Hansimov commited on
Commit
4bc1e6a
1 Parent(s): 3cb12a2

:gem: [Feature] Minimal working script for embedding

Browse files
Files changed (2) hide show
  1. transforms/__init__.py +0 -0
  2. transforms/embed.py +39 -0
transforms/__init__.py ADDED
File without changes
transforms/embed.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from typing import Union
4
+
5
+ from tclogger import logger
6
+ from transformers import AutoModel
7
+ from numpy.linalg import norm
8
+
9
+ from configs.envs import ENVS
10
+
11
+ os.environ["HF_ENDPOINT"] = ENVS["HF_ENDPOINT"]
12
+ os.environ["HF_TOKEN"] = ENVS["HF_TOKEN"]
13
+
14
+
15
+ def cosine_similarity(a, b):
16
+ return (a @ b.T) / (norm(a) * norm(b))
17
+
18
+
19
+ class JinaAIEmbedder:
20
+ def __init__(self, model_name: str = "jinaai/jina-embeddings-v2-base-zh"):
21
+ self.model_name = model_name
22
+ self.load_model()
23
+
24
+ def load_model(self):
25
+ self.model = AutoModel.from_pretrained(self.model_name, trust_remote_code=True)
26
+
27
+ def encode(self, text: Union[str, list[str]]):
28
+ if isinstance(text, str):
29
+ text = [text]
30
+ return self.model.encode(text)
31
+
32
+
33
+ if __name__ == "__main__":
34
+ embedder = JinaAIEmbedder()
35
+ text = ["How is the weather today?", "今天天气怎么样?"]
36
+ # text = "How is the weather today?"
37
+ embeddings = embedder.encode(text)
38
+ logger.success(embeddings)
39
+ # print(cosine_similarity(embeddings[0], embeddings[1]))