dengkane commited on
Commit
d995b49
1 Parent(s): 83db5d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -24
app.py CHANGED
@@ -1,31 +1,29 @@
1
  import streamlit as st
2
  # To make things easier later, we're also importing numpy and pandas for
3
  # working with sample data.
4
- import numpy as np
5
- import pandas as pd
6
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import faiss
8
- import numpy as np
9
- from transformers import AutoTokenizer, AutoModel
10
-
11
- # Load the embedding model and tokenizer
12
- model_name = "moka-ai/m3e-base"
13
- tokenizer = AutoTokenizer.from_pretrained(model_name)
14
- model = AutoModel.from_pretrained(model_name)
15
-
16
- # Generate some random text contents
17
- texts = ["This is the first document.", "This is the second document.", "And this is the third one.", "Is this the first document?"]
18
-
19
- # Convert the text contents to embeddings
20
- embeddings = []
21
- for text in texts:
22
- input_ids = tokenizer.encode(text, return_tensors="pt")
23
- with torch.no_grad():
24
- embedding = model(input_ids)[0][0].numpy()
25
- embeddings.append(embedding)
26
- embeddings = np.array(embeddings)
27
-
28
- # Create a Faiss index
29
  d = embeddings.shape[1] # Dimension of the embeddings
30
  index = faiss.IndexFlatIP(d) # Index that uses inner product (dot product) similarity
31
 
@@ -33,7 +31,7 @@ index = faiss.IndexFlatIP(d) # Index that uses inner product (dot product) simi
33
  index.add(embeddings)
34
 
35
  # Search for similar documents
36
- query = "This is a new document."
37
  input_ids = tokenizer.encode(query, return_tensors="pt")
38
  with torch.no_grad():
39
  query_embedding = model(input_ids)[0][0].numpy()
 
1
  import streamlit as st
2
  # To make things easier later, we're also importing numpy and pandas for
3
  # working with sample data.
 
 
4
  import torch
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+ model = SentenceTransformer('moka-ai/m3e-base')
8
+
9
+ #Our sentences we like to encode
10
+ sentences = [
11
+ '* Moka 此文本嵌入模型由 MokaAI 训练并开源,训练脚本使用 uniem',
12
+ '* Massive 此文本嵌入模型通过**千万级**的中文句对数据集进行训练',
13
+ '* Mixed 此文本嵌入模型支持中英双语的同质文本相似度计算,异质文本检索等功能,未来还会支持代码检索,ALL in one'
14
+ ]
15
+
16
+ #Sentences are encoded by calling model.encode()
17
+ embeddings = model.encode(sentences)
18
+
19
+ #Print the embeddings
20
+ for sentence, embedding in zip(sentences, embeddings):
21
+ print("Sentence:", sentence)
22
+ print("Embedding:", embedding)
23
+ print("")
24
+
25
+
26
  import faiss
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  d = embeddings.shape[1] # Dimension of the embeddings
28
  index = faiss.IndexFlatIP(d) # Index that uses inner product (dot product) similarity
29
 
 
31
  index.add(embeddings)
32
 
33
  # Search for similar documents
34
+ query = "训练脚本."
35
  input_ids = tokenizer.encode(query, return_tensors="pt")
36
  with torch.no_grad():
37
  query_embedding = model(input_ids)[0][0].numpy()