dengkane commited on
Commit
44df7d3
1 Parent(s): bddb13b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -40
app.py CHANGED
@@ -4,57 +4,47 @@ import streamlit as st
4
  import numpy as np
5
  import pandas as pd
6
  import faiss
 
 
7
 
8
- from sentence_transformers import SentenceTransformer
9
-
10
- model = SentenceTransformer('moka-ai/m3e-base')
11
-
12
- #Our sentences we like to encode
13
- sentences = [
14
- '* Moka 此文本嵌入模型由 MokaAI 训练并开源,训练脚本使用 uniem',
15
- '* Massive 此文本嵌入模型通过**千万级**的中文句对数据集进行训练',
16
- '* Mixed 此文本嵌入模型支持中英双语的同质文本相似度计算,异质文本检索等功能,未来还会支持代码检索,ALL in one'
17
- ]
18
-
19
- #Sentences are encoded by calling model.encode()
20
- embeddings = model.encode(sentences)
21
-
22
- #Print the embeddings
23
- #for sentence, embedding in zip(sentences, embeddings):
24
- # st.write("Sentence:", sentence)
25
- # st.write("Embedding:", embedding)
26
- # st.write("")
27
-
28
-
29
- def get_embedding(text_content):
30
- return model.encode(text_content)
31
 
32
- # Load the text file as knowledge
33
- knowledge_file = 'knowledge.txt'
34
- knowledge = []
35
- with open(knowledge_file, 'r', encoding='utf-8') as file:
36
- for line in file:
37
- knowledge.append(line.strip())
38
 
39
- # Perform embedding for the knowledge texts and add to index
40
  embeddings = []
41
- for text in knowledge:
42
- # Add your code here for text embedding (e.g., using word embeddings, sentence transformers, etc.)
43
- embedding = get_embedding(text)
44
- embeddings.append(embedding)
 
45
  embeddings = np.array(embeddings)
46
 
47
- # Create an index
48
- index = faiss.IndexFlatIP(embeddings.shape[1]) # Use Inner Product (IP) as similarity measure
 
49
 
 
50
  index.add(embeddings)
51
 
52
- # Get user input for a question
53
- question = st.text_input("Enter your question: ")
 
 
 
 
 
54
 
55
- # Perform embedding for the question
56
- question_embedding = get_embedding(question)
 
 
57
 
 
58
  # Search index for the most similar content
59
  k = 5 # Number of results to retrieve
60
  D, I = index.search(np.array([question_embedding]), k)
 
4
  import numpy as np
5
  import pandas as pd
6
  import faiss
7
+ import numpy as np
8
+ from transformers import AutoTokenizer, AutoModel
9
 
10
+ # Load the embedding model and tokenizer
11
+ model_name = "moka-ai/m3e-base"
12
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
13
+ model = AutoModel.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ # Generate some random text contents
16
+ texts = ["This is the first document.", "This is the second document.", "And this is the third one.", "Is this the first document?"]
 
 
 
 
17
 
18
+ # Convert the text contents to embeddings
19
  embeddings = []
20
+ for text in texts:
21
+ input_ids = tokenizer.encode(text, return_tensors="pt")
22
+ with torch.no_grad():
23
+ embedding = model(input_ids)[0][0].numpy()
24
+ embeddings.append(embedding)
25
  embeddings = np.array(embeddings)
26
 
27
+ # Create a Faiss index
28
+ d = embeddings.shape[1] # Dimension of the embeddings
29
+ index = faiss.IndexFlatIP(d) # Index that uses inner product (dot product) similarity
30
 
31
+ # Add the embeddings to the index
32
  index.add(embeddings)
33
 
34
+ # Search for similar documents
35
+ query = "This is a new document."
36
+ input_ids = tokenizer.encode(query, return_tensors="pt")
37
+ with torch.no_grad():
38
+ query_embedding = model(input_ids)[0][0].numpy()
39
+ k = 2 # Number of similar documents to retrieve
40
+ D, I = index.search(query_embedding.reshape(1, -1), k)
41
 
42
+ # Print the results
43
+ st.write(f"Query: {query}")
44
+ for i in range(k):
45
+ st.write(f"Rank {i+1}: {texts[I[0][i]]} (similarity score: {D[0][i]})")
46
 
47
+
48
  # Search index for the most similar content
49
  k = 5 # Number of results to retrieve
50
  D, I = index.search(np.array([question_embedding]), k)