Spaces:

dengkane
/

learn-streamlit

Sleeping

App Files Files Community

dengkane commited on Aug 30, 2023

Commit

44df7d3

•

1 Parent(s): bddb13b

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -40

app.py CHANGED Viewed

@@ -4,57 +4,47 @@ import streamlit as st
 import numpy as np
 import pandas as pd
 import faiss
-from sentence_transformers import SentenceTransformer
-model = SentenceTransformer('moka-ai/m3e-base')
-#Our sentences we like to encode
-sentences = [
-    '* Moka 此文本嵌入模型由 MokaAI 训练并开源，训练脚本使用 uniem',
-    '* Massive 此文本嵌入模型通过**千万级**的中文句对数据集进行训练',
-    '* Mixed 此文本嵌入模型支持中英双语的同质文本相似度计算，异质文本检索等功能，未来还会支持代码检索，ALL in one'
-]
-#Sentences are encoded by calling model.encode()
-embeddings = model.encode(sentences)
-#Print the embeddings
-#for sentence, embedding in zip(sentences, embeddings):
-#    st.write("Sentence:", sentence)
-#    st.write("Embedding:", embedding)
-#    st.write("")
-def get_embedding(text_content):
-    return model.encode(text_content)
-# Load the text file as knowledge
-knowledge_file = 'knowledge.txt'
-knowledge = []
-with open(knowledge_file, 'r', encoding='utf-8') as file:
-    for line in file:
-        knowledge.append(line.strip())
-# Perform embedding for the knowledge texts and add to index
 embeddings = []
-for text in knowledge:
-    # Add your code here for text embedding (e.g., using word embeddings, sentence transformers, etc.)
-    embedding = get_embedding(text)
-    embeddings.append(embedding)
 embeddings = np.array(embeddings)
-# Create an index
-index = faiss.IndexFlatIP(embeddings.shape[1])  # Use Inner Product (IP) as similarity measure
 index.add(embeddings)
-# Get user input for a question
-question = st.text_input("Enter your question: ")
-# Perform embedding for the question
-question_embedding = get_embedding(question)
 # Search index for the most similar content
 k = 5  # Number of results to retrieve
 D, I = index.search(np.array([question_embedding]), k)

 import numpy as np
 import pandas as pd
 import faiss
+import numpy as np
+from transformers import AutoTokenizer, AutoModel
+# Load the embedding model and tokenizer
+model_name = "moka-ai/m3e-base"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModel.from_pretrained(model_name)
+# Generate some random text contents
+texts = ["This is the first document.", "This is the second document.", "And this is the third one.", "Is this the first document?"]
+# Convert the text contents to embeddings
 embeddings = []
+for text in texts:
+    input_ids = tokenizer.encode(text, return_tensors="pt")
+    with torch.no_grad():
+        embedding = model(input_ids)[0][0].numpy()
+        embeddings.append(embedding)
 embeddings = np.array(embeddings)
+# Create a Faiss index
+d = embeddings.shape[1]  # Dimension of the embeddings
+index = faiss.IndexFlatIP(d)  # Index that uses inner product (dot product) similarity
+# Add the embeddings to the index
 index.add(embeddings)
+# Search for similar documents
+query = "This is a new document."
+input_ids = tokenizer.encode(query, return_tensors="pt")
+with torch.no_grad():
+    query_embedding = model(input_ids)[0][0].numpy()
+k = 2  # Number of similar documents to retrieve
+D, I = index.search(query_embedding.reshape(1, -1), k)
+# Print the results
+st.write(f"Query: {query}")
+for i in range(k):
+    st.write(f"Rank {i+1}: {texts[I[0][i]]} (similarity score: {D[0][i]})")
 # Search index for the most similar content
 k = 5  # Number of results to retrieve
 D, I = index.search(np.array([question_embedding]), k)