SexBot / milvus_db.py
Pew404's picture
Upload folder using huggingface_hub
13fbd2e verified
from pymilvus import MilvusClient, AnnSearchRequest, RRFRanker
from langchain_community.embeddings.ollama import OllamaEmbeddings
from pymilvus import WeightedRanker
reranker = RRFRanker(k=10)
"""
embed_model = OllamaEmbeddings(model="bge-m3")
client = MilvusClient(uri="http://192.168.5.103:19530")
query = "Can I take pills?"
query_embedding = embed_model.embed_query(query)
# single vector search
res = client.search(
collection_name="t_sur_sex_ed_article_spider",
data=[query_embedding],
limit=2,
search_params={"metric_type": "COSINE", "params": {}},
anns_field="chunk_vector",
output_fields=["title", "chunk", "link", "category"]
)
"""
# hybrid search
def hybrid_search(query, embed_model: OllamaEmbeddings, collection_name, client: MilvusClient):
query_embedding = embed_model.embed_query(query)
search_param1 = {
"data":[query_embedding],
"anns_field": "title_vector",
"param":{
"metric_type": "COSINE",
"params": {"nprobe":10, "level": 3}
},
"limit": 3
}
request1 = AnnSearchRequest(**search_param1)
search_param2 = {
"data":[query_embedding],
"anns_field": "chunk_vector",
"param":{
"metric_type": "COSINE",
"params": {"nprobe":10, "level": 3}
},
"limit": 3
}
request2 = AnnSearchRequest(**search_param2)
search_param3 = {
"data":[query_embedding],
"anns_field": "tags",
"param":{
"metric_type": "COSINE",
"params": {"nprobe":10, "level": 3}
},
"limit": 3
}
request3 = AnnSearchRequest(**search_param3)
candidates = [request1, request2, request3]
# Rerank
rerank = WeightedRanker(0.3, 0.6, 0.1)
res = client.hybrid_search(
collection_name=collection_name,
ranker=rerank,
reqs=candidates,
limit=3,
output_fields=["title", "chunk", "link"]
)
return res
def single_vector_search(query, embed_model: OllamaEmbeddings, collection_name, client: MilvusClient, anns_field):
query_embedding = embed_model.embed_query(query)
res = client.search(
collection_name=collection_name,
data=[query_embedding],
limit=20,
search_params={"metric_type": "COSINE", "params": {}},
anns_field=anns_field,
filter="content_type == 'A'",
output_fields=["title", "content", "url", "content_type", "likes", "dislikes"]
)[0]
res = sorted(res, key=lambda x: x["entity"]["likes"]/(x["entity"]["dislikes"] + 1), reverse=True)
return res
if __name__ == "__main__":
embed_model = OllamaEmbeddings(model="bge-m3")
client = MilvusClient(uri="http://192.168.5.103:19530")
query = "How to make a good blow job"
search_res = hybrid_search(query, embed_model, "t_sur_sex_ed_article_spider", client)[0]
# singel_search_res = single_vector_search(query, embed_model, "t_sur_sex_ed_question_answer_spider", client, "content_vector")
for res in search_res:
print(res["entity"]["chunk"])
print("\n #############################")
# print("===============================\n\n")
# for res in singel_search_res:
# print(res["entity"]["content"])
# print(res["entity"]["content_type"])
# print(res["entity"]["likes"])
# print(res["entity"]["dislikes"])
# print("\n #############################")