File size: 3,485 Bytes
13fbd2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from pymilvus import MilvusClient, AnnSearchRequest, RRFRanker
from langchain_community.embeddings.ollama import OllamaEmbeddings
from pymilvus import WeightedRanker

reranker = RRFRanker(k=10)

"""
embed_model = OllamaEmbeddings(model="bge-m3")
client = MilvusClient(uri="http://192.168.5.103:19530")


query = "Can I take pills?"
query_embedding = embed_model.embed_query(query)

# single vector search
res = client.search(
    collection_name="t_sur_sex_ed_article_spider",
    data=[query_embedding],
    limit=2,
    search_params={"metric_type": "COSINE", "params": {}},
    anns_field="chunk_vector",
    output_fields=["title", "chunk", "link", "category"]
)
"""

# hybrid search
def hybrid_search(query, embed_model: OllamaEmbeddings, collection_name, client: MilvusClient):
    query_embedding = embed_model.embed_query(query)
    search_param1 = {
        "data":[query_embedding],
        "anns_field": "title_vector",
        "param":{
            "metric_type": "COSINE",
            "params": {"nprobe":10, "level": 3}
        },
        "limit": 3
    }
    request1 = AnnSearchRequest(**search_param1)

    search_param2 = {
        "data":[query_embedding],
        "anns_field": "chunk_vector",
        "param":{
            "metric_type": "COSINE",
            "params": {"nprobe":10, "level": 3}
        },
        "limit": 3
    }
    request2 = AnnSearchRequest(**search_param2)
    
    search_param3 = {
        "data":[query_embedding],
        "anns_field": "tags",
        "param":{
            "metric_type": "COSINE",
            "params": {"nprobe":10, "level": 3}
        },
        "limit": 3
    }
    request3 = AnnSearchRequest(**search_param3)
    
    candidates = [request1, request2, request3]
    # Rerank
    rerank = WeightedRanker(0.3, 0.6, 0.1)
    res = client.hybrid_search(
        collection_name=collection_name,
        ranker=rerank,
        reqs=candidates,
        limit=3,
        output_fields=["title", "chunk", "link"]
    )
    
    return res
    

def single_vector_search(query, embed_model: OllamaEmbeddings, collection_name, client: MilvusClient, anns_field):
    query_embedding = embed_model.embed_query(query)
    res = client.search(
        collection_name=collection_name,
        data=[query_embedding],
        limit=20,
        search_params={"metric_type": "COSINE", "params": {}},
        anns_field=anns_field,
        filter="content_type == 'A'",
        output_fields=["title", "content", "url", "content_type", "likes", "dislikes"]
    )[0]
    res = sorted(res, key=lambda x: x["entity"]["likes"]/(x["entity"]["dislikes"] + 1), reverse=True)
    return res
    

if __name__ == "__main__":
    embed_model = OllamaEmbeddings(model="bge-m3")
    client = MilvusClient(uri="http://192.168.5.103:19530")

    query = "How to make a good blow job"
    search_res = hybrid_search(query, embed_model, "t_sur_sex_ed_article_spider", client)[0]
    # singel_search_res = single_vector_search(query, embed_model, "t_sur_sex_ed_question_answer_spider", client, "content_vector")
    for res in search_res:
        print(res["entity"]["chunk"])
        print("\n #############################")
        
    # print("===============================\n\n")
    # for res in singel_search_res:
    #     print(res["entity"]["content"])
    #     print(res["entity"]["content_type"])
    #     print(res["entity"]["likes"])
    #     print(res["entity"]["dislikes"])
    #     print("\n #############################")