File size: 1,813 Bytes
2c5f455
9b14ff1
 
 
2c5f455
 
9b14ff1
 
 
2c5f455
 
 
 
 
 
 
 
 
 
 
 
baaabde
9b14ff1
 
2c5f455
 
 
9b14ff1
 
2c5f455
 
 
 
 
 
9b14ff1
 
 
2c5f455
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# vectordb_relank_law.py
import faiss
import numpy as np
import os
from chromadb import PersistentClient
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
from retriever.reranker import rerank_documents

# chroma vector config v2
embedding_models = [
    "upskyy/bge-m3-korean", 
    "jhgan/ko-sbert-sts", 
    "BM-K/KoSimCSE-roberta", 
    "BM-K/KoSimCSE-v2-multitask", 
    "snunlp/KR-SBERT-V40K-klueNLI-augSTS", 
    "beomi/KcELECTRA-small-v2022", 
]
# law_db config v2
CHROMA_PATH = os.path.abspath("data/index/law_db")
COLLECTION_NAME = "law_all"
EMBEDDING_MODEL_NAME = embedding_models[0]  # ์‚ฌ์šฉํ•˜๊ณ ์ž ํ•˜๋Š” ๋ชจ๋ธ ์„ ํƒ


# 1. ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ v2
# embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)


# 2. ์ž„๋ฒ ๋”ฉ ํ•จ์ˆ˜ ์„ค์ •
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL_NAME)

# 3. Chroma ํด๋ผ์ด์–ธํŠธ ๋ฐ ์ปฌ๋ ‰์…˜ ๋กœ๋“œ
client = PersistentClient(path=CHROMA_PATH)
collection = client.get_collection(name=COLLECTION_NAME, embedding_function=embedding_fn)



# 4. ๊ฒ€์ƒ‰ ํ•จ์ˆ˜
def search_documents(query: str, top_k: int = 5):
    print(f"\n๐Ÿ” ๊ฒ€์ƒ‰์–ด: '{query}'")
    results = collection.query(
        query_texts=[query],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )

    for i, (doc, meta, dist) in enumerate(zip(
        results['documents'][0],
        results['metadatas'][0],
        results['distances'][0]
    )):
        print(f"\n๐Ÿ“„ ๊ฒฐ๊ณผ {i+1} (์œ ์‚ฌ๋„: {1 - dist:.2f})")
        print(f"๋ฌธ์„œ: {doc[:150]}...")
        print("๋ฉ”ํƒ€๋ฐ์ดํ„ฐ:")
        print(meta)