|
|
|
import faiss
|
|
import numpy as np
|
|
import os
|
|
from chromadb import PersistentClient
|
|
from chromadb.utils import embedding_functions
|
|
from sentence_transformers import SentenceTransformer
|
|
from retriever.reranker import rerank_documents
|
|
|
|
|
|
embedding_models = [
|
|
"upskyy/bge-m3-korean",
|
|
"jhgan/ko-sbert-sts",
|
|
"BM-K/KoSimCSE-roberta",
|
|
"BM-K/KoSimCSE-v2-multitask",
|
|
"snunlp/KR-SBERT-V40K-klueNLI-augSTS",
|
|
"beomi/KcELECTRA-small-v2022",
|
|
]
|
|
|
|
CHROMA_PATH = os.path.abspath("data/index/exam_db")
|
|
COLLECTION_NAME = "exam_all"
|
|
EMBEDDING_MODEL_NAME = embedding_models[0]
|
|
|
|
|
|
|
|
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
|
|
|
|
|
|
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL_NAME)
|
|
|
|
|
|
client = PersistentClient(path=CHROMA_PATH)
|
|
collection = client.get_collection(name=COLLECTION_NAME, embedding_function=embedding_fn)
|
|
|
|
|
|
def search_documents(query: str, top_k: int = 5):
|
|
print(f"\nπ κ²μμ΄: '{query}'")
|
|
results = collection.query(
|
|
query_texts=[query],
|
|
n_results=top_k,
|
|
include=["documents", "metadatas", "distances"]
|
|
)
|
|
|
|
|
|
|
|
|
|
for i, (doc, meta, dist) in enumerate(zip(
|
|
results['documents'][0],
|
|
results['metadatas'][0],
|
|
results['distances'][0]
|
|
)):
|
|
print(f"\nπ κ²°κ³Ό {i+1} (μ μ¬λ: {1 - dist:.2f})")
|
|
print(f"λ¬Έμ: {doc[:150]}...")
|
|
print("λ©νλ°μ΄ν°:")
|
|
print(meta) |