Spaces:
Configuration error
Configuration error
import os | |
import chromadb | |
from openai import OpenAI | |
from dotenv import load_dotenv | |
from backend.config import CHROMA_DB_PATH | |
# βββ ENVIRONMENT ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
load_dotenv() | |
API_KEY = os.getenv("OPENAI_API_KEY", "TRANMINHDUONGDEPTRAI") | |
BASE_URL = "https://glowing-workable-arachnid.ngrok-free.app/v1" # or ngrok URL | |
openai_client = OpenAI(api_key=API_KEY, base_url=BASE_URL) | |
# βββ CHROMA SETUP βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
chroma_client = chromadb.PersistentClient(path=CHROMA_DB_PATH) | |
collection = chroma_client.get_or_create_collection("legal_docs") | |
# βββ EMBEDDING FUNCTION βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def embed_query(query_text): | |
resp = openai_client.embeddings.create( | |
model="Qwen3-Embedding-0.6B", | |
input=[query_text] | |
) | |
return resp.data[0].embedding | |
# βββ TOP-K RETRIEVAL ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def query_top_k(query_text, k=5): | |
query_emb = embed_query(query_text) | |
results = collection.query( | |
query_embeddings=[query_emb], | |
n_results=k | |
) | |
# results['documents'] is a list of lists (one per query) | |
# results['distances'] is a list of lists (one per query) | |
# We'll return a list of (chunk, distance) tuples | |
docs = results['documents'][0] if results['documents'] else [] | |
dists = results['distances'][0] if results['distances'] else [] | |
return list(zip(docs, dists)) | |
# Example usage: | |
if __name__ == "__main__": | |
q = "An interesting fact about the humming bird" | |
top_chunks = query_top_k(q, k=3) | |
for chunk, dist in top_chunks: | |
print(f"Score: {dist:.4f}\n{chunk}\n{'-'*40}") | |