TenderIQ / core /bidder_processor.py
JaydeepR's picture
Step 8: vector store and bidder processor — ChromaDB indexing and retrieval
a337229
from pathlib import Path
from core import audit, vectorstore
from core.chunker import chunk_bidder
from core.ocr_pipeline import extract_document
from core.schemas import Criterion, Evidence
def process_bidder(bidder_id: str, files: list[Path]) -> None:
collection = vectorstore.get_collection("bidder_chunks")
for file in files:
pages = extract_document(file)
chunks = chunk_bidder(pages, bidder_id, file.name)
if not chunks:
continue
metadatas = [
{
"bidder_id": bidder_id,
"doc_name": chunk["doc_name"],
"page": chunk["page"],
"source_type": chunk["source_type"],
"ocr_confidence": float(chunk["ocr_confidence"])
if chunk["ocr_confidence"] is not None else -1.0,
}
for chunk in chunks
]
vectorstore.add_chunks(collection, chunks, metadatas)
audit.log(
"bidder_processed",
bidder_id=bidder_id,
doc_name=file.name,
chunk_count=len(chunks),
)
def gather_evidence(bidder_id: str, criterion: Criterion, k: int = 4) -> list[Evidence]:
query_text = f"{criterion.title} {' '.join(criterion.query_hints)}"
collection = vectorstore.get_collection("bidder_chunks")
results = vectorstore.query(
collection, query_text, k=k, where={"bidder_id": bidder_id}
)
evidence = []
for r in results:
meta = r["metadata"]
ocr_conf = meta.get("ocr_confidence")
if ocr_conf is not None and ocr_conf < 0:
ocr_conf = None
evidence.append(Evidence(
bidder_id=bidder_id,
doc_name=meta["doc_name"],
page=meta["page"],
text=r["text"],
source_type=meta["source_type"],
ocr_confidence=ocr_conf,
))
return evidence