| from pathlib import Path |
|
|
| from core import audit, vectorstore |
| from core.chunker import chunk_bidder |
| from core.ocr_pipeline import extract_document |
| from core.schemas import Criterion, Evidence |
|
|
|
|
| def process_bidder(bidder_id: str, files: list[Path]) -> None: |
| collection = vectorstore.get_collection("bidder_chunks") |
| for file in files: |
| pages = extract_document(file) |
| chunks = chunk_bidder(pages, bidder_id, file.name) |
| if not chunks: |
| continue |
| metadatas = [ |
| { |
| "bidder_id": bidder_id, |
| "doc_name": chunk["doc_name"], |
| "page": chunk["page"], |
| "source_type": chunk["source_type"], |
| "ocr_confidence": float(chunk["ocr_confidence"]) |
| if chunk["ocr_confidence"] is not None else -1.0, |
| } |
| for chunk in chunks |
| ] |
| vectorstore.add_chunks(collection, chunks, metadatas) |
| audit.log( |
| "bidder_processed", |
| bidder_id=bidder_id, |
| doc_name=file.name, |
| chunk_count=len(chunks), |
| ) |
|
|
|
|
| def gather_evidence(bidder_id: str, criterion: Criterion, k: int = 4) -> list[Evidence]: |
| query_text = f"{criterion.title} {' '.join(criterion.query_hints)}" |
| collection = vectorstore.get_collection("bidder_chunks") |
| results = vectorstore.query( |
| collection, query_text, k=k, where={"bidder_id": bidder_id} |
| ) |
| evidence = [] |
| for r in results: |
| meta = r["metadata"] |
| ocr_conf = meta.get("ocr_confidence") |
| if ocr_conf is not None and ocr_conf < 0: |
| ocr_conf = None |
| evidence.append(Evidence( |
| bidder_id=bidder_id, |
| doc_name=meta["doc_name"], |
| page=meta["page"], |
| text=r["text"], |
| source_type=meta["source_type"], |
| ocr_confidence=ocr_conf, |
| )) |
| return evidence |
|
|