|
|
|
|
|
|
|
|
"""Seed the vector store with initial data""" |
|
|
|
|
|
import sys |
|
|
import json |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
|
|
|
from vector.store import VectorStore |
|
|
from vector.embeddings import get_embedding_model |
|
|
from app.config import DATA_DIR |
|
|
|
|
|
def seed_vectorstore(): |
|
|
"""Build and persist the initial vector index""" |
|
|
|
|
|
print("Initializing vector store...") |
|
|
store = VectorStore() |
|
|
model = get_embedding_model() |
|
|
|
|
|
|
|
|
companies_file = DATA_DIR / "companies.json" |
|
|
if not companies_file.exists(): |
|
|
print(f"Error: {companies_file} not found") |
|
|
return |
|
|
|
|
|
with open(companies_file) as f: |
|
|
companies = json.load(f) |
|
|
|
|
|
print(f"Loading {len(companies)} companies...") |
|
|
|
|
|
texts = [] |
|
|
metadata = [] |
|
|
|
|
|
for company in companies: |
|
|
|
|
|
desc = f"{company['name']} is a {company['industry']} company with {company['size']} employees" |
|
|
texts.append(desc) |
|
|
metadata.append({ |
|
|
"company_id": company["id"], |
|
|
"type": "description", |
|
|
"text": desc |
|
|
}) |
|
|
|
|
|
|
|
|
for pain in company.get("pains", []): |
|
|
pain_text = f"{company['name']} challenge: {pain}" |
|
|
texts.append(pain_text) |
|
|
metadata.append({ |
|
|
"company_id": company["id"], |
|
|
"type": "pain", |
|
|
"text": pain_text |
|
|
}) |
|
|
|
|
|
|
|
|
for note in company.get("notes", []): |
|
|
note_text = f"{company['name']}: {note}" |
|
|
texts.append(note_text) |
|
|
metadata.append({ |
|
|
"company_id": company["id"], |
|
|
"type": "note", |
|
|
"text": note_text |
|
|
}) |
|
|
|
|
|
print(f"Encoding {len(texts)} documents...") |
|
|
embeddings = model.encode(texts) |
|
|
|
|
|
print("Adding to index...") |
|
|
store.add(embeddings, metadata) |
|
|
|
|
|
print(f"Vector store initialized with {len(texts)} documents") |
|
|
print(f"Index saved to: {store.index_path}") |
|
|
|
|
|
|
|
|
print("\nTesting retrieval...") |
|
|
from vector.retriever import Retriever |
|
|
retriever = Retriever() |
|
|
|
|
|
for company in companies[:1]: |
|
|
results = retriever.retrieve(company["id"], k=3) |
|
|
print(f"\nTop results for {company['name']}:") |
|
|
for r in results: |
|
|
print(f" - {r['text'][:80]}... (score: {r.get('score', 0):.3f})") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
seed_vectorstore() |