Spaces:
Runtime error
Runtime error
| import asyncio | |
| import numpy as np | |
| from app.core.database import SessionLocal | |
| from app.models.website import WebsiteContent | |
| from app.services.vector_db import VectorDB | |
| from app.services.vector_operations import VectorOperations | |
| async def populate_vectordb(): | |
| print("Populating VectorDB for Website 8...") | |
| db = SessionLocal() | |
| website_id = 8 | |
| # Get content | |
| contents = db.query(WebsiteContent).filter(WebsiteContent.website_id == website_id).all() | |
| print(f"Found {len(contents)} content records") | |
| if not contents: | |
| print("No content found!") | |
| return | |
| # Prepare data | |
| texts = [] | |
| metadata = [] | |
| for record in contents: | |
| # Chunk text | |
| chunks = VectorOperations.chunk_text(record.content, max_tokens=500) | |
| for i, chunk in enumerate(chunks): | |
| if len(chunk.strip()) > 50: | |
| texts.append(chunk) | |
| metadata.append({ | |
| "page_url": record.page_url, | |
| "content": chunk, | |
| "chunk_id": f"{record.id}_{i}" | |
| }) | |
| print(f"Created {len(texts)} chunks") | |
| if not texts: | |
| return | |
| # Generate embeddings | |
| print("Generating embeddings...") | |
| embeddings = [] | |
| for i, text in enumerate(texts): | |
| if i % 10 == 0: | |
| print(f"Processing chunk {i}/{len(texts)}") | |
| embedding = await VectorOperations.get_embedding(text) | |
| embeddings.append(embedding) | |
| # Save to VectorDB | |
| print("Saving to VectorDB...") | |
| vector_db = VectorDB() | |
| embeddings_array = np.array(embeddings, dtype=np.float32) | |
| vector_db.add_vectors(embeddings_array, metadata) | |
| vector_db.save(website_id) | |
| print("✓ VectorDB populated successfully!") | |
| if __name__ == "__main__": | |
| asyncio.run(populate_vectordb()) | |