making the frontend functional
Browse files- api.py +5 -10
- vector_db.py +71 -3
api.py
CHANGED
|
@@ -8,7 +8,7 @@ from fastapi import FastAPI, HTTPException
|
|
| 8 |
from fastapi.middleware.cors import CORSMiddleware
|
| 9 |
from pydantic import BaseModel, Field
|
| 10 |
|
| 11 |
-
from vector_db import
|
| 12 |
from retriever.retriever import HybridRetriever
|
| 13 |
from retriever.generator import RAGGenerator
|
| 14 |
from retriever.processor import ChunkProcessor
|
|
@@ -19,9 +19,6 @@ from models.qwen_2_5 import Qwen2_5
|
|
| 19 |
from models.deepseek_v3 import DeepSeek_V3
|
| 20 |
from models.tiny_aya import TinyAya
|
| 21 |
|
| 22 |
-
# Reuse the same query-only helper for loading BM25 corpus from Pinecone metadata.
|
| 23 |
-
from query_only import _load_chunks_from_pinecone
|
| 24 |
-
|
| 25 |
|
| 26 |
class PredictRequest(BaseModel):
|
| 27 |
query: str = Field(..., min_length=1, description="User query text")
|
|
@@ -102,20 +99,18 @@ def startup_event() -> None:
|
|
| 102 |
if not hf_token:
|
| 103 |
raise RuntimeError("HF_TOKEN not found in environment variables")
|
| 104 |
|
| 105 |
-
index_name = "arxiv-
|
| 106 |
embed_model_name = "all-MiniLM-L6-v2"
|
| 107 |
|
| 108 |
startup_start = time.perf_counter()
|
| 109 |
|
| 110 |
-
index =
|
| 111 |
api_key=pinecone_api_key,
|
| 112 |
-
index_name=index_name
|
| 113 |
-
dimension=384,
|
| 114 |
-
metric="cosine",
|
| 115 |
)
|
| 116 |
|
| 117 |
chunks_start = time.perf_counter()
|
| 118 |
-
final_chunks =
|
| 119 |
chunk_load_time = time.perf_counter() - chunks_start
|
| 120 |
|
| 121 |
if not final_chunks:
|
|
|
|
| 8 |
from fastapi.middleware.cors import CORSMiddleware
|
| 9 |
from pydantic import BaseModel, Field
|
| 10 |
|
| 11 |
+
from vector_db import get_index_by_name, load_chunks_from_pinecone
|
| 12 |
from retriever.retriever import HybridRetriever
|
| 13 |
from retriever.generator import RAGGenerator
|
| 14 |
from retriever.processor import ChunkProcessor
|
|
|
|
| 19 |
from models.deepseek_v3 import DeepSeek_V3
|
| 20 |
from models.tiny_aya import TinyAya
|
| 21 |
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
class PredictRequest(BaseModel):
|
| 24 |
query: str = Field(..., min_length=1, description="User query text")
|
|
|
|
| 99 |
if not hf_token:
|
| 100 |
raise RuntimeError("HF_TOKEN not found in environment variables")
|
| 101 |
|
| 102 |
+
index_name = "arxiv-tournament-recursive"
|
| 103 |
embed_model_name = "all-MiniLM-L6-v2"
|
| 104 |
|
| 105 |
startup_start = time.perf_counter()
|
| 106 |
|
| 107 |
+
index = get_index_by_name(
|
| 108 |
api_key=pinecone_api_key,
|
| 109 |
+
index_name=index_name
|
|
|
|
|
|
|
| 110 |
)
|
| 111 |
|
| 112 |
chunks_start = time.perf_counter()
|
| 113 |
+
final_chunks = load_chunks_from_pinecone(index)
|
| 114 |
chunk_load_time = time.perf_counter() - chunks_start
|
| 115 |
|
| 116 |
if not final_chunks:
|
vector_db.py
CHANGED
|
@@ -6,6 +6,21 @@ def slugify_technique(name):
|
|
| 6 |
"""Converts 'Sentence Splitter' to 'sentence-splitter' for Pinecone naming."""
|
| 7 |
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
def get_pinecone_index(api_key, base_name, technique, dimension=384, metric="cosine"):
|
| 10 |
"""
|
| 11 |
Creates/Returns an index specifically for a technique.
|
|
@@ -29,8 +44,8 @@ def get_pinecone_index(api_key, base_name, technique, dimension=384, metric="cos
|
|
| 29 |
while not pc.describe_index(full_index_name).status['ready']:
|
| 30 |
time.sleep(1)
|
| 31 |
|
| 32 |
-
|
| 33 |
-
return
|
| 34 |
|
| 35 |
def refresh_pinecone_index(index, final_chunks, batch_size=100):
|
| 36 |
"""
|
|
@@ -91,4 +106,57 @@ def prepare_vectors_for_upsert(final_chunks):
|
|
| 91 |
def upsert_to_pinecone(index, chunks, batch_size=100):
|
| 92 |
for i in range(0, len(chunks), batch_size):
|
| 93 |
batch = chunks[i : i + batch_size]
|
| 94 |
-
index.upsert(vectors=batch)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""Converts 'Sentence Splitter' to 'sentence-splitter' for Pinecone naming."""
|
| 7 |
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
| 8 |
|
| 9 |
+
def get_index_by_name(api_key: str, index_name: str):
|
| 10 |
+
"""
|
| 11 |
+
Directly connects to a Pinecone index by its full string name.
|
| 12 |
+
Useful for the API/Production side where the name is already known.
|
| 13 |
+
"""
|
| 14 |
+
pc = Pinecone(api_key=api_key)
|
| 15 |
+
|
| 16 |
+
# Check if it exists first to avoid a 404 crash
|
| 17 |
+
existing_indexes = [idx.name for idx in pc.list_indexes()]
|
| 18 |
+
if index_name not in existing_indexes:
|
| 19 |
+
raise ValueError(f"Index '{index_name}' does not exist in your Pinecone project.")
|
| 20 |
+
|
| 21 |
+
print(f" Connecting to Index: {index_name}")
|
| 22 |
+
return pc.Index(index_name)
|
| 23 |
+
|
| 24 |
def get_pinecone_index(api_key, base_name, technique, dimension=384, metric="cosine"):
|
| 25 |
"""
|
| 26 |
Creates/Returns an index specifically for a technique.
|
|
|
|
| 44 |
while not pc.describe_index(full_index_name).status['ready']:
|
| 45 |
time.sleep(1)
|
| 46 |
|
| 47 |
+
# Use our new helper to return the index object
|
| 48 |
+
return get_index_by_name(api_key, full_index_name)
|
| 49 |
|
| 50 |
def refresh_pinecone_index(index, final_chunks, batch_size=100):
|
| 51 |
"""
|
|
|
|
| 106 |
def upsert_to_pinecone(index, chunks, batch_size=100):
|
| 107 |
for i in range(0, len(chunks), batch_size):
|
| 108 |
batch = chunks[i : i + batch_size]
|
| 109 |
+
index.upsert(vectors=batch)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def load_chunks_from_pinecone(index, batch_size: int = 100) -> list[dict[str, any]]:
|
| 113 |
+
"""
|
| 114 |
+
Scans the Pinecone index to retrieve all text metadata for the BM25 corpus.
|
| 115 |
+
"""
|
| 116 |
+
stats = index.describe_index_stats()
|
| 117 |
+
namespaces = list(stats.get('namespaces', {}).keys())
|
| 118 |
+
# If no namespaces are explicitly named, Pinecone uses an empty string for the default
|
| 119 |
+
if not namespaces:
|
| 120 |
+
namespaces = [""]
|
| 121 |
+
|
| 122 |
+
all_chunks: List[Dict[str, Any]] = []
|
| 123 |
+
seen_ids = set()
|
| 124 |
+
|
| 125 |
+
print(f"Loading vectors for BM25 from namespaces: {namespaces}")
|
| 126 |
+
|
| 127 |
+
for ns in namespaces:
|
| 128 |
+
# Pinecone's list() generator returns batches of IDs
|
| 129 |
+
for id_batch in index.list(namespace=ns, limit=batch_size):
|
| 130 |
+
if not id_batch:
|
| 131 |
+
continue
|
| 132 |
+
|
| 133 |
+
# Fetch the actual content (metadata) for this batch of IDs
|
| 134 |
+
fetched = index.fetch(ids=id_batch, namespace=ns)
|
| 135 |
+
vectors = getattr(fetched, "vectors", {})
|
| 136 |
+
|
| 137 |
+
for vector_id, vector_data in vectors.items():
|
| 138 |
+
if vector_id in seen_ids:
|
| 139 |
+
continue
|
| 140 |
+
seen_ids.add(vector_id)
|
| 141 |
+
|
| 142 |
+
# Safely extract metadata
|
| 143 |
+
metadata = getattr(vector_data, "metadata", {})
|
| 144 |
+
text = metadata.get("text")
|
| 145 |
+
|
| 146 |
+
if not text:
|
| 147 |
+
continue
|
| 148 |
+
|
| 149 |
+
all_chunks.append({
|
| 150 |
+
"id": vector_id,
|
| 151 |
+
"metadata": {
|
| 152 |
+
"text": text,
|
| 153 |
+
"title": metadata.get("title", "Untitled"),
|
| 154 |
+
"url": metadata.get("url", ""),
|
| 155 |
+
"chunk_index": metadata.get("chunk_index", 0)
|
| 156 |
+
}
|
| 157 |
+
})
|
| 158 |
+
|
| 159 |
+
print(f" Finished namespace: '{ns if ns else 'default'}'")
|
| 160 |
+
|
| 161 |
+
print(f"Total chunks loaded into memory: {len(all_chunks)}")
|
| 162 |
+
return all_chunks
|