ramailkk commited on
Commit
15c009d
·
1 Parent(s): 6cb3d7c

making the frontend functional

Browse files
Files changed (2) hide show
  1. api.py +5 -10
  2. vector_db.py +71 -3
api.py CHANGED
@@ -8,7 +8,7 @@ from fastapi import FastAPI, HTTPException
8
  from fastapi.middleware.cors import CORSMiddleware
9
  from pydantic import BaseModel, Field
10
 
11
- from vector_db import get_pinecone_index
12
  from retriever.retriever import HybridRetriever
13
  from retriever.generator import RAGGenerator
14
  from retriever.processor import ChunkProcessor
@@ -19,9 +19,6 @@ from models.qwen_2_5 import Qwen2_5
19
  from models.deepseek_v3 import DeepSeek_V3
20
  from models.tiny_aya import TinyAya
21
 
22
- # Reuse the same query-only helper for loading BM25 corpus from Pinecone metadata.
23
- from query_only import _load_chunks_from_pinecone
24
-
25
 
26
  class PredictRequest(BaseModel):
27
  query: str = Field(..., min_length=1, description="User query text")
@@ -102,20 +99,18 @@ def startup_event() -> None:
102
  if not hf_token:
103
  raise RuntimeError("HF_TOKEN not found in environment variables")
104
 
105
- index_name = "arxiv-index"
106
  embed_model_name = "all-MiniLM-L6-v2"
107
 
108
  startup_start = time.perf_counter()
109
 
110
- index = get_pinecone_index(
111
  api_key=pinecone_api_key,
112
- index_name=index_name,
113
- dimension=384,
114
- metric="cosine",
115
  )
116
 
117
  chunks_start = time.perf_counter()
118
- final_chunks = _load_chunks_from_pinecone(index)
119
  chunk_load_time = time.perf_counter() - chunks_start
120
 
121
  if not final_chunks:
 
8
  from fastapi.middleware.cors import CORSMiddleware
9
  from pydantic import BaseModel, Field
10
 
11
+ from vector_db import get_index_by_name, load_chunks_from_pinecone
12
  from retriever.retriever import HybridRetriever
13
  from retriever.generator import RAGGenerator
14
  from retriever.processor import ChunkProcessor
 
19
  from models.deepseek_v3 import DeepSeek_V3
20
  from models.tiny_aya import TinyAya
21
 
 
 
 
22
 
23
  class PredictRequest(BaseModel):
24
  query: str = Field(..., min_length=1, description="User query text")
 
99
  if not hf_token:
100
  raise RuntimeError("HF_TOKEN not found in environment variables")
101
 
102
+ index_name = "arxiv-tournament-recursive"
103
  embed_model_name = "all-MiniLM-L6-v2"
104
 
105
  startup_start = time.perf_counter()
106
 
107
+ index = get_index_by_name(
108
  api_key=pinecone_api_key,
109
+ index_name=index_name
 
 
110
  )
111
 
112
  chunks_start = time.perf_counter()
113
+ final_chunks = load_chunks_from_pinecone(index)
114
  chunk_load_time = time.perf_counter() - chunks_start
115
 
116
  if not final_chunks:
vector_db.py CHANGED
@@ -6,6 +6,21 @@ def slugify_technique(name):
6
  """Converts 'Sentence Splitter' to 'sentence-splitter' for Pinecone naming."""
7
  return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def get_pinecone_index(api_key, base_name, technique, dimension=384, metric="cosine"):
10
  """
11
  Creates/Returns an index specifically for a technique.
@@ -29,8 +44,8 @@ def get_pinecone_index(api_key, base_name, technique, dimension=384, metric="cos
29
  while not pc.describe_index(full_index_name).status['ready']:
30
  time.sleep(1)
31
 
32
- print(f" Using Index: {full_index_name}")
33
- return pc.Index(full_index_name)
34
 
35
  def refresh_pinecone_index(index, final_chunks, batch_size=100):
36
  """
@@ -91,4 +106,57 @@ def prepare_vectors_for_upsert(final_chunks):
91
  def upsert_to_pinecone(index, chunks, batch_size=100):
92
  for i in range(0, len(chunks), batch_size):
93
  batch = chunks[i : i + batch_size]
94
- index.upsert(vectors=batch)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  """Converts 'Sentence Splitter' to 'sentence-splitter' for Pinecone naming."""
7
  return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
8
 
9
+ def get_index_by_name(api_key: str, index_name: str):
10
+ """
11
+ Directly connects to a Pinecone index by its full string name.
12
+ Useful for the API/Production side where the name is already known.
13
+ """
14
+ pc = Pinecone(api_key=api_key)
15
+
16
+ # Check if it exists first to avoid a 404 crash
17
+ existing_indexes = [idx.name for idx in pc.list_indexes()]
18
+ if index_name not in existing_indexes:
19
+ raise ValueError(f"Index '{index_name}' does not exist in your Pinecone project.")
20
+
21
+ print(f" Connecting to Index: {index_name}")
22
+ return pc.Index(index_name)
23
+
24
  def get_pinecone_index(api_key, base_name, technique, dimension=384, metric="cosine"):
25
  """
26
  Creates/Returns an index specifically for a technique.
 
44
  while not pc.describe_index(full_index_name).status['ready']:
45
  time.sleep(1)
46
 
47
+ # Use our new helper to return the index object
48
+ return get_index_by_name(api_key, full_index_name)
49
 
50
  def refresh_pinecone_index(index, final_chunks, batch_size=100):
51
  """
 
106
  def upsert_to_pinecone(index, chunks, batch_size=100):
107
  for i in range(0, len(chunks), batch_size):
108
  batch = chunks[i : i + batch_size]
109
+ index.upsert(vectors=batch)
110
+
111
+
112
+ def load_chunks_from_pinecone(index, batch_size: int = 100) -> list[dict[str, any]]:
113
+ """
114
+ Scans the Pinecone index to retrieve all text metadata for the BM25 corpus.
115
+ """
116
+ stats = index.describe_index_stats()
117
+ namespaces = list(stats.get('namespaces', {}).keys())
118
+ # If no namespaces are explicitly named, Pinecone uses an empty string for the default
119
+ if not namespaces:
120
+ namespaces = [""]
121
+
122
+ all_chunks: List[Dict[str, Any]] = []
123
+ seen_ids = set()
124
+
125
+ print(f"Loading vectors for BM25 from namespaces: {namespaces}")
126
+
127
+ for ns in namespaces:
128
+ # Pinecone's list() generator returns batches of IDs
129
+ for id_batch in index.list(namespace=ns, limit=batch_size):
130
+ if not id_batch:
131
+ continue
132
+
133
+ # Fetch the actual content (metadata) for this batch of IDs
134
+ fetched = index.fetch(ids=id_batch, namespace=ns)
135
+ vectors = getattr(fetched, "vectors", {})
136
+
137
+ for vector_id, vector_data in vectors.items():
138
+ if vector_id in seen_ids:
139
+ continue
140
+ seen_ids.add(vector_id)
141
+
142
+ # Safely extract metadata
143
+ metadata = getattr(vector_data, "metadata", {})
144
+ text = metadata.get("text")
145
+
146
+ if not text:
147
+ continue
148
+
149
+ all_chunks.append({
150
+ "id": vector_id,
151
+ "metadata": {
152
+ "text": text,
153
+ "title": metadata.get("title", "Untitled"),
154
+ "url": metadata.get("url", ""),
155
+ "chunk_index": metadata.get("chunk_index", 0)
156
+ }
157
+ })
158
+
159
+ print(f" Finished namespace: '{ns if ns else 'default'}'")
160
+
161
+ print(f"Total chunks loaded into memory: {len(all_chunks)}")
162
+ return all_chunks