Spaces:

Qar-Raz
/

NLP-RAG

Running

App Files Files Community

ramailkk commited on 5 days ago

Commit

15c009d

1 Parent(s): 6cb3d7c

making the frontend functional

Browse files

Files changed (2) hide show

api.py +5 -10
vector_db.py +71 -3

api.py CHANGED Viewed

@@ -8,7 +8,7 @@ from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
-from vector_db import get_pinecone_index
 from retriever.retriever import HybridRetriever
 from retriever.generator import RAGGenerator
 from retriever.processor import ChunkProcessor
@@ -19,9 +19,6 @@ from models.qwen_2_5 import Qwen2_5
 from models.deepseek_v3 import DeepSeek_V3
 from models.tiny_aya import TinyAya
-# Reuse the same query-only helper for loading BM25 corpus from Pinecone metadata.
-from query_only import _load_chunks_from_pinecone
 class PredictRequest(BaseModel):
     query: str = Field(..., min_length=1, description="User query text")
@@ -102,20 +99,18 @@ def startup_event() -> None:
     if not hf_token:
         raise RuntimeError("HF_TOKEN not found in environment variables")
-    index_name = "arxiv-index"
     embed_model_name = "all-MiniLM-L6-v2"
     startup_start = time.perf_counter()
-    index = get_pinecone_index(
         api_key=pinecone_api_key,
-        index_name=index_name,
-        dimension=384,
-        metric="cosine",
     )
     chunks_start = time.perf_counter()
-    final_chunks = _load_chunks_from_pinecone(index)
     chunk_load_time = time.perf_counter() - chunks_start
     if not final_chunks:

 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
+from vector_db import get_index_by_name, load_chunks_from_pinecone
 from retriever.retriever import HybridRetriever
 from retriever.generator import RAGGenerator
 from retriever.processor import ChunkProcessor
 from models.deepseek_v3 import DeepSeek_V3
 from models.tiny_aya import TinyAya
 class PredictRequest(BaseModel):
     query: str = Field(..., min_length=1, description="User query text")
     if not hf_token:
         raise RuntimeError("HF_TOKEN not found in environment variables")
+    index_name = "arxiv-tournament-recursive"
     embed_model_name = "all-MiniLM-L6-v2"
     startup_start = time.perf_counter()
+    index = get_index_by_name(
         api_key=pinecone_api_key,
+        index_name=index_name
     )
     chunks_start = time.perf_counter()
+    final_chunks = load_chunks_from_pinecone(index)
     chunk_load_time = time.perf_counter() - chunks_start
     if not final_chunks:

vector_db.py CHANGED Viewed

@@ -6,6 +6,21 @@ def slugify_technique(name):
     """Converts 'Sentence Splitter' to 'sentence-splitter' for Pinecone naming."""
     return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
 def get_pinecone_index(api_key, base_name, technique, dimension=384, metric="cosine"):
     """
     Creates/Returns an index specifically for a technique.
@@ -29,8 +44,8 @@ def get_pinecone_index(api_key, base_name, technique, dimension=384, metric="cos
         while not pc.describe_index(full_index_name).status['ready']:
             time.sleep(1)
-    print(f" Using Index: {full_index_name}")
-    return pc.Index(full_index_name)
 def refresh_pinecone_index(index, final_chunks, batch_size=100):
     """
@@ -91,4 +106,57 @@ def prepare_vectors_for_upsert(final_chunks):
 def upsert_to_pinecone(index, chunks, batch_size=100):
     for i in range(0, len(chunks), batch_size):
         batch = chunks[i : i + batch_size]
-        index.upsert(vectors=batch)

     """Converts 'Sentence Splitter' to 'sentence-splitter' for Pinecone naming."""
     return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
+def get_index_by_name(api_key: str, index_name: str):
+    """
+    Directly connects to a Pinecone index by its full string name.
+    Useful for the API/Production side where the name is already known.
+    """
+    pc = Pinecone(api_key=api_key)
+    # Check if it exists first to avoid a 404 crash
+    existing_indexes = [idx.name for idx in pc.list_indexes()]
+    if index_name not in existing_indexes:
+        raise ValueError(f"Index '{index_name}' does not exist in your Pinecone project.")
+    print(f" Connecting to Index: {index_name}")
+    return pc.Index(index_name)
 def get_pinecone_index(api_key, base_name, technique, dimension=384, metric="cosine"):
     """
     Creates/Returns an index specifically for a technique.
         while not pc.describe_index(full_index_name).status['ready']:
             time.sleep(1)
+    # Use our new helper to return the index object
+    return get_index_by_name(api_key, full_index_name)
 def refresh_pinecone_index(index, final_chunks, batch_size=100):
     """
 def upsert_to_pinecone(index, chunks, batch_size=100):
     for i in range(0, len(chunks), batch_size):
         batch = chunks[i : i + batch_size]
+        index.upsert(vectors=batch)
+def load_chunks_from_pinecone(index, batch_size: int = 100) -> list[dict[str, any]]:
+    """
+    Scans the Pinecone index to retrieve all text metadata for the BM25 corpus.
+    """
+    stats = index.describe_index_stats()
+    namespaces = list(stats.get('namespaces', {}).keys())
+    # If no namespaces are explicitly named, Pinecone uses an empty string for the default
+    if not namespaces:
+        namespaces = [""]
+    all_chunks: List[Dict[str, Any]] = []
+    seen_ids = set()
+    print(f"Loading vectors for BM25 from namespaces: {namespaces}")
+    for ns in namespaces:
+        # Pinecone's list() generator returns batches of IDs
+        for id_batch in index.list(namespace=ns, limit=batch_size):
+            if not id_batch:
+                continue
+            # Fetch the actual content (metadata) for this batch of IDs
+            fetched = index.fetch(ids=id_batch, namespace=ns)
+            vectors = getattr(fetched, "vectors", {})
+            for vector_id, vector_data in vectors.items():
+                if vector_id in seen_ids:
+                    continue
+                seen_ids.add(vector_id)
+                # Safely extract metadata
+                metadata = getattr(vector_data, "metadata", {})
+                text = metadata.get("text")
+                if not text:
+                    continue
+                all_chunks.append({
+                    "id": vector_id,
+                    "metadata": {
+                        "text": text,
+                        "title": metadata.get("title", "Untitled"),
+                        "url": metadata.get("url", ""),
+                        "chunk_index": metadata.get("chunk_index", 0)
+                    }
+                })
+        print(f" Finished namespace: '{ns if ns else 'default'}'")
+    print(f"Total chunks loaded into memory: {len(all_chunks)}")
+    return all_chunks