Spaces:

shreyanshknayak
/

Hackrx-QA-llama-index

Sleeping

App Files Files Community

shreyanshknayak commited on Aug 6

Commit

d339ddb

verified ·

1 Parent(s): ab9f9fe

Upload 3 files

Browse files

Files changed (3) hide show

main_2.py +239 -0
pipeline_2.py +184 -0
requirements.txt +219 -0

main_2.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# File: main.py
+# (Modified to load embedding model at startup and await async pipeline run)
+import os
+import tempfile
+import asyncio
+import time
+from typing import List, Dict, Any
+from urllib.parse import urlparse, unquote
+from pathlib import Path
+import httpx
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, HttpUrl
+from groq import AsyncGroq
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+import torch # Import torch to check for CUDA availability
+from dotenv import load_dotenv
+load_dotenv()
+# Import the Pipeline class from the previous file
+from pipeline_2 import Pipeline
+# FastAPI application setup
+app = FastAPI(
+    title="Llama-Index RAG with Groq",
+    description="An API to process a PDF from a URL and answer a list of questions using a Llama-Index RAG pipeline.",
+)
+# --- Pydantic Models for API Request and Response ---
+class RunRequest(BaseModel):
+    documents: HttpUrl
+    questions: List[str]
+class Answer(BaseModel):
+    question: str
+    answer: str
+class RunResponse(BaseModel):
+    answers: List[Answer]
+    processing_time: float
+    step_timings: Dict[str, float]
+# --- Global Configurations ---
+GROQ_API_KEY = os.getenv("GROQ_API_KEY", "gsk_...")
+GROQ_MODEL_NAME = "llama3-70b-8192"
+EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+# Global variable to hold the initialized embedding model
+embed_model_instance: HuggingFaceEmbedding | None = None
+if GROQ_API_KEY == "gsk_...":
+    print("WARNING: GROQ_API_KEY is not set. Please set it in your environment or main.py.")
+@app.on_event("startup")
+async def startup_event():
+    """
+    Loads the embedding model once when the application starts.
+    This prevents re-loading it on every API call.
+    """
+    global embed_model_instance
+    print(f"Loading embedding model '{EMBEDDING_MODEL_NAME}' at startup...")
+    # Check for GPU availability and use it if possible
+    # Assuming 16GB VRAM, a standard device check is sufficient
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    embed_model_instance = await asyncio.to_thread(HuggingFaceEmbedding, model_name=EMBEDDING_MODEL_NAME, device=device)
+    print("Embedding model loaded successfully.")
+# --- Async Groq Generation Function ---
+async def generate_answer_with_groq(query: str, retrieved_results: List[dict], groq_api_key: str) -> str:
+    """
+    Generates an answer using the Groq API based on the query and retrieved chunks' content.
+    """
+    if not groq_api_key:
+        return "Error: Groq API key is not set. Cannot generate answer."
+    client = AsyncGroq(api_key=groq_api_key)
+    context_parts = []
+    for i, res in enumerate(retrieved_results):
+        content = res.get("content", "")
+        metadata = res.get("document_metadata", {})
+        section_heading = metadata.get("section_heading", metadata.get("file_name", "N/A"))
+        context_parts.append(
+            f"--- Context Chunk {i+1} ---\n"
+            f"Document Part: {section_heading}\n"
+            f"Content: {content}\n"
+            f"-------------------------"
+        )
+    context = "\n\n".join(context_parts)
+    prompt = (
+        f"You are a specialized document analyzer assistant. Your task is to answer the user's question "
+        f"solely based on the provided context. If the answer cannot be found in the provided context, "
+        f"clearly state that you do not have enough information.\n\n"
+        f"Context:\n{context}\n\n"
+        f"Question: {query}\n\n"
+        f"Answer:"
+    )
+    try:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            model=GROQ_MODEL_NAME,
+            temperature=0.7,
+            max_tokens=500,
+        )
+        answer = chat_completion.choices[0].message.content
+        return answer
+    except Exception as e:
+        print(f"An error occurred during Groq API call: {e}")
+        return "Could not generate an answer due to an API error."
+# --- FastAPI Endpoint ---
+@app.post("/rag/run", response_model=RunResponse)
+async def run_rag_pipeline(request: RunRequest):
+    """
+    Runs the RAG pipeline for a given PDF document URL and a list of questions.
+    The PDF is downloaded, processed, and then the questions are answered.
+    """
+    pdf_url = request.documents
+    questions = request.questions
+    local_pdf_path = None
+    step_timings = {}
+    start_time_total = time.perf_counter()
+    if not embed_model_instance:
+         raise HTTPException(
+            status_code=500,
+            detail="Embedding model not loaded. Application startup failed."
+        )
+    if not GROQ_API_KEY or GROQ_API_KEY == "gsk_...":
+        raise HTTPException(
+            status_code=500,
+            detail="Groq API key is not configured. Please set the GROQ_API_KEY environment variable."
+        )
+    try:
+        # 1. Download PDF
+        start_time = time.perf_counter()
+        async with httpx.AsyncClient() as client:
+            try:
+                response = await client.get(str(pdf_url), timeout=30.0, follow_redirects=True)
+                response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
+                doc_bytes = response.content
+                print("Download successful.")
+            except httpx.HTTPStatusError as e:
+                raise HTTPException(status_code=e.response.status_code, detail=f"HTTP error downloading PDF: {e.response.status_code} - {e.response.text}")
+            except httpx.RequestError as e:
+                raise HTTPException(status_code=400, detail=f"Network error downloading PDF: {e}")
+            except Exception as e:
+                raise HTTPException(status_code=500, detail=f"An unexpected error occurred during download: {e}")
+        # Determine a temporary local filename
+        parsed_path = urlparse(str(pdf_url)).path
+        filename = unquote(os.path.basename(parsed_path))
+        if not filename or not filename.lower().endswith(".pdf"):
+            # If the URL doesn't provide a valid PDF filename, create a generic one.
+            filename = "downloaded_document.pdf"
+        # Use tempfile to create a secure temporary file
+        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf_file:
+            temp_pdf_file.write(doc_bytes)
+            local_pdf_path = temp_pdf_file.name
+        end_time = time.perf_counter()
+        step_timings["download_pdf"] = end_time - start_time
+        print(f"PDF download took {step_timings['download_pdf']:.2f} seconds.")
+        # 2. Initialize and Run the Pipeline (Parsing, Node Creation, Embeddings)
+        start_time = time.perf_counter()
+        # The Pipeline's run() method is now async, so await it directly
+        pipeline = Pipeline(groq_api_key=GROQ_API_KEY, pdf_path=local_pdf_path, embed_model=embed_model_instance)
+        await pipeline.run() # Changed from asyncio.to_thread(pipeline.run)
+        end_time = time.perf_counter()
+        step_timings["pipeline_setup"] = end_time - start_time
+        print(f"Pipeline setup took {step_timings['pipeline_setup']:.2f} seconds.")
+        # 3. Concurrent Retrieval Phase
+        start_time_retrieval = time.perf_counter()
+        print(f"\nStarting concurrent retrieval for {len(questions)} questions...")
+        retrieval_tasks = [asyncio.to_thread(pipeline.retrieve_nodes, q) for q in questions]
+        all_retrieved_results = await asyncio.gather(*retrieval_tasks)
+        end_time_retrieval = time.perf_counter()
+        step_timings["retrieval"] = end_time_retrieval - start_time_retrieval
+        print(f"Retrieval phase completed in {step_timings['retrieval']:.2f} seconds.")
+        # 4. Concurrent Generation Phase
+        start_time_generation = time.perf_counter()
+        print(f"\nStarting concurrent answer generation for {len(questions)} questions...")
+        generation_tasks = [
+            generate_answer_with_groq(q, retrieved_results, GROQ_API_KEY)
+            for q, retrieved_results in zip(questions, all_retrieved_results)
+        ]
+        all_answer_texts = await asyncio.gather(*generation_tasks)
+        end_time_generation = time.perf_counter()
+        step_timings["generation"] = end_time_generation - start_time_generation
+        print(f"Generation phase completed in {step_timings['generation']:.2f} seconds.")
+        end_time_total = time.perf_counter()
+        total_processing_time = end_time_total - start_time_total
+        answers = [Answer(question=q, answer=a) for q, a in zip(questions, all_answer_texts)]
+        return RunResponse(
+            answers=answers,
+            processing_time=total_processing_time,
+            step_timings=step_timings,
+        )
+    except HTTPException as e:
+        raise e
+    except Exception as e:
+        print(f"An unhandled error occurred: {e}")
+        raise HTTPException(
+            status_code=500, detail=f"An internal server error occurred: {e}"
+        )
+    finally:
+        if local_pdf_path and os.path.exists(local_pdf_path):
+            os.unlink(local_pdf_path)
+            print(f"Cleaned up temporary PDF file: {local_pdf_path}")

pipeline_2.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# File: pipeline.py
+# (Modified to accept a pre-initialized embedding model and generate embeddings concurrently)
+import time
+from pathlib import Path
+from typing import List, Any
+import asyncio # Import asyncio for concurrent operations
+from llama_index.core import Document, StorageContext, VectorStoreIndex, Settings
+from llama_index.core.node_parser import HierarchicalNodeParser, get_leaf_nodes, get_root_nodes
+from llama_index.core.retrievers import AutoMergingRetriever, BaseRetriever
+from llama_index.core.storage.docstore import SimpleDocumentStore
+from llama_index.readers.file import PyMuPDFReader
+from llama_index.llms.groq import Groq
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+class Pipeline:
+    """
+    A pipeline to process a PDF, create nodes, and generate embeddings.
+    It exposes a retriever to fetch nodes for a given query,
+    but does not handle the answer generation itself. The embedding
+    model is now passed in, not initialized internally.
+    """
+    def __init__(self, groq_api_key: str, pdf_path: str, embed_model: HuggingFaceEmbedding):
+        """
+        Initializes the pipeline with API keys, file path, and a pre-initialized embedding model.
+        Args:
+            groq_api_key (str): Your API key for Groq.
+            pdf_path (str): The path to the PDF file to be processed.
+            embed_model (HuggingFaceEmbedding): The pre-initialized embedding model.
+        """
+        self.groq_api_key = groq_api_key
+        self.pdf_path = Path(pdf_path)
+        self.embed_model = embed_model
+        # Configure Llama-Index LLM setting only
+        Settings.llm = Groq(model="llama3-70b-8192", api_key=self.groq_api_key)
+        # Initialize components
+        self.documents: List[Document] = []
+        self.nodes: List[Any] = []
+        self.storage_context: StorageContext | None = None
+        self.index: VectorStoreIndex | None = None
+        self.retriever: BaseRetriever | None = None
+        self.leaf_nodes: List[Any] = []
+        self.root_nodes: List[Any] = []
+    def _parse_pdf(self) -> None:
+        """Parses the PDF file into Llama-Index Document objects."""
+        print(f"Parsing PDF at: {self.pdf_path}")
+        start_time = time.perf_counter()
+        loader = PyMuPDFReader()
+        docs = loader.load(file_path=self.pdf_path)
+        # Concatenate all document parts into a single document for simpler processing
+        # Adjust this if you need to maintain per-page document context
+        doc_text = "\n\n".join([d.get_content() for d in docs])
+        self.documents = [Document(text=doc_text)]
+        end_time = time.perf_counter()
+        print(f"PDF parsing completed in {end_time - start_time:.2f} seconds.")
+    def _create_nodes(self) -> None:
+        """Creates hierarchical nodes from the parsed documents."""
+        print("Creating nodes from documents...")
+        start_time = time.perf_counter()
+        node_parser = HierarchicalNodeParser.from_defaults()
+        self.nodes = node_parser.get_nodes_from_documents(self.documents)
+        self.leaf_nodes = get_leaf_nodes(self.nodes)
+        self.root_nodes = get_root_nodes(self.nodes)
+        end_time = time.perf_counter()
+        print(f"Node creation completed in {end_time - start_time:.2f} seconds.")
+    async def _generate_embeddings_concurrently(self) -> None:
+        """
+        Generates embeddings for leaf nodes concurrently using asyncio.to_thread
+        and then builds the VectorStoreIndex.
+        """
+        print("Generating embeddings for leaf nodes concurrently...")
+        start_time_embeddings = time.perf_counter()
+        # Define a batch size for sending texts to the embedding model.
+        # For a 16GB VRAM GPU, a batch size of 300 for 'all-MiniLM-L6-v2' is a reasonable starting point.
+        # You might be able to increase this further depending on the exact model and GPU utilization.
+        BATCH_SIZE = 300
+        embedding_tasks = []
+        # Extract text content from leaf nodes
+        node_texts = [node.get_content() for node in self.leaf_nodes]
+        # Create batches of texts and schedule embedding generation in separate threads
+        for i in range(0, len(node_texts), BATCH_SIZE):
+            batch_texts = node_texts[i : i + BATCH_SIZE]
+            # Use asyncio.to_thread to run the synchronous embedding model call in a separate thread
+            # This prevents blocking the main event loop
+            embedding_tasks.append(asyncio.to_thread(self.embed_model.get_text_embedding_batch, texts=batch_texts, show_progress=False))
+        # Wait for all concurrent embedding tasks to complete
+        all_embeddings_batches = await asyncio.gather(*embedding_tasks)
+        # Flatten the list of lists of embeddings into a single list
+        flat_embeddings = [emb for sublist in all_embeddings_batches for emb in sublist]
+        # Assign the generated embeddings back to their respective leaf nodes
+        for i, node in enumerate(self.leaf_nodes):
+            node.embedding = flat_embeddings[i]
+        end_time_embeddings = time.perf_counter()
+        print(f"Embeddings generated for {len(self.leaf_nodes)} nodes in {end_time_embeddings - start_time_embeddings:.2f} seconds.")
+        # Now, build the VectorStoreIndex using the nodes that now have pre-computed embeddings
+        print("Building VectorStoreIndex...")
+        start_time_index_build = time.perf_counter()
+        # Add all nodes (root and leaf) to the document store
+        docstore = SimpleDocumentStore()
+        docstore.add_documents(self.nodes)
+        self.storage_context = StorageContext.from_defaults(docstore=docstore)
+        # When nodes already have embeddings, VectorStoreIndex will use them
+        self.index = VectorStoreIndex(
+            self.leaf_nodes, # Pass leaf nodes which now contain their embeddings
+            storage_context=self.storage_context,
+            embed_model=self.embed_model # Still pass the embed_model, though it won't re-embed if nodes have embeddings
+        )
+        end_time_index_build = time.perf_counter()
+        print(f"VectorStoreIndex built in {end_time_index_build - start_time_index_build:.2f} seconds.")
+        print(f"Total index generation and embedding process completed in {end_time_index_build - start_time_embeddings:.2f} seconds.")
+    def _setup_retriever(self) -> None:
+        """Sets up the retriever."""
+        print("Setting up retriever...")
+        base_retriever = self.index.as_retriever(similarity_top_k=6)
+        self.retriever = AutoMergingRetriever(
+            base_retriever, storage_context=self.storage_context, verbose=True
+        )
+    async def run(self) -> None:
+        """Runs the entire pipeline from parsing to retriever setup."""
+        if not self.pdf_path.exists():
+            raise FileNotFoundError(f"PDF file not found at: {self.pdf_path}")
+        self._parse_pdf()
+        self._create_nodes()
+        await self._generate_embeddings_concurrently() # Await the async embedding generation
+        self._setup_retriever()
+        print("Pipeline is ready for retrieval.")
+    def retrieve_nodes(self, query_str: str) -> List[dict]:
+        """
+        Retrieves relevant nodes for a given query and converts them to a
+        list of dictionaries for external use.
+        Args:
+            query_str (str): The query string.
+        Returns:
+            List[dict]: A list of dictionaries with node content and metadata.
+        """
+        if not self.retriever:
+            raise RuntimeError("Retriever is not initialized. Run the pipeline first.")
+        print(f"\nRetrieving nodes for query: '{query_str}'")
+        start_time = time.perf_counter()
+        # This is a synchronous call
+        nodes = self.retriever.retrieve(query_str)
+        end_time = time.perf_counter()
+        print(f"Retrieval completed in {end_time - start_time:.2f} seconds. Found {len(nodes)} nodes.")
+        # Convert the Llama-Index nodes to a dictionary format
+        retrieved_results = [
+            {
+                "content": n.text,
+                "document_metadata": n.metadata
+            }
+            for n in nodes
+        ]
+        return retrieved_results

requirements.txt ADDED Viewed

	@@ -0,0 +1,219 @@

+acres==0.5.0
+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.15
+aiosignal==1.4.0
+aiosqlite==0.21.0
+annotated-types==0.7.0
+anyio==4.10.0
+appnope==0.1.4
+argon2-cffi==25.1.0
+argon2-cffi-bindings==25.1.0
+arrow==1.3.0
+asttokens==3.0.0
+async-lru==2.0.5
+asyncio==4.0.0
+attrs==25.3.0
+babel==2.17.0
+banks==2.2.0
+beautifulsoup4==4.13.4
+bleach==6.2.0
+certifi==2025.8.3
+cffi==1.17.1
+charset-normalizer==3.4.2
+ci-info==0.3.0
+click==8.2.1
+cohere==5.16.2
+colorama==0.4.6
+comm==0.2.3
+configobj==5.0.9
+configparser==7.2.0
+contourpy==1.3.3
+cycler==0.12.1
+dataclasses-json==0.6.7
+debugpy==1.8.15
+decorator==5.2.1
+defusedxml==0.7.1
+Deprecated==1.2.18
+dirtyjson==1.0.8
+distro==1.9.0
+dotenv==0.9.9
+etelemetry==0.3.1
+executing==2.2.0
+fastapi==0.116.1
+fastavro==1.12.0
+fastjsonschema==2.21.1
+filelock==3.18.0
+filetype==1.2.0
+fitz==0.0.1.dev2
+fonttools==4.59.0
+fqdn==1.5.1
+frontend==0.0.3
+frozenlist==1.7.0
+fsspec==2025.7.0
+greenlet==3.2.3
+griffe==1.9.0
+groq==0.31.0
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
+httplib2==0.22.0
+httpx==0.28.1
+httpx-sse==0.4.0
+huggingface-hub==0.34.3
+idna==3.10
+ipykernel==6.30.1
+ipython==9.4.0
+ipython_pygments_lexers==1.1.1
+ipywidgets==8.1.7
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.2
+Jinja2==3.1.6
+jiter==0.10.0
+joblib==1.5.1
+json5==0.12.0
+jsonpointer==3.0.0
+jsonschema==4.25.0
+jsonschema-specifications==2025.4.1
+jupyter==1.1.1
+jupyter-console==6.6.3
+jupyter-events==0.12.0
+jupyter-lsp==2.2.6
+jupyter_client==8.6.3
+jupyter_core==5.8.1
+jupyter_server==2.16.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.4.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.15
+kiwisolver==1.4.8
+lark==1.2.2
+llama-cloud==0.1.35
+llama-cloud-services==0.6.54
+llama-index==0.13.0
+llama-index-cli==0.5.0
+llama-index-core==0.13.0
+llama-index-embeddings-cohere==0.6.0
+llama-index-embeddings-huggingface==0.6.0
+llama-index-embeddings-openai==0.5.0
+llama-index-indices-managed-llama-cloud==0.9.0
+llama-index-instrumentation==0.4.0
+llama-index-llms-groq==0.4.0
+llama-index-llms-openai==0.5.0
+llama-index-llms-openai-like==0.5.0
+llama-index-readers-file==0.5.0
+llama-index-readers-llama-parse==0.5.0
+llama-index-vector-stores-pinecone==0.7.0
+llama-index-workflows==1.2.0
+llama-parse==0.6.54
+looseversion==1.3.0
+lxml==6.0.0
+MarkupSafe==3.0.2
+marshmallow==3.26.1
+matplotlib==3.10.5
+matplotlib-inline==0.1.7
+mistune==3.1.3
+mpmath==1.3.0
+multidict==6.6.3
+mypy_extensions==1.1.0
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.5
+nibabel==5.3.2
+nipype==1.10.0
+nltk==3.9.1
+notebook==7.4.5
+notebook_shim==0.2.4
+numpy==2.3.2
+openai==1.98.0
+overrides==7.7.0
+packaging==24.2
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+pathlib==1.0.1
+pexpect==4.9.0
+pillow==11.3.0
+pinecone==7.3.0
+pinecone-plugin-assistant==1.7.0
+pinecone-plugin-interface==0.0.7
+platformdirs==4.3.8
+prometheus_client==0.22.1
+prompt_toolkit==3.0.51
+propcache==0.3.2
+prov==2.1.1
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+puremagic==1.30
+pycparser==2.22
+pydantic==2.11.7
+pydantic_core==2.33.2
+pydot==4.0.1
+Pygments==2.19.2
+PyMuPDF==1.26.3
+pyparsing==3.2.3
+pypdf==5.9.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.1
+python-json-logger==3.3.0
+pytz==2025.2
+pyxnat==1.6.3
+PyYAML==6.0.2
+pyzmq==27.0.1
+rdflib==7.1.4
+referencing==0.36.2
+regex==2025.7.34
+requests==2.32.4
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rfc3987-syntax==1.1.0
+rpds-py==0.26.0
+safetensors==0.5.3
+scikit-learn==1.7.1
+scipy==1.16.1
+Send2Trash==1.8.3
+sentence-transformers==5.0.0
+setuptools==80.9.0
+simplejson==3.20.1
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.7
+SQLAlchemy==2.0.42
+stack-data==0.6.3
+starlette==0.47.2
+striprtf==0.0.26
+sympy==1.14.0
+tenacity==9.1.2
+terminado==0.18.1
+threadpoolctl==3.6.0
+tiktoken==0.9.0
+tinycss2==1.4.0
+tokenizers==0.21.4
+torch==2.7.1
+tornado==6.5.1
+tqdm==4.67.1
+traitlets==5.14.3
+traits==7.0.2
+transformers==4.54.1
+types-python-dateutil==2.9.0.20250708
+types-requests==2.32.4.20250611
+typing==3.7.4.3
+typing-inspect==0.9.0
+typing-inspection==0.4.1
+typing_extensions==4.14.1
+tzdata==2025.2
+uri-template==1.3.0
+urllib3==2.5.0
+uvicorn==0.35.0
+wcwidth==0.2.13
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.14
+wrapt==1.17.2
+yarl==1.20.1