Spaces:

codewithpurav
/

financial-qa-agent

Paused

App Files Files Community

codewithpurav commited on Sep 29

Commit

3efe7a4

1 Parent(s): ccbeb57

Add Dockerfile for Streamlit deployment

Browse files

Files changed (11) hide show

.gitignore +34 -0
Data_Cleaning.py +99 -0
Dockerfile +21 -8
Embeddings.py +319 -0
Logger.py +109 -0
README.md +37 -0
app.py +60 -0
app_colabcode.ipynb +805 -0
config.json +25 -0
evaluation.py +161 -0
requirements.txt +18 -3

.gitignore ADDED Viewed

	@@ -0,0 +1,34 @@

+# Python cache
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+# Logs
+logs/
+*.log
+# Checkpoints and outputs
+*.ckpt
+*.idx
+*.pkl
+*.jsonl
+# Environment files
+.env
+*.env
+*.bak
+# Jupyter/Colab
+.ipynb_checkpoints/
+# System files
+.DS_Store
+Thumbs.db
+# Project Files
+eval_dataset.json
+test_questions.txt
+experiment.py
+Retrieval_Summarization.py
+run_evalution.py

Data_Cleaning.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import pdfplumber
+import os
+import multiprocessing
+from tqdm import tqdm
+from Logger import GetLogger
+class GetDataCleaning():
+    def __init__(self, root_folder, excluding_folder=[], logger=None):
+        if not logger:
+            obj = GetLogger()
+            logger = obj.get_logger()
+        self.logger = logger
+        self.root_folder = root_folder
+        self.excluding_folder = excluding_folder
+        self.folder_list = [item for item in os.listdir(self.root_folder) if (("txt" not in item.split("_")) and (item not in excluding_folder))]
+        self.logger.info("all the folder list is generated sucessfully")
+    def pdf_to_txt(self, pdf_path, txt_path):
+        text = ""
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+        with open(txt_path, "w", encoding="utf-8") as f:
+            f.write(text)
+    def clean_txt(self, text):
+        lines = text.split("\n")
+        cleaned = []
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+            if line.isdigit():
+                continue
+            if line in ["Infosys", "ICICI Bank"]:
+                continue
+            cleaned.append(line)
+        return " ".join(cleaned)
+    def process_file(self, folder, file, logger):
+        """Single file processing pipeline"""
+        input_pdf = os.path.join(self.root_folder, folder, file)
+        output_txt = os.path.join(self.root_folder, folder + "_txt", file.replace(".pdf", ".txt"))
+        output_cleaned = os.path.join(self.root_folder, folder + "_cleaned_txt", file.replace(".pdf", ".txt"))
+        # Convert PDF → TXT
+        self.pdf_to_txt(input_pdf, output_txt)
+        # Clean text
+        raw_text = open(output_txt, encoding="utf-8").read()
+        cleaned_text = self.clean_txt(raw_text)
+        with open(output_cleaned, "w", encoding="utf-8") as f:
+            f.write(cleaned_text)
+        logger.info(f"✅ Processed: {folder}/{file}")
+    def run(self, workers=4):
+        try:
+            self.logger.info("🚀 Starting Cleaning Process")
+            for folder in self.folder_list:
+                os.makedirs(os.path.join(self.root_folder, folder + "_txt"), exist_ok=True)
+                os.makedirs(os.path.join(self.root_folder, folder + "_cleaned_txt"), exist_ok=True)
+                pdf_files = [
+                    f for f in os.listdir(os.path.join(self.root_folder, folder))
+                    if f.endswith(".pdf")
+                ]
+                # Run parallel processing
+                with multiprocessing.Pool(processes=workers) as pool:
+                    pool.starmap(self.process_file, [(folder, f, self.logger) for f in pdf_files])
+                    pool.close()
+                    pool.join()
+                self.logger.info(f"Data Cleaning completed for folder:{folder}")
+        except Exception as e:
+            self.logger.error(f"Got Error: {e}")
+# if __name__ == "__main__":
+#     obj = Cleaning(root_folder="financial_reports", excluding_folder=["ICICI"])
+#     obj.run()
+    # obj.process_file("ICICI", "icici-bank-23.pdf") # for experiment only

Dockerfile CHANGED Viewed

@@ -1,20 +1,33 @@
-FROM python:3.13.5-slim
 WORKDIR /app
 RUN apt-get update && apt-get install -y \
     build-essential \
-    curl \
     git \
     && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+# Base image with Python
+FROM python:3.11-slim
+# Prevent Python from writing .pyc files and using output buffer
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# Set working directory
 WORKDIR /app
+# Install system dependencies (for faiss, etc.)
 RUN apt-get update && apt-get install -y \
     build-essential \
     git \
+    curl \
     && rm -rf /var/lib/apt/lists/*
+# Copy requirements first (better cache usage)
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy project files
+COPY . .
+# Expose Streamlit default port
+EXPOSE 7860
+# Run Streamlit app
+CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]

Embeddings.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import os
+import glob
+import pickle, json
+from tqdm import tqdm
+import numpy as np
+# Try imports with friendly errors
+try:
+    import faiss
+except Exception as e:
+    raise ImportError("faiss is required. Install cpu version: `pip install faiss-cpu` or install via conda for GPU (faiss-gpu).") from e
+try:
+    from sentence_transformers import SentenceTransformer
+except Exception as e:
+    raise ImportError("sentence-transformers is required. `pip install sentence-transformers`") from e
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import torch
+from dotenv import load_dotenv
+from Data_Cleaning import GetDataCleaning
+from Logger import GetLogger
+class GetEmbeddings:
+    """
+    Embedding pipeline for cleaned text files.
+    Generates embeddings using SentenceTransformers, builds a FAISS index,
+    and allows searching queries against the vector database.
+    """
+    def __init__(self, config_path="config.json", logger=None):
+        with open(config_path, "r") as f:
+            self.config = json.load(f)
+        cfg_paths = self.config["paths"]
+        cfg_emb = self.config["embedding"]
+        self.root = cfg_paths["root"]
+        self.cleaned_suffix = "_cleaned_txt"
+        self.chunk_words = cfg_emb["chunk_words"]
+        self.batch_size = cfg_emb["batch_size"]
+        self.faiss_index_path = cfg_paths["faiss_index"]
+        self.metadata_path = cfg_paths["metadata"]
+        self.embedding_model = cfg_emb["model"]
+        if not logger:
+            obj = GetLogger()
+            logger = obj.get_logger()
+        self.logger = logger
+        self.logger.info("Initializing Embedding Pipeline...")
+        # Device
+        self.device = "cuda" if self.check_cuda() and cfg_emb["use_gpu"] else "cpu"
+        load_dotenv()
+        self.hf_token = os.getenv("HF_TOKEN")
+    def check_cuda(self):
+        """Return True if CUDA is available and usable."""
+        try:
+            if torch.cuda.is_available():
+                _ = torch.cuda.current_device()
+                self.logger.info(f"✅ CUDA available. Device: {torch.cuda.get_device_name(0)}")
+                return True
+            self.logger.info("⚠️ CUDA not available. Using CPU.")
+            return False
+        except Exception as e:
+            self.logger.error(f"Error checking CUDA, defaulting to CPU. Error: {e}")
+            return False
+    def list_cleaned_files(self):
+        """Return sorted list of cleaned text files under root/*{cleaned_suffix}/*.txt"""
+        pattern = os.path.join(self.root, f"*{self.cleaned_suffix}", "*.txt")
+        files = glob.glob(pattern)
+        files.sort()
+        return files
+    def read_text_file(self, path):
+        """Read a text file and return string content."""
+        with open(path, "r", encoding="utf-8") as f:
+            return f.read()
+    def chunk_text_words(self, text):
+        """
+        Simple word-based chunking.
+        Returns list of text chunks.
+        """
+        words = text.split()
+        if not words:
+            return []
+        return [" ".join(words[i:i + self.chunk_words]) for i in range(0, len(words), self.chunk_words)]
+    def save_index_and_metadata(self):
+        """Save FAISS index and metadata to disk."""
+        os.makedirs(os.path.dirname(self.faiss_index_path), exist_ok=True)
+        faiss.write_index(self.index, self.faiss_index_path)
+        with open(self.metadata_path, "wb") as f:
+            pickle.dump(self.metadata, f)
+        self.logger.info(f"💾 Saved FAISS index to {self.faiss_index_path}")
+        self.logger.info(f"💾 Saved metadata to {self.metadata_path}")
+    def load_index_and_metadata(self):
+        """Load FAISS index and metadata if they exist."""
+        if os.path.exists(self.faiss_index_path) and os.path.exists(self.metadata_path):
+            try:
+                self.index = faiss.read_index(self.faiss_index_path)
+                with open(self.metadata_path, "rb") as f:
+                    self.metadata = pickle.load(f)
+                self.logger.info(f"✅ Loaded existing FAISS index + metadata from disk.")
+                return True
+            except Exception as e:
+                self.logger.warning(f"⚠️ Failed to load FAISS index/metadata, will rebuild. Error: {e}")
+                return False
+        return False
+    def load_encoder(self):
+        """Loading Encoder"""
+        self.encoder = SentenceTransformer(self.embedding_model, device=self.device)
+        self.logger.info(f"Loaded embedding model '{self.embedding_model}' on {self.device}")
+        return self.encoder
+    def building_embeddings_index(self, files):
+        """Build embeddings for all text chunks and return FAISS index + metadata."""
+        all_embeddings, metadata = [], []
+        next_id = 0
+        # Iterate files and chunks
+        for fp in tqdm(files, desc="Files", unit="file"):
+            text = self.read_text_file(fp)
+            if not text.strip():
+                continue
+            # metadata: infer company and file from path
+            # e.g., financial_reports/Infosys_cleaned_txt/Infosys_2023_AR.txt
+            rel = os.path.relpath(fp, self.root)
+            folder = rel.split(os.sep)[0]
+            filename = os.path.basename(fp)
+            chunks = self.chunk_text_words(text)
+            if not chunks:
+                continue
+            for i in range(0, len(chunks), self.batch_size):
+                batch = chunks[i:i + self.batch_size]
+                embs = self.encoder.encode(batch, show_progress_bar=False, convert_to_numpy=True)
+                embs = embs.astype(np.float32)
+                for j, vec in enumerate(embs):
+                    all_embeddings.append(vec)
+                    metadata.append({
+                        "id": next_id,
+                        "source_folder": folder,
+                        "file": filename,
+                        "chunk_id": i + j,
+                        "text": batch[j]  # store chunk text for retrieval
+                    })
+                    next_id += 1
+        if not all_embeddings:
+            raise RuntimeError("No embeddings were produced. Check cleaned files and chunking.")
+        emb_matrix = np.vstack(all_embeddings).astype(np.float32)
+        faiss.normalize_L2(emb_matrix)
+        # Build FAISS index (IndexFlatIP over normalized vectors = cosine similarity)
+        dim = emb_matrix.shape[1]
+        self.index = faiss.IndexFlatIP(dim)
+        self.index.add(emb_matrix)
+        self.metadata = metadata
+        self.logger.info(f"✅ Built FAISS index with {self.index.ntotal} vectors, dim={dim}")
+        return self.index, self.metadata
+    def run(self):
+        """Main entry: load or build embeddings + FAISS index."""
+        if self.load_index_and_metadata():
+            return
+        files = self.list_cleaned_files()
+        if not files:
+            self.logger.error("❌ No cleaned text files found.")
+            raise SystemExit(1)
+        self.load_encoder()
+        self.building_embeddings_index(files)
+        self.save_index_and_metadata()
+    def load_summarizer(self, model_name="google/gemma-2b"):
+        """
+        Load summarizer LLM once.
+        If already loaded, skip.
+        """
+        if hasattr(self, "summarizer_pipeline"):
+            self.logger.info("ℹ️ Summarizer already loaded, skipping reload.")
+            return
+        try:
+            self.logger.info(f"⏳ Loading summarizer model '{model_name}'...")
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=self.hf_token)
+            self.summarizer_model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+                device_map=self.device,
+                token=self.hf_token
+            )
+            self.summarizer_pipeline = pipeline(
+                "text-generation",
+                model=self.summarizer_model,
+                tokenizer=self.tokenizer
+            )
+            self.logger.info(f"✅ Summarizer model '{model_name}' loaded successfully.")
+        except RuntimeError as e:
+            if "CUDA out of memory" in str(e):
+                self.logger.warning("⚠️ CUDA OOM while loading summarizer. Retrying on CPU...")
+                self.device = "cpu"
+                torch.cuda.empty_cache()
+                return self.load_summarizer(model_name=model_name)
+            else:
+                self.logger.error(f"❌ Failed to load summarizer: {e}")
+                raise
+    def summarize_chunks(self, chunks, max_content_tokens=2048, max_output_tokens=256):
+        """
+        Summarize list of text chunks using LLM.
+        - Chunks are joined until they fit into max_context_tokens
+        - Generates a concise summary.
+        """
+        if not hasattr(self, "summarizer_pipeline"):
+            self.load_summarizer()
+            self.logger.info("Summarizer not initialized. Called load_summarizer(). pipeline will work with default parameters.")
+        # Join chunks into one context, respecting token budget
+        context = " ".join(chunks)
+        input_tokens = len(self.tokenizer.encode(context))
+        if input_tokens > max_content_tokens:
+            # Trim to fit context window
+            context = " ".join(context.split()[:max_content_tokens])
+            self.logger.warning("⚠️ Context truncated to fit within model token limit.")
+        # Build summarization prompt
+        prompt = f"""
+            Summarize the following financial report excerpts into a concise answer.
+            Keep it factual, short, and grounded in the text.
+            Excerpts:
+            {context}
+            Summary:
+            """
+        try:
+            output = self.summarizer_pipeline(
+                prompt,
+                max_new_tokens=max_output_tokens,
+                do_sample=False
+            )[0]["generated_text"]
+            if "Summary:" in output:
+                summary = output.split("Summary:")[-1].strip()
+            else:
+                summary = output.strip()
+            return summary
+        except RuntimeError as e:
+            if "CUDA out of memory" in str(e):
+                self.logger.warning("⚠️ CUDA OOM during summarization. Retrying on CPU...")
+                self.device = "cpu"
+                torch.cuda.empty_cache()
+                return self.summarize_chunks(chunks, max_content_tokens, max_output_tokens)
+            else:
+                self.logger.error(f"❌ Summarizer failed: {e}. Falling back to raw chunks.")
+                return " ".join(chunks[:2])  # fallback: return first 2 chunks
+    def answer_query(self, query, top_k=3):
+        """
+        End-to-end QA:
+        - Retrieve relevant chunks from FAISS
+        - Summarize into a final answer.
+        """
+        try:
+            #step 1: Retrieve
+            self.logger.info(f"🔍 searching vector DB for query: {query}")
+            q_emb = self.encoder.encode(query, show_progress_bar=False, convert_to_numpy=True).reshape(1, -1)
+            faiss.normalize_L2(q_emb)
+            scores, idxs = self.index.search(q_emb, k=top_k)
+            chunks = [self.metadata[idx]["text"] for idx in idxs[0]]
+            # Step 2: Summarize
+            summary = self.summarize_chunks(chunks)
+            # Log results
+            self.logger.info(f"✅ Final Answer: {summary}")
+            return summary
+        except Exception as e:
+            self.logger.error(f"Error in answer_query: {e}")
+            return None
+# Example
+ge = GetEmbeddings()
+# ge.run()
+# # NEW STEP
+# ge.load_summarizer("google/gemma-2b")
+# answer = ge.answer_query("What are the key highlights from Q2 financial report?")
+# print(answer)

Logger.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import logging
+import os, json
+from logging.handlers import TimedRotatingFileHandler
+from datetime import datetime
+class GetLogger:
+    def __init__(self, logging_level="INFO", log_to_console=True, log_dir="logs"):
+        """
+        Advanced Logger
+        - Logs to both file (rotating) and console
+        - Default rotation: daily, keep last 7 logs
+        - Safe filename (no ':' in timestamp)
+        """
+        self.logger = logging.getLogger(__name__)
+        self.logger.setLevel(logging_level.upper())
+        # Avoid duplicate handlers
+        if self.logger.hasHandlers():
+            self.logger.handlers.clear()
+        # Ensure log directory exists
+        os.makedirs(log_dir, exist_ok=True)
+        # File handler (rotates daily, keep 7 backups)
+        file_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".log"
+        log_path = os.path.join(log_dir, file_name)
+        file_handler = TimedRotatingFileHandler(
+            filename=log_path, when="D", interval=1, backupCount=3, encoding="utf-8"
+        )
+        formatter = logging.Formatter(
+            "%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(funcName)s() - %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S"
+        )
+        file_handler.setFormatter(formatter)
+        self.logger.addHandler(file_handler)
+        # Console handler (optional)
+        if log_to_console:
+            console_handler = logging.StreamHandler()
+            console_handler.setFormatter(formatter)
+            self.logger.addHandler(console_handler)
+    def get_logger(self):
+        return self.logger
+    def delete_logger(self):
+        """Remove all handlers and delete logger."""
+        handlers = self.logger.handlers[:]
+        for handler in handlers:
+            self.logger.removeHandler(handler)
+            handler.close()
+        del self.logger
+class MetricsLogger:
+    """
+    Collects evaluation metrics and saves aggregated statistics.
+    """
+    def __init__(self, save_path="logs/metrics_summary.json", logger=None):
+        self.save_path = save_path
+        self.metrics = []  # store per-query metrics
+        self.logger = logger or logging.getLogger(__name__)
+    def log_query_metrics(self, query, result_dict):
+        """
+        Log metrics for a single query.
+        Example: result_dict = {"latency_sec": 0.5, "rougeL": 0.7, ...}
+        """
+        record = {"query": query}
+        record.update(result_dict)
+        self.metrics.append(record)
+        self.logger.info(f"📊 Metrics logged for query: {query[:50]}...")
+    def summarize(self):
+        """Aggregate metrics (mean values)."""
+        if not self.metrics:
+            return {}
+        summary = {}
+        keys = [k for k in self.metrics[0].keys() if k != "query"]
+        for key in keys:
+            values = [m[key] for m in self.metrics if key in m and isinstance(m[key], (int, float))]
+            if values:
+                summary[f"avg_{key}"] = float(sum(values) / len(values))
+        return summary
+    def save(self):
+        """Save all metrics + summary to JSON."""
+        os.makedirs(os.path.dirname(self.save_path), exist_ok=True)
+        data = {
+            "per_query": self.metrics,
+            "summary": self.summarize()
+        }
+        with open(self.save_path, "w", encoding="utf-8") as f:
+            json.dump(data, f, indent=2)
+        self.logger.info(f"✅ Metrics saved to {self.save_path}")
+        return data
+# Example
+if __name__ == "__main__":
+    obj = GetLogger()
+    logger = obj.get_logger()
+    logger.info("✅ Logger initialized successfully")
+    logger.warning("⚠️ This is a warning")
+    logger.error("❌ This is an error")

README.md CHANGED Viewed

@@ -18,3 +18,40 @@ Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :hear
 If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
 forums](https://discuss.streamlit.io).

 If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
 forums](https://discuss.streamlit.io).
+# 📊 Financial QA Agent
+An AI-powered financial report assistant built with **RAG (Retrieval-Augmented Generation)**.
+This app lets you upload financial reports, search them with semantic embeddings, and get concise answers/summaries using an open-source LLM.
+## 🚀 Features
+- Cleans financial report text files automatically
+- Generates vector embeddings with FAISS for efficient retrieval
+- Summarizes answers using `google/gemma-2b` (or lightweight models for deployment)
+- Streamlit UI for easy interaction
+- Evaluation pipeline with ROUGE, BLEU, and BERTScore
+## 🛠️ Tech Stack
+- **Streamlit** for UI
+- **FAISS** for vector search
+- **Sentence-Transformers** for embeddings
+- **Transformers** (Gemma/LLMs) for summarization
+- **Scikit-learn, NLTK, BERTScore** for evaluation metrics
+## 📂 Project Structure
+├── app.py # Main Streamlit app (entrypoint)
+├── Embeddings.py # Embedding + FAISS pipeline
+├── Data_Cleaning.py # Data cleaning utility
+├── Logger.py # Logging utility
+├── evaluation.py # Evaluation pipeline
+├── config.json # Configurations
+├── eval_dataset.json # Sample evaluation dataset
+├── requirements.txt # Dependencies
+├── README.md # Project documentation
+└── .gitignore # Ignore unnecessary files
+## ⚡ Running Locally
+```bash
+pip install -r requirements.txt
+streamlit run app.py

app.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import streamlit as st
+from Embeddings import GetEmbeddings
+import json
+# Load Agent once and cache it
+@st.cache_resource
+def load_agent():
+    agent = GetEmbeddings(config_path="config.json")
+    agent.run()                # Build/load FAISS
+    agent.load_summarizer()    # Load summarizer model
+    encoder = agent.load_encoder()
+    return agent, encoder
+def main():
+    st.set_page_config(page_title="📊 Financial QA Agent", layout="wide")
+    st.title("📊 Financial QA Agent")
+    st.markdown(
+        """
+        Ask questions about financial reports.
+        The system retrieves relevant sections from company reports and summarizes them into concise answers.
+        """
+    )
+    # Sidebar
+    st.sidebar.header("⚙️ Settings")
+    show_debug = st.sidebar.checkbox("Show retrieved chunks", value=False)
+    # Load Agent
+    agent, encoder = load_agent()
+    # User Input
+    query = st.text_area("Enter your financial question:", height=100)
+    if st.button("Get Answer"):
+        if query.strip() == "":
+            st.warning("⚠️ Please enter a query.")
+        else:
+            with st.spinner("🔎 Searching and generating answer..."):
+                answer = agent.answer_query(query, top_k=3)
+            st.subheader("✅ Answer")
+            st.write(answer)
+            if show_debug:
+                st.subheader("📂 Retrieved Chunks (Debug)")
+                # Show top chunks used
+                q_emb = encoder.encode(query, convert_to_numpy=True).reshape(1, -1)
+                import faiss
+                faiss.normalize_L2(q_emb)
+                scores, idxs = agent.index.search(q_emb, k=3)
+                for score, idx in zip(scores[0], idxs[0]):
+                    st.markdown(f"**Score:** {score:.4f}")
+                    st.write(agent.metadata[idx]["text"][:500] + "...")
+if __name__ == "__main__":
+    main()

app_colabcode.ipynb ADDED Viewed

	@@ -0,0 +1,805 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "rkRTHAoXbOJC",
+        "outputId": "ab776f45-7c6c-4b1c-87bc-7410dc1955fe"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Selecting previously unselected package cloudflared.\n",
+            "(Reading database ... 126441 files and directories currently installed.)\n",
+            "Preparing to unpack cloudflared-linux-amd64.deb ...\n",
+            "Unpacking cloudflared (2025.9.1) ...\n",
+            "Setting up cloudflared (2025.9.1) ...\n",
+            "Processing triggers for man-db (2.10.2-1) ...\n",
+            "cloudflared version 2025.9.1 (built 2025-09-22-13:28 UTC)\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install -r requirements.txt -q\n",
+        "!pip install streamlit cloudflared -q\n",
+        "!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb\n",
+        "!dpkg -i cloudflared-linux-amd64.deb\n",
+        "\n",
+        "!cloudflared --version\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "UpQo5rPBkvT4"
+      },
+      "outputs": [],
+      "source": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "l08lsc3SbUy2",
+        "outputId": "e7c5db50-4944-4fad-bad6-fae2ec7439aa"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "✅ CUDA is available. Using GPU: Tesla T4\n"
+          ]
+        }
+      ],
+      "source": [
+        "import torch\n",
+        "\n",
+        "if torch.cuda.is_available():\n",
+        "    print(f\"✅ CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}\")\n",
+        "    # return True\n",
+        "else:\n",
+        "    print(\"⚠️ CUDA not available. Falling back to CPU.\")\n",
+        "    # return False\n",
+        "\n",
+        "\n",
+        "# # Load the allocator\n",
+        "# new_alloc = torch.cuda.memory.CUDAPluggableAllocator(\n",
+        "#     'alloc.so', 'my_malloc', 'my_free')\n",
+        "# # Swap the current allocator\n",
+        "# torch.cuda.memory.change_current_allocator(new_alloc)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "LHHSaPwNbZXW",
+        "outputId": "a2939de4-7a06-4a35-cf6f-190ea3fec13a"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Overwriting Embeddings.py\n"
+          ]
+        }
+      ],
+      "source": [
+        "%%writefile Embeddings.py\n",
+        "\n",
+        "import os\n",
+        "import glob\n",
+        "import pickle, json\n",
+        "from tqdm import tqdm\n",
+        "import numpy as np\n",
+        "\n",
+        "# Try imports with friendly errors\n",
+        "try:\n",
+        "    import faiss\n",
+        "except Exception as e:\n",
+        "    raise ImportError(\"faiss is required. Install cpu version: `pip install faiss-cpu` or install via conda for GPU (faiss-gpu).\") from e\n",
+        "\n",
+        "try:\n",
+        "    from sentence_transformers import SentenceTransformer\n",
+        "except Exception as e:\n",
+        "    raise ImportError(\"sentence-transformers is required. `pip install sentence-transformers`\") from e\n",
+        "\n",
+        "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
+        "import torch\n",
+        "from google.colab import userdata\n",
+        "\n",
+        "\n",
+        "\n",
+        "# from Data_Cleaning import GetDataCleaning\n",
+        "# from Logger import GetLogger\n",
+        "\n",
+        "\n",
+        "class GetEmbeddings:\n",
+        "    \"\"\"\n",
+        "    Embedding pipeline for cleaned text files.\n",
+        "    Generates embeddings using SentenceTransformers, builds a FAISS index,\n",
+        "    and allows searching queries against the vector database.\n",
+        "    \"\"\"\n",
+        "\n",
+        "    def __init__(self, config_path=\"config.json\", logger=None):\n",
+        "\n",
+        "        with open(config_path, \"r\") as f:\n",
+        "            self.config = json.load(f)\n",
+        "\n",
+        "        cfg_paths = self.config[\"paths\"]\n",
+        "\n",
+        "\n",
+        "        cfg_emb = self.config[\"embedding\"]\n",
+        "\n",
+        "        self.root = cfg_paths[\"root\"]\n",
+        "        self.cleaned_suffix = \"_cleaned_txt\"\n",
+        "        self.chunk_words = cfg_emb[\"chunk_words\"]\n",
+        "        self.batch_size = cfg_emb[\"batch_size\"]\n",
+        "        self.faiss_index_path = cfg_paths[\"faiss_index\"]\n",
+        "        self.metadata_path = cfg_paths[\"metadata\"]\n",
+        "        self.embedding_model = cfg_emb[\"model\"]\n",
+        "\n",
+        "        # if not logger:\n",
+        "        #     obj = GetLogger()\n",
+        "        #     logger = obj.get_logger()\n",
+        "        # self.logger = logger\n",
+        "        # print(\"Initializing Embedding Pipeline...\")\n",
+        "\n",
+        "        # Device\n",
+        "        self.device = \"cuda\" if self.check_cuda() and cfg_emb[\"use_gpu\"] else \"cpu\"\n",
+        "        self.hf_token = \"your_token\"\n",
+        "\n",
+        "    def check_cuda(self):\n",
+        "        \"\"\"Return True if CUDA is available and usable.\"\"\"\n",
+        "        try:\n",
+        "            if torch.cuda.is_available():\n",
+        "                _ = torch.cuda.current_device()\n",
+        "                print(f\"✅ CUDA available. Device: {torch.cuda.get_device_name(0)}\")\n",
+        "                return True\n",
+        "            print(\"⚠️ CUDA not available. Using CPU.\")\n",
+        "            return False\n",
+        "        except Exception as e:\n",
+        "            print(f\"Error checking CUDA, defaulting to CPU. Error: {e}\")\n",
+        "            return False\n",
+        "\n",
+        "    def list_cleaned_files(self):\n",
+        "        \"\"\"Return sorted list of cleaned text files under root/*{cleaned_suffix}/*.txt\"\"\"\n",
+        "        pattern = os.path.join(self.root, f\"*{self.cleaned_suffix}\", \"*.txt\")\n",
+        "        files = glob.glob(pattern)\n",
+        "        files.sort()\n",
+        "        return files\n",
+        "\n",
+        "    def read_text_file(self, path):\n",
+        "        \"\"\"Read a text file and return string content.\"\"\"\n",
+        "        with open(path, \"r\", encoding=\"utf-8\") as f:\n",
+        "            return f.read()\n",
+        "\n",
+        "    def chunk_text_words(self, text):\n",
+        "        \"\"\"\n",
+        "        Simple word-based chunking.\n",
+        "        Returns list of text chunks.\n",
+        "        \"\"\"\n",
+        "        words = text.split()\n",
+        "        if not words:\n",
+        "            return []\n",
+        "        return [\" \".join(words[i:i + self.chunk_words]) for i in range(0, len(words), self.chunk_words)]\n",
+        "\n",
+        "    def save_index_and_metadata(self):\n",
+        "        \"\"\"Save FAISS index and metadata to disk.\"\"\"\n",
+        "        os.makedirs(os.path.dirname(self.faiss_index_path), exist_ok=True)\n",
+        "        faiss.write_index(self.index, self.faiss_index_path)\n",
+        "        with open(self.metadata_path, \"wb\") as f:\n",
+        "            pickle.dump(self.metadata, f)\n",
+        "        print(f\"💾 Saved FAISS index to {self.faiss_index_path}\")\n",
+        "        print(f\"💾 Saved metadata to {self.metadata_path}\")\n",
+        "\n",
+        "    def load_index_and_metadata(self):\n",
+        "        \"\"\"Load FAISS index and metadata if they exist.\"\"\"\n",
+        "        if os.path.exists(self.faiss_index_path) and os.path.exists(self.metadata_path):\n",
+        "            try:\n",
+        "                self.index = faiss.read_index(self.faiss_index_path)\n",
+        "                with open(self.metadata_path, \"rb\") as f:\n",
+        "                    self.metadata = pickle.load(f)\n",
+        "                print(f\"✅ Loaded existing FAISS index + metadata from disk.\")\n",
+        "                return True\n",
+        "            except Exception as e:\n",
+        "                print(f\"⚠️ Failed to load FAISS index/metadata, will rebuild. Error: {e}\")\n",
+        "                return False\n",
+        "        return False\n",
+        "\n",
+        "    def load_encoder(self):\n",
+        "        \"\"\"Loading Encoder\"\"\"\n",
+        "        self.encoder = SentenceTransformer(self.embedding_model, device=self.device)\n",
+        "        print(f\"Loaded embedding model '{self.embedding_model}' on {self.device}\")\n",
+        "        return self.encoder\n",
+        "\n",
+        "\n",
+        "    def building_embeddings_index(self, files):\n",
+        "        \"\"\"Build embeddings for all text chunks and return FAISS index + metadata.\"\"\"\n",
+        "\n",
+        "\n",
+        "        all_embeddings, metadata = [], []\n",
+        "        next_id = 0\n",
+        "        # Iterate files and chunks\n",
+        "        for fp in tqdm(files, desc=\"Files\", unit=\"file\"):\n",
+        "            text = self.read_text_file(fp)\n",
+        "\n",
+        "            if not text.strip():\n",
+        "                continue\n",
+        "\n",
+        "            # metadata: infer company and file from path\n",
+        "            # e.g., financial_reports/Infosys_cleaned_txt/Infosys_2023_AR.txt\n",
+        "            rel = os.path.relpath(fp, self.root)\n",
+        "            folder = rel.split(os.sep)[0]\n",
+        "            filename = os.path.basename(fp)\n",
+        "\n",
+        "            chunks = self.chunk_text_words(text)\n",
+        "            if not chunks:\n",
+        "                continue\n",
+        "\n",
+        "            for i in range(0, len(chunks), self.batch_size):\n",
+        "                batch = chunks[i:i + self.batch_size]\n",
+        "                embs = self.encoder.encode(batch, show_progress_bar=False, convert_to_numpy=True)\n",
+        "                embs = embs.astype(np.float32)\n",
+        "\n",
+        "                for j, vec in enumerate(embs):\n",
+        "                    all_embeddings.append(vec)\n",
+        "                    metadata.append({\n",
+        "                        \"id\": next_id,\n",
+        "                        \"source_folder\": folder,\n",
+        "                        \"file\": filename,\n",
+        "                        \"chunk_id\": i + j,\n",
+        "                        \"text\": batch[j]  # store chunk text for retrieval\n",
+        "                    })\n",
+        "                    next_id += 1\n",
+        "\n",
+        "        if not all_embeddings:\n",
+        "            raise RuntimeError(\"No embeddings were produced. Check cleaned files and chunking.\")\n",
+        "\n",
+        "        emb_matrix = np.vstack(all_embeddings).astype(np.float32)\n",
+        "        faiss.normalize_L2(emb_matrix)\n",
+        "\n",
+        "        # Build FAISS index (IndexFlatIP over normalized vectors = cosine similarity)\n",
+        "        dim = emb_matrix.shape[1]\n",
+        "        self.index = faiss.IndexFlatIP(dim)\n",
+        "        self.index.add(emb_matrix)\n",
+        "        self.metadata = metadata\n",
+        "        print(f\"✅ Built FAISS index with {self.index.ntotal} vectors, dim={dim}\")\n",
+        "\n",
+        "        return self.index, self.metadata\n",
+        "\n",
+        "    def run(self):\n",
+        "        \"\"\"Main entry: load or build embeddings + FAISS index.\"\"\"\n",
+        "        if self.load_index_and_metadata():\n",
+        "            return\n",
+        "\n",
+        "        files = self.list_cleaned_files()\n",
+        "        if not files:\n",
+        "            print(\"❌ No cleaned text files found.\")\n",
+        "            raise SystemExit(1)\n",
+        "        self.load_encoder()\n",
+        "        self.building_embeddings_index(files)\n",
+        "        self.save_index_and_metadata()\n",
+        "\n",
+        "    def load_summarizer(self, model_name=\"google/gemma-2b\"):\n",
+        "        \"\"\"\n",
+        "        Load summarizer LLM once.\n",
+        "        If already loaded, skip.\n",
+        "        \"\"\"\n",
+        "        if hasattr(self, \"summarizer_pipeline\"):\n",
+        "            print(\"ℹ️ Summarizer already loaded, skipping reload.\")\n",
+        "            return\n",
+        "\n",
+        "        try:\n",
+        "            print(f\"⏳ Loading summarizer model '{model_name}'...\")\n",
+        "            self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=self.hf_token)\n",
+        "            self.summarizer_model = AutoModelForCausalLM.from_pretrained(\n",
+        "                model_name,\n",
+        "                torch_dtype=torch.float16 if self.device == \"cuda\" else torch.float32,\n",
+        "                device_map=self.device,\n",
+        "                token=self.hf_token\n",
+        "            )\n",
+        "            self.summarizer_pipeline = pipeline(\n",
+        "                \"text-generation\",\n",
+        "                model=self.summarizer_model,\n",
+        "                tokenizer=self.tokenizer\n",
+        "            )\n",
+        "            print(f\"✅ Summarizer model '{model_name}' loaded successfully.\")\n",
+        "\n",
+        "        except RuntimeError as e:\n",
+        "            if \"CUDA out of memory\" in str(e):\n",
+        "                print(\"⚠️ CUDA OOM while loading summarizer. Retrying on CPU...\")\n",
+        "                self.device = \"cpu\"\n",
+        "                torch.cuda.empty_cache()\n",
+        "                return self.load_summarizer(model_name=model_name)\n",
+        "            else:\n",
+        "                print(f\"❌ Failed to load summarizer: {e}\")\n",
+        "                raise\n",
+        "\n",
+        "    def summarize_chunks(self, chunks, max_content_tokens=2048, max_output_tokens=256):\n",
+        "        \"\"\"\n",
+        "        Summarize list of text chunks using LLM.\n",
+        "        - Chunks are joined until they fit into max_context_tokens\n",
+        "        - Generates a concise summary.\n",
+        "        \"\"\"\n",
+        "\n",
+        "        if not hasattr(self, \"summarizer_pipeline\"):\n",
+        "            self.load_summarizer()\n",
+        "            print(\"Summarizer not initialized. Called load_summarizer(). pipeline will work with default parameters.\")\n",
+        "\n",
+        "        # Join chunks into one context, respecting token budget\n",
+        "        context = \" \".join(chunks)\n",
+        "        input_tokens = len(self.tokenizer.encode(context))\n",
+        "\n",
+        "        if input_tokens > max_content_tokens:\n",
+        "            # Trim to fit context window\n",
+        "            context = \" \".join(context.split()[:max_content_tokens])\n",
+        "            print(\"⚠️ Context truncated to fit within model token limit.\")\n",
+        "\n",
+        "        # Build summarization prompt\n",
+        "        prompt = f\"\"\"\n",
+        "            Summarize the following financial report excerpts into a concise answer.\n",
+        "            Keep it factual, short, and grounded in the text.\n",
+        "\n",
+        "            Excerpts:\n",
+        "            {context}\n",
+        "\n",
+        "            Summary:\n",
+        "            \"\"\"\n",
+        "\n",
+        "        try:\n",
+        "            output = self.summarizer_pipeline(\n",
+        "                prompt,\n",
+        "                max_new_tokens=max_output_tokens,\n",
+        "                do_sample=False\n",
+        "            )[0][\"generated_text\"]\n",
+        "\n",
+        "            if \"Summary:\" in output:\n",
+        "                summary = output.split(\"Summary:\")[-1].strip()\n",
+        "            else:\n",
+        "                summary = output.strip()\n",
+        "\n",
+        "            return summary\n",
+        "\n",
+        "        except RuntimeError as e:\n",
+        "            if \"CUDA out of memory\" in str(e):\n",
+        "                print(\"⚠️ CUDA OOM during summarization. Retrying on CPU...\")\n",
+        "                self.device = \"cpu\"\n",
+        "                torch.cuda.empty_cache()\n",
+        "                return self.summarize_chunks(chunks, max_content_tokens, max_output_tokens)\n",
+        "            else:\n",
+        "                print(f\"❌ Summarizer failed: {e}. Falling back to raw chunks.\")\n",
+        "                return \" \".join(chunks[:2])  # fallback: return first 2 chunks\n",
+        "\n",
+        "\n",
+        "    def answer_query(self, query, top_k=3):\n",
+        "        \"\"\"\n",
+        "        End-to-end QA:\n",
+        "        - Retrieve relevant chunks from FAISS\n",
+        "        - Summarize into a final answer.\n",
+        "        \"\"\"\n",
+        "        try:\n",
+        "            #step 1: Retrieve\n",
+        "            print(f\"🔍 searching vector DB for query: {query}\")\n",
+        "            q_emb = self.encoder.encode(query, show_progress_bar=False, convert_to_numpy=True).reshape(1, -1)\n",
+        "            faiss.normalize_L2(q_emb)\n",
+        "\n",
+        "            scores, idxs = self.index.search(q_emb, k=top_k)\n",
+        "            chunks = [self.metadata[idx][\"text\"] for idx in idxs[0]]\n",
+        "\n",
+        "            # Step 2: Summarize\n",
+        "            summary = self.summarize_chunks(chunks)\n",
+        "\n",
+        "            # Log results\n",
+        "            print(f\"✅ Final Answer: {summary}\")\n",
+        "            return summary\n",
+        "\n",
+        "        except Exception as e:\n",
+        "            print(f\"Error in answer_query: {e}\")\n",
+        "            return None\n",
+        "\n",
+        "\n",
+        "# Example\n",
+        "# ge = GetEmbeddings()\n",
+        "# ge.run()\n",
+        "# # NEW STEP\n",
+        "# ge.load_summarizer(\"google/gemma-2b\")\n",
+        "# answer = ge.answer_query(\"What are the key highlights from Q2 financial report?\")\n",
+        "# print(answer)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "SrZwOeGPba8Q",
+        "outputId": "b14f3d67-54d7-4db1-c030-702ab670bc90"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Writing Evaluator.py\n"
+          ]
+        }
+      ],
+      "source": [
+        "%%writefile Evaluator.py\n",
+        "import os\n",
+        "import json\n",
+        "import time\n",
+        "import numpy as np\n",
+        "from tqdm import tqdm\n",
+        "\n",
+        "# from Logger import GetLogger, MetricsLogger\n",
+        "# from Embeddings import GetEmbeddings\n",
+        "\n",
+        "# Metrics\n",
+        "from sklearn.metrics.pairwise import cosine_similarity\n",
+        "from rouge_score import rouge_scorer\n",
+        "from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction\n",
+        "from bert_score import score as bert_score\n",
+        "\n",
+        "class Evaluator:\n",
+        "    \"\"\"\n",
+        "    Evaluation pipeline for financial QA Agent.\n",
+        "    Uses eval_dataset.json to run queries, collect answers, and compute metrics.\n",
+        "    \"\"\"\n",
+        "    def __init__(self, config_path=\"config.json\", logger=None):\n",
+        "        with open(config_path, \"r\") as f:\n",
+        "            self.config = json.load(f)\n",
+        "        self.paths = self.config[\"paths\"]\n",
+        "\n",
+        "\n",
+        "        # if not logger:\n",
+        "        #     obj = GetLogger()\n",
+        "        #     logger = obj.get_logger()\n",
+        "        # self.logger = logger\n",
+        "\n",
+        "\t\t# # Metrics logger\n",
+        "        # self.metrics_logger = MetricsLogger(logger=self.logger)\n",
+        "\n",
+        "        # Initialize Agent\n",
+        "        self.agent = GetEmbeddings(config_path=config_path, logger=None)\n",
+        "        self.agent.run()    # Load or rebuild FAISS + embeddings\n",
+        "        self.agent.load_summarizer()    # Load summarizer\n",
+        "        self.encoder = self.agent.load_encoder()\n",
+        "\n",
+        "        # Load Dataset\n",
+        "        self.dataset = self.load_dataset()\n",
+        "        self.results = []\n",
+        "        self.failed_queries = []\n",
+        "\n",
+        "    def load_dataset(self):\n",
+        "        path = self.paths[\"eval_dataset\"]\n",
+        "        if not os.path.exists(path):\n",
+        "            raise FileNotFoundError(f\"Dataset not found: {path}\")\n",
+        "        with open(path, \"r\", encoding=\"utf-8\") as f:\n",
+        "            return json.load(f)\n",
+        "\n",
+        "    def measure_latency(self, func, *args, **kwargs):\n",
+        "        \"\"\"Helper: measure time taken by a function call.\"\"\"\n",
+        "        start = time.time()\n",
+        "        result = func(*args, **kwargs)\n",
+        "        latency = time.time() - start\n",
+        "        return result, latency\n",
+        "\n",
+        "    def evaluate_query(self, query, reference):\n",
+        "        \"\"\"Run one query, compare answer vs. reference, compute metrics.\"\"\"\n",
+        "        # try:\n",
+        "        # Run pipeline\n",
+        "        system_answer, latency = self.measure_latency(self.agent.answer_query, query)\n",
+        "\n",
+        "        # 1. Embedding similarity (proxy retrieval quality)\n",
+        "        ref_emb = self.encoder.encode([reference], convert_to_numpy=True)\n",
+        "        ans_emb = self.encoder.encode([system_answer], convert_to_numpy=True)\n",
+        "        retrieval_quality = float(cosine_similarity(ref_emb, ans_emb)[0][0])\n",
+        "\n",
+        "        # 2. ROUGE-L\n",
+        "        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)\n",
+        "        rouge_score = scorer.score(reference, system_answer)['rougeL'].fmeasure\n",
+        "\n",
+        "        # 3. BLEU (with smoothing for short texts)\n",
+        "        smoothie = SmoothingFunction().method4\n",
+        "        bleu = sentence_bleu([reference.split()], system_answer.split(), smoothing_function=smoothie)\n",
+        "\n",
+        "        # 4. BERTScore (semantic similarity)\n",
+        "        P, R, F1 = bert_score([system_answer], [reference], lang=\"en\")\n",
+        "        bert_f1 = float(F1.mean())\n",
+        "\n",
+        "        metrics = {\n",
+        "            \"query\": query,\n",
+        "            \"reference\": reference,\n",
+        "            \"system_answer\": system_answer,\n",
+        "            \"retrieval_quality\": retrieval_quality,\n",
+        "            \"rougeL\": rouge_score,\n",
+        "            \"bleu\": bleu,\n",
+        "            \"bertscore_f1\": bert_f1,\n",
+        "            \"latency_sec\": latency\n",
+        "        }\n",
+        "\n",
+        "        # Log into metrics logger\n",
+        "        # self.metrics_logger.log_query_metrics(query, metrics)\n",
+        "\n",
+        "        return metrics\n",
+        "\n",
+        "        # except Exception as e:\n",
+        "        #     print(f\"Error evaluating query '{query}': {e}\")\n",
+        "        #     return None\n",
+        "\n",
+        "\n",
+        "    def run(self):\n",
+        "        \"\"\"Run evaluation on entire dataset.\"\"\"\n",
+        "        print(\"Starting Evaluation...\")\n",
+        "\n",
+        "        for item in tqdm(self.dataset, desc=\"Queries\"):\n",
+        "            query = item[\"query\"]\n",
+        "            reference = item[\"reference\"]\n",
+        "            result = self.evaluate_query(query, reference)\n",
+        "            if result:\n",
+        "                self.results.append(result)\n",
+        "\n",
+        "\n",
+        "        # Save result\n",
+        "        with open(self.paths[\"eval_results\"], \"w\", encoding=\"utf-8\") as f:\n",
+        "            json.dump(self.results, f, indent=2)\n",
+        "\n",
+        "        if self.failed_queries:\n",
+        "            with open(self.paths[\"failed_queries\"], \"w\", encoding=\"utf-8\") as f:\n",
+        "                json.dump(self.failed_queries, f, indent=2)\n",
+        "\n",
+        "\n",
+        "        # Save metrics summary\n",
+        "        # summary = self.metrics_logger.save()\n",
+        "        summary = None\n",
+        "        print(f\"Evaluation Complete.\")\n",
+        "        print(f\"📊 Evaluation summary: {summary}\")\n",
+        "\n",
+        "        return self.results, summary\n",
+        "\n",
+        "\n",
+        "if __name__ == \"__main__\":\n",
+        "    evaluator = Evaluator()\n",
+        "    results, summary = evaluator.run()\n",
+        "\n",
+        "    print(\"\\n=== Sample Results ===\")\n",
+        "    print(json.dumps(results[:2], indent=2))\n",
+        "    print(\"\\n=== Summary ===\")\n",
+        "    print(json.dumps(summary, indent=2))\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "_SgMUhSJbdcu",
+        "outputId": "c79fe42b-517f-40b7-cc2b-71ddaae05084"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Overwriting app.py\n"
+          ]
+        }
+      ],
+      "source": [
+        "%%writefile app.py\n",
+        "import streamlit as st\n",
+        "import json\n",
+        "import faiss\n",
+        "import numpy as np\n",
+        "import re\n",
+        "from Embeddings import GetEmbeddings\n",
+        "from Logger import GetLogger\n",
+        "\n",
+        "# ================================\n",
+        "# Load Config\n",
+        "# ================================\n",
+        "with open(\"config.json\", \"r\") as f:\n",
+        "    config = json.load(f)\n",
+        "\n",
+        "# Initialize Logger\n",
+        "log_obj = GetLogger()\n",
+        "logger = log_obj.get_logger()\n",
+        "\n",
+        "# Initialize QA Agent\n",
+        "@st.cache_resource\n",
+        "def load_agent():\n",
+        "    agent = GetEmbeddings(config_path=\"config.json\", logger=logger)\n",
+        "    agent.run()  # load or build FAISS index\n",
+        "    encoder = agent.load_encoder()\n",
+        "    agent.load_summarizer()\n",
+        "    return agent, encoder\n",
+        "\n",
+        "agent, encoder = load_agent()\n",
+        "\n",
+        "# ================================\n",
+        "# Streamlit UI\n",
+        "# ================================\n",
+        "st.set_page_config(page_title=\"Financial QA Agent\", layout=\"wide\")\n",
+        "\n",
+        "# --- Header ---\n",
+        "st.title(\"💹 Financial Report QA Agent\")\n",
+        "st.markdown(\n",
+        "    \"\"\"\n",
+        "    Welcome!\n",
+        "    This tool lets you **query annual financial reports** (Infosys, ICICI Bank, etc.)\n",
+        "    and get **summarized answers** with supporting evidence from the text.\n",
+        "    \"\"\"\n",
+        ")\n",
+        "\n",
+        "# Sidebar - Settings\n",
+        "st.sidebar.header(\"⚙️ Settings\")\n",
+        "top_k = st.sidebar.slider(\"Top K Chunks\", 1, 10, 3)\n",
+        "max_output_tokens = st.sidebar.slider(\"Max Summary Tokens\", 64, 512, 256)\n",
+        "\n",
+        "# --- Keyword highlighting ---\n",
+        "def highlight_keywords(text, keywords=[\"risk\", \"revenue\", \"profit\", \"growth\", \"loss\"]):\n",
+        "    pattern = re.compile(r\"\\b(\" + \"|\".join(keywords) + r\")\\b\", re.IGNORECASE)\n",
+        "    return pattern.sub(lambda m: f\"**{m.group(0)}**\", text)\n",
+        "\n",
+        "# --- Session State for Query History ---\n",
+        "if \"history\" not in st.session_state:\n",
+        "    st.session_state[\"history\"] = []\n",
+        "\n",
+        "# --- Query input ---\n",
+        "query = st.text_input(\"🔍 Enter your question:\", placeholder=\"e.g., What are the main risk factors in 2023?\")\n",
+        "\n",
+        "if st.button(\"Get Answer\"):\n",
+        "    if query.strip() == \"\":\n",
+        "        st.warning(\"Please enter a query.\")\n",
+        "    else:\n",
+        "        with st.spinner(\"Searching reports...\"):\n",
+        "            try:\n",
+        "                # Retrieve + summarize\n",
+        "                answer = agent.answer_query(query, top_k=top_k)\n",
+        "\n",
+        "                # --- Display final answer ---\n",
+        "                st.subheader(\"📌 Answer\")\n",
+        "                st.success(answer)\n",
+        "\n",
+        "                # --- Show supporting chunks ---\n",
+        "                st.subheader(\"📂 Supporting Chunks\")\n",
+        "                q_emb = encoder.encode(query, convert_to_numpy=True).reshape(1, -1)\n",
+        "                faiss.normalize_L2(q_emb)\n",
+        "                scores, idxs = agent.index.search(q_emb.astype(np.float32), k=top_k)\n",
+        "\n",
+        "                for score, idx in zip(scores[0], idxs[0]):\n",
+        "                    meta = agent.metadata[idx]\n",
+        "                    with st.expander(f\"📄 {meta['file']} | Chunk {meta['chunk_id']} | Score: {score:.4f}\"):\n",
+        "                        chunk_text = highlight_keywords(meta['text'][:1000])\n",
+        "                        st.markdown(chunk_text)\n",
+        "\n",
+        "                # --- Save Query & Answer to History ---\n",
+        "                st.session_state[\"history\"].append({\"query\": query, \"answer\": answer})\n",
+        "\n",
+        "                # --- Log query + answer ---\n",
+        "                logger.info(f\"User Query: {query}\")\n",
+        "                logger.info(f\"System Answer: {answer}\")\n",
+        "\n",
+        "                # --- Save persistent history JSON ---\n",
+        "                with open(\"ui_query_history.json\", \"w\", encoding=\"utf-8\") as f:\n",
+        "                    json.dump(st.session_state[\"history\"], f, indent=2)\n",
+        "\n",
+        "            except Exception as e:\n",
+        "                st.error(f\"Error: {e}\")\n",
+        "                logger.error(f\"Streamlit UI error: {e}\")\n",
+        "\n",
+        "# --- Show History in Sidebar ---\n",
+        "if st.session_state[\"history\"]:\n",
+        "    st.sidebar.subheader(\"🕘 Query History\")\n",
+        "    for item in st.session_state[\"history\"][-5:]:  # show last 5 queries\n",
+        "        st.sidebar.write(f\"**Q:** {item['query']}\")\n",
+        "        st.sidebar.write(f\"**A:** {item['answer'][:100]}...\")\n",
+        "        st.sidebar.markdown(\"---\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "6UAnlclVckzM",
+        "outputId": "bb65eead-5953-4a4f-f838-14fadc1469dd"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\u001b[90m2025-09-29T13:35:21Z\u001b[0m \u001b[32mINF\u001b[0m Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps\n",
+            "\u001b[90m2025-09-29T13:35:21Z\u001b[0m \u001b[32mINF\u001b[0m Requesting new quick Tunnel on trycloudflare.com...\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m +--------------------------------------------------------------------------------------------+\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m |  Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):  |\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m |  https://ease-library-cases-gibraltar.trycloudflare.com                                    |\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m +--------------------------------------------------------------------------------------------+\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m Cannot determine default configuration path. No file [config.yml config.yaml] in [~/.cloudflared ~/.cloudflare-warp ~/cloudflare-warp /etc/cloudflared /usr/local/etc/cloudflared]\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m Version 2025.9.1 (Checksum 3dc1dc4252eae3c691861f926e2b8640063a2ce534b07b7a3f4ec2de439ecfe3)\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m GOOS: linux, GOVersion: go1.24.4, GoArch: amd64\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m Settings: map[ha-connections:1 no-autoupdate:true protocol:quic url:http://localhost:8501]\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m cloudflared will not automatically update if installed by a package manager.\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m Generated Connector ID: b7e0104f-71af-4b1e-a366-b3b15b2c86d9\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m Initial protocol quic\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m ICMP proxy will use 172.28.0.12 as source for IPv4\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m ICMP proxy will use :: as source for IPv6\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[1m\u001b[31mERR\u001b[0m\u001b[0m Cannot determine default origin certificate path. No file cert.pem in [~/.cloudflared ~/.cloudflare-warp ~/cloudflare-warp /etc/cloudflared /usr/local/etc/cloudflared]. You need to specify the origin certificate path by specifying the origincert option in the configuration file, or set TUNNEL_ORIGIN_CERT environment variable \u001b[36moriginCertPath=\u001b[0m\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m ICMP proxy will use 172.28.0.12 as source for IPv4\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m ICMP proxy will use :: as source for IPv6\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m Starting metrics server on 127.0.0.1:20241/metrics\n",
+            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m Tunnel connection curve preferences: [X25519MLKEM768 CurveP256] \u001b[36mconnIndex=\u001b[0m0 \u001b[36mevent=\u001b[0m0 \u001b[36mip=\u001b[0m198.41.200.113\n",
+            "2025/09/29 13:35:25 failed to sufficiently increase receive buffer size (was: 208 kiB, wanted: 7168 kiB, got: 416 kiB). See https://github.com/quic-go/quic-go/wiki/UDP-Buffer-Sizes for details.\n",
+            "\u001b[90m2025-09-29T13:35:26Z\u001b[0m \u001b[32mINF\u001b[0m Registered tunnel connection \u001b[36mconnIndex=\u001b[0m0 \u001b[36mconnection=\u001b[0mc535a197-93c0-4941-a9ab-b32533b50549 \u001b[36mevent=\u001b[0m0 \u001b[36mip=\u001b[0m198.41.200.113 \u001b[36mlocation=\u001b[0msin02 \u001b[36mprotocol=\u001b[0mquic\n",
+            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[32mINF\u001b[0m Initiating graceful shutdown due to signal interrupt ...\n",
+            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[1m\u001b[31mERR\u001b[0m\u001b[0m failed to run the datagram handler \u001b[31merror=\u001b[0m\u001b[31m\"context canceled\"\u001b[0m \u001b[36mconnIndex=\u001b[0m0 \u001b[36mevent=\u001b[0m0 \u001b[36mip=\u001b[0m198.41.200.113\n",
+            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[1m\u001b[31mERR\u001b[0m\u001b[0m failed to serve tunnel connection \u001b[31merror=\u001b[0m\u001b[31m\"accept stream listener encountered a failure while serving\"\u001b[0m \u001b[36mconnIndex=\u001b[0m0 \u001b[36mevent=\u001b[0m0 \u001b[36mip=\u001b[0m198.41.200.113\n",
+            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[1m\u001b[31mERR\u001b[0m\u001b[0m Serve tunnel error \u001b[31merror=\u001b[0m\u001b[31m\"accept stream listener encountered a failure while serving\"\u001b[0m \u001b[36mconnIndex=\u001b[0m0 \u001b[36mevent=\u001b[0m0 \u001b[36mip=\u001b[0m198.41.200.113\n",
+            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[32mINF\u001b[0m Retrying connection in up to 1s \u001b[36mconnIndex=\u001b[0m0 \u001b[36mevent=\u001b[0m0 \u001b[36mip=\u001b[0m198.41.200.113\n",
+            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[1m\u001b[31mERR\u001b[0m\u001b[0m Connection terminated \u001b[36mconnIndex=\u001b[0m0\n",
+            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[1m\u001b[31mERR\u001b[0m\u001b[0m no more connections active and exiting\n",
+            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[32mINF\u001b[0m Tunnel server stopped\n",
+            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[32mINF\u001b[0m Metrics server stopped\n"
+          ]
+        }
+      ],
+      "source": [
+        "import threading, os\n",
+        "\n",
+        "# Kill anything on port 8501 (just in case)\n",
+        "os.system(\"kill -9 $(lsof -t -i:8501) 2>/dev/null\")\n",
+        "\n",
+        "# Run Streamlit in background\n",
+        "def run_app():\n",
+        "    os.system(\"streamlit run app.py --server.port 8501\")\n",
+        "\n",
+        "thread = threading.Thread(target=run_app)\n",
+        "thread.start()\n",
+        "\n",
+        "# Start cloudflared tunnel\n",
+        "!cloudflared tunnel --url http://localhost:8501 --no-autoupdate\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "machine_shape": "hm",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "paths": {
+    "root": "financial_reports",
+    "faiss_index": "financial_reports/faiss_index.idx",
+    "metadata": "financial_reports/faiss_metadata.pkl",
+    "eval_dataset": "eval_dataset.json",
+    "eval_results": "eval_results.json",
+    "failed_queries": "failed_queries.json"
+  },
+  "embedding": {
+    "model": "sentence-transformers/all-MiniLM-L6-v2",
+    "chunk_words": 600,
+    "batch_size": 64,
+    "use_gpu": true
+  },
+  "summarizer": {
+    "model": "google/gemma-2b",
+    "max_content_tokens": 2048,
+    "max_output_tokens": 256
+  },
+  "logging": {
+    "level": "INFO",
+    "log_dir": "logs"
+  }
+}

evaluation.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+import json
+import time
+import numpy as np
+from tqdm import tqdm
+import nltk
+from Logger import GetLogger, MetricsLogger
+from Embeddings import GetEmbeddings
+# Metrics
+from sklearn.metrics.pairwise import cosine_similarity
+from rouge_score import rouge_scorer
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from bert_score import score as bert_score
+class Evaluator:
+    """
+    Evaluation pipeline for financial QA Agent.
+    Uses eval_dataset.json to run queries, collect answers, and compute metrics.
+    """
+    def __init__(self, config_path="config.json", logger=None):
+        with open(config_path, "r") as f:
+            self.config = json.load(f)
+        self.paths = self.config["paths"]
+        if not logger:
+            obj = GetLogger()
+            logger = obj.get_logger()
+        self.logger = logger
+		# Metrics logger
+        self.metrics_logger = MetricsLogger(logger=self.logger)
+        # Initialize Agent
+        self.agent = GetEmbeddings(config_path=config_path, logger=self.logger)
+        self.agent.run()    # Load or rebuild FAISS + embeddings
+        self.agent.load_summarizer()    # Load summarizer
+        self.encoder = self.agent.load_encoder()
+        # Load Dataset
+        self.dataset = self.load_dataset()
+        self.results = []
+        self.failed_queries = []
+        nltk.download('punkt', quiet=True)
+    def load_dataset(self):
+        path = self.paths["eval_dataset"]
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"Dataset not found: {path}")
+        with open(path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    def measure_latency(self, func, *args, **kwargs):
+        """Helper: measure time taken by a function call."""
+        start = time.time()
+        result = func(*args, **kwargs)
+        latency = time.time() - start
+        return result, latency
+    def evaluate_query(self, query, reference):
+        """Run one query, compare answer vs. reference, compute metrics."""
+        try:
+            # Run pipeline
+            system_answer, latency = self.measure_latency(self.agent.answer_query, query)
+            # 1. Embedding similarity (proxy retrieval quality)
+            ref_emb = self.encoder.encode([reference], convert_to_numpy=True)
+            ans_emb = self.encoder.encode([system_answer], convert_to_numpy=True)
+            retrieval_quality = float(cosine_similarity(ref_emb, ans_emb)[0][0])
+            # 2. ROUGE-L
+            scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
+            rouge_score = scorer.score(reference, system_answer)['rougeL'].fmeasure
+            # 3. BLEU (with smoothing for short texts)
+            smoothie = SmoothingFunction().method4
+            bleu = sentence_bleu([reference.split()], system_answer.split(), smoothing_function=smoothie)
+            # 4. BERTScore (semantic similarity)
+            P, R, F1 = bert_score([system_answer], [reference], lang="en")
+            bert_f1 = float(F1.mean())
+            metrics = {
+                "query": query,
+                "reference": reference,
+                "system_answer": system_answer,
+                "retrieval_quality": retrieval_quality,
+                "rougeL": rouge_score,
+                "bleu": bleu,
+                "bertscore_f1": bert_f1,
+                "latency_sec": latency
+            }
+            # Log into metrics logger
+            self.metrics_logger.log_query_metrics(query, metrics)
+            return metrics
+        except Exception as e:
+            self.logger.error(f"Error evaluating query '{query}': {e}")
+            return None
+    def aggregate_summary(self):
+        """Aggregate metrics across all queries for global averages."""
+        if not self.results:
+            return {}
+        summary = {
+            "avg_retrieval_quality": float(np.mean([r["retrieval_quality"] for r in self.results])),
+            "avg_rougeL": float(np.mean([r["rougeL"] for r in self.results])),
+            "avg_bleu": float(np.mean([r["bleu"] for r in self.results])),
+            "avg_bertscore_f1": float(np.mean([r["bertscore_f1"] for r in self.results])),
+            "avg_latency_sec": float(np.mean([r["latency_sec"] for r in self.results])),
+            "num_queries": len(self.results)
+        }
+        return summary
+    def run(self):
+        """Run evaluation on entire dataset."""
+        self.logger.info("Starting Evaluation...")
+        for item in tqdm(self.dataset, desc="Queries"):
+            query = item["query"]
+            reference = item["reference"]
+            result = self.evaluate_query(query, reference)
+            if result:
+                self.results.append(result)
+        # Save result
+        with open(self.paths["eval_results"], "w", encoding="utf-8") as f:
+            json.dump(self.results, f, indent=2)
+        if self.failed_queries:
+            with open(self.paths["failed_queries"], "w", encoding="utf-8") as f:
+                json.dump(self.failed_queries, f, indent=2)
+        # Save metrics summary
+        summary = self.aggregate_summary()   # NEW: aggregated averages
+        self.logger.info(f"📊 Evaluation summary: {summary}")
+        # Also save aggregated summary separately
+        with open(self.paths.get("eval_summary", "eval_summary.json"), "w", encoding="utf-8") as f:
+            json.dump(summary, f, indent=2)
+        return self.results, summary
+if __name__ == "__main__":
+    evaluator = Evaluator()
+    results, summary = evaluator.run()
+    print("\n=== Sample Results ===")
+    print(json.dumps(results[:2], indent=2))
+    print("\n=== Summary ===")
+    print(json.dumps(summary, indent=2))

requirements.txt CHANGED Viewed

@@ -1,3 +1,18 @@
-altair
-pandas
-streamlit

+pdfplumber
+tqdm
+transformers
+sentence-transformers
+numpy
+faiss-cpu
+python-dotenv
+accelerate
+protobuf
+tiktoken
+SentencePiece
+bitsandbytes
+nltk
+rouge-score
+bert-score
+streamlit
+python-dateutil
+protobuf<4.0.0