Spaces:

PlantBasedTen
/

Financial_Bot

Runtime error

App Files Files Community

PlantBasedTen commited on Jan 26

Commit

bb59984

•

1 Parent(s): d432e43

Upload 22 files

Browse files

Files changed (22) hide show

financial_bot/__init__.py +55 -0
financial_bot/__pycache__/__init__.cpython-310.pyc +0 -0
financial_bot/__pycache__/base.cpython-310.pyc +0 -0
financial_bot/__pycache__/chains.cpython-310.pyc +0 -0
financial_bot/__pycache__/constants.cpython-310.pyc +0 -0
financial_bot/__pycache__/embeddings.cpython-310.pyc +0 -0
financial_bot/__pycache__/handlers.cpython-310.pyc +0 -0
financial_bot/__pycache__/langchain_bot.cpython-310.pyc +0 -0
financial_bot/__pycache__/models.cpython-310.pyc +0 -0
financial_bot/__pycache__/qdrant.cpython-310.pyc +0 -0
financial_bot/__pycache__/template.cpython-310.pyc +0 -0
financial_bot/__pycache__/utils.cpython-310.pyc +0 -0
financial_bot/base.py +38 -0
financial_bot/chains.py +226 -0
financial_bot/constants.py +23 -0
financial_bot/embeddings.py +123 -0
financial_bot/handlers.py +64 -0
financial_bot/langchain_bot.py +223 -0
financial_bot/models.py +264 -0
financial_bot/qdrant.py +49 -0
financial_bot/template.py +132 -0
financial_bot/utils.py +106 -0

financial_bot/__init__.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import logging
+import logging.config
+from pathlib import Path
+import yaml
+from dotenv import find_dotenv, load_dotenv
+logger = logging.getLogger(__name__)
+def initialize(logging_config_path: str = "logging.yaml", env_file_path: str = ".env"):
+    """
+    Initializes the logger and environment variables.
+    Args:
+        logging_config_path (str): The path to the logging configuration file. Defaults to "logging.yaml".
+        env_file_path (str): The path to the environment variables file. Defaults to ".env".
+    """
+    logger.info("Initializing logger...")
+    try:
+        initialize_logger(config_path=logging_config_path)
+    except FileNotFoundError:
+        logger.warning(
+            f"No logging configuration file found at: {logging_config_path}. Setting logging level to INFO."
+        )
+        logging.basicConfig(level=logging.INFO)
+    logger.info("Initializing env vars...")
+    if env_file_path is None:
+        env_file_path = find_dotenv(raise_error_if_not_found=True, usecwd=False)
+    logger.info(f"Loading environment variables from: {env_file_path}")
+    found_env_file = load_dotenv(env_file_path, verbose=True, override=True)
+    if found_env_file is False:
+        raise RuntimeError(f"Could not find environment file at: {env_file_path}")
+def initialize_logger(
+    config_path: str = "logging.yaml", logs_dir_name: str = "logs"
+) -> logging.Logger:
+    """Initialize logger from a YAML config file."""
+    # Create logs directory.
+    config_path_parent = Path(config_path).parent
+    logs_dir = config_path_parent / logs_dir_name
+    logs_dir.mkdir(parents=True, exist_ok=True)
+    with open(config_path, "rt") as f:
+        config = yaml.safe_load(f.read())
+    # Make sure that existing logger will still work.
+    config["disable_existing_loggers"] = False
+    logging.config.dictConfig(config)

financial_bot/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.96 kB). View file

financial_bot/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (936 Bytes). View file

financial_bot/__pycache__/chains.cpython-310.pyc ADDED Viewed

Binary file (6.98 kB). View file

financial_bot/__pycache__/constants.cpython-310.pyc ADDED Viewed

Binary file (720 Bytes). View file

financial_bot/__pycache__/embeddings.cpython-310.pyc ADDED Viewed

Binary file (4.37 kB). View file

financial_bot/__pycache__/handlers.cpython-310.pyc ADDED Viewed

Binary file (2.59 kB). View file

financial_bot/__pycache__/langchain_bot.cpython-310.pyc ADDED Viewed

Binary file (7.71 kB). View file

financial_bot/__pycache__/models.cpython-310.pyc ADDED Viewed

Binary file (8.25 kB). View file

financial_bot/__pycache__/qdrant.cpython-310.pyc ADDED Viewed

Binary file (1.56 kB). View file

financial_bot/__pycache__/template.cpython-310.pyc ADDED Viewed

Binary file (3.84 kB). View file

financial_bot/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (3.34 kB). View file

financial_bot/base.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from threading import Lock
+class SingletonMeta(type):
+    """
+    This is a thread-safe implementation of Singleton.
+    """
+    _instances = {}
+    _lock: Lock = Lock()
+    """
+    We now have a lock object that will be used to synchronize threads during
+    first access to the Singleton.
+    """
+    def __call__(cls, *args, **kwargs):
+        """
+        Possible changes to the value of the `__init__` argument do not affect
+        the returned instance.
+        """
+        # Now, imagine that the program has just been launched. Since there's no
+        # Singleton instance yet, multiple threads can simultaneously pass the
+        # previous conditional and reach this point almost at the same time. The
+        # first of them will acquire lock and will proceed further, while the
+        # rest will wait here.
+        with cls._lock:
+            # The first thread to acquire the lock, reaches this conditional,
+            # goes inside and creates the Singleton instance. Once it leaves the
+            # lock block, a thread that might have been waiting for the lock
+            # release may then enter this section. But since the Singleton field
+            # is already initialized, the thread won't create a new object.
+            if cls not in cls._instances:
+                instance = super().__call__(*args, **kwargs)
+                cls._instances[cls] = instance
+        return cls._instances[cls]

financial_bot/chains.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import time
+from typing import Any, Dict, List, Optional
+import qdrant_client
+from langchain import chains
+from langchain.callbacks.manager import CallbackManagerForChainRun
+from langchain.chains.base import Chain
+from langchain.llms import HuggingFacePipeline
+from unstructured.cleaners.core import (
+    clean,
+    clean_extra_whitespace,
+    clean_non_ascii_chars,
+    group_broken_paragraphs,
+    replace_unicode_quotes,
+)
+from financial_bot.embeddings import EmbeddingModelSingleton
+from financial_bot.template import PromptTemplate
+class StatelessMemorySequentialChain(chains.SequentialChain):
+    """
+    A sequential chain that uses a stateless memory to store context between calls.
+    This chain overrides the _call and prep_outputs methods to load and clear the memory
+    before and after each call, respectively.
+    """
+    history_input_key: str = "to_load_history"
+    def _call(self, inputs: Dict[str, str], **kwargs) -> Dict[str, str]:
+        """
+        Override _call to load history before calling the chain.
+        This method loads the history from the input dictionary and saves it to the
+        stateless memory. It then updates the inputs dictionary with the memory values
+        and removes the history input key. Finally, it calls the parent _call method
+        with the updated inputs and returns the results.
+        """
+        to_load_history = inputs[self.history_input_key]
+        for (
+            human,
+            ai,
+        ) in to_load_history:
+            self.memory.save_context(
+                inputs={self.memory.input_key: human},
+                outputs={self.memory.output_key: ai},
+            )
+        memory_values = self.memory.load_memory_variables({})
+        inputs.update(memory_values)
+        del inputs[self.history_input_key]
+        return super()._call(inputs, **kwargs)
+    def prep_outputs(
+        self,
+        inputs: Dict[str, str],
+        outputs: Dict[str, str],
+        return_only_outputs: bool = False,
+    ) -> Dict[str, str]:
+        """
+        Override prep_outputs to clear the internal memory after each call.
+        This method calls the parent prep_outputs method to get the results, then
+        clears the stateless memory and removes the memory key from the results
+        dictionary. It then returns the updated results.
+        """
+        results = super().prep_outputs(inputs, outputs, return_only_outputs)
+        # Clear the internal memory.
+        self.memory.clear()
+        if self.memory.memory_key in results:
+            results[self.memory.memory_key] = ""
+        return results
+class ContextExtractorChain(Chain):
+    """
+    Encode the question, search the vector store for top-k articles and return
+    context news from documents collection of Alpaca news.
+    Attributes:
+    -----------
+    top_k : int
+        The number of top matches to retrieve from the vector store.
+    embedding_model : EmbeddingModelSingleton
+        The embedding model to use for encoding the question.
+    vector_store : qdrant_client.QdrantClient
+        The vector store to search for matches.
+    vector_collection : str
+        The name of the collection to search in the vector store.
+    """
+    top_k: int = 1
+    embedding_model: EmbeddingModelSingleton
+    vector_store: qdrant_client.QdrantClient
+    vector_collection: str
+    @property
+    def input_keys(self) -> List[str]:
+        return ["about_me", "question"]
+    @property
+    def output_keys(self) -> List[str]:
+        return ["context"]
+    def _call(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        _, quest_key = self.input_keys
+        question_str = inputs[quest_key]
+        cleaned_question = self.clean(question_str)
+        # TODO: Instead of cutting the question at 'max_input_length', chunk the question in 'max_input_length' chunks,
+        # pass them through the model and average the embeddings.
+        cleaned_question = cleaned_question[: self.embedding_model.max_input_length]
+        embeddings = self.embedding_model(cleaned_question)
+        # TODO: Using the metadata, use the filter to take into consideration only the news from the last 24 hours
+        # (or other time frame).
+        matches = self.vector_store.search(
+            query_vector=embeddings,
+            k=self.top_k,
+            collection_name=self.vector_collection,
+        )
+        context = ""
+        for match in matches:
+            context += match.payload["summary"] + "\n"
+        return {
+            "context": context,
+        }
+    def clean(self, question: str) -> str:
+        """
+        Clean the input question by removing unwanted characters.
+        Parameters:
+        -----------
+        question : str
+            The input question to clean.
+        Returns:
+        --------
+        str
+            The cleaned question.
+        """
+        question = clean(question)
+        question = replace_unicode_quotes(question)
+        question = clean_non_ascii_chars(question)
+        return question
+class FinancialBotQAChain(Chain):
+    """This custom chain handles LLM generation upon given prompt"""
+    hf_pipeline: HuggingFacePipeline
+    template: PromptTemplate
+    @property
+    def input_keys(self) -> List[str]:
+        """Returns a list of input keys for the chain"""
+        return ["context"]
+    @property
+    def output_keys(self) -> List[str]:
+        """Returns a list of output keys for the chain"""
+        return ["answer"]
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        """Calls the chain with the given inputs and returns the output"""
+        inputs = self.clean(inputs)
+        prompt = self.template.format_infer(
+            {
+                "user_context": inputs["about_me"],
+                "news_context": inputs["context"],
+                "chat_history": inputs["chat_history"],
+                "question": inputs["question"],
+            }
+        )
+        start_time = time.time()
+        response = self.hf_pipeline(prompt["prompt"])
+        end_time = time.time()
+        duration_milliseconds = (end_time - start_time) * 1000
+        if run_manager:
+            run_manager.on_chain_end(
+                outputs={
+                    "answer": response,
+                },
+                # TODO: Count tokens instead of using len().
+                metadata={
+                    "prompt": prompt["prompt"],
+                    "prompt_template_variables": prompt["payload"],
+                    "prompt_template": self.template.infer_raw_template,
+                    "usage.prompt_tokens": len(prompt["prompt"]),
+                    "usage.total_tokens": len(prompt["prompt"]) + len(response),
+                    "usage.actual_new_tokens": len(response),
+                    "duration_milliseconds": duration_milliseconds,
+                },
+            )
+        return {"answer": response}
+    def clean(self, inputs: Dict[str, str]) -> Dict[str, str]:
+        """Cleans the inputs by removing extra whitespace and grouping broken paragraphs"""
+        for key, input in inputs.items():
+            cleaned_input = clean_extra_whitespace(input)
+            cleaned_input = group_broken_paragraphs(cleaned_input)
+            inputs[key] = cleaned_input
+        return inputs

financial_bot/constants.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from pathlib import Path
+# == Embeddings model ==
+EMBEDDING_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
+EMBEDDING_MODEL_MAX_INPUT_LENGTH = 384
+# == VECTOR Database ==
+VECTOR_DB_OUTPUT_COLLECTION_NAME = "alpaca_financial_news"
+VECTOR_DB_SEARCH_TOPK = 1
+# == LLM Model ==
+LLM_MODEL_ID = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit"
+LLM_QLORA_CHECKPOINT = "plantbased/mistral-7b-instruct-v0.2-4bit"
+LLM_INFERNECE_MAX_NEW_TOKENS = 500
+LLM_INFERENCE_TEMPERATURE = 1.0
+# == Prompt Template ==
+TEMPLATE_NAME = "mistral"
+# === Misc ===
+CACHE_DIR = Path.home() / ".cache" / "hands-on-llms"

financial_bot/embeddings.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import logging
+import traceback
+from typing import Optional, Union
+import numpy as np
+from transformers import AutoModel, AutoTokenizer
+from financial_bot import constants
+from financial_bot.base import SingletonMeta
+logger = logging.getLogger(__name__)
+class EmbeddingModelSingleton(metaclass=SingletonMeta):
+    """
+    A singleton class that provides a pre-trained transformer model for generating embeddings of input text.
+    Args:
+        model_id (str): The identifier of the pre-trained transformer model to use.
+        max_input_length (int): The maximum length of input text to tokenize.
+        device (str): The device to use for running the model (e.g. "cpu", "cuda").
+        cache_dir (Optional[Path]): The directory to cache the pre-trained model files.
+            If None, the default cache directory is used.
+    Attributes:
+        max_input_length (int): The maximum length of input text to tokenize.
+        tokenizer (AutoTokenizer): The tokenizer used to tokenize input text.
+    """
+    def __init__(
+        self,
+        model_id: str = constants.EMBEDDING_MODEL_ID,
+        max_input_length: int = constants.EMBEDDING_MODEL_MAX_INPUT_LENGTH,
+        device: str = "cuda:0",
+        cache_dir: Optional[str] = None,
+    ):
+        """
+        Initializes the EmbeddingModelSingleton instance.
+        Args:
+            model_id (str): The identifier of the pre-trained transformer model to use.
+            max_input_length (int): The maximum length of input text to tokenize.
+            device (str): The device to use for running the model (e.g. "cpu", "cuda").
+            cache_dir (Optional[Path]): The directory to cache the pre-trained model files.
+                If None, the default cache directory is used.
+        """
+        self._model_id = model_id
+        self._device = device
+        self._max_input_length = max_input_length
+        self._tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self._model = AutoModel.from_pretrained(
+            model_id,
+            cache_dir=str(cache_dir) if cache_dir else None,
+        ).to(self._device)
+        self._model.eval()
+    @property
+    def max_input_length(self) -> int:
+        """
+        Returns the maximum length of input text to tokenize.
+        Returns:
+            int: The maximum length of input text to tokenize.
+        """
+        return self._max_input_length
+    @property
+    def tokenizer(self) -> AutoTokenizer:
+        """
+        Returns the tokenizer used to tokenize input text.
+        Returns:
+            AutoTokenizer: The tokenizer used to tokenize input text.
+        """
+        return self._tokenizer
+    def __call__(
+        self, input_text: str, to_list: bool = True
+    ) -> Union[np.ndarray, list]:
+        """
+        Generates embeddings for the input text using the pre-trained transformer model.
+        Args:
+            input_text (str): The input text to generate embeddings for.
+            to_list (bool): Whether to return the embeddings as a list or numpy array. Defaults to True.
+        Returns:
+            Union[np.ndarray, list]: The embeddings generated for the input text.
+        """
+        try:
+            tokenized_text = self._tokenizer(
+                input_text,
+                padding=True,
+                truncation=True,
+                return_tensors="pt",
+                max_length=self._max_input_length,
+            ).to(self._device)
+        except Exception:
+            logger.error(traceback.format_exc())
+            logger.error(f"Error tokenizing the following input text: {input_text}")
+            return [] if to_list else np.array([])
+        try:
+            result = self._model(**tokenized_text)
+        except Exception:
+            logger.error(traceback.format_exc())
+            logger.error(
+                f"Error generating embeddings for the following model_id: {self._model_id} and input text: {input_text}"
+            )
+            return [] if to_list else np.array([])
+        embeddings = result.last_hidden_state[:, 0, :].cpu().detach().numpy()
+        if to_list:
+            embeddings = embeddings.flatten().tolist()
+        return embeddings

financial_bot/handlers.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from typing import Any, Dict
+import comet_llm
+from langchain.callbacks.base import BaseCallbackHandler
+from financial_bot import constants
+class CometLLMMonitoringHandler(BaseCallbackHandler):
+    """
+    A callback handler for monitoring LLM models using Comet.ml.
+    Args:
+        project_name (str): The name of the Comet.ml project to log to.
+        llm_model_id (str): The ID of the LLM model to use for inference.
+        llm_qlora_model_id (str): The ID of the PEFT model to use for inference.
+        llm_inference_max_new_tokens (int): The maximum number of new tokens to generate during inference.
+        llm_inference_temperature (float): The temperature to use during inference.
+    """
+    def __init__(
+        self,
+        project_name: str = None,
+        llm_model_id: str = constants.LLM_MODEL_ID,
+        llm_qlora_model_id: str = constants.LLM_QLORA_CHECKPOINT,
+        llm_inference_max_new_tokens: int = constants.LLM_INFERNECE_MAX_NEW_TOKENS,
+        llm_inference_temperature: float = constants.LLM_INFERENCE_TEMPERATURE,
+    ):
+        self._project_name = project_name
+        self._llm_model_id = llm_model_id
+        self._llm_qlora_model_id = llm_qlora_model_id
+        self._llm_inference_max_new_tokens = llm_inference_max_new_tokens
+        self._llm_inference_temperature = llm_inference_temperature
+    def on_chain_end(self, outputs: Dict[str, Any], **kwargs: Any) -> None:
+        """
+        A callback function that logs the prompt and output to Comet.ml.
+        Args:
+            outputs (Dict[str, Any]): The output of the LLM model.
+            **kwargs (Any): Additional arguments passed to the function.
+        """
+        should_log_prompt = "metadata" in kwargs
+        if should_log_prompt:
+            metadata = kwargs["metadata"]
+            comet_llm.log_prompt(
+                project=self._project_name,
+                prompt=metadata["prompt"],
+                output=outputs["answer"],
+                prompt_template=metadata["prompt_template"],
+                prompt_template_variables=metadata["prompt_template_variables"],
+                metadata={
+                    "usage.prompt_tokens": metadata["usage.prompt_tokens"],
+                    "usage.total_tokens": metadata["usage.total_tokens"],
+                    "usage.max_new_tokens": self._llm_inference_max_new_tokens,
+                    "usage.temperature": self._llm_inference_temperature,
+                    "usage.actual_new_tokens": metadata["usage.actual_new_tokens"],
+                    "model": self._llm_model_id,
+                    "peft_model": self._llm_qlora_model_id,
+                },
+                duration=metadata["duration_milliseconds"],
+            )

financial_bot/langchain_bot.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import logging
+import os
+from pathlib import Path
+from typing import Iterable, List, Tuple
+from langchain import chains
+from langchain.memory import ConversationBufferWindowMemory
+from financial_bot import constants
+from financial_bot.chains import (
+    ContextExtractorChain,
+    FinancialBotQAChain,
+    StatelessMemorySequentialChain,
+)
+from financial_bot.embeddings import EmbeddingModelSingleton
+from financial_bot.handlers import CometLLMMonitoringHandler
+from financial_bot.models import build_huggingface_pipeline
+from financial_bot.qdrant import build_qdrant_client
+from financial_bot.template import get_llm_template
+logger = logging.getLogger(__name__)
+class FinancialBot:
+    """
+    A language chain bot that uses a language model to generate responses to user inputs.
+    Args:
+        llm_model_id (str): The ID of the Hugging Face language model to use.
+        llm_qlora_model_id (str): The ID of the Hugging Face QLora model to use.
+        llm_template_name (str): The name of the LLM template to use.
+        llm_inference_max_new_tokens (int): The maximum number of new tokens to generate during inference.
+        llm_inference_temperature (float): The temperature to use during inference.
+        vector_collection_name (str): The name of the Qdrant vector collection to use.
+        vector_db_search_topk (int): The number of nearest neighbors to search for in the Qdrant vector database.
+        model_cache_dir (Path): The directory to use for caching the language model and embedding model.
+        streaming (bool): Whether to use the Hugging Face streaming API for inference.
+        embedding_model_device (str): The device to use for the embedding model.
+        debug (bool): Whether to enable debug mode.
+    Attributes:
+        finbot_chain (Chain): The language chain that generates responses to user inputs.
+    """
+    def __init__(
+        self,
+        llm_model_id: str = constants.LLM_MODEL_ID,
+        llm_qlora_model_id: str = constants.LLM_QLORA_CHECKPOINT,
+        llm_template_name: str = constants.TEMPLATE_NAME,
+        llm_inference_max_new_tokens: int = constants.LLM_INFERNECE_MAX_NEW_TOKENS,
+        llm_inference_temperature: float = constants.LLM_INFERENCE_TEMPERATURE,
+        vector_collection_name: str = constants.VECTOR_DB_OUTPUT_COLLECTION_NAME,
+        vector_db_search_topk: int = constants.VECTOR_DB_SEARCH_TOPK,
+        model_cache_dir: Path = constants.CACHE_DIR,
+        streaming: bool = False,
+        embedding_model_device: str = "cuda:0",
+        debug: bool = False,
+    ):
+        self._llm_model_id = llm_model_id
+        self._llm_qlora_model_id = llm_qlora_model_id
+        self._llm_template_name = llm_template_name
+        self._llm_template = get_llm_template(name=self._llm_template_name)
+        self._llm_inference_max_new_tokens = llm_inference_max_new_tokens
+        self._llm_inference_temperature = llm_inference_temperature
+        self._vector_collection_name = vector_collection_name
+        self._vector_db_search_topk = vector_db_search_topk
+        self._debug = debug
+        self._qdrant_client = build_qdrant_client()
+        self._embd_model = EmbeddingModelSingleton(
+            cache_dir=model_cache_dir, device=embedding_model_device
+        )
+        self._llm_agent, self._streamer = build_huggingface_pipeline(
+            llm_model_id=llm_model_id,
+            llm_lora_model_id=llm_qlora_model_id,
+            max_new_tokens=llm_inference_max_new_tokens,
+            temperature=llm_inference_temperature,
+            use_streamer=streaming,
+            cache_dir=model_cache_dir,
+            debug=debug,
+        )
+        self.finbot_chain = self.build_chain()
+    @property
+    def is_streaming(self) -> bool:
+        return self._streamer is not None
+    def build_chain(self) -> chains.SequentialChain:
+        """
+        Constructs and returns a financial bot chain.
+        This chain is designed to take as input the user description, `about_me` and a `question` and it will
+        connect to the VectorDB, searches the financial news that rely on the user's question and injects them into the
+        payload that is further passed as a prompt to a financial fine-tuned LLM that will provide answers.
+        The chain consists of two primary stages:
+        1. Context Extractor: This stage is responsible for embedding the user's question,
+        which means converting the textual question into a numerical representation.
+        This embedded question is then used to retrieve relevant context from the VectorDB.
+        The output of this chain will be a dict payload.
+        2. LLM Generator: Once the context is extracted,
+        this stage uses it to format a full prompt for the LLM and
+        then feed it to the model to get a response that is relevant to the user's question.
+        Returns
+        -------
+        chains.SequentialChain
+            The constructed financial bot chain.
+        Notes
+        -----
+        The actual processing flow within the chain can be visualized as:
+        [about: str][question: str] > ContextChain >
+        [about: str][question:str] + [context: str] > FinancialChain >
+        [answer: str]
+        """
+        logger.info("Building 1/3 - ContextExtractorChain")
+        context_retrieval_chain = ContextExtractorChain(
+            embedding_model=self._embd_model,
+            vector_store=self._qdrant_client,
+            vector_collection=self._vector_collection_name,
+            top_k=self._vector_db_search_topk,
+        )
+        logger.info("Building 2/3 - FinancialBotQAChain")
+        if self._debug:
+            callabacks = []
+        else:
+            try:
+                comet_project_name = os.environ["COMET_PROJECT_NAME"]
+            except KeyError:
+                raise RuntimeError(
+                    "Please set the COMET_PROJECT_NAME environment variable."
+                )
+            callabacks = [
+                CometLLMMonitoringHandler(
+                    project_name=f"{comet_project_name}-monitor-prompts",
+                    llm_model_id=self._llm_model_id,
+                    llm_qlora_model_id=self._llm_qlora_model_id,
+                    llm_inference_max_new_tokens=self._llm_inference_max_new_tokens,
+                    llm_inference_temperature=self._llm_inference_temperature,
+                )
+            ]
+        llm_generator_chain = FinancialBotQAChain(
+            hf_pipeline=self._llm_agent,
+            template=self._llm_template,
+            callbacks=callabacks,
+        )
+        logger.info("Building 3/3 - Connecting chains into SequentialChain")
+        seq_chain = StatelessMemorySequentialChain(
+            history_input_key="to_load_history",
+            memory=ConversationBufferWindowMemory(
+                memory_key="chat_history",
+                input_key="question",
+                output_key="answer",
+                k=3,
+            ),
+            chains=[context_retrieval_chain, llm_generator_chain],
+            input_variables=["about_me", "question", "to_load_history"],
+            output_variables=["answer"],
+            verbose=True,
+        )
+        logger.info("Done building SequentialChain.")
+        logger.info("Workflow:")
+        logger.info(
+            """
+            [about: str][question: str] > ContextChain >
+            [about: str][question:str] + [context: str] > FinancialChain >
+            [answer: str]
+            """
+        )
+        return seq_chain
+    def answer(
+        self,
+        about_me: str,
+        question: str,
+        to_load_history: List[Tuple[str, str]] = None,
+    ) -> str:
+        """
+        Given a short description about the user and a question make the LLM
+        generate a response.
+        Parameters
+        ----------
+        about_me : str
+            Short user description.
+        question : str
+            User question.
+        Returns
+        -------
+        str
+            LLM generated response.
+        """
+        inputs = {
+            "about_me": about_me,
+            "question": question,
+            "to_load_history": to_load_history if to_load_history else [],
+        }
+        response = self.finbot_chain.run(inputs)
+        return response
+    def stream_answer(self) -> Iterable[str]:
+        """Stream the answer from the LLM after each token is generated after calling `answer()`."""
+        assert (
+            self.is_streaming
+        ), "Stream answer not available. Build the bot with `use_streamer=True`."
+        partial_answer = ""
+        for new_token in self._streamer:
+            if new_token != self._llm_template.eos:
+                partial_answer += new_token
+                yield partial_answer

financial_bot/models.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import logging
+import os
+from pathlib import Path
+from typing import List, Optional, Tuple
+import torch
+from comet_ml import API
+from langchain.llms import HuggingFacePipeline
+from peft import LoraConfig, PeftConfig, PeftModel
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    StoppingCriteria,
+    StoppingCriteriaList,
+    TextIteratorStreamer,
+    pipeline,
+)
+from financial_bot import constants
+from financial_bot.utils import MockedPipeline
+logger = logging.getLogger(__name__)
+def download_from_model_registry(
+    model_id: str, cache_dir: Optional[Path] = None
+) -> Path:
+    """
+    Downloads a model from the Comet ML Learning model registry.
+    Args:
+        model_id (str): The ID of the model to download, in the format "workspace/model_name:version".
+        cache_dir (Optional[Path]): The directory to cache the downloaded model in. Defaults to the value of
+            `constants.CACHE_DIR`.
+    Returns:
+        Path: The path to the downloaded model directory.
+    """
+    if cache_dir is None:
+        cache_dir = constants.CACHE_DIR
+    output_folder = cache_dir / "models" / model_id
+    already_downloaded = output_folder.exists()
+    if not already_downloaded:
+        workspace, model_id = model_id.split("/")
+        model_name, version = model_id.split(":")
+        api = API()
+        model = api.get_model(workspace=workspace, model_name=model_name)
+        model.download(version=version, output_folder=output_folder, expand=True)
+    else:
+        logger.info(f"Model {model_id=} already downloaded to: {output_folder}")
+    subdirs = [d for d in output_folder.iterdir() if d.is_dir()]
+    if len(subdirs) == 1:
+        model_dir = subdirs[0]
+    else:
+        raise RuntimeError(
+            f"There should be only one directory inside the model folder. \
+                Check the downloaded model at: {output_folder}"
+        )
+    logger.info(f"Model {model_id=} downloaded from the registry to: {model_dir}")
+    return model_dir
+class StopOnTokens(StoppingCriteria):
+    """
+    A stopping criteria that stops generation when a specific token is generated.
+    Args:
+        stop_ids (List[int]): A list of token ids that will trigger the stopping criteria.
+    """
+    def __init__(self, stop_ids: List[int]):
+        super().__init__()
+        self._stop_ids = stop_ids
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        """
+        Check if the last generated token is in the stop_ids list.
+        Args:
+            input_ids (torch.LongTensor): The input token ids.
+            scores (torch.FloatTensor): The scores of the generated tokens.
+        Returns:
+            bool: True if the last generated token is in the stop_ids list, False otherwise.
+        """
+        for stop_id in self._stop_ids:
+            if input_ids[0][-1] == stop_id:
+                return True
+        return False
+def build_huggingface_pipeline(
+    llm_model_id: str,
+    llm_lora_model_id: str,
+    max_new_tokens: int = constants.LLM_INFERNECE_MAX_NEW_TOKENS,
+    temperature: float = constants.LLM_INFERENCE_TEMPERATURE,
+    gradient_checkpointing: bool = False,
+    use_streamer: bool = False,
+    cache_dir: Optional[Path] = None,
+    debug: bool = False,
+) -> Tuple[HuggingFacePipeline, Optional[TextIteratorStreamer]]:
+    """
+    Builds a HuggingFace pipeline for text generation using a custom LLM + Finetuned checkpoint.
+    Args:
+        llm_model_id (str): The ID or path of the LLM model.
+        llm_lora_model_id (str): The ID or path of the LLM LoRA model.
+        max_new_tokens (int, optional): The maximum number of new tokens to generate. Defaults to 128.
+        temperature (float, optional): The temperature to use for sampling. Defaults to 0.7.
+        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
+        use_streamer (bool, optional): Whether to use a text iterator streamer. Defaults to False.
+        cache_dir (Optional[Path], optional): The directory to use for caching. Defaults to None.
+        debug (bool, optional): Whether to use a mocked pipeline for debugging. Defaults to False.
+    Returns:
+        Tuple[HuggingFacePipeline, Optional[TextIteratorStreamer]]: A tuple containing the HuggingFace pipeline
+            and the text iterator streamer (if used).
+    """
+    if debug is True:
+        return (
+            HuggingFacePipeline(
+                pipeline=MockedPipeline(f=lambda _: "You are doing great!")
+            ),
+            None,
+        )
+    model, tokenizer, _ = build_qlora_model(
+        pretrained_model_name_or_path=llm_model_id,
+        peft_pretrained_model_name_or_path=llm_lora_model_id,
+        gradient_checkpointing=gradient_checkpointing,
+        cache_dir=cache_dir,
+    )
+    model.eval()
+    if use_streamer:
+        streamer = TextIteratorStreamer(
+            tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
+        )
+        stop_on_tokens = StopOnTokens(stop_ids=[tokenizer.eos_token_id])
+        stopping_criteria = StoppingCriteriaList([stop_on_tokens])
+    else:
+        streamer = None
+        stopping_criteria = StoppingCriteriaList([])
+    pipe = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        streamer=streamer,
+        stopping_criteria=stopping_criteria,
+    )
+    hf = HuggingFacePipeline(pipeline=pipe)
+    return hf, streamer
+def build_qlora_model(
+    pretrained_model_name_or_path: str = "tiiuae/falcon-7b-instruct",
+    peft_pretrained_model_name_or_path: Optional[str] = None,
+    gradient_checkpointing: bool = True,
+    cache_dir: Optional[Path] = None,
+) -> Tuple[AutoModelForCausalLM, AutoTokenizer, PeftConfig]:
+    """
+    Function that builds a QLoRA LLM model based on the given HuggingFace name:
+        1.   Create and prepare the bitsandbytes configuration for QLoRa's quantization
+        2.   Download, load, and quantize on-the-fly Falcon-7b
+        3.   Create and prepare the LoRa configuration
+        4.   Load and configuration Falcon-7B's tokenizer
+    Args:
+        pretrained_model_name_or_path (str): The name or path of the pretrained model to use.
+        peft_pretrained_model_name_or_path (Optional[str]): The name or path of the PEFT pretrained model to use.
+        gradient_checkpointing (bool): Whether to use gradient checkpointing or not.
+        cache_dir (Optional[Path]): The directory to cache the downloaded models.
+    Returns:
+        Tuple[AutoModelForCausalLM, AutoTokenizer, PeftConfig]:
+            A tuple containing the QLoRA LLM model, tokenizer, and PEFT config.
+    """
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path,
+        revision="main",
+        quantization_config=bnb_config,
+        load_in_4bit=True,
+        device_map="auto",
+        trust_remote_code=False,
+        cache_dir=str(cache_dir) if cache_dir else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        pretrained_model_name_or_path,
+        trust_remote_code=False,
+        truncation=True,
+        cache_dir=str(cache_dir) if cache_dir else None,
+    )
+    if tokenizer.pad_token_id is None:
+        tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+        with torch.no_grad():
+            model.resize_token_embeddings(len(tokenizer))
+        model.config.pad_token_id = tokenizer.pad_token_id
+    if peft_pretrained_model_name_or_path:
+        is_model_name = not os.path.isdir(peft_pretrained_model_name_or_path)
+        if is_model_name:
+            logger.info(
+                f"Downloading {peft_pretrained_model_name_or_path} from CometML Model Registry:"
+            )
+            peft_pretrained_model_name_or_path = download_from_model_registry(
+                model_id=peft_pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+            )
+        logger.info(f"Loading Lora Confing from: {peft_pretrained_model_name_or_path}")
+        lora_config = LoraConfig.from_pretrained(peft_pretrained_model_name_or_path)
+        assert (
+            lora_config.base_model_name_or_path == pretrained_model_name_or_path
+        ), f"Lora Model trained on different base model than the one requested: \
+        {lora_config.base_model_name_or_path} != {pretrained_model_name_or_path}"
+        logger.info(f"Loading Peft Model from: {peft_pretrained_model_name_or_path}")
+        model = PeftModel.from_pretrained(model, peft_pretrained_model_name_or_path)
+    else:
+        lora_config = LoraConfig(
+            lora_alpha=16,
+            lora_dropout=0.1,
+            r=64,
+            bias="none",
+            task_type="CAUSAL_LM",
+            target_modules=["query_key_value"],
+        )
+    if gradient_checkpointing:
+        model.gradient_checkpointing_enable()
+        model.config.use_cache = (
+            False  # Gradient checkpointing is not compatible with caching.
+        )
+    else:
+        model.gradient_checkpointing_disable()
+        model.config.use_cache = True  # It is good practice to enable caching when using the model for inference.
+    return model, tokenizer, lora_config

financial_bot/qdrant.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import logging
+import os
+from typing import Optional
+import qdrant_client
+logger = logging.getLogger(__name__)
+def build_qdrant_client(
+    url: Optional[str] = None,
+    api_key: Optional[str] = None,
+):
+    """
+    Builds a Qdrant client object using the provided URL and API key.
+    Args:
+        url (Optional[str]): The URL of the Qdrant server. If not provided, the function will attempt
+            to read it from the QDRANT_URL environment variable.
+        api_key (Optional[str]): The API key to use for authentication. If not provided, the function will attempt
+            to read it from the QDRANT_API_KEY environment variable.
+    Raises:
+        KeyError: If the URL or API key is not provided and cannot be read from the environment variables.
+    Returns:
+        qdrant_client.QdrantClient: A Qdrant client object.
+    """
+    logger.info("Building QDrant Client")
+    if url is None:
+        try:
+            url = os.environ["QDRANT_URL"]
+        except KeyError:
+            raise KeyError(
+                "QDRANT_URL must be set as environment variable or manually passed as an argument."
+            )
+    if api_key is None:
+        try:
+            api_key = os.environ["QDRANT_API_KEY"]
+        except KeyError:
+            raise KeyError(
+                "QDRANT_API_KEY must be set as environment variable or manually passed as an argument."
+            )
+    client = qdrant_client.QdrantClient(url, api_key=api_key)
+    return client

financial_bot/template.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""
+This script defines a PromptTemplate class that assists in generating
+conversation/prompt templates. The script facilitates formatting prompts
+for inference and training by combining various context elements and user inputs.
+"""
+import dataclasses
+from typing import Dict, List, Union
+@dataclasses.dataclass
+class PromptTemplate:
+    """A class that manages prompt templates"""
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = "{system_message}"
+    # The template for the system context
+    context_template: str = "{user_context}\n{news_context}"
+    # The template for the conversation history
+    chat_history_template: str = "{chat_history}"
+    # The template of the user question
+    question_template: str = "{question}"
+    # The template of the system answer
+    answer_template: str = "{answer}"
+    # The system message
+    system_message: str = ""
+    # Separator
+    sep: str = "\n"
+    eos: str = "</s>"
+    @property
+    def input_variables(self) -> List[str]:
+        """Returns a list of input variables for the prompt template"""
+        return ["user_context", "news_context", "chat_history", "question", "answer"]
+    @property
+    def train_raw_template(self):
+        """Returns the training prompt template format"""
+        system = self.system_template.format(system_message=self.system_message)
+        context = f"{self.sep}{self.context_template}"
+        chat_history = f"{self.sep}{self.chat_history_template}"
+        question = f"{self.sep}{self.question_template}"
+        answer = f"{self.sep}{self.answer_template}"
+        return f"{system}{context}{chat_history}{question}{answer}{self.eos}"
+    @property
+    def infer_raw_template(self):
+        """Returns the inference prompt template format"""
+        system = self.system_template.format(system_message=self.system_message)
+        context = f"{self.sep}{self.context_template}"
+        chat_history = f"{self.sep}{self.chat_history_template}"
+        question = f"{self.sep}{self.question_template}"
+        return f"{system}{context}{chat_history}{question}{self.eos}"
+    def format_train(self, sample: Dict[str, str]) -> Dict[str, Union[str, Dict]]:
+        """Formats the data sample to a training sample"""
+        prompt = self.train_raw_template.format(
+            user_context=sample["user_context"],
+            news_context=sample["news_context"],
+            chat_history=sample.get("chat_history", ""),
+            question=sample["question"],
+            answer=sample["answer"],
+        )
+        return {"prompt": prompt, "payload": sample}
+    def format_infer(self, sample: Dict[str, str]) -> Dict[str, Union[str, Dict]]:
+        """Formats the data sample to a testing sample"""
+        prompt = self.infer_raw_template.format(
+            user_context=sample["user_context"],
+            news_context=sample["news_context"],
+            chat_history=sample.get("chat_history", ""),
+            question=sample["question"],
+        )
+        return {"prompt": prompt, "payload": sample}
+# Global Templates registry
+templates: Dict[str, PromptTemplate] = {}
+def register_llm_template(template: PromptTemplate):
+    """Register a new template to the global templates registry"""
+    templates[template.name] = template
+def get_llm_template(name: str) -> PromptTemplate:
+    """Returns the template assigned to the given name"""
+    return templates[name]
+##### Register Templates #####
+# - Mistral 7B Instruct v0.2 Template
+register_llm_template(
+    PromptTemplate(
+        name="mistral",
+        system_template="<s>{system_message}",
+        system_message="You are a helpful assistant, with financial expertise.",
+        context_template="{user_context}\n{news_context}",
+        chat_history_template="Summary: {chat_history}",
+        question_template="[INST] {question} [/INST]",
+        answer_template="{answer}",
+        sep="\n",
+        eos=" </s>",
+    )
+)
+# - FALCON (spec: https://huggingface.co/tiiuae/falcon-7b/blob/main/tokenizer.json)
+register_llm_template(
+    PromptTemplate(
+        name="falcon",
+        system_template=">>INTRODUCTION<< {system_message}",
+        system_message="You are a helpful assistant, with financial expertise.",
+        context_template=">>DOMAIN<< {user_context}\n{news_context}",
+        chat_history_template=">>SUMMARY<< {chat_history}",
+        question_template=">>QUESTION<< {question}",
+        answer_template=">>ANSWER<< {answer}",
+        sep="\n",
+        eos="<|endoftext|>",
+    )
+)

financial_bot/utils.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import logging
+import os
+import subprocess
+from typing import Callable, Dict, List
+import psutil
+import torch
+logger = logging.getLogger(__name__)
+def log_available_gpu_memory():
+    """
+    Logs the available GPU memory for each available GPU device.
+    If no GPUs are available, logs "No GPUs available".
+    Returns:
+        None
+    """
+    if torch.cuda.is_available():
+        for i in range(torch.cuda.device_count()):
+            memory_info = subprocess.check_output(
+                f"nvidia-smi -i {i} --query-gpu=memory.free --format=csv,nounits,noheader",
+                shell=True,
+            )
+            memory_info = str(memory_info).split("\\")[0][2:]
+            logger.info(f"GPU {i} memory available: {memory_info} MiB")
+    else:
+        logger.info("No GPUs available")
+def log_available_ram():
+    """
+    Logs the amount of available RAM in gigabytes.
+    Returns:
+        None
+    """
+    memory_info = psutil.virtual_memory()
+    # convert bytes to GB
+    logger.info(f"Available RAM: {memory_info.available / (1024.0 ** 3):.2f} GB")
+def log_files_and_subdirs(directory_path: str):
+    """
+    Logs all files and subdirectories in the specified directory.
+    Args:
+        directory_path (str): The path to the directory to log.
+    Returns:
+        None
+    """
+    # Check if the directory exists
+    if os.path.exists(directory_path) and os.path.isdir(directory_path):
+        for dirpath, dirnames, filenames in os.walk(directory_path):
+            logger.info(f"Directory: {dirpath}")
+            for filename in filenames:
+                logger.info(f"File: {os.path.join(dirpath, filename)}")
+            for dirname in dirnames:
+                logger.info(f"Sub-directory: {os.path.join(dirpath, dirname)}")
+    else:
+        logger.info(f"The directory '{directory_path}' does not exist")
+class MockedPipeline:
+    """
+    A mocked pipeline class that is used as a replacement to the HF pipeline class.
+    Attributes:
+    -----------
+    task : str
+        The task of the pipeline, which is text-generation.
+    f : Callable[[str], str]
+        A function that takes a prompt string as input and returns a generated text string.
+    """
+    task: str = "text-generation"
+    def __init__(self, f: Callable[[str], str]):
+        self.f = f
+    def __call__(self, prompt: str) -> List[Dict[str, str]]:
+        """
+        Calls the pipeline with a given prompt and returns a list of generated text.
+        Parameters:
+        -----------
+        prompt : str
+            The prompt string to generate text from.
+        Returns:
+        --------
+        List[Dict[str, str]]
+            A list of dictionaries, where each dictionary contains a generated_text key with the generated text string.
+        """
+        result = self.f(prompt)
+        return [{"generated_text": f"{prompt}{result}"}]