# -*- coding: utf-8 -*-
"""lastbot.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1opFBOe_TDrO_j2naogZ3uT8KGTa-fdLT
"""

#!pip install pypdf
## Embedding
#!pip install install sentence_transformers
#!pip install llama_index

"""import three key functions from LlamaIndex namely:


"""

import lamma_index
import huggingface_hub

from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM
from llama_index.prompts.prompts import SimpleInputPrompt

"""

---

"""

#!pip uninstall langchain llama_index -y

#!pip install langchain==0.0.150 llama_index==0.5.1

import llama

documents=SimpleDirectoryReader("/content/sample_data/data/insurance-2030-the-impact-of-ai-on-the-future-of-insurance-f.pdf").load_data()
documents

"""I will build a system prompt template , this is required for Llama2 to control the context and to manage the response."""

system_prompt="""
You are a Q&A assistant. Your goal is to answer questions as
accurately as possible based on the instructions and context provided.
"""
## Default format supportable by LLama2
query_wrapper_prompt=SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

"""Login hugging face and get the access token . This is required to link up with hugging face api’s"""

!huggingface-cli login

"""Next, we woud call Llama2 model and for this i will torch framework."""

import torch

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    model_name="meta-llama/Llama-2-7b-chat-hf",
    device_map="auto",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True}
)

"""llm = HuggingFaceLLM(...): Creates an instance of the LLM object.
  context_window=4096: Defines how much past conversation history (4096 words) the LLM considers when generating responses.
    max_new_tokens=256: Limits the maximum number of words the LLM generates per response (256).
    generate_kwargs={"temperature": 0.0, "do_sample": False}: Controls how creative the LLM is with its responses. Low temperature and not sampling make it more predictable and similar to the training data.
    system_prompt, query_wrapper_prompt: Templates used to guide the LLM's understanding of the context and how to respond.
    tokenizer_name, model_name: Specify the Llama2 model and tokenizer from Hugging Face.
    device_map="auto": Automatically uses GPU if available, otherwise CPU.
    model_kwargs (commented out): Reduce memory usage if using a GPU (CUDA) by utilizing half-precision floats (torch.float16) and loading the model in 8-bit format.

its time to build the embeddings using llamaIndex
"""

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import ServiceContext
from llama_index.embeddings import LangchainEmbedding

embed_model=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))

"""from langchain.embeddings.huggingface import HuggingFaceEmbeddings: This line imports the HuggingFaceEmbeddings class from the Langchain library, which allows you to use pre-trained models from the Hugging Face Hub for embedding text data.


---

from llama_index import ServiceContext: This imports the ServiceContext class from LlamaIndex, which manages and coordinates different components of your application.


---
from llama_index.embeddings import LangchainEmbedding: This imports the LangchainEmbedding class from LlamaIndex, which provides a wrapper for using various embedding models within the LlamaIndex framework.


---
Embedding Model Configuration:
embed_model = LangchainEmbedding(...): This line creates an instance of the LangchainEmbedding class.
HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"): This nested configuration specifies the HuggingFace embedding model to be used. In this case, it's the all-mpnet-base-v2 model from the sentence-transformers library, which is known for its good performance on sentence embedding tasks.


"""

# Configure the service context as

service_context=ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

"""Setup up the vector store as

Essentially, this line transforms our text documents into numerical representations (embeddings) using the embedding model and then stores them within the index object. This allows our RAG application to efficiently find relevant documents based on user queries during conversation
"""

index=VectorStoreIndex.from_documents(documents,service_context=service_context)

"""creates a “query engine” object called query_engine from our existing index object. This engine allows usto easily search and retrieve information from the dataset we built earlier.


---


We can use the query_engine to ask questions about the data stored in our index. It will use the embeddings it holds to find relevant documents and retrieve information from them
"""

query_engine=index.as_query_engine()

"""Time to test the RAG application."""

response=query_engine.query("How would insurance be in 2030")
print(response)

"""second test"""