# -*- coding: utf-8 -*- from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext from llama_index.llms import HuggingFaceLLM import torch from llama_index.llms import LlamaCPP from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt documents = SimpleDirectoryReader("./data").load_data() llm = LlamaCPP( # You can pass in the URL to a GGML model to download it automatically model_url='https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q5_K_M.gguf', # optionally, you can set the path to a pre-downloaded model instead of model_url model_path=None, temperature=0.1, max_new_tokens=256, # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room context_window=3900, # kwargs to pass to __call__() generate_kwargs={}, # kwargs to pass to __init__() # set to at least 1 to use GPU model_kwargs={"n_gpu_layers": -1}, # transform inputs into Llama2 format messages_to_prompt=messages_to_prompt, completion_to_prompt=completion_to_prompt, verbose=True, ) from llama_index.embeddings import HuggingFaceEmbedding # loads BAAI/bge-small-en # embed_model = HuggingFaceEmbedding() # loads BAAI/bge-small-en-v1.5 # intilaize our custom embeddings embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") service_context = ServiceContext.from_defaults( chunk_size=512, llm=llm, embed_model=embed_model ) """Advanced RAG with Cross Encoder Reranker . Referred from: https://wandb.ai/ayush-thakur/llama-index-report/reports/Building-Advanced-Query-Engine-and-Evaluation-with-LlamaIndex-and-W-B--Vmlldzo0OTIzMjMy""" from llama_index.indices.postprocessor import SentenceTransformerRerank # Initialize the reranker rerank = SentenceTransformerRerank( model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_n=3) # Retrives top 3 chunks #create query engine index = VectorStoreIndex.from_documents(documents, service_context=service_context) query_engine = index.as_query_engine() # Without reranker def predict(input, history): response = query_engine.query(input) return str(response) #create query engine with cross encoder reranker index = VectorStoreIndex.from_documents(documents, service_context=service_context) query_engine = index.as_query_engine(similarity_top_k=10, node_postprocessors=[rerank]) # Note we are first selecting 10 chunks. def predict(input, history): response = query_engine.query(input) return str(response) import time import gradio as gr def predict(input, history): start_time = time.time() # Start the timer response = query_engine.query(input) # Process the query end_time = time.time() # Stop the timer response_time = end_time - start_time # Calculate the time taken # Format the response to include the time taken timed_response = f"{response}\n\n(Response Time: {response_time:.2f} seconds)" return str(timed_response) # Launch gradio chat ui gr.ChatInterface(predict).launch(share=True, debug=True)