import spaces import gradio as gr from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.llms.huggingface import HuggingFaceLLM from llama_index.core.postprocessor import SentenceTransformerRerank from llama_index.core import ( StorageContext, load_index_from_storage, Settings, ) from llama_index.embeddings.openai import OpenAIEmbedding import torch PERSIST_DIR = './storage' # Configure the settings DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") Settings.embed_model = OpenAIEmbedding() # Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5", device="cpu") Settings.llm = HuggingFaceLLM( model_name="meta-llama/Meta-Llama-3-8B-Instruct", tokenizer_name="meta-llama/Meta-Llama-3-8B-Instruct", context_window=2048, max_new_tokens=256, generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95}, device_map="auto", ) storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) index = load_index_from_storage(storage_context) query_engine = index.as_query_engine() rerank = SentenceTransformerRerank( model="BAAI/bge-reranker-large", top_n=5 # Note here ) query_engine = index.as_query_engine(streaming=True, similarity_top_k=1, node_postprocessors=[rerank]) @spaces.GPU def chatbot_response(message, history): response = query_engine.query(message) return str(response) iface = gr.ChatInterface( fn=chatbot_response, title="UESP Lore Chatbot: Running on top of Meta-Llama-3-8B-Instruct (currently) It works 'okay'", description="Github page for use case, general information, local installs, etc: https://github.com/emarron/UESP-lore", examples=["Who is Zaraphus?", "What is the relation between dragonbreak and chim?", "What is the Lunar Lorkhan?"], cache_examples=True, ) if __name__ == "__main__": iface.launch()