import spaces import gradio as gr from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core import StorageContext, load_index_from_storage, Settings from llama_index.llms.huggingface import HuggingFaceLLM import torch from pydantic import BaseModel PERSIST_DIR = './storage' # Configure the settings DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Pydantic config to avoid protected namespace warning class Config(BaseModel): model_config = {'protected_namespaces': ()} # @spaces.GPU(duration=240) def setup(): Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5", device=DEVICE) Settings.llm = HuggingFaceLLM( model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0", tokenizer_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0", context_window=2048, max_new_tokens=256, generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95}, device_map="auto", ) setup() # Load the existing index # @spaces.GPU def load_context(): storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) index = load_index_from_storage(storage_context) query_engine = index.as_query_engine() return query_engine query_engine = None def initialize_query_engine(): global query_engine query_engine = load_context() # Initialize query engine at the start initialize_query_engine() # Chatbot response function @spaces.GPU def chatbot_response(message, history): if query_engine is None: initialize_query_engine() response = query_engine.query(message) return str(response) # Initialize Gradio interface iface = gr.ChatInterface( fn=chatbot_response, title="UESP Lore Chatbot: CPU bound version of Phi-3-mini", description=( "Low quality and extremely slow version of the ones you can find on the github page: " "https://github.com/emarron/UESP-lore. I am not paying to have Llama3 on here." ), examples=["Who is Zaraphus?"], cache_examples=True, ) if __name__ == "__main__": iface.launch()