emar
test 2
4b52a00
raw
history blame
No virus
2.11 kB
import spaces
import gradio as gr
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import StorageContext, load_index_from_storage, Settings
from llama_index.llms.huggingface import HuggingFaceLLM
import torch
from pydantic import BaseModel
PERSIST_DIR = './storage'
# Configure the settings
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Pydantic config to avoid protected namespace warning
class Config(BaseModel):
model_config = {'protected_namespaces': ()}
# @spaces.GPU(duration=240)
def setup():
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5", device=DEVICE)
Settings.llm = HuggingFaceLLM(
model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
tokenizer_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
context_window=2048,
max_new_tokens=256,
generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
device_map="auto",
)
setup()
# Load the existing index
# @spaces.GPU
def load_context():
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
index = load_index_from_storage(storage_context)
query_engine = index.as_query_engine()
return query_engine
query_engine = None
def initialize_query_engine():
global query_engine
query_engine = load_context()
# Initialize query engine at the start
initialize_query_engine()
# Chatbot response function
@spaces.GPU
def chatbot_response(message, history):
if query_engine is None:
initialize_query_engine()
response = query_engine.query(message)
return str(response)
# Initialize Gradio interface
iface = gr.ChatInterface(
fn=chatbot_response,
title="UESP Lore Chatbot: CPU bound version of Phi-3-mini",
description=(
"Low quality and extremely slow version of the ones you can find on the github page: "
"https://github.com/emarron/UESP-lore. I am not paying to have Llama3 on here."
),
examples=["Who is Zaraphus?"],
cache_examples=True,
)
if __name__ == "__main__":
iface.launch()