File size: 2,212 Bytes
3d6d112
fa83d85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c00107f
fac3e95
2253763
fa83d85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dde5236
fa83d85
 
 
 
 
 
 
 
 
 
dde5236
fa83d85
 
fb333f9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59

#https://docs.google.com/document/d/1hY5ItC8Mewyk-90Q--CGr50wBbZBjPrkYu4NtiBVre4/edit?usp=sharing
#Inference takes 6-7 mins per query
import logging
import sys
import gradio as gr
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

# Set up logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

def configure_llama_model():
    #model_url = 'https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf'
    model_url = 'https://huggingface.co/TheBloke/stablelm-zephyr-3b-GGUF/resolve/main/stablelm-zephyr-3b.Q4_K_M.gguf'

    llm = LlamaCPP(
        model_url=model_url,
        temperature=0.3,
        max_new_tokens=256,
        context_window=3900,
        model_kwargs={"n_gpu_layers": -1},
        messages_to_prompt=messages_to_prompt,
        completion_to_prompt=completion_to_prompt,
        verbose=True,
    )
    return llm

def configure_embeddings():
    embed_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    return embed_model

def configure_service_context(llm, embed_model):
    return ServiceContext.from_defaults(chunk_size=250, llm=llm, embed_model=embed_model)

def initialize_vector_store_index(data_path, service_context):
    documents = SimpleDirectoryReader("./").load_data()
    index = VectorStoreIndex.from_documents(documents, service_context=service_context)
    return index

# Configure and initialize components

llm = configure_llama_model()
embed_model = configure_embeddings()
service_context = configure_service_context(llm, embed_model)
index = initialize_vector_store_index("./", service_context)
query_engine = index.as_query_engine()

# Define a function for Gradio to use
def get_response(text, username):
    # For simplicity, we are only using the 'text' argument
    response = str(query_engine.query(text))
    return response


gr.ChatInterface(get_response).launch(debug=True,share=True)