#https://docs.google.com/document/d/1hY5ItC8Mewyk-90Q--CGr50wBbZBjPrkYu4NtiBVre4/edit?usp=sharing #Inference takes 6-7 mins per query import logging import sys import gradio as gr from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext from llama_index.llms import LlamaCPP from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt from langchain.embeddings.huggingface import HuggingFaceEmbeddings from langchain_community.llms import Cohere from langchain_community.llms import FakeListLLM from langchain_community.llms import * # Set up logging logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) def configure_llama_model(): model_url = 'https://huggingface.co/TheBloke/stablelm-zephyr-3b-GGUF/resolve/main/stablelm-zephyr-3b.Q4_0.gguf' llm = LlamaCPP( model_url=model_url, temperature=0.3, max_new_tokens=256, context_window=3900, model_kwargs={"n_gpu_layers": 1}, messages_to_prompt=messages_to_prompt, completion_to_prompt=completion_to_prompt, verbose=True, ) return llm def configure_embeddings(): embed_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") return embed_model def configure_service_context(llm, embed_model): return ServiceContext.from_defaults(chunk_size=250, llm=llm, embed_model=embed_model) def initialize_vector_store_index(data_path, service_context): documents = SimpleDirectoryReader("./").load_data() index = VectorStoreIndex.from_documents(documents, service_context=service_context) return index # Configure and initialize components llm = configure_llama_model() embed_model = configure_embeddings() service_context = configure_service_context(llm, embed_model) index = initialize_vector_store_index("./", service_context) query_engine = index.as_query_engine() # Define a function for Gradio to use def get_response(text, username): # For simplicity, we are only using the 'text' argument response = str(query_engine.query(text)) return response gr.ChatInterface(get_response).launch(debug=True,share=True)