Spaces:
Sleeping
Sleeping
| #https://docs.google.com/document/d/1hY5ItC8Mewyk-90Q--CGr50wBbZBjPrkYu4NtiBVre4/edit?usp=sharing | |
| #Inference takes 6-7 mins per query | |
| import logging | |
| import sys | |
| import gradio as gr | |
| from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext | |
| from llama_index.llms import LlamaCPP | |
| from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt | |
| from langchain.embeddings.huggingface import HuggingFaceEmbeddings | |
| # Set up logging | |
| logging.basicConfig(stream=sys.stdout, level=logging.INFO) | |
| logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) | |
| def configure_llama_model(): | |
| #model_url = 'https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf' | |
| model_url = 'https://huggingface.co/TheBloke/stablelm-zephyr-3b-GGUF/resolve/main/stablelm-zephyr-3b.Q4_K_M.gguf' | |
| llm = LlamaCPP( | |
| model_url=model_url, | |
| temperature=0.3, | |
| max_new_tokens=256, | |
| context_window=3900, | |
| model_kwargs={"n_gpu_layers": -1}, | |
| messages_to_prompt=messages_to_prompt, | |
| completion_to_prompt=completion_to_prompt, | |
| verbose=True, | |
| ) | |
| return llm | |
| def configure_embeddings(): | |
| embed_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| return embed_model | |
| def configure_service_context(llm, embed_model): | |
| return ServiceContext.from_defaults(chunk_size=250, llm=llm, embed_model=embed_model) | |
| def initialize_vector_store_index(data_path, service_context): | |
| documents = SimpleDirectoryReader("./").load_data() | |
| index = VectorStoreIndex.from_documents(documents, service_context=service_context) | |
| return index | |
| # Configure and initialize components | |
| llm = configure_llama_model() | |
| embed_model = configure_embeddings() | |
| service_context = configure_service_context(llm, embed_model) | |
| index = initialize_vector_store_index("./", service_context) | |
| query_engine = index.as_query_engine() | |
| # Define a function for Gradio to use | |
| def get_response(text, username): | |
| # For simplicity, we are only using the 'text' argument | |
| response = str(query_engine.query(text)) | |
| return response | |
| gr.ChatInterface(get_response).launch(debug=True,share=True) |