Spaces:

DanyaalMajid
/

NLP-Final-LocalLLM-RAG

Running

App Files Files Community

DanyaalMajid commited on Jan 8

Commit

0ff7c7e

•

1 Parent(s): f21cd9a

test app

Browse files

Files changed (2) hide show

app.py +105 -0
requirements.txt +5 -0

app.py CHANGED Viewed

	@@ -0,0 +1,105 @@

+import time
+import streamlit as st
+from llama_index import ServiceContext, StorageContext, set_global_service_context, VectorStoreIndex
+from llama_index.embeddings import LangchainEmbedding
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from llama_index.llms import LlamaCPP
+from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
+from PyPDF2 import PdfReader
+# LLM Intialization
+llm = LlamaCPP(
+    model_url=None,  # We'll load locally.
+    # Trying small version of an already small model
+    model_path='./Models/phi-2.Q4_K_M.gguf',
+    temperature=0.1,
+    max_new_tokens=512,
+    context_window=2048,  # Phi-2 2K context window - this could be a limitation for RAG as it has to put the content into this context window
+    generate_kwargs={},
+    # set to at least 1 to use GPU
+    # This is small model and there's no indication of layers offloaded to the GPU
+    model_kwargs={"n_gpu_layers": 0},
+    messages_to_prompt=messages_to_prompt,
+    completion_to_prompt=completion_to_prompt,
+    verbose=True
+)
+# Embedding Initialization
+embed_model = LangchainEmbedding(
+    HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
+)
+# Service Context
+service_context = ServiceContext.from_defaults(
+    chunk_size=128,  # Number of tokens in each chunk
+    chunk_overlap=20,
+    # This should be automatically set with the model metadata but we'll force it to ensure wit is
+    context_window=2048,
+    num_output=768,  # Maximum output from the LLM, let's put this at 512 to ensure LlamaIndex saves that "space" for the output
+    llm=llm,
+    embed_model=embed_model
+)
+set_global_service_context(service_context)
+# Storage Context
+storage_context = StorageContext.from_defaults()
+# Model Specific Prompt
+def modelspecific_prompt(promptmessage):
+    # Model Specific Prompt
+    # As per https://huggingface.co/TheBloke/phi-2-GGUF
+    return f"Instruct: {promptmessage}\nOutput:"
+# PDF to Text
+def extract_text_from_pdf(pdf):
+    pdf_reader = PdfReader(pdf)
+    return ''.join(page.extract_text() for page in pdf_reader.pages)
+st.title("Llama-CPP Local LLM with RAG (Phi-2 + TinyLlama)")
+pdf = st.file_uploader("Upload a PDF file", type=["pdf"])
+if pdf is not None:
+    documents = extract_text_from_pdf(pdf)
+    nodes = (service_context.node_parser.get_nodes_from_documents(documents))
+    storage_context.docstore.add_documents(nodes)
+    index = (VectorStoreIndex.from_documents(
+        documents, service_context=service_context, storage_context=storage_context, llm=llm))
+    chat_engine = index.as_chat_engine(chat_mode="simple", verbose=True)
+    # Initialize chat history
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    # Display chat messages from history on app rerun
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # Accept user input
+    if prompt := st.chat_input("What is up?"):
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        # Display user message in chat message container
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        # Display assistant response in chat message container
+        with st.chat_message("assistant"):
+            message_placeholder = st.empty()
+            full_response = ""
+            assistant_response = chat_engine.chat(modelspecific_prompt(prompt))
+            # Simulate stream of response with milliseconds delay
+            for chunk in assistant_response.split():
+                full_response += chunk + " "
+                time.sleep(0.05)
+                # Add a blinking cursor to simulate typing
+                message_placeholder.markdown(full_response + "▌")
+            message_placeholder.markdown(full_response)
+        # Add assistant response to chat history
+        st.session_state.messages.append(
+            {"role": "assistant", "content": full_response})

requirements.txt CHANGED Viewed

	@@ -0,0 +1,5 @@

+streamlit
+llama-index
+langchain
+PyPDF2
+sentence_transformers