|
import time |
|
import streamlit as st |
|
from llama_index import ServiceContext, StorageContext, set_global_service_context, VectorStoreIndex, Document |
|
from llama_index.prompts import PromptTemplate |
|
from llama_index.embeddings import LangchainEmbedding |
|
from langchain.embeddings.huggingface import HuggingFaceEmbeddings |
|
from llama_index.chat_engine.condense_question import CondenseQuestionChatEngine |
|
from llama_index.llms import LlamaCPP |
|
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt |
|
from PyPDF2 import PdfReader |
|
|
|
|
|
|
|
|
|
def modelspecific_prompt(promptmessage): |
|
|
|
|
|
return f"Instruct: {promptmessage}\nOutput:" |
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf): |
|
pdf_reader = PdfReader(pdf) |
|
data = ''.join(page.extract_text() for page in pdf_reader.pages) |
|
data = data.split() |
|
return data |
|
|
|
|
|
def main(): |
|
|
|
llm = LlamaCPP( |
|
model_url=None, |
|
|
|
model_path='models/phi-2.Q4_K_M.gguf', |
|
temperature=0.1, |
|
max_new_tokens=512, |
|
context_window=2048, |
|
generate_kwargs={}, |
|
|
|
|
|
model_kwargs={"n_gpu_layers": 32}, |
|
messages_to_prompt=messages_to_prompt, |
|
completion_to_prompt=completion_to_prompt, |
|
verbose=True |
|
) |
|
|
|
|
|
embed_model = LangchainEmbedding( |
|
HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5") |
|
) |
|
|
|
|
|
service_context = ServiceContext.from_defaults( |
|
chunk_size=128, |
|
chunk_overlap=20, |
|
|
|
context_window=2048, |
|
num_output=768, |
|
llm=llm, |
|
embed_model=embed_model |
|
) |
|
set_global_service_context(service_context) |
|
|
|
|
|
storage_context = StorageContext.from_defaults() |
|
st.title("Llama-CPP Local LLM with RAG (Phi-2 RAG)") |
|
|
|
st.markdown( |
|
"Made with ❤️️ By Danyaal Majid & Muhammad Bin Asif Using [HF Spaces](https://huggingface.co/spaces/DanyaalMajid/NLP-Final-LocalLLM-RAG)") |
|
|
|
pdf = st.file_uploader("Upload a PDF file", type=["pdf"]) |
|
|
|
if pdf is not None: |
|
text_list = extract_text_from_pdf(pdf) |
|
documents = [Document(text=t) for t in text_list] |
|
nodes = (service_context.node_parser.get_nodes_from_documents(documents)) |
|
storage_context.docstore.add_documents(nodes) |
|
index = (VectorStoreIndex.from_documents( |
|
documents, service_context=service_context, storage_context=storage_context, llm=llm)) |
|
|
|
custom_prompt = PromptTemplate("") |
|
query_engine = index.as_query_engine() |
|
chat_engine = CondenseQuestionChatEngine.from_defaults( |
|
query_engine=query_engine, |
|
condense_question_prompt=custom_prompt, |
|
verbose=True, |
|
) |
|
|
|
if "messages" not in st.session_state: |
|
st.session_state.messages = [] |
|
|
|
|
|
for message in st.session_state.messages: |
|
with st.chat_message(message["role"]): |
|
st.markdown(message["content"]) |
|
|
|
|
|
if prompt := st.chat_input("What is up?"): |
|
|
|
st.session_state.messages.append( |
|
{"role": "user", "content": prompt}) |
|
|
|
with st.chat_message("user"): |
|
st.markdown(prompt) |
|
|
|
|
|
with st.chat_message("assistant"): |
|
message_placeholder = st.empty() |
|
full_response = "" |
|
assistant_response = chat_engine.chat( |
|
modelspecific_prompt(str(prompt))) |
|
assistant_response = str(assistant_response) |
|
|
|
for chunk in assistant_response.split(): |
|
full_response += chunk + " " |
|
time.sleep(0.05) |
|
|
|
message_placeholder.markdown(full_response + "▌") |
|
message_placeholder.markdown(full_response) |
|
|
|
st.session_state.messages.append( |
|
{"role": "assistant", "content": full_response}) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|