smu-ai

Build error

App Files Files Community

dh-mc commited on Aug 26, 2023

Commit

71f3335

1 Parent(s): 675cf55

code complete for smu lib bot

Browse files

Files changed (11) hide show

.env.example +6 -10
.gitattributes +2 -0
README.md +2 -2
app.py +7 -6
app_modules/llm_inference.py +23 -4
app_modules/utils.py +4 -3
data/questions.txt +6 -5
data/smu_lib_index/index.faiss +3 -0
data/smu_lib_index/index.pkl +3 -0
ingest-pdf-html.py +114 -0
test.py +1 -1

.env.example CHANGED Viewed

@@ -28,10 +28,9 @@ HF_PIPELINE_DEVICE_TYPE=
 # USE_LLAMA_2_PROMPT_TEMPLATE=true
 DISABLE_MODEL_PRELOADING=true
-CHAT_HISTORY_ENABLED=true
 SHOW_PARAM_SETTINGS=false
 SHARE_GRADIO_APP=false
-PDF_FILE_BASE_URL=https://chat-with-llama-2.netlify.app/pdfs/books/
 # if unset, default to "hkunlp/instructor-xl"
 HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
@@ -75,11 +74,7 @@ LLAMACPP_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/reso
 CTRANSFORMERS_MODEL_PATH="../models/llama-2-7b-chat.ggmlv3.q4_K_M.bin"
 CTRANSFORMERS_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_M.bin
-# Index for AI Books PDF files - chunk_size=1024 chunk_overlap=512
-# CHROMADB_INDEX_PATH="./data/chromadb_1024_512/"
-FAISS_INDEX_PATH="./data/ai_books/"
-CHAT_QUESTION="What's the capital city of Malaysia?"
 QA_QUESTION="What's deep learning?"
 QUESTIONS_FILE_PATH="./data/questions.txt"
@@ -87,10 +82,12 @@ QUESTIONS_FILE_PATH="./data/questions.txt"
 TOKENIZERS_PARALLELISM=true
 # env variables for ingesting source PDF files
-SOURCE_PDFS_PATH="./data/pdfs/"
-SOURCE_URLS=
 CHUNCK_SIZE=1024
 CHUNK_OVERLAP=512
 # telegram bot
 TELEGRAM_API_TOKEN=
@@ -104,4 +101,3 @@ export NGROK_EDGE=
 export HUGGINGFACE_HUB_CACHE=$HOME/.cache/huggingface/hub/
 export HUGGING_FACE_HUB_TOKEN=
-ß

 # USE_LLAMA_2_PROMPT_TEMPLATE=true
 DISABLE_MODEL_PRELOADING=true
+CHAT_HISTORY_ENABLED=false
 SHOW_PARAM_SETTINGS=false
 SHARE_GRADIO_APP=false
 # if unset, default to "hkunlp/instructor-xl"
 HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
 CTRANSFORMERS_MODEL_PATH="../models/llama-2-7b-chat.ggmlv3.q4_K_M.bin"
 CTRANSFORMERS_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_M.bin
+\CHAT_QUESTION="What's the capital city of Malaysia?"
 QA_QUESTION="What's deep learning?"
 QUESTIONS_FILE_PATH="./data/questions.txt"
 TOKENIZERS_PARALLELISM=true
 # env variables for ingesting source PDF files
 CHUNCK_SIZE=1024
 CHUNK_OVERLAP=512
+SOURCE_PATH="data/pdfs/smu_lib_html/"
+# Index for SMU LibBot PDF files - chunk_size=1024 chunk_overlap=512
+FAISS_INDEX_PATH="data/smu_lib_index/"
 # telegram bot
 TELEGRAM_API_TOKEN=
 export HUGGINGFACE_HUB_CACHE=$HOME/.cache/huggingface/hub/
 export HUGGING_FACE_HUB_TOKEN=

.gitattributes CHANGED Viewed

@@ -37,3 +37,5 @@ data/ai_books/index.faiss filter=lfs diff=lfs merge=lfs -text
 data/ai_books/index.pkl filter=lfs diff=lfs merge=lfs -text
 data/pci_dss_v4/index.faiss filter=lfs diff=lfs merge=lfs -text
 data/pci_dss_v4/index.pkl filter=lfs diff=lfs merge=lfs -text

 data/ai_books/index.pkl filter=lfs diff=lfs merge=lfs -text
 data/pci_dss_v4/index.faiss filter=lfs diff=lfs merge=lfs -text
 data/pci_dss_v4/index.pkl filter=lfs diff=lfs merge=lfs -text
+data/smu_lib_index/index.faiss filter=lfs diff=lfs merge=lfs -text
+data/smu_lib_index/index.pkl filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Chat with AI Books
 emoji: 👀
 colorFrom: indigo
 colorTo: blue
@@ -87,7 +87,7 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
 ## Talk to Your Own PDF Files
-- The sample PDF books & documents are downloaded from the internet (for AI Books) and [PCI DSS official website](https://www.pcisecuritystandards.org/document_library/?category=pcidss) and the corresponding embeddings are stored in folders `data/ai_books` and `data/pci_dss_v4` respectively, which allows you to run locally without any additional effort.
 - You can also put your own PDF files into any folder specified in `SOURCE_PDFS_PATH` and run the command below to generate embeddings which will be stored in folder `FAISS_INDEX_PATH` or `CHROMADB_INDEX_PATH`. If both `*_INDEX_PATH` env vars are set, `FAISS_INDEX_PATH` takes precedence. Make sure the folder specified by `*_INDEX_PATH` doesn't exist; other wise the command will simply try to load index from the folder and do a simple similarity search, as a way to verify if embeddings are generated and stored properly. Please note the HuggingFace Embedding model specified by `HF_EMBEDDINGS_MODEL_NAME` will be used to generate the embeddings.

 ---
+title: Chat with SMU LibBot
 emoji: 👀
 colorFrom: indigo
 colorTo: blue
 ## Talk to Your Own PDF Files
+- The sample PDF books & documents are downloaded from the internet (for SMU LibBot) and [PCI DSS official website](https://www.pcisecuritystandards.org/document_library/?category=pcidss) and the corresponding embeddings are stored in folders `data/ai_books` and `data/pci_dss_v4` respectively, which allows you to run locally without any additional effort.
 - You can also put your own PDF files into any folder specified in `SOURCE_PDFS_PATH` and run the command below to generate embeddings which will be stored in folder `FAISS_INDEX_PATH` or `CHROMADB_INDEX_PATH`. If both `*_INDEX_PATH` env vars are set, `FAISS_INDEX_PATH` takes precedence. Make sure the folder specified by `*_INDEX_PATH` doesn't exist; other wise the command will simply try to load index from the folder and do a simple similarity search, as a way to verify if embeddings are generated and stored properly. Please note the HuggingFace Embedding model specified by `HF_EMBEDDINGS_MODEL_NAME` will be used to generate the embeddings.

app.py CHANGED Viewed

@@ -38,7 +38,7 @@ if chat_with_llama_2:
     qa_chain = ChatChain(llm_loader)
     name = "Llama-2"
 else:
-    name = "AI Books"
 title = f"""<h1 align="left" style="min-width:200px; margin-top:0;"> Chat with {name} </h1>"""
@@ -111,10 +111,11 @@ def qa(chatbot):
             ret = result.get()
             titles = []
             for doc in ret["source_documents"]:
-                page = doc.metadata["page"] + 1
-                url = f"{doc.metadata['url']}#page={page}"
-                file_name = doc.metadata["source"].split("/")[-1]
-                title = f"{file_name} Page: {page}"
                 if title not in titles:
                     titles.append(title)
                     chatbot[-1][1] += f"1. [{title}]({url})\n"
@@ -209,5 +210,5 @@ with gr.Blocks(css=customCSS) as demo:
         api_name="reset",
     )
-demo.title = "Chat with AI Books" if chat_with_llama_2 else "Chat with Llama-2"
 demo.queue(concurrency_count=CONCURRENT_COUNT).launch(share=share_gradio_app)

     qa_chain = ChatChain(llm_loader)
     name = "Llama-2"
 else:
+    name = "SMU LibBot"
 title = f"""<h1 align="left" style="min-width:200px; margin-top:0;"> Chat with {name} </h1>"""
             ret = result.get()
             titles = []
             for doc in ret["source_documents"]:
+                url = doc.metadata["url"]
+                if "page" in doc.metadata:
+                    page = doc.metadata["page"] + 1
+                    url = f"{url}#page={page}"
+                title = url
                 if title not in titles:
                     titles.append(title)
                     chatbot[-1][1] += f"1. [{title}]({url})\n"
         api_name="reset",
     )
+demo.title = "Chat with SMU LibBot" if chat_with_llama_2 else "Chat with Llama-2"
 demo.queue(concurrency_count=CONCURRENT_COUNT).launch(share=share_gradio_app)

app_modules/llm_inference.py CHANGED Viewed

@@ -10,6 +10,7 @@ from langchain.chains.base import Chain
 from app_modules.llm_loader import LLMLoader, TextIteratorStreamer
 from app_modules.utils import remove_extra_spaces
 class LLMInference(metaclass=abc.ABCMeta):
@@ -59,13 +60,31 @@ class LLMInference(metaclass=abc.ABCMeta):
             if "answer" in result:
                 result["answer"] = remove_extra_spaces(result["answer"])
-                base_url = os.environ.get("PDF_FILE_BASE_URL")
-                if base_url is not None and len(base_url) > 0:
                     documents = result["source_documents"]
                     for doc in documents:
                         source = doc.metadata["source"]
-                        title = source.split("/")[-1]
-                        doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
             return result
         finally:

 from app_modules.llm_loader import LLMLoader, TextIteratorStreamer
 from app_modules.utils import remove_extra_spaces
+from urllib.parse import urlparse, urlunparse, quote
 class LLMInference(metaclass=abc.ABCMeta):
             if "answer" in result:
                 result["answer"] = remove_extra_spaces(result["answer"])
+                source_path = os.environ.get("SOURCE_PATH")
+                if source_path is not None and len(source_path) > 0:
                     documents = result["source_documents"]
                     for doc in documents:
                         source = doc.metadata["source"]
+                        url = source.replace(source_path, "https://")
+                        url = url.replace(".html", "")
+                        parsed_url = urlparse(url)
+                        # Encode path, query, and fragment
+                        encoded_path = quote(parsed_url.path)
+                        encoded_query = quote(parsed_url.query)
+                        encoded_fragment = quote(parsed_url.fragment)
+                        # Construct the encoded URL
+                        doc.metadata["url"] = urlunparse(
+                            (
+                                parsed_url.scheme,
+                                parsed_url.netloc,
+                                encoded_path,
+                                parsed_url.params,
+                                encoded_query,
+                                encoded_fragment,
+                            )
+                        )
             return result
         finally:

app_modules/utils.py CHANGED Viewed

@@ -74,10 +74,11 @@ def print_llm_response(llm_response):
     print("\nSources:")
     for source in source_documents:
         metadata = source["metadata"] if "metadata" in source else source.metadata
         print(
-            "  Page: "
-            + str(metadata["page"])
-            + " Source: "
             + str(metadata["url"] if "url" in metadata else metadata["source"])
         )
         print(

     print("\nSources:")
     for source in source_documents:
         metadata = source["metadata"] if "metadata" in source else source.metadata
+        if "page" in metadata:
+            print(f"  Page:  {metadata['page']}", end="")
         print(
+            " Source: "
             + str(metadata["url"] if "url" in metadata else metadata["source"])
         )
         print(

data/questions.txt CHANGED Viewed

@@ -1,5 +1,6 @@
-What's AI?
-life in AI era
-machine learning
-generative model
-graph attention network

+What are the library opening hours?
+I'm an undergrad. How many books can I borrow from libraries?
+Can you list some of recommended resources on generative AI?
+Hi, is it necessary to book a terminal first before being able to use the bloomberg computer in the library? or can i just show up?
+Hi, I am an alumni of SMU (batch of 2018). I wanted to enquire for SMU Alumni rates for access to library resources (databases, investment studio) etc
+I've overdue fine of $4.00. Could you advise on how I can go about paying the fine?

data/smu_lib_index/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f922ef2c87a9ab83f3a6ddc5c83f63607c51b3c3557c639e1fcc65b1d5071ee
+size 15009837

data/smu_lib_index/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8986b3308523752e17623ee5832bdca1637a8d3fde7bad1928466ee1ef885d69
+size 4510879

ingest-pdf-html.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# setting device on GPU if available, else CPU
+import os
+from timeit import default_timer as timer
+from typing import List
+from langchain.document_loaders import DirectoryLoader
+from langchain.document_loaders import PyPDFDirectoryLoader
+from langchain.embeddings import HuggingFaceInstructEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores.base import VectorStore
+from langchain.vectorstores.chroma import Chroma
+from langchain.vectorstores.faiss import FAISS
+from app_modules.init import *
+def load_documents(source_path) -> List:
+    loader = PyPDFDirectoryLoader(source_path, silent_errors=True)
+    documents = loader.load()
+    loader = DirectoryLoader(
+        source_path, glob="**/*.html", silent_errors=True, show_progress=True
+    )
+    documents.extend(loader.load())
+    return documents
+def split_chunks(documents: List, chunk_size, chunk_overlap) -> List:
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=chunk_overlap
+    )
+    return text_splitter.split_documents(documents)
+def generate_index(
+    chunks: List, embeddings: HuggingFaceInstructEmbeddings
+) -> VectorStore:
+    if using_faiss:
+        faiss_instructor_embeddings = FAISS.from_documents(
+            documents=chunks, embedding=embeddings
+        )
+        faiss_instructor_embeddings.save_local(index_path)
+        return faiss_instructor_embeddings
+    else:
+        chromadb_instructor_embeddings = Chroma.from_documents(
+            documents=chunks, embedding=embeddings, persist_directory=index_path
+        )
+        chromadb_instructor_embeddings.persist()
+        return chromadb_instructor_embeddings
+# Constants
+device_type, hf_pipeline_device_type = get_device_types()
+hf_embeddings_model_name = (
+    os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
+)
+index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get("CHROMADB_INDEX_PATH")
+using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
+source_path = os.environ.get("SOURCE_PATH")
+chunk_size = os.environ.get("CHUNCK_SIZE")
+chunk_overlap = os.environ.get("CHUNK_OVERLAP")
+start = timer()
+embeddings = HuggingFaceInstructEmbeddings(
+    model_name=hf_embeddings_model_name, model_kwargs={"device": device_type}
+)
+end = timer()
+print(f"Completed in {end - start:.3f}s")
+start = timer()
+if not os.path.isdir(index_path):
+    print(
+        f"The index persist directory {index_path} is not present. Creating a new one."
+    )
+    os.mkdir(index_path)
+    print(f"Loading PDF & HTML files from {source_path}")
+    sources = load_documents(source_path)
+    # print(sources[359])
+    print(f"Splitting {len(sources)} HTML pages in to chunks ...")
+    chunks = split_chunks(
+        sources, chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap)
+    )
+    print(chunks[3])
+    print(f"Generating index for {len(chunks)} chunks ...")
+    index = generate_index(chunks, embeddings)
+else:
+    print(f"The index persist directory {index_path} is present. Loading index ...")
+    index = (
+        FAISS.load_local(index_path, embeddings)
+        if using_faiss
+        else Chroma(embedding_function=embeddings, persist_directory=index_path)
+    )
+    query = "hi"
+    print(f"Load relevant documents for standalone question: {query}")
+    start2 = timer()
+    docs = index.as_retriever().get_relevant_documents(query)
+    end = timer()
+    print(f"Completed in {end - start2:.3f}s")
+    print(docs)
+end = timer()
+print(f"Completed in {end - start:.3f}s")

test.py CHANGED Viewed

@@ -77,7 +77,7 @@ while True:
     end = timer()
     print(f"Completed in {end - start:.3f}s")
-    print_llm_response(result)
     if len(chat_history) == 0:
         standalone_question = query

     end = timer()
     print(f"Completed in {end - start:.3f}s")
+    # print_llm_response(result)
     if len(chat_history) == 0:
         standalone_question = query