code complete for smu lib bot
Browse files- .env.example +6 -10
- .gitattributes +2 -0
- README.md +2 -2
- app.py +7 -6
- app_modules/llm_inference.py +23 -4
- app_modules/utils.py +4 -3
- data/questions.txt +6 -5
- data/smu_lib_index/index.faiss +3 -0
- data/smu_lib_index/index.pkl +3 -0
- ingest-pdf-html.py +114 -0
- test.py +1 -1
.env.example
CHANGED
@@ -28,10 +28,9 @@ HF_PIPELINE_DEVICE_TYPE=
|
|
28 |
|
29 |
# USE_LLAMA_2_PROMPT_TEMPLATE=true
|
30 |
DISABLE_MODEL_PRELOADING=true
|
31 |
-
CHAT_HISTORY_ENABLED=
|
32 |
SHOW_PARAM_SETTINGS=false
|
33 |
SHARE_GRADIO_APP=false
|
34 |
-
PDF_FILE_BASE_URL=https://chat-with-llama-2.netlify.app/pdfs/books/
|
35 |
|
36 |
# if unset, default to "hkunlp/instructor-xl"
|
37 |
HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
|
@@ -75,11 +74,7 @@ LLAMACPP_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/reso
|
|
75 |
CTRANSFORMERS_MODEL_PATH="../models/llama-2-7b-chat.ggmlv3.q4_K_M.bin"
|
76 |
CTRANSFORMERS_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_M.bin
|
77 |
|
78 |
-
|
79 |
-
# CHROMADB_INDEX_PATH="./data/chromadb_1024_512/"
|
80 |
-
FAISS_INDEX_PATH="./data/ai_books/"
|
81 |
-
|
82 |
-
CHAT_QUESTION="What's the capital city of Malaysia?"
|
83 |
QA_QUESTION="What's deep learning?"
|
84 |
|
85 |
QUESTIONS_FILE_PATH="./data/questions.txt"
|
@@ -87,10 +82,12 @@ QUESTIONS_FILE_PATH="./data/questions.txt"
|
|
87 |
TOKENIZERS_PARALLELISM=true
|
88 |
|
89 |
# env variables for ingesting source PDF files
|
90 |
-
SOURCE_PDFS_PATH="./data/pdfs/"
|
91 |
-
SOURCE_URLS=
|
92 |
CHUNCK_SIZE=1024
|
93 |
CHUNK_OVERLAP=512
|
|
|
|
|
|
|
|
|
94 |
|
95 |
# telegram bot
|
96 |
TELEGRAM_API_TOKEN=
|
@@ -104,4 +101,3 @@ export NGROK_EDGE=
|
|
104 |
|
105 |
export HUGGINGFACE_HUB_CACHE=$HOME/.cache/huggingface/hub/
|
106 |
export HUGGING_FACE_HUB_TOKEN=
|
107 |
-
ß
|
|
|
28 |
|
29 |
# USE_LLAMA_2_PROMPT_TEMPLATE=true
|
30 |
DISABLE_MODEL_PRELOADING=true
|
31 |
+
CHAT_HISTORY_ENABLED=false
|
32 |
SHOW_PARAM_SETTINGS=false
|
33 |
SHARE_GRADIO_APP=false
|
|
|
34 |
|
35 |
# if unset, default to "hkunlp/instructor-xl"
|
36 |
HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
|
|
|
74 |
CTRANSFORMERS_MODEL_PATH="../models/llama-2-7b-chat.ggmlv3.q4_K_M.bin"
|
75 |
CTRANSFORMERS_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_M.bin
|
76 |
|
77 |
+
\CHAT_QUESTION="What's the capital city of Malaysia?"
|
|
|
|
|
|
|
|
|
78 |
QA_QUESTION="What's deep learning?"
|
79 |
|
80 |
QUESTIONS_FILE_PATH="./data/questions.txt"
|
|
|
82 |
TOKENIZERS_PARALLELISM=true
|
83 |
|
84 |
# env variables for ingesting source PDF files
|
|
|
|
|
85 |
CHUNCK_SIZE=1024
|
86 |
CHUNK_OVERLAP=512
|
87 |
+
SOURCE_PATH="data/pdfs/smu_lib_html/"
|
88 |
+
|
89 |
+
# Index for SMU LibBot PDF files - chunk_size=1024 chunk_overlap=512
|
90 |
+
FAISS_INDEX_PATH="data/smu_lib_index/"
|
91 |
|
92 |
# telegram bot
|
93 |
TELEGRAM_API_TOKEN=
|
|
|
101 |
|
102 |
export HUGGINGFACE_HUB_CACHE=$HOME/.cache/huggingface/hub/
|
103 |
export HUGGING_FACE_HUB_TOKEN=
|
|
.gitattributes
CHANGED
@@ -37,3 +37,5 @@ data/ai_books/index.faiss filter=lfs diff=lfs merge=lfs -text
|
|
37 |
data/ai_books/index.pkl filter=lfs diff=lfs merge=lfs -text
|
38 |
data/pci_dss_v4/index.faiss filter=lfs diff=lfs merge=lfs -text
|
39 |
data/pci_dss_v4/index.pkl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
37 |
data/ai_books/index.pkl filter=lfs diff=lfs merge=lfs -text
|
38 |
data/pci_dss_v4/index.faiss filter=lfs diff=lfs merge=lfs -text
|
39 |
data/pci_dss_v4/index.pkl filter=lfs diff=lfs merge=lfs -text
|
40 |
+
data/smu_lib_index/index.faiss filter=lfs diff=lfs merge=lfs -text
|
41 |
+
data/smu_lib_index/index.pkl filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: Chat with
|
3 |
emoji: 👀
|
4 |
colorFrom: indigo
|
5 |
colorTo: blue
|
@@ -87,7 +87,7 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
|
|
87 |
|
88 |
## Talk to Your Own PDF Files
|
89 |
|
90 |
-
- The sample PDF books & documents are downloaded from the internet (for
|
91 |
|
92 |
- You can also put your own PDF files into any folder specified in `SOURCE_PDFS_PATH` and run the command below to generate embeddings which will be stored in folder `FAISS_INDEX_PATH` or `CHROMADB_INDEX_PATH`. If both `*_INDEX_PATH` env vars are set, `FAISS_INDEX_PATH` takes precedence. Make sure the folder specified by `*_INDEX_PATH` doesn't exist; other wise the command will simply try to load index from the folder and do a simple similarity search, as a way to verify if embeddings are generated and stored properly. Please note the HuggingFace Embedding model specified by `HF_EMBEDDINGS_MODEL_NAME` will be used to generate the embeddings.
|
93 |
|
|
|
1 |
---
|
2 |
+
title: Chat with SMU LibBot
|
3 |
emoji: 👀
|
4 |
colorFrom: indigo
|
5 |
colorTo: blue
|
|
|
87 |
|
88 |
## Talk to Your Own PDF Files
|
89 |
|
90 |
+
- The sample PDF books & documents are downloaded from the internet (for SMU LibBot) and [PCI DSS official website](https://www.pcisecuritystandards.org/document_library/?category=pcidss) and the corresponding embeddings are stored in folders `data/ai_books` and `data/pci_dss_v4` respectively, which allows you to run locally without any additional effort.
|
91 |
|
92 |
- You can also put your own PDF files into any folder specified in `SOURCE_PDFS_PATH` and run the command below to generate embeddings which will be stored in folder `FAISS_INDEX_PATH` or `CHROMADB_INDEX_PATH`. If both `*_INDEX_PATH` env vars are set, `FAISS_INDEX_PATH` takes precedence. Make sure the folder specified by `*_INDEX_PATH` doesn't exist; other wise the command will simply try to load index from the folder and do a simple similarity search, as a way to verify if embeddings are generated and stored properly. Please note the HuggingFace Embedding model specified by `HF_EMBEDDINGS_MODEL_NAME` will be used to generate the embeddings.
|
93 |
|
app.py
CHANGED
@@ -38,7 +38,7 @@ if chat_with_llama_2:
|
|
38 |
qa_chain = ChatChain(llm_loader)
|
39 |
name = "Llama-2"
|
40 |
else:
|
41 |
-
name = "
|
42 |
|
43 |
title = f"""<h1 align="left" style="min-width:200px; margin-top:0;"> Chat with {name} </h1>"""
|
44 |
|
@@ -111,10 +111,11 @@ def qa(chatbot):
|
|
111 |
ret = result.get()
|
112 |
titles = []
|
113 |
for doc in ret["source_documents"]:
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
118 |
if title not in titles:
|
119 |
titles.append(title)
|
120 |
chatbot[-1][1] += f"1. [{title}]({url})\n"
|
@@ -209,5 +210,5 @@ with gr.Blocks(css=customCSS) as demo:
|
|
209 |
api_name="reset",
|
210 |
)
|
211 |
|
212 |
-
demo.title = "Chat with
|
213 |
demo.queue(concurrency_count=CONCURRENT_COUNT).launch(share=share_gradio_app)
|
|
|
38 |
qa_chain = ChatChain(llm_loader)
|
39 |
name = "Llama-2"
|
40 |
else:
|
41 |
+
name = "SMU LibBot"
|
42 |
|
43 |
title = f"""<h1 align="left" style="min-width:200px; margin-top:0;"> Chat with {name} </h1>"""
|
44 |
|
|
|
111 |
ret = result.get()
|
112 |
titles = []
|
113 |
for doc in ret["source_documents"]:
|
114 |
+
url = doc.metadata["url"]
|
115 |
+
if "page" in doc.metadata:
|
116 |
+
page = doc.metadata["page"] + 1
|
117 |
+
url = f"{url}#page={page}"
|
118 |
+
title = url
|
119 |
if title not in titles:
|
120 |
titles.append(title)
|
121 |
chatbot[-1][1] += f"1. [{title}]({url})\n"
|
|
|
210 |
api_name="reset",
|
211 |
)
|
212 |
|
213 |
+
demo.title = "Chat with SMU LibBot" if chat_with_llama_2 else "Chat with Llama-2"
|
214 |
demo.queue(concurrency_count=CONCURRENT_COUNT).launch(share=share_gradio_app)
|
app_modules/llm_inference.py
CHANGED
@@ -10,6 +10,7 @@ from langchain.chains.base import Chain
|
|
10 |
|
11 |
from app_modules.llm_loader import LLMLoader, TextIteratorStreamer
|
12 |
from app_modules.utils import remove_extra_spaces
|
|
|
13 |
|
14 |
|
15 |
class LLMInference(metaclass=abc.ABCMeta):
|
@@ -59,13 +60,31 @@ class LLMInference(metaclass=abc.ABCMeta):
|
|
59 |
if "answer" in result:
|
60 |
result["answer"] = remove_extra_spaces(result["answer"])
|
61 |
|
62 |
-
|
63 |
-
if
|
64 |
documents = result["source_documents"]
|
65 |
for doc in documents:
|
66 |
source = doc.metadata["source"]
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
return result
|
71 |
finally:
|
|
|
10 |
|
11 |
from app_modules.llm_loader import LLMLoader, TextIteratorStreamer
|
12 |
from app_modules.utils import remove_extra_spaces
|
13 |
+
from urllib.parse import urlparse, urlunparse, quote
|
14 |
|
15 |
|
16 |
class LLMInference(metaclass=abc.ABCMeta):
|
|
|
60 |
if "answer" in result:
|
61 |
result["answer"] = remove_extra_spaces(result["answer"])
|
62 |
|
63 |
+
source_path = os.environ.get("SOURCE_PATH")
|
64 |
+
if source_path is not None and len(source_path) > 0:
|
65 |
documents = result["source_documents"]
|
66 |
for doc in documents:
|
67 |
source = doc.metadata["source"]
|
68 |
+
url = source.replace(source_path, "https://")
|
69 |
+
url = url.replace(".html", "")
|
70 |
+
parsed_url = urlparse(url)
|
71 |
+
|
72 |
+
# Encode path, query, and fragment
|
73 |
+
encoded_path = quote(parsed_url.path)
|
74 |
+
encoded_query = quote(parsed_url.query)
|
75 |
+
encoded_fragment = quote(parsed_url.fragment)
|
76 |
+
|
77 |
+
# Construct the encoded URL
|
78 |
+
doc.metadata["url"] = urlunparse(
|
79 |
+
(
|
80 |
+
parsed_url.scheme,
|
81 |
+
parsed_url.netloc,
|
82 |
+
encoded_path,
|
83 |
+
parsed_url.params,
|
84 |
+
encoded_query,
|
85 |
+
encoded_fragment,
|
86 |
+
)
|
87 |
+
)
|
88 |
|
89 |
return result
|
90 |
finally:
|
app_modules/utils.py
CHANGED
@@ -74,10 +74,11 @@ def print_llm_response(llm_response):
|
|
74 |
print("\nSources:")
|
75 |
for source in source_documents:
|
76 |
metadata = source["metadata"] if "metadata" in source else source.metadata
|
|
|
|
|
|
|
77 |
print(
|
78 |
-
"
|
79 |
-
+ str(metadata["page"])
|
80 |
-
+ " Source: "
|
81 |
+ str(metadata["url"] if "url" in metadata else metadata["source"])
|
82 |
)
|
83 |
print(
|
|
|
74 |
print("\nSources:")
|
75 |
for source in source_documents:
|
76 |
metadata = source["metadata"] if "metadata" in source else source.metadata
|
77 |
+
if "page" in metadata:
|
78 |
+
print(f" Page: {metadata['page']}", end="")
|
79 |
+
|
80 |
print(
|
81 |
+
" Source: "
|
|
|
|
|
82 |
+ str(metadata["url"] if "url" in metadata else metadata["source"])
|
83 |
)
|
84 |
print(
|
data/questions.txt
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
-
What
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
|
|
|
1 |
+
What are the library opening hours?
|
2 |
+
I'm an undergrad. How many books can I borrow from libraries?
|
3 |
+
Can you list some of recommended resources on generative AI?
|
4 |
+
Hi, is it necessary to book a terminal first before being able to use the bloomberg computer in the library? or can i just show up?
|
5 |
+
Hi, I am an alumni of SMU (batch of 2018). I wanted to enquire for SMU Alumni rates for access to library resources (databases, investment studio) etc
|
6 |
+
I've overdue fine of $4.00. Could you advise on how I can go about paying the fine?
|
data/smu_lib_index/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3f922ef2c87a9ab83f3a6ddc5c83f63607c51b3c3557c639e1fcc65b1d5071ee
|
3 |
+
size 15009837
|
data/smu_lib_index/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8986b3308523752e17623ee5832bdca1637a8d3fde7bad1928466ee1ef885d69
|
3 |
+
size 4510879
|
ingest-pdf-html.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# setting device on GPU if available, else CPU
|
2 |
+
import os
|
3 |
+
from timeit import default_timer as timer
|
4 |
+
from typing import List
|
5 |
+
|
6 |
+
from langchain.document_loaders import DirectoryLoader
|
7 |
+
from langchain.document_loaders import PyPDFDirectoryLoader
|
8 |
+
|
9 |
+
from langchain.embeddings import HuggingFaceInstructEmbeddings
|
10 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
+
from langchain.vectorstores.base import VectorStore
|
12 |
+
from langchain.vectorstores.chroma import Chroma
|
13 |
+
from langchain.vectorstores.faiss import FAISS
|
14 |
+
|
15 |
+
from app_modules.init import *
|
16 |
+
|
17 |
+
|
18 |
+
def load_documents(source_path) -> List:
|
19 |
+
loader = PyPDFDirectoryLoader(source_path, silent_errors=True)
|
20 |
+
documents = loader.load()
|
21 |
+
|
22 |
+
loader = DirectoryLoader(
|
23 |
+
source_path, glob="**/*.html", silent_errors=True, show_progress=True
|
24 |
+
)
|
25 |
+
documents.extend(loader.load())
|
26 |
+
return documents
|
27 |
+
|
28 |
+
|
29 |
+
def split_chunks(documents: List, chunk_size, chunk_overlap) -> List:
|
30 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
31 |
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
32 |
+
)
|
33 |
+
return text_splitter.split_documents(documents)
|
34 |
+
|
35 |
+
|
36 |
+
def generate_index(
|
37 |
+
chunks: List, embeddings: HuggingFaceInstructEmbeddings
|
38 |
+
) -> VectorStore:
|
39 |
+
if using_faiss:
|
40 |
+
faiss_instructor_embeddings = FAISS.from_documents(
|
41 |
+
documents=chunks, embedding=embeddings
|
42 |
+
)
|
43 |
+
|
44 |
+
faiss_instructor_embeddings.save_local(index_path)
|
45 |
+
return faiss_instructor_embeddings
|
46 |
+
else:
|
47 |
+
chromadb_instructor_embeddings = Chroma.from_documents(
|
48 |
+
documents=chunks, embedding=embeddings, persist_directory=index_path
|
49 |
+
)
|
50 |
+
|
51 |
+
chromadb_instructor_embeddings.persist()
|
52 |
+
return chromadb_instructor_embeddings
|
53 |
+
|
54 |
+
|
55 |
+
# Constants
|
56 |
+
device_type, hf_pipeline_device_type = get_device_types()
|
57 |
+
hf_embeddings_model_name = (
|
58 |
+
os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
|
59 |
+
)
|
60 |
+
index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get("CHROMADB_INDEX_PATH")
|
61 |
+
using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
|
62 |
+
source_path = os.environ.get("SOURCE_PATH")
|
63 |
+
chunk_size = os.environ.get("CHUNCK_SIZE")
|
64 |
+
chunk_overlap = os.environ.get("CHUNK_OVERLAP")
|
65 |
+
|
66 |
+
start = timer()
|
67 |
+
embeddings = HuggingFaceInstructEmbeddings(
|
68 |
+
model_name=hf_embeddings_model_name, model_kwargs={"device": device_type}
|
69 |
+
)
|
70 |
+
end = timer()
|
71 |
+
|
72 |
+
print(f"Completed in {end - start:.3f}s")
|
73 |
+
|
74 |
+
start = timer()
|
75 |
+
|
76 |
+
if not os.path.isdir(index_path):
|
77 |
+
print(
|
78 |
+
f"The index persist directory {index_path} is not present. Creating a new one."
|
79 |
+
)
|
80 |
+
os.mkdir(index_path)
|
81 |
+
|
82 |
+
print(f"Loading PDF & HTML files from {source_path}")
|
83 |
+
sources = load_documents(source_path)
|
84 |
+
# print(sources[359])
|
85 |
+
|
86 |
+
print(f"Splitting {len(sources)} HTML pages in to chunks ...")
|
87 |
+
|
88 |
+
chunks = split_chunks(
|
89 |
+
sources, chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap)
|
90 |
+
)
|
91 |
+
print(chunks[3])
|
92 |
+
print(f"Generating index for {len(chunks)} chunks ...")
|
93 |
+
|
94 |
+
index = generate_index(chunks, embeddings)
|
95 |
+
else:
|
96 |
+
print(f"The index persist directory {index_path} is present. Loading index ...")
|
97 |
+
index = (
|
98 |
+
FAISS.load_local(index_path, embeddings)
|
99 |
+
if using_faiss
|
100 |
+
else Chroma(embedding_function=embeddings, persist_directory=index_path)
|
101 |
+
)
|
102 |
+
query = "hi"
|
103 |
+
print(f"Load relevant documents for standalone question: {query}")
|
104 |
+
|
105 |
+
start2 = timer()
|
106 |
+
docs = index.as_retriever().get_relevant_documents(query)
|
107 |
+
end = timer()
|
108 |
+
|
109 |
+
print(f"Completed in {end - start2:.3f}s")
|
110 |
+
print(docs)
|
111 |
+
|
112 |
+
end = timer()
|
113 |
+
|
114 |
+
print(f"Completed in {end - start:.3f}s")
|
test.py
CHANGED
@@ -77,7 +77,7 @@ while True:
|
|
77 |
end = timer()
|
78 |
print(f"Completed in {end - start:.3f}s")
|
79 |
|
80 |
-
print_llm_response(result)
|
81 |
|
82 |
if len(chat_history) == 0:
|
83 |
standalone_question = query
|
|
|
77 |
end = timer()
|
78 |
print(f"Completed in {end - start:.3f}s")
|
79 |
|
80 |
+
# print_llm_response(result)
|
81 |
|
82 |
if len(chat_history) == 0:
|
83 |
standalone_question = query
|