dh-mc commited on
Commit
71f3335
·
1 Parent(s): 675cf55

code complete for smu lib bot

Browse files
.env.example CHANGED
@@ -28,10 +28,9 @@ HF_PIPELINE_DEVICE_TYPE=
28
 
29
  # USE_LLAMA_2_PROMPT_TEMPLATE=true
30
  DISABLE_MODEL_PRELOADING=true
31
- CHAT_HISTORY_ENABLED=true
32
  SHOW_PARAM_SETTINGS=false
33
  SHARE_GRADIO_APP=false
34
- PDF_FILE_BASE_URL=https://chat-with-llama-2.netlify.app/pdfs/books/
35
 
36
  # if unset, default to "hkunlp/instructor-xl"
37
  HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
@@ -75,11 +74,7 @@ LLAMACPP_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/reso
75
  CTRANSFORMERS_MODEL_PATH="../models/llama-2-7b-chat.ggmlv3.q4_K_M.bin"
76
  CTRANSFORMERS_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_M.bin
77
 
78
- # Index for AI Books PDF files - chunk_size=1024 chunk_overlap=512
79
- # CHROMADB_INDEX_PATH="./data/chromadb_1024_512/"
80
- FAISS_INDEX_PATH="./data/ai_books/"
81
-
82
- CHAT_QUESTION="What's the capital city of Malaysia?"
83
  QA_QUESTION="What's deep learning?"
84
 
85
  QUESTIONS_FILE_PATH="./data/questions.txt"
@@ -87,10 +82,12 @@ QUESTIONS_FILE_PATH="./data/questions.txt"
87
  TOKENIZERS_PARALLELISM=true
88
 
89
  # env variables for ingesting source PDF files
90
- SOURCE_PDFS_PATH="./data/pdfs/"
91
- SOURCE_URLS=
92
  CHUNCK_SIZE=1024
93
  CHUNK_OVERLAP=512
 
 
 
 
94
 
95
  # telegram bot
96
  TELEGRAM_API_TOKEN=
@@ -104,4 +101,3 @@ export NGROK_EDGE=
104
 
105
  export HUGGINGFACE_HUB_CACHE=$HOME/.cache/huggingface/hub/
106
  export HUGGING_FACE_HUB_TOKEN=
107
- ß
 
28
 
29
  # USE_LLAMA_2_PROMPT_TEMPLATE=true
30
  DISABLE_MODEL_PRELOADING=true
31
+ CHAT_HISTORY_ENABLED=false
32
  SHOW_PARAM_SETTINGS=false
33
  SHARE_GRADIO_APP=false
 
34
 
35
  # if unset, default to "hkunlp/instructor-xl"
36
  HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
 
74
  CTRANSFORMERS_MODEL_PATH="../models/llama-2-7b-chat.ggmlv3.q4_K_M.bin"
75
  CTRANSFORMERS_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_M.bin
76
 
77
+ \CHAT_QUESTION="What's the capital city of Malaysia?"
 
 
 
 
78
  QA_QUESTION="What's deep learning?"
79
 
80
  QUESTIONS_FILE_PATH="./data/questions.txt"
 
82
  TOKENIZERS_PARALLELISM=true
83
 
84
  # env variables for ingesting source PDF files
 
 
85
  CHUNCK_SIZE=1024
86
  CHUNK_OVERLAP=512
87
+ SOURCE_PATH="data/pdfs/smu_lib_html/"
88
+
89
+ # Index for SMU LibBot PDF files - chunk_size=1024 chunk_overlap=512
90
+ FAISS_INDEX_PATH="data/smu_lib_index/"
91
 
92
  # telegram bot
93
  TELEGRAM_API_TOKEN=
 
101
 
102
  export HUGGINGFACE_HUB_CACHE=$HOME/.cache/huggingface/hub/
103
  export HUGGING_FACE_HUB_TOKEN=
 
.gitattributes CHANGED
@@ -37,3 +37,5 @@ data/ai_books/index.faiss filter=lfs diff=lfs merge=lfs -text
37
  data/ai_books/index.pkl filter=lfs diff=lfs merge=lfs -text
38
  data/pci_dss_v4/index.faiss filter=lfs diff=lfs merge=lfs -text
39
  data/pci_dss_v4/index.pkl filter=lfs diff=lfs merge=lfs -text
 
 
 
37
  data/ai_books/index.pkl filter=lfs diff=lfs merge=lfs -text
38
  data/pci_dss_v4/index.faiss filter=lfs diff=lfs merge=lfs -text
39
  data/pci_dss_v4/index.pkl filter=lfs diff=lfs merge=lfs -text
40
+ data/smu_lib_index/index.faiss filter=lfs diff=lfs merge=lfs -text
41
+ data/smu_lib_index/index.pkl filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Chat with AI Books
3
  emoji: 👀
4
  colorFrom: indigo
5
  colorTo: blue
@@ -87,7 +87,7 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
87
 
88
  ## Talk to Your Own PDF Files
89
 
90
- - The sample PDF books & documents are downloaded from the internet (for AI Books) and [PCI DSS official website](https://www.pcisecuritystandards.org/document_library/?category=pcidss) and the corresponding embeddings are stored in folders `data/ai_books` and `data/pci_dss_v4` respectively, which allows you to run locally without any additional effort.
91
 
92
  - You can also put your own PDF files into any folder specified in `SOURCE_PDFS_PATH` and run the command below to generate embeddings which will be stored in folder `FAISS_INDEX_PATH` or `CHROMADB_INDEX_PATH`. If both `*_INDEX_PATH` env vars are set, `FAISS_INDEX_PATH` takes precedence. Make sure the folder specified by `*_INDEX_PATH` doesn't exist; other wise the command will simply try to load index from the folder and do a simple similarity search, as a way to verify if embeddings are generated and stored properly. Please note the HuggingFace Embedding model specified by `HF_EMBEDDINGS_MODEL_NAME` will be used to generate the embeddings.
93
 
 
1
  ---
2
+ title: Chat with SMU LibBot
3
  emoji: 👀
4
  colorFrom: indigo
5
  colorTo: blue
 
87
 
88
  ## Talk to Your Own PDF Files
89
 
90
+ - The sample PDF books & documents are downloaded from the internet (for SMU LibBot) and [PCI DSS official website](https://www.pcisecuritystandards.org/document_library/?category=pcidss) and the corresponding embeddings are stored in folders `data/ai_books` and `data/pci_dss_v4` respectively, which allows you to run locally without any additional effort.
91
 
92
  - You can also put your own PDF files into any folder specified in `SOURCE_PDFS_PATH` and run the command below to generate embeddings which will be stored in folder `FAISS_INDEX_PATH` or `CHROMADB_INDEX_PATH`. If both `*_INDEX_PATH` env vars are set, `FAISS_INDEX_PATH` takes precedence. Make sure the folder specified by `*_INDEX_PATH` doesn't exist; other wise the command will simply try to load index from the folder and do a simple similarity search, as a way to verify if embeddings are generated and stored properly. Please note the HuggingFace Embedding model specified by `HF_EMBEDDINGS_MODEL_NAME` will be used to generate the embeddings.
93
 
app.py CHANGED
@@ -38,7 +38,7 @@ if chat_with_llama_2:
38
  qa_chain = ChatChain(llm_loader)
39
  name = "Llama-2"
40
  else:
41
- name = "AI Books"
42
 
43
  title = f"""<h1 align="left" style="min-width:200px; margin-top:0;"> Chat with {name} </h1>"""
44
 
@@ -111,10 +111,11 @@ def qa(chatbot):
111
  ret = result.get()
112
  titles = []
113
  for doc in ret["source_documents"]:
114
- page = doc.metadata["page"] + 1
115
- url = f"{doc.metadata['url']}#page={page}"
116
- file_name = doc.metadata["source"].split("/")[-1]
117
- title = f"{file_name} Page: {page}"
 
118
  if title not in titles:
119
  titles.append(title)
120
  chatbot[-1][1] += f"1. [{title}]({url})\n"
@@ -209,5 +210,5 @@ with gr.Blocks(css=customCSS) as demo:
209
  api_name="reset",
210
  )
211
 
212
- demo.title = "Chat with AI Books" if chat_with_llama_2 else "Chat with Llama-2"
213
  demo.queue(concurrency_count=CONCURRENT_COUNT).launch(share=share_gradio_app)
 
38
  qa_chain = ChatChain(llm_loader)
39
  name = "Llama-2"
40
  else:
41
+ name = "SMU LibBot"
42
 
43
  title = f"""<h1 align="left" style="min-width:200px; margin-top:0;"> Chat with {name} </h1>"""
44
 
 
111
  ret = result.get()
112
  titles = []
113
  for doc in ret["source_documents"]:
114
+ url = doc.metadata["url"]
115
+ if "page" in doc.metadata:
116
+ page = doc.metadata["page"] + 1
117
+ url = f"{url}#page={page}"
118
+ title = url
119
  if title not in titles:
120
  titles.append(title)
121
  chatbot[-1][1] += f"1. [{title}]({url})\n"
 
210
  api_name="reset",
211
  )
212
 
213
+ demo.title = "Chat with SMU LibBot" if chat_with_llama_2 else "Chat with Llama-2"
214
  demo.queue(concurrency_count=CONCURRENT_COUNT).launch(share=share_gradio_app)
app_modules/llm_inference.py CHANGED
@@ -10,6 +10,7 @@ from langchain.chains.base import Chain
10
 
11
  from app_modules.llm_loader import LLMLoader, TextIteratorStreamer
12
  from app_modules.utils import remove_extra_spaces
 
13
 
14
 
15
  class LLMInference(metaclass=abc.ABCMeta):
@@ -59,13 +60,31 @@ class LLMInference(metaclass=abc.ABCMeta):
59
  if "answer" in result:
60
  result["answer"] = remove_extra_spaces(result["answer"])
61
 
62
- base_url = os.environ.get("PDF_FILE_BASE_URL")
63
- if base_url is not None and len(base_url) > 0:
64
  documents = result["source_documents"]
65
  for doc in documents:
66
  source = doc.metadata["source"]
67
- title = source.split("/")[-1]
68
- doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  return result
71
  finally:
 
10
 
11
  from app_modules.llm_loader import LLMLoader, TextIteratorStreamer
12
  from app_modules.utils import remove_extra_spaces
13
+ from urllib.parse import urlparse, urlunparse, quote
14
 
15
 
16
  class LLMInference(metaclass=abc.ABCMeta):
 
60
  if "answer" in result:
61
  result["answer"] = remove_extra_spaces(result["answer"])
62
 
63
+ source_path = os.environ.get("SOURCE_PATH")
64
+ if source_path is not None and len(source_path) > 0:
65
  documents = result["source_documents"]
66
  for doc in documents:
67
  source = doc.metadata["source"]
68
+ url = source.replace(source_path, "https://")
69
+ url = url.replace(".html", "")
70
+ parsed_url = urlparse(url)
71
+
72
+ # Encode path, query, and fragment
73
+ encoded_path = quote(parsed_url.path)
74
+ encoded_query = quote(parsed_url.query)
75
+ encoded_fragment = quote(parsed_url.fragment)
76
+
77
+ # Construct the encoded URL
78
+ doc.metadata["url"] = urlunparse(
79
+ (
80
+ parsed_url.scheme,
81
+ parsed_url.netloc,
82
+ encoded_path,
83
+ parsed_url.params,
84
+ encoded_query,
85
+ encoded_fragment,
86
+ )
87
+ )
88
 
89
  return result
90
  finally:
app_modules/utils.py CHANGED
@@ -74,10 +74,11 @@ def print_llm_response(llm_response):
74
  print("\nSources:")
75
  for source in source_documents:
76
  metadata = source["metadata"] if "metadata" in source else source.metadata
 
 
 
77
  print(
78
- " Page: "
79
- + str(metadata["page"])
80
- + " Source: "
81
  + str(metadata["url"] if "url" in metadata else metadata["source"])
82
  )
83
  print(
 
74
  print("\nSources:")
75
  for source in source_documents:
76
  metadata = source["metadata"] if "metadata" in source else source.metadata
77
+ if "page" in metadata:
78
+ print(f" Page: {metadata['page']}", end="")
79
+
80
  print(
81
+ " Source: "
 
 
82
  + str(metadata["url"] if "url" in metadata else metadata["source"])
83
  )
84
  print(
data/questions.txt CHANGED
@@ -1,5 +1,6 @@
1
- What's AI?
2
- life in AI era
3
- machine learning
4
- generative model
5
- graph attention network
 
 
1
+ What are the library opening hours?
2
+ I'm an undergrad. How many books can I borrow from libraries?
3
+ Can you list some of recommended resources on generative AI?
4
+ Hi, is it necessary to book a terminal first before being able to use the bloomberg computer in the library? or can i just show up?
5
+ Hi, I am an alumni of SMU (batch of 2018). I wanted to enquire for SMU Alumni rates for access to library resources (databases, investment studio) etc
6
+ I've overdue fine of $4.00. Could you advise on how I can go about paying the fine?
data/smu_lib_index/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f922ef2c87a9ab83f3a6ddc5c83f63607c51b3c3557c639e1fcc65b1d5071ee
3
+ size 15009837
data/smu_lib_index/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8986b3308523752e17623ee5832bdca1637a8d3fde7bad1928466ee1ef885d69
3
+ size 4510879
ingest-pdf-html.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # setting device on GPU if available, else CPU
2
+ import os
3
+ from timeit import default_timer as timer
4
+ from typing import List
5
+
6
+ from langchain.document_loaders import DirectoryLoader
7
+ from langchain.document_loaders import PyPDFDirectoryLoader
8
+
9
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain.vectorstores.base import VectorStore
12
+ from langchain.vectorstores.chroma import Chroma
13
+ from langchain.vectorstores.faiss import FAISS
14
+
15
+ from app_modules.init import *
16
+
17
+
18
+ def load_documents(source_path) -> List:
19
+ loader = PyPDFDirectoryLoader(source_path, silent_errors=True)
20
+ documents = loader.load()
21
+
22
+ loader = DirectoryLoader(
23
+ source_path, glob="**/*.html", silent_errors=True, show_progress=True
24
+ )
25
+ documents.extend(loader.load())
26
+ return documents
27
+
28
+
29
+ def split_chunks(documents: List, chunk_size, chunk_overlap) -> List:
30
+ text_splitter = RecursiveCharacterTextSplitter(
31
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
32
+ )
33
+ return text_splitter.split_documents(documents)
34
+
35
+
36
+ def generate_index(
37
+ chunks: List, embeddings: HuggingFaceInstructEmbeddings
38
+ ) -> VectorStore:
39
+ if using_faiss:
40
+ faiss_instructor_embeddings = FAISS.from_documents(
41
+ documents=chunks, embedding=embeddings
42
+ )
43
+
44
+ faiss_instructor_embeddings.save_local(index_path)
45
+ return faiss_instructor_embeddings
46
+ else:
47
+ chromadb_instructor_embeddings = Chroma.from_documents(
48
+ documents=chunks, embedding=embeddings, persist_directory=index_path
49
+ )
50
+
51
+ chromadb_instructor_embeddings.persist()
52
+ return chromadb_instructor_embeddings
53
+
54
+
55
+ # Constants
56
+ device_type, hf_pipeline_device_type = get_device_types()
57
+ hf_embeddings_model_name = (
58
+ os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
59
+ )
60
+ index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get("CHROMADB_INDEX_PATH")
61
+ using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
62
+ source_path = os.environ.get("SOURCE_PATH")
63
+ chunk_size = os.environ.get("CHUNCK_SIZE")
64
+ chunk_overlap = os.environ.get("CHUNK_OVERLAP")
65
+
66
+ start = timer()
67
+ embeddings = HuggingFaceInstructEmbeddings(
68
+ model_name=hf_embeddings_model_name, model_kwargs={"device": device_type}
69
+ )
70
+ end = timer()
71
+
72
+ print(f"Completed in {end - start:.3f}s")
73
+
74
+ start = timer()
75
+
76
+ if not os.path.isdir(index_path):
77
+ print(
78
+ f"The index persist directory {index_path} is not present. Creating a new one."
79
+ )
80
+ os.mkdir(index_path)
81
+
82
+ print(f"Loading PDF & HTML files from {source_path}")
83
+ sources = load_documents(source_path)
84
+ # print(sources[359])
85
+
86
+ print(f"Splitting {len(sources)} HTML pages in to chunks ...")
87
+
88
+ chunks = split_chunks(
89
+ sources, chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap)
90
+ )
91
+ print(chunks[3])
92
+ print(f"Generating index for {len(chunks)} chunks ...")
93
+
94
+ index = generate_index(chunks, embeddings)
95
+ else:
96
+ print(f"The index persist directory {index_path} is present. Loading index ...")
97
+ index = (
98
+ FAISS.load_local(index_path, embeddings)
99
+ if using_faiss
100
+ else Chroma(embedding_function=embeddings, persist_directory=index_path)
101
+ )
102
+ query = "hi"
103
+ print(f"Load relevant documents for standalone question: {query}")
104
+
105
+ start2 = timer()
106
+ docs = index.as_retriever().get_relevant_documents(query)
107
+ end = timer()
108
+
109
+ print(f"Completed in {end - start2:.3f}s")
110
+ print(docs)
111
+
112
+ end = timer()
113
+
114
+ print(f"Completed in {end - start:.3f}s")
test.py CHANGED
@@ -77,7 +77,7 @@ while True:
77
  end = timer()
78
  print(f"Completed in {end - start:.3f}s")
79
 
80
- print_llm_response(result)
81
 
82
  if len(chat_history) == 0:
83
  standalone_question = query
 
77
  end = timer()
78
  print(f"Completed in {end - start:.3f}s")
79
 
80
+ # print_llm_response(result)
81
 
82
  if len(chat_history) == 0:
83
  standalone_question = query