Corentin commited on
Commit
ea4e986
·
1 Parent(s): f8a66c4
.gitattributes CHANGED
@@ -32,3 +32,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ data/dnd_spell.pdf filter=lfs diff=lfs merge=lfs -text
36
+ db_spells/index/index_abb61b80-18ff-4301-8b99-627ee62ef944.bin filter=lfs diff=lfs merge=lfs -text
37
+ db_spells/chroma-collections.parquet filter=lfs diff=lfs merge=lfs -text
38
+ db_spells/chroma-embeddings.parquet filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Harrison Chase
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,53 @@
1
  ---
2
- title: Dnd Qa
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: indigo
6
  sdk: streamlit
7
  sdk_version: 1.19.0
8
- app_file: app.py
9
- pinned: false
10
  license: agpl-3.0
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: DnD QA Bot
3
+ emoji: 🗡️
4
+ colorFrom: purple
5
+ colorTo: yellow
6
  sdk: streamlit
7
  sdk_version: 1.19.0
8
+ app_file: main.py
 
9
  license: agpl-3.0
10
+ python: 3.10.9
11
  ---
12
 
13
+ # 🗡️D&D Spell QA Bot🗡️
14
+
15
+ This is a chatbot that can answer questions about **Dungeon and Dragons spells** based on this [database](https://www.aidedd.org/dnd-filters/spells-5e.php) and built with LangChain and OpenAI API. Usefull to find informations quickly instead of browsing through 50 pages of PDF.
16
+ The creator of this bot is **[Corentin Meyer (@corentinm_py)](https://twitter.com/corentinm_py)**.
17
+ 💪 This bot it based on Notion Question-Answering demo from [LangChain](https://github.com/hwchase17/langchain)
18
+
19
+ # 🌲 Environment Setup
20
+
21
+ In order to set your environment up to run the code here, first install all requirements and then launch streamlit app:
22
+
23
+ ```shell
24
+ python -m venv .venv
25
+ source .venv/bin/activate
26
+ pip install -r requirements.txt
27
+ streamlit run main.py
28
+ ```
29
+
30
+ Then set your OpenAI API key (if you don't have one, get one [here](https://beta.openai.com/playground))
31
+
32
+ ```shell
33
+ export OPENAI_API_KEY=....
34
+ ```
35
+
36
+ ## 🚀 Code to deploy on StreamLit
37
+
38
+ The code to run the StreamLit app is in `main.py`.
39
+ Note that when setting up your StreamLit app you should make sure to add `OPENAI_API_KEY` as a secret environment variable.
40
+
41
+ ## 🧑 Reproduce the embedding and stuff
42
+
43
+ Run the following command to ingest the data.
44
+
45
+ ```shell
46
+ python ingest.py
47
+ ```
48
+
49
+ Boom! Now you're done, and you can ask it questions like:
50
+
51
+ ```shell
52
+ python qa.py "What's the size of tsunami spell ?"
53
+ ```
awk.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ awk '{B[NR%3]=$0} NR>2{ print B[(NR+1)%3]} /^level/ {print ""} END {print B[(NR+2)%3]; print B[(NR+3)%3]}' dnd_spell.txt > dnd_spell_split.txt
data/dnd_spell.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b89814c7fe82a5b318fbf0aace794533ed9a4501e28dec72bb1eccd113b5027
3
+ size 2307284
data/dnd_spell.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/dnd_spell_split.txt ADDED
The diff for this file is too large to render. See raw diff
 
db_spells/chroma-collections.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:085aa69d06c9189b9a861beb67583680dc3ada6e910077896432aed1739558c5
3
+ size 557
db_spells/chroma-embeddings.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a15ac51b1f91a063e3e4b3ddfd8e22d4fd0df42d527e5b8459f26d0edc486de6
3
+ size 3148551
db_spells/index/id_to_uuid_abb61b80-18ff-4301-8b99-627ee62ef944.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfc6dbe52b6e9d5c12281a1d253f9950a4dbaf6cba9d5b421ea0608e1d3817b3
3
+ size 10498
db_spells/index/index_abb61b80-18ff-4301-8b99-627ee62ef944.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16ed23c2bb32432023b2665ee2f0eb9b812f24959266df681a128a5aec1679ec
3
+ size 2076836
db_spells/index/index_metadata_abb61b80-18ff-4301-8b99-627ee62ef944.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47a774357132affe4f362595e475f07233eed4702f998ad5cd1e3b67455dbaa4
3
+ size 74
db_spells/index/uuid_to_id_abb61b80-18ff-4301-8b99-627ee62ef944.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08c067aeda07cadf4c22b364e1b009e6352806af610923970c00c9bc227b5a9b
3
+ size 12300
ingest.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #%%
2
+ from langchain.embeddings.openai import OpenAIEmbeddings
3
+
4
+ from langchain.vectorstores import Chroma
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+
7
+ persist_directory = "db_spells"
8
+ with open("data/dnd_spell_split.txt") as f:
9
+ dnd_spell = f.read()
10
+ text_splitter = CharacterTextSplitter(
11
+ separator="\n\n",
12
+ chunk_size=1000,
13
+ chunk_overlap=0,
14
+ length_function=len,
15
+ )
16
+ texts = text_splitter.split_text(dnd_spell)
17
+
18
+ docs = text_splitter.create_documents([dnd_spell])
19
+ embeddings = OpenAIEmbeddings()
20
+
21
+ metadatas = []
22
+ for i in texts:
23
+ source = i.split("\n")[0]
24
+ metadatas.append({"source": f"Spell {source} in dnd_spell_split.txt"})
25
+ #%%
26
+ docsearch = Chroma.from_texts(
27
+ texts,
28
+ embeddings,
29
+ persist_directory=persist_directory,
30
+ metadatas=metadatas,
31
+ )
32
+ docsearch.persist()
33
+ docsearch = None
34
+
35
+ # %%
main.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Python file to serve as the frontend"""
2
+ import streamlit as st
3
+ from streamlit_chat import message
4
+
5
+ from langchain.chains import VectorDBQAWithSourcesChain
6
+ from langchain.embeddings.openai import OpenAIEmbeddings
7
+ from langchain.vectorstores import Chroma
8
+ from langchain.chat_models import ChatOpenAI
9
+ from langchain.prompts.chat import (
10
+ ChatPromptTemplate,
11
+ SystemMessagePromptTemplate,
12
+ HumanMessagePromptTemplate,
13
+ )
14
+
15
+ st.set_page_config(page_title="D&D 🗡️ Spell QA Bot", page_icon="🗡️")
16
+
17
+ # Load the LangChain.
18
+ system_template = """Use the following pieces of context to answer the users question.
19
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
20
+ ALWAYS return a "SOURCES" part in your answer.
21
+ The "SOURCES" part should be a reference to the source of the document from which you got your answer.
22
+
23
+ Example of your response should be:
24
+
25
+ ```
26
+ The answer is foo
27
+ SOURCES: xyz
28
+ ```
29
+
30
+ Begin!
31
+ ----------------
32
+ {summaries}"""
33
+ messages = [
34
+ SystemMessagePromptTemplate.from_template(system_template),
35
+ HumanMessagePromptTemplate.from_template("{question}"),
36
+ ]
37
+ prompt = ChatPromptTemplate.from_messages(messages)
38
+
39
+
40
+ @st.cache_resource
41
+ def load_chroma():
42
+ persist_directory = "db_spells"
43
+ embeddings = OpenAIEmbeddings()
44
+ vectordb = Chroma(
45
+ persist_directory=persist_directory, embedding_function=embeddings
46
+ )
47
+ return vectordb
48
+
49
+
50
+ vectordb = load_chroma()
51
+ chain_type_kwargs = {"prompt": prompt}
52
+ chain = VectorDBQAWithSourcesChain.from_chain_type(
53
+ ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
54
+ chain_type="stuff",
55
+ vectorstore=vectordb,
56
+ chain_type_kwargs=chain_type_kwargs,
57
+ )
58
+
59
+
60
+ # From here down is all the StreamLit UI.
61
+ st.header("D&D 🗡️ Spell QA Bot")
62
+ st.markdown(
63
+ """
64
+ This is a chatbot that can answer questions about **Dungeon and Dragons spells** based on this [database](https://www.aidedd.org/dnd-filters/spells-5e.php) and built with LangChain and OpenAI API.
65
+ The creator of this bot is **[Corentin Meyer (@corentinm_py)](https://twitter.com/corentinm_py)**.
66
+ Try by yourself by typing something like: "What's the size of tsunami spell ?"
67
+ """
68
+ )
69
+
70
+ if "generated" not in st.session_state:
71
+ st.session_state["generated"] = []
72
+
73
+ if "past" not in st.session_state:
74
+ st.session_state["past"] = []
75
+
76
+
77
+ def get_text():
78
+ input_text = st.text_input(
79
+ "You: ", "What's the size of tsunami spell ?", key="input"
80
+ )
81
+ return input_text
82
+
83
+
84
+ user_input = get_text()
85
+
86
+ if user_input:
87
+ result = chain(
88
+ {"question": user_input},
89
+ return_only_outputs=True,
90
+ )
91
+ output = f"Answer: {result['answer']}\nSources: {result['sources']}"
92
+
93
+ st.session_state.past.append(user_input)
94
+ st.session_state.generated.append(output)
95
+
96
+ if st.session_state["generated"]:
97
+
98
+ for i in range(len(st.session_state["generated"]) - 1, -1, -1):
99
+ message(st.session_state["generated"][i], key=str(i))
100
+ message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
qa.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Ask a question to the database."""
2
+ #%%
3
+ from langchain.chains import VectorDBQAWithSourcesChain
4
+ from langchain.embeddings.openai import OpenAIEmbeddings
5
+ from langchain.vectorstores import Chroma
6
+ from langchain.chat_models import ChatOpenAI
7
+ from langchain.prompts.chat import (
8
+ ChatPromptTemplate,
9
+ SystemMessagePromptTemplate,
10
+ HumanMessagePromptTemplate,
11
+ )
12
+ import argparse
13
+
14
+ system_template = """Use the following pieces of context to answer the users question.
15
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
16
+ ALWAYS return a "SOURCES" part in your answer.
17
+ The "SOURCES" part should be a reference to the source of the document from which you got your answer.
18
+
19
+ Example of your response should be:
20
+
21
+ ```
22
+ The answer is foo
23
+ SOURCES: xyz
24
+ ```
25
+
26
+ Begin!
27
+ ----------------
28
+ {summaries}"""
29
+ messages = [
30
+ SystemMessagePromptTemplate.from_template(system_template),
31
+ HumanMessagePromptTemplate.from_template("{question}"),
32
+ ]
33
+ prompt = ChatPromptTemplate.from_messages(messages)
34
+
35
+ persist_directory = "db_spells"
36
+ embeddings = OpenAIEmbeddings()
37
+ vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
38
+ chain_type_kwargs = {"prompt": prompt}
39
+ chain = VectorDBQAWithSourcesChain.from_chain_type(
40
+ ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
41
+ chain_type="stuff",
42
+ vectorstore=vectordb,
43
+ chain_type_kwargs=chain_type_kwargs,
44
+ )
45
+
46
+ parser = argparse.ArgumentParser(description="Ask a question to the DB.")
47
+ parser.add_argument("question", type=str, help="The question to ask the DB")
48
+ args = parser.parse_args()
49
+
50
+ result = chain(
51
+ {"question": args.question},
52
+ return_only_outputs=True,
53
+ )
54
+ print(f"Answer: {result['answer']}")
55
+ print(f"Sources: {result['sources']}")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ openai
3
+ streamlit
4
+ streamlit-chat
5
+ ipykernel
6
+ tiktoken
7
+ chromadb
8
+ ipykernel