Adrian Cowham commited on
Commit
e71c4e6
1 Parent(s): e8442b6

restarting

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ resources/design-by-fire.pdf filter=lfs diff=lfs merge=lfs -text
37
+ resources/lets-talk.pdf filter=lfs diff=lfs merge=lfs -text
38
+ resources/progit.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: green
5
  colorTo: green
6
  sdk: gradio
7
  sdk_version: 3.40.1
8
- app_file: app.py
9
  pinned: false
10
  ---
11
 
 
5
  colorTo: green
6
  sdk: gradio
7
  sdk_version: 3.40.1
8
+ app_file: src/app.py
9
  pinned: false
10
  ---
11
 
resources/design-by-fire.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cf8176ae0f4873ca6547f22ecad1fcc366b170488782087a3be48801721eba0
3
+ size 1353204
resources/lets-talk.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1545cbf074f8e363200e32f1da24b0319163bfafd2871078c8e750a32b02098b
3
+ size 4156635
resources/progit.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dea5f1cce14aabd2d3e7246fd4d0e6fe632c13561060bb47eee069b7f257289a
3
+ size 18915172
src/.DS_Store ADDED
Binary file (6.15 kB). View file
 
src/app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ from threading import Lock
4
+ from typing import Any, Dict, Optional, Tuple
5
+
6
+ import gradio as gr
7
+ from langchain.chains import ConversationalRetrievalChain
8
+ from langchain.chat_models import ChatOpenAI
9
+ from langchain.memory import ConversationBufferMemory
10
+ from langchain.prompts.chat import (ChatPromptTemplate,
11
+ HumanMessagePromptTemplate,
12
+ SystemMessagePromptTemplate)
13
+
14
+ from .core.chunking import chunk_file
15
+ from .core.embedding import embed_files
16
+ from .core.parsing import read_file
17
+
18
+ VECTOR_STORE = "faiss"
19
+ MODEL = "openai"
20
+ EMBEDDING = "openai"
21
+ MODEL = "gpt-3.5-turbo-16k"
22
+ K = 5
23
+ USE_VERBOSE = True
24
+ API_KEY = os.environ["OPENAI_API_KEY"]
25
+ system_template = """
26
+ Use the context below to answer questions. You must only use the Context to answer questions. If I ask you about 'the book' or 'this book' or similar references, then answer using the Context. If you cannot find the answer from the Context below, you must respond with
27
+ "I'm sorry, but I can't find the answer to your question in the book, 'Design by Fire,' by Emily Elizabeth Schlickman and Brett Milligan." All answers must be in English unless you are explicitly asked to translate to a different language.
28
+ ----------------
29
+ {context}
30
+ {chat_history}
31
+ """
32
+
33
+ # Create the chat prompt templates
34
+ messages = [
35
+ SystemMessagePromptTemplate.from_template(system_template),
36
+ HumanMessagePromptTemplate.from_template("{question}")
37
+ ]
38
+ qa_prompt = ChatPromptTemplate.from_messages(messages)
39
+
40
+ class AnswerConversationBufferMemory(ConversationBufferMemory):
41
+ def save_context(self, inputs: Dict[str, Any], outputs: Dict[str, str]) -> None:
42
+ return super(AnswerConversationBufferMemory, self).save_context(inputs,{'response': outputs['answer']})
43
+
44
+ def getretriever():
45
+ with open("./resources/design-by-fire.pdf", 'rb') as uploaded_file:
46
+ try:
47
+ file = read_file(uploaded_file)
48
+ except Exception as e:
49
+ print(e)
50
+
51
+ chunked_file = chunk_file(file, chunk_size=512, chunk_overlap=0)
52
+ folder_index = embed_files(
53
+ files=[chunked_file],
54
+ embedding=EMBEDDING,
55
+ vector_store=VECTOR_STORE,
56
+ openai_api_key=API_KEY,
57
+ )
58
+ return folder_index.index.as_retriever(verbose=True, search_type="similarity", search_kwargs={"k": K})
59
+
60
+ retriever = getretriever()
61
+
62
+ def getanswer(chain, question, history):
63
+ if hasattr(chain, "value"):
64
+ chain = chain.value
65
+ if hasattr(history, "value"):
66
+ history = history.value
67
+ if hasattr(question, "value"):
68
+ question = question.value
69
+
70
+ history = history or []
71
+ lock = Lock()
72
+ lock.acquire()
73
+ try:
74
+ output = chain({"question": question})
75
+ output = output["answer"]
76
+ history.append((question, output))
77
+ except Exception as e:
78
+ raise e
79
+ finally:
80
+ lock.release()
81
+ return history, history, gr.update(value="")
82
+
83
+ def load_chain(inputs = None):
84
+ llm = ChatOpenAI(
85
+ openai_api_key=API_KEY,
86
+ model_name=MODEL,
87
+ verbose=True)
88
+ chain = ConversationalRetrievalChain.from_llm(
89
+ llm,
90
+ retriever=retriever,
91
+ return_source_documents=USE_VERBOSE,
92
+ memory=AnswerConversationBufferMemory(memory_key="chat_history", return_messages=True),
93
+ verbose=USE_VERBOSE,
94
+ combine_docs_chain_kwargs={"prompt": qa_prompt})
95
+ return chain
96
+
97
+ CSS ="""
98
+ .contain { display: flex; flex-direction: column; }
99
+ .gradio-container { height: 100vh !important; }
100
+ #component-0 { height: 100%; }
101
+ #chatbot { flex-grow: 1; overflow: auto;}
102
+ """
103
+
104
+ with gr.Blocks() as block:
105
+ with gr.Row():
106
+ with gr.Column(scale=0.75):
107
+ with gr.Row():
108
+ gr.Markdown("<h1>Design by Fire</h1>")
109
+ with gr.Row():
110
+ gr.Markdown("by Emily Elizabeth Schlickman and Brett Milligan")
111
+ chatbot = gr.Chatbot(elem_id="chatbot").style(height=600)
112
+
113
+ with gr.Row():
114
+ message = gr.Textbox(
115
+ label="",
116
+ placeholder="Design by Fire",
117
+ lines=1,
118
+ )
119
+ with gr.Row():
120
+ submit = gr.Button(value="Send", variant="primary", scale=1)
121
+
122
+ state = gr.State()
123
+ chain_state = gr.State(load_chain)
124
+
125
+ submit.click(getanswer, inputs=[chain_state, message, state], outputs=[chatbot, state, message])
126
+ message.submit(getanswer, inputs=[chain_state, message, state], outputs=[chatbot, state, message])
127
+
128
+ with gr.Column(scale=0.25):
129
+ with gr.Row():
130
+ gr.Markdown("<h1><center>Suggestions</center></h1>")
131
+ ex1 = gr.Button(value="What are the main factors and trends discussed in the book that contribute to the changing behavior of wildfires?", variant="primary")
132
+ ex1.click(getanswer, inputs=[chain_state, ex1, state], outputs=[chatbot, state, message])
133
+ ex2 = gr.Button(value="How does the book explore the relationship between fire and different landscapes, such as wilderness and urban areas?", variant="primary")
134
+ ex2.click(getanswer, inputs=[chain_state, ex2, state], outputs=[chatbot, state, message])
135
+ ex3 = gr.Button(value="What are the three approaches to designing with fire that the book presents?", variant="primary")
136
+ ex3.click(getanswer, inputs=[chain_state, ex3, state], outputs=[chatbot, state, message])
137
+
138
+ block.launch(debug=True)
src/components/__init__.py ADDED
File without changes
src/components/faq.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ import streamlit as st
3
+
4
+
5
+ def faq():
6
+ st.markdown(
7
+ """
8
+ # FAQ
9
+ ## How does KnowledgeGPT work?
10
+ When you upload a document, it will be divided into smaller chunks
11
+ and stored in a special type of database called a vector index
12
+ that allows for semantic search and retrieval.
13
+
14
+ When you ask a question, KnowledgeGPT will search through the
15
+ document chunks and find the most relevant ones using the vector index.
16
+ Then, it will use GPT3 to generate a final answer.
17
+
18
+ ## Is my data safe?
19
+ Yes, your data is safe. KnowledgeGPT does not store your documents or
20
+ questions. All uploaded data is deleted after you close the browser tab.
21
+
22
+ ## Why does it take so long to index my document?
23
+ If you are using a free OpenAI API key, it will take a while to index
24
+ your document. This is because the free API key has strict [rate limits](https://platform.openai.com/docs/guides/rate-limits/overview).
25
+ To speed up the indexing process, you can use a paid API key.
26
+
27
+ ## What do the numbers mean under each source?
28
+ For a PDF document, you will see a citation number like this: 3-12.
29
+ The first number is the page number and the second number is
30
+ the chunk number on that page. For DOCS and TXT documents,
31
+ the first number is set to 1 and the second number is the chunk number.
32
+
33
+ ## Are the answers 100% accurate?
34
+ No, the answers are not 100% accurate. KnowledgeGPT uses GPT-3 to generate
35
+ answers. GPT-3 is a powerful language model, but it sometimes makes mistakes
36
+ and is prone to hallucinations. Also, KnowledgeGPT uses semantic search
37
+ to find the most relevant chunks and does not see the entire document,
38
+ which means that it may not be able to find all the relevant information and
39
+ may not be able to answer all questions (especially summary-type questions
40
+ or questions that require a lot of context from the document).
41
+
42
+ But for most use cases, KnowledgeGPT is very accurate and can answer
43
+ most questions. Always check with the sources to make sure that the answers
44
+ are correct.
45
+ """
46
+ )
src/core/__init__.py ADDED
File without changes
src/core/caching.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import canonical_demo_memory.core.chunking as chunking
2
+ import canonical_demo_memory.core.embedding as embedding
3
+ import canonical_demo_memory.core.parsing as parsing
4
+ import streamlit as st
5
+ from canonical_demo_memory.core.parsing import File
6
+ from streamlit.runtime.caching.hashing import HashFuncsDict
7
+
8
+
9
+ def file_hash_func(file: File) -> str:
10
+ """Get a unique hash for a file"""
11
+ return file.id
12
+
13
+
14
+ @st.cache_data(show_spinner=False)
15
+ def bootstrap_caching():
16
+ """Patch module functions with caching"""
17
+
18
+ # Get all substypes of File from module
19
+ file_subtypes = [
20
+ cls
21
+ for cls in vars(parsing).values()
22
+ if isinstance(cls, type) and issubclass(cls, File) and cls != File
23
+ ]
24
+ file_hash_funcs: HashFuncsDict = {cls: file_hash_func for cls in file_subtypes}
25
+
26
+ parsing.read_file = st.cache_data(show_spinner=False)(parsing.read_file)
27
+ chunking.chunk_file = st.cache_data(show_spinner=False, hash_funcs=file_hash_funcs)(
28
+ chunking.chunk_file
29
+ )
30
+ embedding.embed_files = st.cache_data(
31
+ show_spinner=False, hash_funcs=file_hash_funcs
32
+ )(embedding.embed_files)
src/core/chunking.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.docstore.document import Document
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+
4
+ from .parsing import File
5
+
6
+
7
+ def chunk_sentences(sentences, chunk_size=512):
8
+ sents = []
9
+ current_sent = ""
10
+
11
+ for sentence in sentences:
12
+ # If adding the next sentence doesn't exceed the chunk_size,
13
+ # we add the sentence to the current chunk.
14
+ if len(current_sent) + len(sentence) <= chunk_size:
15
+ current_sent += " " + sentence
16
+ else:
17
+ # If adding the sentence would make the chunk too long,
18
+ # we add the current_sent chunk to the list of chunks and start a new chunk.
19
+ sents.append(current_sent)
20
+ current_sent = sentence
21
+
22
+ # After going through all the sentences, there may be a chunk that hasn't yet been added to the list.
23
+ # We add it now:
24
+ if current_sent:
25
+ sents.append(current_sent)
26
+
27
+ return sents
28
+
29
+ def chunk_file(
30
+ file: File, chunk_size: int, chunk_overlap: int = 0, model_name="gpt-3.5-turbo"
31
+ ) -> File:
32
+ """Chunks each document in a file into smaller documents
33
+ according to the specified chunk size and overlap
34
+ where the size is determined by the number of token for the specified model.
35
+ """
36
+
37
+ # split each document into chunks
38
+ chunked_docs = []
39
+ for doc in file.docs:
40
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
41
+ model_name=model_name,
42
+ chunk_size=chunk_size,
43
+ chunk_overlap=chunk_overlap,
44
+ )
45
+
46
+ chunks = text_splitter.split_text(doc.page_content)
47
+
48
+ for i, chunk in enumerate(chunks):
49
+ doc = Document(
50
+ page_content=chunk,
51
+ metadata={
52
+ "page": doc.metadata.get("page", 1),
53
+ "chunk": i + 1,
54
+ "source": f"{doc.metadata.get('page', 1)}-{i + 1}",
55
+ },
56
+ )
57
+ chunked_docs.append(doc)
58
+
59
+ chunked_file = file.copy()
60
+ chunked_file.docs = chunked_docs
61
+ return chunked_file
src/core/debug.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import VectorStore
2
+ from typing import Iterable, List, Any
3
+ from langchain.docstore.document import Document
4
+ from langchain.embeddings.base import Embeddings
5
+ from langchain.embeddings.fake import FakeEmbeddings as FakeEmbeddingsBase
6
+ from langchain.chat_models.fake import FakeListChatModel
7
+ from typing import Optional
8
+
9
+
10
+ class FakeChatModel(FakeListChatModel):
11
+ def __init__(self, **kwargs):
12
+ responses = ["The answer is 42. SOURCES: 1, 2, 3, 4"]
13
+ super().__init__(responses=responses, **kwargs)
14
+
15
+
16
+ class FakeEmbeddings(FakeEmbeddingsBase):
17
+ def __init__(self, **kwargs):
18
+ super().__init__(size=4, **kwargs)
19
+
20
+
21
+ class FakeVectorStore(VectorStore):
22
+ """Fake vector store for testing purposes."""
23
+
24
+ def __init__(self, texts: List[str]):
25
+ self.texts: List[str] = texts
26
+
27
+ def add_texts(
28
+ self, texts: Iterable[str], metadatas: List[dict] | None = None, **kwargs: Any
29
+ ) -> List[str]:
30
+ self.texts.extend(texts)
31
+ return self.texts
32
+
33
+ @classmethod
34
+ def from_texts(
35
+ cls,
36
+ texts: List[str],
37
+ embedding: Embeddings,
38
+ metadatas: Optional[List[dict]] = None,
39
+ **kwargs: Any,
40
+ ) -> "FakeVectorStore":
41
+ return cls(texts=list(texts))
42
+
43
+ def similarity_search(
44
+ self, query: str, k: int = 4, **kwargs: Any
45
+ ) -> List[Document]:
46
+ return [
47
+ Document(page_content=text, metadata={"source": f"{i+1}-{1}"})
48
+ for i, text in enumerate(self.texts)
49
+ ]
src/core/embedding.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Type
2
+
3
+ from langchain.docstore.document import Document
4
+ from langchain.embeddings import OpenAIEmbeddings
5
+ from langchain.embeddings.base import Embeddings
6
+ from langchain.vectorstores import VectorStore
7
+ from langchain.vectorstores.faiss import FAISS
8
+
9
+ from .debug import FakeEmbeddings, FakeVectorStore
10
+ from .parsing import File
11
+
12
+
13
+ class FolderIndex:
14
+ """Index for a collection of files (a folder)"""
15
+
16
+ def __init__(self, files: List[File], index: VectorStore):
17
+ self.name: str = "default"
18
+ self.files = files
19
+ self.index: VectorStore = index
20
+
21
+ @staticmethod
22
+ def _combine_files(files: List[File]) -> List[Document]:
23
+ """Combines all the documents in a list of files into a single list."""
24
+
25
+ all_texts = []
26
+ for file in files:
27
+ for doc in file.docs:
28
+ doc.metadata["file_name"] = file.name
29
+ doc.metadata["file_id"] = file.id
30
+ all_texts.append(doc)
31
+
32
+ return all_texts
33
+
34
+ @classmethod
35
+ def from_files(
36
+ cls, files: List[File], embeddings: Embeddings, vector_store: Type[VectorStore]
37
+ ) -> "FolderIndex":
38
+ """Creates an index from files."""
39
+
40
+ all_docs = cls._combine_files(files)
41
+
42
+ index = vector_store.from_documents(
43
+ documents=all_docs,
44
+ embedding=embeddings,
45
+ )
46
+
47
+ return cls(files=files, index=index)
48
+
49
+
50
+ def embed_files(
51
+ files: List[File], embedding: str, vector_store: str, **kwargs
52
+ ) -> FolderIndex:
53
+ """Embeds a collection of files and stores them in a FolderIndex."""
54
+
55
+ supported_embeddings: dict[str, Type[Embeddings]] = {
56
+ "openai": OpenAIEmbeddings,
57
+ "debug": FakeEmbeddings,
58
+ }
59
+ supported_vector_stores: dict[str, Type[VectorStore]] = {
60
+ "faiss": FAISS,
61
+ "debug": FakeVectorStore,
62
+ }
63
+
64
+ if embedding in supported_embeddings:
65
+ _embeddings = supported_embeddings[embedding](**kwargs)
66
+ else:
67
+ raise NotImplementedError(f"Embedding {embedding} not supported.")
68
+
69
+ if vector_store in supported_vector_stores:
70
+ _vector_store = supported_vector_stores[vector_store]
71
+ else:
72
+ raise NotImplementedError(f"Vector store {vector_store} not supported.")
73
+
74
+ return FolderIndex.from_files(
75
+ files=files, embeddings=_embeddings, vector_store=_vector_store
76
+ )
src/core/parsing.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from abc import ABC, abstractmethod
3
+ from copy import deepcopy
4
+ from hashlib import md5
5
+ from io import BytesIO
6
+ from typing import Any, List, Optional
7
+
8
+ import docx2txt
9
+ import fitz
10
+ from langchain.docstore.document import Document
11
+
12
+
13
+
14
+ class File(ABC):
15
+ """Represents an uploaded file comprised of Documents"""
16
+
17
+ def __init__(
18
+ self,
19
+ name: str,
20
+ id: str,
21
+ metadata: Optional[dict[str, Any]] = None,
22
+ docs: Optional[List[Document]] = None,
23
+ ):
24
+ self.name = name
25
+ self.id = id
26
+ self.metadata = metadata or {}
27
+ self.docs = docs or []
28
+
29
+ @classmethod
30
+ @abstractmethod
31
+ def from_bytes(cls, file: BytesIO) -> "File":
32
+ """Creates a File from a BytesIO object"""
33
+
34
+ def __repr__(self) -> str:
35
+ return (
36
+ f"File(name={self.name}, id={self.id},"
37
+ " metadata={self.metadata}, docs={self.docs})"
38
+ )
39
+
40
+ def __str__(self) -> str:
41
+ return f"File(name={self.name}, id={self.id}, metadata={self.metadata})"
42
+
43
+ def copy(self) -> "File":
44
+ """Create a deep copy of this File"""
45
+ return self.__class__(
46
+ name=self.name,
47
+ id=self.id,
48
+ metadata=deepcopy(self.metadata),
49
+ docs=deepcopy(self.docs),
50
+ )
51
+
52
+
53
+ def strip_consecutive_newlines(text: str) -> str:
54
+ """Strips consecutive newlines from a string
55
+ possibly with whitespace in between
56
+ """
57
+ return re.sub(r"\s*\n\s*", "\n", text)
58
+
59
+
60
+ class DocxFile(File):
61
+ @classmethod
62
+ def from_bytes(cls, file: BytesIO) -> "DocxFile":
63
+ text = docx2txt.process(file)
64
+ text = strip_consecutive_newlines(text)
65
+ doc = Document(page_content=text.strip())
66
+ return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=[doc])
67
+
68
+
69
+ class PdfFile(File):
70
+ @classmethod
71
+ def from_bytes(cls, file: BytesIO) -> "PdfFile":
72
+ pdf = fitz.open(stream=file.read(), filetype="pdf") # type: ignore
73
+ docs = []
74
+ for i, page in enumerate(pdf):
75
+ text = page.get_text(sort=True)
76
+ text = strip_consecutive_newlines(text)
77
+ doc = Document(page_content=text.strip())
78
+ doc.metadata["page"] = i + 1
79
+ docs.append(doc)
80
+ # file.read() mutates the file object, which can affect caching
81
+ # so we need to reset the file pointer to the beginning
82
+ file.seek(0)
83
+ return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=docs)
84
+
85
+
86
+ class TxtFile(File):
87
+ @classmethod
88
+ def from_bytes(cls, file: BytesIO) -> "TxtFile":
89
+ text = file.read().decode("utf-8")
90
+ text = strip_consecutive_newlines(text)
91
+ file.seek(0)
92
+ doc = Document(page_content=text.strip())
93
+ return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=[doc])
94
+
95
+
96
+ def read_file(file: BytesIO) -> File:
97
+ """Reads an uploaded file and returns a File object"""
98
+ if file.name.lower().endswith(".docx"):
99
+ return DocxFile.from_bytes(file)
100
+ elif file.name.lower().endswith(".pdf"):
101
+ return PdfFile.from_bytes(file)
102
+ elif file.name.lower().endswith(".txt"):
103
+ return TxtFile.from_bytes(file)
104
+ else:
105
+ raise NotImplementedError(f"File type {file.name.split('.')[-1]} not supported")
src/core/prompts.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ from langchain.prompts import PromptTemplate
3
+
4
+ ## Use a shorter template to reduce the number of tokens in the prompt
5
+ template = """Create a final answer to the given questions using the provided document excerpts(in no particular order) as references. ALWAYS include a "SOURCES" section in your answer including only the minimal set of sources needed to answer the question. If you are unable to answer the question, simply state that you do not know. Do not attempt to fabricate an answer and leave the SOURCES section empty.
6
+
7
+ ---------
8
+
9
+ QUESTION: What is the purpose of ARPA-H?
10
+ =========
11
+ Content: More support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. \n\nIt’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more. \n\nARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more.
12
+ Source: 1-32
13
+ Content: While we’re at it, let’s make sure every American can get the health care they need. \n\nWe’ve already made historic investments in health care. \n\nWe’ve made it easier for Americans to get the care they need, when they need it. \n\nWe’ve made it easier for Americans to get the treatments they need, when they need them. \n\nWe’ve made it easier for Americans to get the medications they need, when they need them.
14
+ Source: 1-33
15
+ Content: The V.A. is pioneering new ways of linking toxic exposures to disease, already helping veterans get the care they deserve. \n\nWe need to extend that same care to all Americans. \n\nThat’s why I’m calling on Congress to pass legislation that would establish a national registry of toxic exposures, and provide health care and financial assistance to those affected.
16
+ Source: 1-30
17
+ =========
18
+ FINAL ANSWER: The purpose of ARPA-H is to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more.
19
+ SOURCES: 1-32
20
+
21
+ ---------
22
+
23
+ QUESTION: {question}
24
+ =========
25
+ {summaries}
26
+ =========
27
+ FINAL ANSWER:"""
28
+
29
+ STUFF_PROMPT = PromptTemplate(
30
+ template=template, input_variables=["summaries", "question"]
31
+ )
src/core/qa.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, List
2
+
3
+ from canonical_demo_memory.core.debug import FakeChatModel
4
+ from canonical_demo_memory.core.embedding import FolderIndex
5
+ from canonical_demo_memory.core.prompts import STUFF_PROMPT
6
+ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.docstore.document import Document
9
+ from pydantic import BaseModel
10
+
11
+
12
+ class AnswerWithSources(BaseModel):
13
+ answer: str
14
+ sources: List[Document]
15
+
16
+
17
+ def query_folder(
18
+ query: str,
19
+ folder_index: FolderIndex,
20
+ return_all: bool = False,
21
+ model: str = "openai",
22
+ **model_kwargs: Any,
23
+ ) -> AnswerWithSources:
24
+ """Queries a folder index for an answer.
25
+
26
+ Args:
27
+ query (str): The query to search for.
28
+ folder_index (FolderIndex): The folder index to search.
29
+ return_all (bool): Whether to return all the documents from the embedding or
30
+ just the sources for the answer.
31
+ model (str): The model to use for the answer generation.
32
+ **model_kwargs (Any): Keyword arguments for the model.
33
+
34
+ Returns:
35
+ AnswerWithSources: The answer and the source documents.
36
+ """
37
+ supported_models = {
38
+ "openai": ChatOpenAI,
39
+ "debug": FakeChatModel,
40
+ }
41
+
42
+ if model in supported_models:
43
+ llm = supported_models[model](**model_kwargs)
44
+ else:
45
+ raise ValueError(f"Model {model} not supported.")
46
+
47
+ chain = load_qa_with_sources_chain(
48
+ llm=llm,
49
+ chain_type="stuff",
50
+ prompt=STUFF_PROMPT,
51
+ )
52
+
53
+ relevant_docs = folder_index.index.similarity_search(query, k=5)
54
+ result = chain(
55
+ {"input_documents": relevant_docs, "question": query}, return_only_outputs=True
56
+ )
57
+ sources = relevant_docs
58
+
59
+ if not return_all:
60
+ sources = get_sources(result["output_text"], folder_index)
61
+
62
+ answer = result["output_text"].split("SOURCES: ")[0]
63
+
64
+ return AnswerWithSources(answer=answer, sources=sources)
65
+
66
+
67
+ def get_sources(answer: str, folder_index: FolderIndex) -> List[Document]:
68
+ """Retrieves the docs that were used to answer the question the generated answer."""
69
+
70
+ source_keys = [s for s in answer.split("SOURCES: ")[-1].split(", ")]
71
+
72
+ source_docs = []
73
+ for file in folder_index.files:
74
+ for doc in file.docs:
75
+ if doc.metadata["source"] in source_keys:
76
+ source_docs.append(doc)
77
+
78
+ return source_docs