darthPanda commited on
Commit
b7bb8ad
1 Parent(s): d9b26bd
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Nima Mahmoudi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -9,4 +9,4 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
9
  pinned: false
10
  ---
11
 
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__pycache__/embed_pdf.cpython-310.pyc ADDED
Binary file (2.8 kB). View file
 
__pycache__/llm_helper.cpython-310.pyc ADDED
Binary file (10.2 kB). View file
 
agent_helper.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from langchain.callbacks import StreamlitCallbackHandler
2
+ from langchain.callbacks.streamlit.streamlit_callback_handler import StreamlitCallbackHandler
3
+ from tenacity import retry, wait_exponential, stop_after_attempt
4
+
5
+ def bind_logger(toolClass):
6
+ class newToolClass(toolClass):
7
+ def __init__(self, tool_name: str, st_cb: StreamlitCallbackHandler, *args, **kwargs):
8
+ super().__init__(*args, **kwargs)
9
+ self.st_cb = st_cb
10
+ self.tool_name = tool_name
11
+
12
+ def run(self, *args, **kwargs):
13
+ print(f"Running {toolClass.__name__} {[*args]}, {kwargs}")
14
+
15
+ if self.st_cb._current_thought is None:
16
+ self.st_cb.on_llm_start({}, [])
17
+
18
+ args_str = ' '.join(args) + ' ' + ' '.join([f'{k}=`{v}`' for k, v in kwargs.items()])
19
+ self.st_cb.on_tool_start({'name': self.tool_name}, args_str)
20
+
21
+ try:
22
+ ret_val = retry(
23
+ wait=wait_exponential(min=2, max=20),
24
+ stop=stop_after_attempt(5),
25
+ )(super().run)(*args, **kwargs)
26
+ self.st_cb.on_tool_end(ret_val)
27
+ return ret_val
28
+ except Exception as e:
29
+ original_exception = e.last_attempt.result()
30
+ print(f"Exception {original_exception} in {toolClass.__name__} {[*args]}, {kwargs}")
31
+ raise original_exception
32
+
33
+
34
+ return newToolClass
35
+
36
+ from functools import wraps
37
+
38
+ def retry_and_streamlit_callback(st_cb: StreamlitCallbackHandler, tool_name: str):
39
+ if st_cb is None:
40
+ return lambda x: x
41
+
42
+ def decorator(tool_func):
43
+ @wraps(tool_func)
44
+ def decorated_func(*args, **kwargs):
45
+ print(f"Running {tool_name} {args}, {kwargs}")
46
+
47
+ if st_cb._current_thought is None:
48
+ st_cb.on_llm_start({}, [])
49
+
50
+ args_str = ' '.join(args) + ' ' + ' '.join([f'{k}=`{v}`' for k, v in kwargs.items()])
51
+ st_cb.on_tool_start({'name': tool_name}, args_str)
52
+
53
+ @retry(wait=wait_exponential(min=2, max=20), stop=stop_after_attempt(5))
54
+ def retry_wrapper():
55
+ return tool_func(*args, **kwargs)
56
+
57
+ try:
58
+ ret_val = retry_wrapper()
59
+ st_cb.on_tool_end(ret_val)
60
+ return ret_val
61
+ except Exception as e:
62
+ print(f"Exception {e} in {tool_name} {args}, {kwargs}")
63
+ raise e
64
+
65
+ return decorated_func
66
+
67
+ return decorator
app-agent.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from langchain.agents import initialize_agent, AgentType
4
+ from langchain.callbacks import StreamlitCallbackHandler
5
+ from langchain.chat_models import ChatOpenAI
6
+ from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
7
+
8
+ from llm_helper import get_agent_chain, get_lc_oai_tools
9
+
10
+ with st.sidebar:
11
+ openai_api_key = st.secrets["OPENAI_API_KEY"]
12
+ "[Get an OpenAI API key](https://platform.openai.com/account/api-keys)"
13
+ "[View the source code](https://github.com/streamlit/llm-examples/blob/main/pages/2_Chat_with_search.py)"
14
+ "[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/streamlit/llm-examples?quickstart=1)"
15
+
16
+ st.title("🔎 LangChain - Chat with search")
17
+
18
+ """
19
+ In this example, we're using `StreamlitCallbackHandler` to display the thoughts and actions of an agent in an interactive Streamlit app.
20
+ Try more LangChain 🤝 Streamlit Agent examples at [github.com/langchain-ai/streamlit-agent](https://github.com/langchain-ai/streamlit-agent).
21
+ """
22
+
23
+ if "messages" not in st.session_state:
24
+ st.session_state["messages"] = [
25
+ {"role": "assistant", "content": "Hi, I'm a chatbot who can search the web. How can I help you?"}
26
+ ]
27
+
28
+ for msg in st.session_state.messages:
29
+ st.chat_message(msg["role"]).write(msg["content"])
30
+
31
+ if prompt := st.chat_input(placeholder="Who won the Women's U.S. Open in 2018?"):
32
+ st.session_state.messages.append({"role": "user", "content": prompt})
33
+ st.chat_message("user").write(prompt)
34
+
35
+ if not openai_api_key:
36
+ st.info("Please add your OpenAI API key to continue.")
37
+ st.stop()
38
+
39
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo-1106", openai_api_key=openai_api_key, streaming=True)
40
+ lc_tools, _ = get_lc_oai_tools()
41
+ search_agent = initialize_agent(lc_tools, llm, agent=AgentType.OPENAI_FUNCTIONS, handle_parsing_errors=True, verbose=True)
42
+
43
+ agent_prompt = ChatPromptTemplate.from_messages(
44
+ [
45
+ ("system", "You are a helpful assistant, use the search tool to answer the user's question and cite only the page number when you use information coming (like [p1]) from the source document. Always use the content from the source document to answer the user's question. If you need to compare multiple subjects, search them one by one."),
46
+ ("user", "{input}"),
47
+ MessagesPlaceholder(variable_name="agent_scratchpad"),
48
+ ]
49
+ )
50
+ search_agent.agent.prompt = agent_prompt
51
+ with st.chat_message("assistant"):
52
+ st_cb = StreamlitCallbackHandler(st.container(), expand_new_thoughts=False)
53
+ response = search_agent.run(prompt, callbacks=[st_cb])
54
+ # search_agent = get_agent_chain(callbacks=[st_cb])
55
+ # response = search_agent.invoke({"input": prompt})
56
+ # response = response["output"]
57
+
58
+ st.session_state.messages.append({"role": "assistant", "content": response})
59
+ st.write(response)
app-agent2.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from langchain.agents import initialize_agent, AgentType
4
+ from langchain.callbacks import StreamlitCallbackHandler
5
+ from langchain.chat_models import ChatOpenAI
6
+ from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
7
+
8
+ from langchain.agents.format_scratchpad import format_to_openai_function_messages
9
+ from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
10
+
11
+ from llm_helper import get_agent_chain, get_lc_oai_tools, convert_message
12
+ from langchain.agents import AgentExecutor
13
+
14
+ with st.sidebar:
15
+ openai_api_key = st.secrets["OPENAI_API_KEY"]
16
+ "[Get an OpenAI API key](https://platform.openai.com/account/api-keys)"
17
+ "[View the source code](https://github.com/streamlit/llm-examples/blob/main/pages/2_Chat_with_search.py)"
18
+ "[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/streamlit/llm-examples?quickstart=1)"
19
+
20
+ st.title("🔎 LangChain - Chat with search")
21
+
22
+ """
23
+ In this example, we're using `StreamlitCallbackHandler` to display the thoughts and actions of an agent in an interactive Streamlit app.
24
+ Try more LangChain 🤝 Streamlit Agent examples at [github.com/langchain-ai/streamlit-agent](https://github.com/langchain-ai/streamlit-agent).
25
+ """
26
+
27
+ if "messages" not in st.session_state:
28
+ st.session_state["messages"] = [
29
+ {"role": "assistant", "content": "Hi, I'm a chatbot who can search the web. How can I help you?"}
30
+ ]
31
+
32
+ for msg in st.session_state.messages:
33
+ st.chat_message(msg["role"]).write(msg["content"])
34
+
35
+ if prompt := st.chat_input(placeholder="Who won the Women's U.S. Open in 2018?"):
36
+ st.session_state.messages.append({"role": "user", "content": prompt})
37
+ st.chat_message("user").write(prompt)
38
+
39
+ if not openai_api_key:
40
+ st.info("Please add your OpenAI API key to continue.")
41
+ st.stop()
42
+
43
+ if "messages" in st.session_state:
44
+ chat_history = [convert_message(m) for m in st.session_state.messages[:-1]]
45
+ else:
46
+ chat_history = []
47
+
48
+ with st.chat_message("assistant"):
49
+ st_cb = StreamlitCallbackHandler(st.container(), expand_new_thoughts=False)
50
+ agent = get_agent_chain(st_cb=st_cb)
51
+
52
+ response = agent.invoke({
53
+ "input": prompt,
54
+ "chat_history": chat_history,
55
+ })
56
+ response = response["output"]
57
+
58
+ st.session_state.messages.append({"role": "assistant", "content": response})
59
+ st.write(response)
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import embed_pdf
4
+ import shutil
5
+
6
+ def clear_directory(directory):
7
+ for filename in os.listdir(directory):
8
+ file_path = os.path.join(directory, filename)
9
+ try:
10
+ if os.path.isfile(file_path) or os.path.islink(file_path):
11
+ os.unlink(file_path)
12
+ elif os.path.isdir(file_path):
13
+ shutil.rmtree(file_path)
14
+ except Exception as e:
15
+ print(f'Failed to delete {file_path}. Reason: {e}')
16
+
17
+ def clear_pdf_files(directory):
18
+ for filename in os.listdir(directory):
19
+ file_path = os.path.join(directory, filename)
20
+ try:
21
+ if os.path.isfile(file_path) and file_path.endswith('.pdf'):
22
+ os.remove(file_path)
23
+ except Exception as e:
24
+ print(f'Failed to delete {file_path}. Reason: {e}')
25
+
26
+ # clear_pdf_files("pdf")
27
+ # clear_directory("index")
28
+
29
+
30
+ # create sidebar and ask for openai api key if not set in secrets
31
+ secrets_file_path = os.path.join(".streamlit", "secrets.toml")
32
+ # if os.path.exists(secrets_file_path):
33
+ # try:
34
+ # if "OPENAI_API_KEY" in st.secrets:
35
+ # os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
36
+ # else:
37
+ # print("OpenAI API Key not found in environment variables")
38
+ # except FileNotFoundError:
39
+ # print('Secrets file not found')
40
+ # else:
41
+ # print('Secrets file not found')
42
+
43
+ # if not os.getenv('OPENAI_API_KEY', '').startswith("sk-"):
44
+ # os.environ["OPENAI_API_KEY"] = st.sidebar.text_input(
45
+ # "OpenAI API Key", type="password"
46
+ # )
47
+ # else:
48
+ # if st.sidebar.button("Embed Documents"):
49
+ # st.sidebar.info("Embedding documents...")
50
+ # try:
51
+ # embed_pdf.embed_all_pdf_docs()
52
+ # st.sidebar.info("Done!")
53
+ # except Exception as e:
54
+ # st.sidebar.error(e)
55
+ # st.sidebar.error("Failed to embed documents.")
56
+
57
+ os.environ["OPENAI_API_KEY"] = st.sidebar.text_input(
58
+ "OpenAI API Key", type="password"
59
+ )
60
+
61
+ uploaded_file = st.sidebar.file_uploader("Upload Document", type=['pdf', 'docx'], disabled=False)
62
+
63
+ if uploaded_file is None:
64
+ file_uploaded_bool = False
65
+ else:
66
+ file_uploaded_bool = True
67
+
68
+ if st.sidebar.button("Embed Documents", disabled=not file_uploaded_bool):
69
+ st.sidebar.info("Embedding documents...")
70
+ try:
71
+ embed_pdf.embed_all_inputed_pdf_docs(uploaded_file)
72
+ # embed_pdf.embed_all_pdf_docs()
73
+ st.sidebar.info("Done!")
74
+ except Exception as e:
75
+ st.sidebar.error(e)
76
+ st.sidebar.error("Failed to embed documents.")
77
+
78
+ # create the app
79
+ st.title("Chat with your PDF")
80
+
81
+ # chosen_file = st.radio(
82
+ # "Choose a file to search", embed_pdf.get_all_index_files(), index=0
83
+ # )
84
+
85
+ # check if openai api key is set
86
+ if not os.getenv('OPENAI_API_KEY', '').startswith("sk-"):
87
+ st.warning("Please enter your OpenAI API key!", icon="⚠")
88
+ st.stop()
89
+
90
+ # load the agent
91
+ from llm_helper import convert_message, get_rag_chain, get_rag_fusion_chain
92
+
93
+ rag_method_map = {
94
+ 'Basic RAG': get_rag_chain,
95
+ 'RAG Fusion': get_rag_fusion_chain
96
+ }
97
+ chosen_rag_method = st.radio(
98
+ "Choose a RAG method", rag_method_map.keys(), index=0
99
+ )
100
+ get_rag_chain_func = rag_method_map[chosen_rag_method]
101
+ ## get the chain WITHOUT the retrieval callback (not used)
102
+ # custom_chain = get_rag_chain_func(chosen_file)
103
+
104
+ # create the message history state
105
+ if "messages" not in st.session_state:
106
+ st.session_state.messages = []
107
+
108
+ # render older messages
109
+ for message in st.session_state.messages:
110
+ with st.chat_message(message["role"]):
111
+ st.markdown(message["content"])
112
+
113
+ # render the chat input
114
+ prompt = st.chat_input("Enter your message...")
115
+ if prompt:
116
+ st.session_state.messages.append({"role": "user", "content": prompt})
117
+
118
+ # render the user's new message
119
+ with st.chat_message("user"):
120
+ st.markdown(prompt)
121
+
122
+ # render the assistant's response
123
+ with st.chat_message("assistant"):
124
+ retrival_container = st.container()
125
+ message_placeholder = st.empty()
126
+
127
+ # retrieval_status = retrival_container.status("**Context Retrieval**")
128
+ queried_questions = []
129
+ rendered_questions = set()
130
+ def update_retrieval_status():
131
+ for q in queried_questions:
132
+ if q in rendered_questions:
133
+ continue
134
+ rendered_questions.add(q)
135
+ # retrieval_status.markdown(f"\n\n`- {q}`")
136
+ retrival_container.markdown(f"\n\n`- {q}`")
137
+ def retrieval_cb(qs):
138
+ for q in qs:
139
+ if q not in queried_questions:
140
+ queried_questions.append(q)
141
+ return qs
142
+
143
+ # get the chain with the retrieval callback
144
+ custom_chain = get_rag_chain_func(uploaded_file.name, retrieval_cb=retrieval_cb)
145
+
146
+ if "messages" in st.session_state:
147
+ chat_history = [convert_message(m) for m in st.session_state.messages[:-1]]
148
+ else:
149
+ chat_history = []
150
+
151
+ full_response = ""
152
+ for response in custom_chain.stream(
153
+ {"input": prompt, "chat_history": chat_history}
154
+ ):
155
+ if "output" in response:
156
+ full_response += response["output"]
157
+ else:
158
+ full_response += response.content
159
+
160
+ message_placeholder.markdown(full_response + "▌")
161
+ update_retrieval_status()
162
+
163
+ # retrival_container.update(state="complete")
164
+ # retrieval_status.update(state="complete")
165
+ message_placeholder.markdown(full_response)
166
+
167
+ # add the full response to the message history
168
+ st.session_state.messages.append({"role": "assistant", "content": full_response})
embed_pdf.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PagedPDFSplitter
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.embeddings.openai import OpenAIEmbeddings
4
+ from langchain.vectorstores import FAISS
5
+
6
+ import os
7
+
8
+
9
+ def embed_document(file_name, file_folder="pdf", embedding_folder="index"):
10
+ file_path = f"{file_folder}/{file_name}"
11
+ loader = PagedPDFSplitter(file_path)
12
+ source_pages = loader.load_and_split()
13
+
14
+ embedding_func = OpenAIEmbeddings()
15
+ text_splitter = RecursiveCharacterTextSplitter(
16
+ chunk_size=500,
17
+ chunk_overlap=100,
18
+ length_function=len,
19
+ is_separator_regex=False,
20
+ separators=["\n\n", "\n", " ", ""],
21
+ )
22
+ source_chunks = text_splitter.split_documents(source_pages)
23
+ search_index = FAISS.from_documents(source_chunks, embedding_func)
24
+ search_index.save_local(
25
+ folder_path=embedding_folder, index_name=file_name + ".index"
26
+ )
27
+
28
+ def embed_all_inputed_pdf_docs(uploaded_document):
29
+ # Define the directory path
30
+ pdf_directory = "pdf"
31
+ pdf_file_path = os.path.join(pdf_directory, uploaded_document.name)
32
+
33
+ with open(pdf_file_path, 'wb') as file:
34
+ file.write(uploaded_document.getbuffer())
35
+
36
+ # Check if the directory exists
37
+ if os.path.exists(pdf_directory):
38
+ # List all PDF files in the directory
39
+ pdf_files = [
40
+ file for file in os.listdir(pdf_directory) if file.endswith(".pdf")
41
+ ]
42
+
43
+ if pdf_files:
44
+ for pdf_file in pdf_files:
45
+ print(f"Embedding {pdf_file}...")
46
+ embed_document(file_name=pdf_file, file_folder=pdf_directory)
47
+ print("Done!")
48
+ else:
49
+ raise Exception("No PDF files found in the directory.")
50
+ else:
51
+ raise Exception(f"Directory '{pdf_directory}' does not exist.")
52
+
53
+
54
+ def embed_all_pdf_docs():
55
+ # Define the directory path
56
+ pdf_directory = "pdf"
57
+
58
+ # Check if the directory exists
59
+ if os.path.exists(pdf_directory):
60
+ # List all PDF files in the directory
61
+ pdf_files = [
62
+ file for file in os.listdir(pdf_directory) if file.endswith(".pdf")
63
+ ]
64
+
65
+ if pdf_files:
66
+ for pdf_file in pdf_files:
67
+ print(f"Embedding {pdf_file}...")
68
+ embed_document(file_name=pdf_file, file_folder=pdf_directory)
69
+ print("Done!")
70
+ else:
71
+ raise Exception("No PDF files found in the directory.")
72
+ else:
73
+ raise Exception(f"Directory '{pdf_directory}' does not exist.")
74
+
75
+
76
+ def get_all_index_files():
77
+ # Define the directory path
78
+ index_directory = "index"
79
+
80
+ # Check if the directory exists
81
+ if os.path.exists(index_directory):
82
+ # List all index files in the directory
83
+ postfix = ".index.faiss"
84
+ index_files = [
85
+ file.replace(postfix, "")
86
+ for file in os.listdir(index_directory)
87
+ if file.endswith(postfix)
88
+ ]
89
+
90
+ if index_files:
91
+ return index_files
92
+ else:
93
+ raise Exception("No index files found in the directory.")
94
+ else:
95
+ raise Exception(f"Directory '{index_directory}' does not exist.")
index/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.faiss
2
+ *.pkl
llm_helper.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ # langchain imports
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.schema.runnable import RunnableMap
6
+ from langchain.prompts.prompt import PromptTemplate
7
+ from langchain.prompts import ChatPromptTemplate
8
+ from langchain.schema.runnable import RunnablePassthrough
9
+ from langchain.schema.output_parser import StrOutputParser
10
+ from operator import itemgetter
11
+ from langchain.schema.messages import HumanMessage, SystemMessage, AIMessage
12
+ from langchain.callbacks.streamlit.streamlit_callback_handler import StreamlitCallbackHandler
13
+
14
+
15
+ def format_docs(docs):
16
+ res = ""
17
+ # res = str(docs)
18
+ for doc in docs:
19
+ escaped_page_content = doc.page_content.replace("\n", "\\n")
20
+ res += "<doc>\n"
21
+ res += f" <content>{escaped_page_content}</content>\n"
22
+ for m in doc.metadata:
23
+ res += f" <{m}>{doc.metadata[m]}</{m}>\n"
24
+ res += "</doc>\n"
25
+ return res
26
+
27
+
28
+ def get_search_index(file_name="Mahmoudi_Nima_202202_PhD.pdf", index_folder="index"):
29
+ # load embeddings
30
+ from langchain.vectorstores import FAISS
31
+ from langchain.embeddings.openai import OpenAIEmbeddings
32
+
33
+ search_index = FAISS.load_local(
34
+ folder_path=index_folder,
35
+ index_name=file_name + ".index",
36
+ embeddings=OpenAIEmbeddings(),
37
+ )
38
+ return search_index
39
+
40
+
41
+ def convert_message(m):
42
+ if m["role"] == "user":
43
+ return HumanMessage(content=m["content"])
44
+ elif m["role"] == "assistant":
45
+ return AIMessage(content=m["content"])
46
+ elif m["role"] == "system":
47
+ return SystemMessage(content=m["content"])
48
+ else:
49
+ raise ValueError(f"Unknown role {m['role']}")
50
+
51
+
52
+ _condense_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
53
+
54
+ Chat History:
55
+ {chat_history}
56
+ Follow Up Input: {input}
57
+ Standalone question:"""
58
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_condense_template)
59
+
60
+ _rag_template = """Answer the question based only on the following context, citing the page number(s) of the document(s) you used to answer the question:
61
+ {context}
62
+
63
+ Question: {question}
64
+ """
65
+ ANSWER_PROMPT = ChatPromptTemplate.from_template(_rag_template)
66
+
67
+
68
+ def _format_chat_history(chat_history):
69
+ def format_single_chat_message(m):
70
+ if type(m) is HumanMessage:
71
+ return "Human: " + m.content
72
+ elif type(m) is AIMessage:
73
+ return "Assistant: " + m.content
74
+ elif type(m) is SystemMessage:
75
+ return "System: " + m.content
76
+ else:
77
+ raise ValueError(f"Unknown role {m['role']}")
78
+
79
+ return "\n".join([format_single_chat_message(m) for m in chat_history])
80
+
81
+ def get_standalone_question_from_chat_history_chain():
82
+ _inputs = RunnableMap(
83
+ standalone_question=RunnablePassthrough.assign(
84
+ chat_history=lambda x: _format_chat_history(x["chat_history"])
85
+ )
86
+ | CONDENSE_QUESTION_PROMPT
87
+ | ChatOpenAI(temperature=0)
88
+ | StrOutputParser(),
89
+ )
90
+ return _inputs
91
+
92
+ def get_rag_chain(file_name, index_folder="index", retrieval_cb=None):
93
+ vectorstore = get_search_index(file_name, index_folder)
94
+ retriever = vectorstore.as_retriever()
95
+
96
+ if retrieval_cb is None:
97
+ retrieval_cb = lambda x: x
98
+
99
+ def context_update_fn(q):
100
+ retrieval_cb([q])
101
+ return q
102
+
103
+ _inputs = RunnableMap(
104
+ standalone_question=RunnablePassthrough.assign(
105
+ chat_history=lambda x: _format_chat_history(x["chat_history"])
106
+ )
107
+ | CONDENSE_QUESTION_PROMPT
108
+ | ChatOpenAI(temperature=0)
109
+ | StrOutputParser(),
110
+ )
111
+ _context = {
112
+ "context": itemgetter("standalone_question") | RunnablePassthrough(context_update_fn) | retriever | format_docs,
113
+ "question": lambda x: x["standalone_question"],
114
+ }
115
+ conversational_qa_chain = _inputs | _context | ANSWER_PROMPT | ChatOpenAI()
116
+ return conversational_qa_chain
117
+
118
+
119
+ # RAG fusion chain
120
+ # source1: https://youtu.be/GchC5WxeXGc?si=6i7J0rPZI7SNwFYZ
121
+ # source2: https://towardsdatascience.com/forget-rag-the-future-is-rag-fusion-1147298d8ad1
122
+ def reciprocal_rank_fusion(results: list[list], k=60):
123
+ from langchain.load import dumps, loads
124
+ fused_scores = {}
125
+ for docs in results:
126
+ # Assumes the docs are returned in sorted order of relevance
127
+ for rank, doc in enumerate(docs):
128
+ doc_str = dumps(doc)
129
+ if doc_str not in fused_scores:
130
+ fused_scores[doc_str] = 0
131
+ fused_scores[doc_str] += 1 / (rank + k)
132
+
133
+ reranked_results = [
134
+ (loads(doc), score)
135
+ for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
136
+ ]
137
+ return reranked_results
138
+
139
+
140
+ def get_search_query_generation_chain():
141
+ from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate
142
+ prompt = ChatPromptTemplate(
143
+ input_variables=['original_query'],
144
+ messages=[
145
+ SystemMessagePromptTemplate(
146
+ prompt=PromptTemplate(
147
+ input_variables=[],
148
+ template='You are a helpful assistant that generates multiple search queries based on a single input query.'
149
+ )
150
+ ),
151
+ HumanMessagePromptTemplate(
152
+ prompt=PromptTemplate(
153
+ input_variables=['original_query'],
154
+ template='Generate multiple search queries related to: {original_query} \n OUTPUT (4 queries):'
155
+ )
156
+ )
157
+ ]
158
+ )
159
+
160
+ generate_queries = (
161
+ prompt |
162
+ ChatOpenAI(temperature=0) |
163
+ StrOutputParser() |
164
+ (lambda x: x.split("\n"))
165
+ )
166
+
167
+ return generate_queries
168
+
169
+ def get_rag_fusion_chain(file_name, index_folder="index", retrieval_cb=None):
170
+ vectorstore = get_search_index(file_name, index_folder)
171
+ retriever = vectorstore.as_retriever()
172
+ query_generation_chain = get_search_query_generation_chain()
173
+ _inputs = RunnableMap(
174
+ standalone_question=RunnablePassthrough.assign(
175
+ chat_history=lambda x: _format_chat_history(x["chat_history"])
176
+ )
177
+ | CONDENSE_QUESTION_PROMPT
178
+ | ChatOpenAI(temperature=0)
179
+ | StrOutputParser(),
180
+ )
181
+
182
+ if retrieval_cb is None:
183
+ retrieval_cb = lambda x: x
184
+
185
+ _context = {
186
+ "context":
187
+ RunnablePassthrough.assign(
188
+ original_query=lambda x: x["standalone_question"]
189
+ )
190
+ | query_generation_chain
191
+ | retrieval_cb
192
+ | retriever.map()
193
+ | reciprocal_rank_fusion
194
+ | (lambda x: [item[0] for item in x])
195
+ | format_docs,
196
+ "question": lambda x: x["standalone_question"],
197
+ }
198
+ conversational_qa_chain = _inputs | _context | ANSWER_PROMPT | ChatOpenAI()
199
+ return conversational_qa_chain
200
+
201
+
202
+ ####################
203
+ # Adding agent chain with OpenAI function calling
204
+
205
+ def get_search_tool_from_index(search_index, st_cb: Optional[StreamlitCallbackHandler] = None, ):
206
+ from langchain.agents import tool
207
+ from agent_helper import retry_and_streamlit_callback
208
+
209
+ @tool
210
+ @retry_and_streamlit_callback(st_cb=st_cb, tool_name="Content Seach Tool")
211
+ def search(query: str) -> str:
212
+ """Search the contents of the source document for the queries."""
213
+
214
+ docs = search_index.similarity_search(query, k=5)
215
+ return format_docs(docs)
216
+
217
+ return search
218
+
219
+ def get_lc_oai_tools(file_name:str = "Mahmoudi_Nima_202202_PhD.pdf", index_folder: str = "index", st_cb: Optional[StreamlitCallbackHandler] = None, ):
220
+ from langchain.tools.render import format_tool_to_openai_tool
221
+ search_index = get_search_index(file_name, index_folder)
222
+ lc_tools = [get_search_tool_from_index(search_index=search_index, st_cb=st_cb)]
223
+ oai_tools = [format_tool_to_openai_tool(t) for t in lc_tools]
224
+ return lc_tools, oai_tools
225
+
226
+ def get_agent_chain(file_name="Mahmoudi_Nima_202202_PhD.pdf", index_folder="index", callbacks=None, st_cb: Optional[StreamlitCallbackHandler] = None, ):
227
+ if callbacks is None:
228
+ callbacks = []
229
+
230
+ from langchain.agents import initialize_agent, AgentType
231
+ from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
232
+ from langchain.agents.format_scratchpad.openai_tools import (
233
+ format_to_openai_tool_messages,
234
+ )
235
+ from langchain.agents import AgentExecutor
236
+ from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
237
+
238
+ lc_tools, oai_tools = get_lc_oai_tools(file_name, index_folder, st_cb)
239
+
240
+
241
+ prompt = ChatPromptTemplate.from_messages(
242
+ [
243
+ ("system", "You are a helpful assistant, use the search tool to answer the user's question and cite only the page number when you use information coming (like [p1]) from the source document.\nchat history: {chat_history}"),
244
+ ("user", "{input}"),
245
+ MessagesPlaceholder(variable_name="agent_scratchpad"),
246
+ ]
247
+ )
248
+ llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106")
249
+
250
+ agent = (
251
+ {
252
+ "input": lambda x: x["input"],
253
+ "agent_scratchpad": lambda x: format_to_openai_tool_messages(
254
+ x["intermediate_steps"]
255
+ ),
256
+ "chat_history": lambda x: _format_chat_history(x["chat_history"]),
257
+ }
258
+ | prompt
259
+ | llm.bind(tools=oai_tools)
260
+ | OpenAIToolsAgentOutputParser()
261
+ )
262
+
263
+ agent_executor = AgentExecutor(agent=agent, tools=lc_tools, verbose=True, callbacks=callbacks)
264
+ return agent_executor
265
+
266
+
267
+ if __name__ == "__main__":
268
+ question_generation_chain = get_search_query_generation_chain()
269
+ print('='*50)
270
+ print('RAG Chain')
271
+ chain = get_rag_chain()
272
+ print(chain.invoke({'input': 'serverless computing', 'chat_history': []}))
273
+
274
+ print('='*50)
275
+ print('Question Generation Chain')
276
+ print(question_generation_chain.invoke({'original_query': 'serverless computing'}))
277
+
278
+ print('-'*50)
279
+ print('RAG Fusion Chain')
280
+ chain = get_rag_fusion_chain()
281
+ print(chain.invoke({'input': 'serverless computing', 'chat_history': []}))
282
+
283
+ agent_executor = get_agent_chain()
284
+ print(
285
+ agent_executor.invoke({
286
+ "input": "based on the source document, compare FaaS with BaaS??",
287
+ "chat_history": [],
288
+ })
289
+ )
pdf/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.pdf
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "llm-streamlit-demo-basic"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Nima Mahmoudi <nima.mahmoudi.w@gmail.com>"]
6
+
7
+ [tool.poetry.dependencies]
8
+ python = ">=3.10.0,<3.11"
9
+ langchain = "^0.0.321"
10
+ openai = "^0.28.1"
11
+ streamlit = "^1.27.2"
12
+ faiss-cpu = "^1.7.4"
13
+ tiktoken = "^0.5.1"
14
+ langchainhub = "^0.1.13"
15
+ pypdf = "^3.17.0"
16
+
17
+ [tool.pyright]
18
+ # https://github.com/microsoft/pyright/blob/main/docs/configuration.md
19
+ useLibraryCodeForTypes = true
20
+ exclude = [".cache"]
21
+
22
+ [tool.ruff]
23
+ # https://beta.ruff.rs/docs/configuration/
24
+ select = ['E', 'W', 'F', 'I', 'B', 'C4', 'ARG', 'SIM']
25
+ ignore = ['W291', 'W292', 'W293']
26
+
27
+ [build-system]
28
+ requires = ["poetry-core>=1.0.0"]
29
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ langchain
2
+ openai
3
+ streamlit==1.25.0
4
+ faiss-cpu
5
+ tiktoken
6
+ langchainhub
7
+ pypdf