SaiChaitanya commited on
Commit
25773cf
1 Parent(s): 1518573

Upload 106 files

Browse files
.env CHANGED
@@ -1 +1 @@
1
- OPENAI_API_KEY="sk-proj-2FjtAue570qdHBcvCiTGT3BlbkFJNV8SYnSFHm6j8wjI0ARe"
 
1
+ OPENAI_API_KEY=""
.gitmodules CHANGED
@@ -1,3 +1,3 @@
1
- [submodule "web-services-samples"]
2
- path = web-services-samples
3
- url = https://github.com/onetcenter/web-services-samples.git
 
1
+ [submodule "web-services-samples"]
2
+ path = web-services-samples
3
+ url = https://github.com/onetcenter/web-services-samples.git
bot/bot-langchain-chat.py CHANGED
@@ -1,96 +1,96 @@
1
- import streamlit as st
2
- from langchain.agents import AgentExecutor, create_openai_functions_agent,Tool
3
- from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
4
- from langchain_openai import ChatOpenAI
5
- from langchain.memory import ConversationBufferMemory
6
- from langchain.prompts import MessagesPlaceholder
7
- from langchain.schema.messages import SystemMessage
8
- import asyncio
9
- from dotenv import load_dotenv
10
- load_dotenv()
11
-
12
- st.set_page_config(page_title="Chatbot")
13
- st.header('Basic Chatbot')
14
- st.write('Allows users to interact with the Career Roadmap Genrator')
15
- agent_type = st.selectbox(
16
- 'How would you like to be contacted?',
17
- ('Code Assistant', 'General Assistant', 'Roadmap Generator'))
18
- st.write('You selected:', agent_type)
19
- #st.write('[![view source code ](https://img.shields.io/badge/view_source_code-gray?logo=github)](https://github.com/shashankdeshpande/langchain-chatbot/blob/master/pages/1_%F0%9F%92%AC_basic_chatbot.py)')
20
-
21
- def create_agent(model_name,agent_type):
22
- print('model:',model_name)
23
- print('Assistant Type:',agent_type)
24
- # agent_kwargs = {
25
- # "extra_prompt_messages": [MessagesPlaceholder(variable_name="memory")],
26
- # }
27
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
28
-
29
- increment = Tool(
30
- name="Increment",
31
- func=lambda x:x+'1',
32
- description="useful to add one to input.",
33
- )
34
- tools = [increment]
35
- llm = ChatOpenAI(model=model_name, temperature=0)
36
- # agent_exe = initialize_agent(
37
- # tools=[increment],
38
- # llm=llm,
39
- # agent=AgentType.OPENAI_FUNCTIONS,
40
- # agent_kwargs=agent_kwargs,
41
- # memory = memory,
42
- # verbose=True,
43
- # )
44
- system_message = ""
45
- if agent_type == 'Code Assistant':
46
- system_message = """You are a code assistant.
47
- Answer questions in code with minimal to no explanation.
48
- Put brief one line comments on the code for explanation.
49
- """
50
- elif agent_type == 'General Assistant':
51
- system_message = """You are a general AI assistant.
52
- Answer questions with minimal and to the point explanation.
53
- Don't put safety and cultural warnings. Only warn about security.
54
- """
55
- else:
56
- system_message = """You are a Career Roadmap Generator.
57
- Answer questions with the help of given job description and create breif step by step solutions for every job description user provides to get that role in that company.
58
- Put step by step process to get the job for the specific job description. List as many most relevant skills as possble for that role at that company.
59
- If possible provide few projects to work on before applying for that role which will increace the chance of getting selected.
60
- Add the resources to learn, watch, practice if possible for each step. Don't give me generic roadmap. Provide in-depth roadmap.
61
- Link all the realatd skills and give what skill to learn first followed by another in the roadmap.
62
- """
63
-
64
- prompt = ChatPromptTemplate.from_messages(
65
- [
66
- ("system", system_message),
67
- MessagesPlaceholder("chat_history", optional=True),
68
- ("human", "{input}"),
69
- MessagesPlaceholder("agent_scratchpad"),
70
- ]
71
- )
72
- agent = create_openai_functions_agent(llm, tools, prompt)
73
- agent_exe = AgentExecutor(agent=agent, tools=tools,memory=memory)
74
- return agent_exe
75
-
76
- async def main():
77
- #chain = self.setup_chain()
78
- #agent_exe = create_agent("gpt-3.5-turbo","General Assistant")
79
- agent_exe = create_agent("gpt-4-turbo-preview", agent_type)
80
- user_query = st.chat_input(placeholder="Ask your Question!")
81
- if "messages" not in st.session_state:
82
- st.session_state["messages"] = [{"role": "assistant", "content": "Hello! I'm here to help with your career progression needs."}]
83
- if user_query:
84
- st.session_state["messages"].append({"role": 'user', "content": user_query})
85
- #with st.chat_message("assistant"):
86
- #st_cb = StreamHandler(st.empty())
87
- #response = chain.run(user_query, callbacks=[st_cb])
88
- response = await agent_exe.ainvoke(input={"input":user_query})
89
- st.session_state["messages"].append({"role": "assistant", "content": response['output']})
90
-
91
- for msg in st.session_state["messages"]:
92
- st.chat_message(msg["role"]).write(msg["content"])
93
- print(st.session_state["messages"])
94
-
95
- if __name__ == "__main__":
96
  asyncio.run(main())
 
1
+ import streamlit as st
2
+ from langchain.agents import AgentExecutor, create_openai_functions_agent,Tool
3
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
4
+ from langchain_openai import ChatOpenAI
5
+ from langchain.memory import ConversationBufferMemory
6
+ from langchain.prompts import MessagesPlaceholder
7
+ from langchain.schema.messages import SystemMessage
8
+ import asyncio
9
+ from dotenv import load_dotenv
10
+ load_dotenv()
11
+
12
+ st.set_page_config(page_title="Chatbot")
13
+ st.header('Basic Chatbot')
14
+ st.write('Allows users to interact with the Career Roadmap Genrator')
15
+ agent_type = st.selectbox(
16
+ 'How would you like to be contacted?',
17
+ ('Code Assistant', 'General Assistant', 'Roadmap Generator'))
18
+ st.write('You selected:', agent_type)
19
+ #st.write('[![view source code ](https://img.shields.io/badge/view_source_code-gray?logo=github)](https://github.com/shashankdeshpande/langchain-chatbot/blob/master/pages/1_%F0%9F%92%AC_basic_chatbot.py)')
20
+
21
+ def create_agent(model_name,agent_type):
22
+ print('model:',model_name)
23
+ print('Assistant Type:',agent_type)
24
+ # agent_kwargs = {
25
+ # "extra_prompt_messages": [MessagesPlaceholder(variable_name="memory")],
26
+ # }
27
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
28
+
29
+ increment = Tool(
30
+ name="Increment",
31
+ func=lambda x:x+'1',
32
+ description="useful to add one to input.",
33
+ )
34
+ tools = [increment]
35
+ llm = ChatOpenAI(model=model_name, temperature=0)
36
+ # agent_exe = initialize_agent(
37
+ # tools=[increment],
38
+ # llm=llm,
39
+ # agent=AgentType.OPENAI_FUNCTIONS,
40
+ # agent_kwargs=agent_kwargs,
41
+ # memory = memory,
42
+ # verbose=True,
43
+ # )
44
+ system_message = ""
45
+ if agent_type == 'Code Assistant':
46
+ system_message = """You are a code assistant.
47
+ Answer questions in code with minimal to no explanation.
48
+ Put brief one line comments on the code for explanation.
49
+ """
50
+ elif agent_type == 'General Assistant':
51
+ system_message = """You are a general AI assistant.
52
+ Answer questions with minimal and to the point explanation.
53
+ Don't put safety and cultural warnings. Only warn about security.
54
+ """
55
+ else:
56
+ system_message = """You are a Career Roadmap Generator.
57
+ Answer questions with the help of given job description and create breif step by step solutions for every job description user provides to get that role in that company.
58
+ Put step by step process to get the job for the specific job description. List as many most relevant skills as possble for that role at that company.
59
+ If possible provide few projects to work on before applying for that role which will increace the chance of getting selected.
60
+ Add the resources to learn, watch, practice if possible for each step. Don't give me generic roadmap. Provide in-depth roadmap.
61
+ Link all the realatd skills and give what skill to learn first followed by another in the roadmap.
62
+ """
63
+
64
+ prompt = ChatPromptTemplate.from_messages(
65
+ [
66
+ ("system", system_message),
67
+ MessagesPlaceholder("chat_history", optional=True),
68
+ ("human", "{input}"),
69
+ MessagesPlaceholder("agent_scratchpad"),
70
+ ]
71
+ )
72
+ agent = create_openai_functions_agent(llm, tools, prompt)
73
+ agent_exe = AgentExecutor(agent=agent, tools=tools,memory=memory)
74
+ return agent_exe
75
+
76
+ async def main():
77
+ #chain = self.setup_chain()
78
+ #agent_exe = create_agent("gpt-3.5-turbo","General Assistant")
79
+ agent_exe = create_agent("gpt-4-turbo-preview", agent_type)
80
+ user_query = st.chat_input(placeholder="Ask your Question!")
81
+ if "messages" not in st.session_state:
82
+ st.session_state["messages"] = [{"role": "assistant", "content": "Hello! I'm here to help with your career progression needs."}]
83
+ if user_query:
84
+ st.session_state["messages"].append({"role": 'user', "content": user_query})
85
+ #with st.chat_message("assistant"):
86
+ #st_cb = StreamHandler(st.empty())
87
+ #response = chain.run(user_query, callbacks=[st_cb])
88
+ response = await agent_exe.ainvoke(input={"input":user_query})
89
+ st.session_state["messages"].append({"role": "assistant", "content": response['output']})
90
+
91
+ for msg in st.session_state["messages"]:
92
+ st.chat_message(msg["role"]).write(msg["content"])
93
+ print(st.session_state["messages"])
94
+
95
+ if __name__ == "__main__":
96
  asyncio.run(main())
bot/langchain-bot/bot-langchain-chat.py CHANGED
@@ -1,50 +1,54 @@
1
- import streamlit as st
2
- import asyncio
3
- from engine.langchain_agent import create_agent, run_agent
4
- from dotenv import load_dotenv
5
- #from openai import OpenAI
6
-
7
- load_dotenv()
8
- import os
9
- # print(os.environ["OPENAI_API_KEY"])
10
- # Client = OpenAI()
11
-
12
- st.title("Langchain Agent")
13
- if "memory" not in st.session_state:
14
- st.session_state["memory"] = [{"role":"system","content":""}]
15
-
16
- if "agent" not in st.session_state:
17
- #model_name = "gpt-3.5-turbo"
18
- model_name = "gpt-4-turbo-preview"
19
- st.session_state['agent'] = create_agent(model_name)
20
-
21
- # updading the chat page with messages
22
- for message in st.session_state["memory"]:
23
- if message["role"] == "assistant":
24
- with st.chat_message(message["role"]):
25
- msg = message["content"]["output"]
26
- if "/bot/images/dall-e" in msg:
27
- address = msg.split("(")[1][:-1]
28
- print("address:",address)
29
- st.image(address)
30
- else:
31
- st.markdown(msg)
32
- elif message["role"] == "system":
33
- pass
34
- else:
35
- with st.chat_message(message["role"]):
36
- st.markdown(message["content"])
37
-
38
- # entering new message event handle
39
- if prompt := st.chat_input("Your message ..."):
40
- st.session_state['memory'].append({"role":"user","content":prompt})
41
- with st.chat_message("user"):
42
- st.markdown(prompt)
43
- response = asyncio.run(run_agent(st.session_state["agent"],prompt))
44
- st.session_state['memory'].append({"role":"assistant","content":response})
45
- with st.chat_message("assistant"):
46
- if "/bot/images/dall-e" in response["output"]:
47
- address = response["output"].split("(")[1][:-1]
48
- st.image(address)
49
- else:
 
 
 
 
50
  st.markdown(response["output"])
 
1
+ import streamlit as st
2
+ import asyncio
3
+ from engine.langchain_agent import create_agent, run_agent
4
+ from dotenv import load_dotenv
5
+ #from openai import OpenAI
6
+
7
+ load_dotenv()
8
+ import os
9
+ # print(os.environ["OPENAI_API_KEY"])
10
+ # Client = OpenAI()
11
+
12
+ st.set_page_config(page_title="Career Roadmap Generator")
13
+ st.header("Paste a job description and get a roadmap")
14
+ st.write('Allows users to interact with the LLM')
15
+
16
+ if "memory" not in st.session_state:
17
+ st.session_state["memory"] = [{"role":"system","content":""}]
18
+
19
+ if "agent" not in st.session_state:
20
+ #model_name = "gpt-3.5-turbo"
21
+ #model_name = "gpt-4-turbo-preview"
22
+ model_name = "gpt-4o"
23
+ st.session_state['agent'] = create_agent(model_name)
24
+
25
+ # updading the chat page with messages
26
+ for message in st.session_state["memory"]:
27
+ if message["role"] == "assistant":
28
+ with st.chat_message(message["role"]):
29
+ msg = message["content"]["output"]
30
+ if "/bot/images/dall-e" in msg:
31
+ address = msg.split("(")[1][:-1]
32
+ print("address:",address)
33
+ st.image(address)
34
+ else:
35
+ st.markdown(msg)
36
+ elif message["role"] == "system":
37
+ pass
38
+ else:
39
+ with st.chat_message(message["role"]):
40
+ st.markdown(message["content"])
41
+
42
+ # entering new message event handle
43
+ if prompt := st.chat_input("Your message ..."):
44
+ st.session_state['memory'].append({"role":"user","content":prompt})
45
+ with st.chat_message("user"):
46
+ st.markdown(prompt)
47
+ response = asyncio.run(run_agent(st.session_state["agent"],prompt))
48
+ st.session_state['memory'].append({"role":"assistant","content":response})
49
+ with st.chat_message("assistant"):
50
+ if "/bot/images/dall-e" in response["output"]:
51
+ address = response["output"].split("(")[1][:-1]
52
+ st.image(address)
53
+ else:
54
  st.markdown(response["output"])
bot/langchain-bot/engine/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (160 Bytes). View file
 
bot/langchain-bot/engine/__pycache__/langchain_agent.cpython-310.pyc ADDED
Binary file (2.09 kB). View file
 
bot/langchain-bot/engine/__pycache__/langchain_agent.cpython-39.pyc CHANGED
Binary files a/bot/langchain-bot/engine/__pycache__/langchain_agent.cpython-39.pyc and b/bot/langchain-bot/engine/__pycache__/langchain_agent.cpython-39.pyc differ
 
bot/langchain-bot/engine/__pycache__/tools.cpython-39.pyc CHANGED
Binary files a/bot/langchain-bot/engine/__pycache__/tools.cpython-39.pyc and b/bot/langchain-bot/engine/__pycache__/tools.cpython-39.pyc differ
 
bot/langchain-bot/engine/langchain_agent.py CHANGED
@@ -1,48 +1,52 @@
1
- from langchain.agents import AgentExecutor, create_openai_tools_agent,Tool
2
- from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
3
- from langchain_openai import ChatOpenAI, OpenAIEmbeddings
4
- from langchain_community.vectorstores import Chroma
5
- from langchain import hub
6
- from langchain.memory import ConversationBufferMemory
7
- from langchain.prompts import MessagesPlaceholder
8
- from engine.tools import GPT35TCodeGen, GPT4TAssistant, GPT4TCodeGen, DalleImageGen,RAGTool, CombinedTool, CareerRoadmapGenerator
9
-
10
- def create_agent(model_name):
11
-
12
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
13
- rag_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
14
- #rag_llm = ChatOpenAI(model_name="gpt-4-turbo-preview", temperature=0)
15
- rag_prompt = hub.pull("rlm/rag-prompt")
16
- rag_db = Chroma(persist_directory="../../chroma_db",
17
- embedding_function=OpenAIEmbeddings())
18
- rag_retriever = rag_db.as_retriever()
19
-
20
- #tools = [GPT35TCodeGen(),GPT4TAssistant(),GPT4TCodeGen(), DalleImageGen(), RAGTool(rag_retriever,rag_llm,rag_prompt), CombinedTool(rag_retriever,rag_llm,rag_prompt), CareerRoadmapGenerator(rag_retriever,rag_llm,rag_prompt)]
21
- tools = [GPT4TAssistant(),GPT4TCodeGen(), DalleImageGen(), RAGTool(rag_retriever,rag_llm,rag_prompt)]
22
-
23
- llm = ChatOpenAI(model=model_name, temperature=0)
24
-
25
- system_message = "You are a general AI assistant.\n" + \
26
- "Don't answer the question if you are not getting the answer from a tool.\n" + \
27
- "Don't change the answers you receive from a tool. Just pass them to the user."
28
- "Don't put safety and cultural warnings. Only warn about security."
29
-
30
- prompt = ChatPromptTemplate.from_messages(
31
- [
32
- ("system", system_message),
33
- MessagesPlaceholder("chat_history", optional=True),
34
- ("human", "{input}"),
35
- MessagesPlaceholder("agent_scratchpad"),
36
- ]
37
- )
38
- agent = create_openai_tools_agent(llm, tools, prompt)
39
- agent_exe = AgentExecutor(agent=agent, tools=tools,memory=memory,verbose=True)
40
- return agent_exe
41
-
42
- async def run_agent(agent,user_query):
43
- #print(agent.memory.chat_memory.messages[-2:] if len(agent.memory.chat_memory.messages) > 1 else "")
44
- #set_verbose(True)
45
- print(agent.memory.chat_memory)
46
- print('********************')
47
- print()
 
 
 
 
48
  return await agent.ainvoke(input={"input":user_query},verbose=True)
 
1
+ from langchain.agents import AgentExecutor, create_openai_tools_agent,Tool
2
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
3
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain import hub
6
+ from langchain.memory import ConversationBufferMemory
7
+ from langchain.prompts import MessagesPlaceholder
8
+ from engine.tools import GPT35TCodeGen, GPT4TAssistant, GPT4TCodeGen, DalleImageGen,RAGTool, CombinedTool, CareerRoadmapGenerator
9
+
10
+ def create_agent(model_name):
11
+
12
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
13
+ #rag_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
14
+ rag_llm = ChatOpenAI(model_name="gpt-4o", temperature=0.1)
15
+ #rag_llm = ChatOpenAI(model_name="gpt-4-turbo-preview", temperature=0)
16
+ rag_prompt = hub.pull("rlm/rag-prompt")
17
+ rag_db = Chroma(persist_directory="../../chroma_db",
18
+ embedding_function=OpenAIEmbeddings())
19
+ rag_retriever = rag_db.as_retriever()
20
+
21
+ #tools = [GPT35TCodeGen(),GPT4TAssistant(),GPT4TCodeGen(), DalleImageGen(), RAGTool(rag_retriever,rag_llm,rag_prompt), CombinedTool(rag_retriever,rag_llm,rag_prompt), CareerRoadmapGenerator(rag_retriever,rag_llm,rag_prompt)]
22
+ tools = [RAGTool(rag_retriever,rag_llm,rag_prompt)]
23
+
24
+ llm = ChatOpenAI(model=model_name, temperature=0)
25
+
26
+ system_message = "You are a Career Roadmap Generator.\n" + \
27
+ "Answer questions with the help of given job description and create breif step by step solutions for every job description user provides to get that role in that company.\n" + \
28
+ "Put step by step process to get the job for the specific job description. List as many most relevant skills as possble for that role at that company.\n" + \
29
+ "If possible provide few projects to work on before applying for that role which will increace the chance of getting selected.\n" + \
30
+ "Add the resources to learn, watch, practice if possible for each step. Don't give me generic roadmap. Provide in-depth roadmap.\n" + \
31
+ "Link all the realatd skills and give what skill to learn first followed by another in the roadmap."
32
+
33
+
34
+ prompt = ChatPromptTemplate.from_messages(
35
+ [
36
+ ("system", system_message),
37
+ MessagesPlaceholder("chat_history", optional=True),
38
+ ("human", "{input}"),
39
+ MessagesPlaceholder("agent_scratchpad"),
40
+ ]
41
+ )
42
+ agent = create_openai_tools_agent(llm, tools, prompt)
43
+ agent_exe = AgentExecutor(agent=agent, tools=tools,memory=memory,verbose=True)
44
+ return agent_exe
45
+
46
+ async def run_agent(agent,user_query):
47
+ #print(agent.memory.chat_memory.messages[-2:] if len(agent.memory.chat_memory.messages) > 1 else "")
48
+ #set_verbose(True)
49
+ print(agent.memory.chat_memory)
50
+ print('********************')
51
+ print()
52
  return await agent.ainvoke(input={"input":user_query},verbose=True)
bot/langchain-bot/engine/tools.py CHANGED
@@ -178,8 +178,6 @@ class RAGTool(BaseTool):
178
  "[question]\n" + \
179
  "Don't change the input question from the user and don't change answer from this tool\n." +\
180
  "Just pass it through to the user."
181
-
182
-
183
  retriever: Type[VectorStoreRetriever] = None
184
  llm: Type[ChatOpenAI] = None
185
  prompt: Type[ChatPromptTemplate] = None
 
178
  "[question]\n" + \
179
  "Don't change the input question from the user and don't change answer from this tool\n." +\
180
  "Just pass it through to the user."
 
 
181
  retriever: Type[VectorStoreRetriever] = None
182
  llm: Type[ChatOpenAI] = None
183
  prompt: Type[ChatPromptTemplate] = None
bot/openai-bot/bot-openai-chat.py CHANGED
@@ -1,68 +1,68 @@
1
- import streamlit as st
2
- from dotenv import load_dotenv
3
- from openai_engine import OpenAIEngine
4
-
5
- load_dotenv()
6
- engine = OpenAIEngine()
7
-
8
- st.title("OpenAI API")
9
-
10
- # Define options for the dropdown lists
11
- chat_model_list = ["gpt-3.5-turbo","gpt-4-turbo-preview","gpt-4-vision-preview"]
12
- image_model_list = ["dall-e-3","dall-e-2"]
13
- chat_prompt_dictionary = {
14
- "Code Assistant":
15
- """You are a code assistant.
16
- Answer questions in code with minimal to no explanation.
17
- Put brief one line comments on the code for explanation\
18
- """,
19
- "General Assistant":
20
- """You are a general AI assistant.
21
- Answer questions with minimal and to the point explanation.
22
- Don't put safety and cultural warnings. Only warn about security."""
23
- }
24
-
25
- # Create the first dropdown in the sidebar and update session state: generation type
26
- st.session_state["app_type_option"] = st.sidebar.selectbox("Generation Type:",["Chatting","Image Generation"])
27
- st.sidebar.write(f'You are in {st.session_state.app_type_option} mode.')
28
-
29
- # list of models is changed based on the type of generation
30
- model_list = chat_model_list if st.session_state.app_type_option == "Chatting" else image_model_list
31
- # second dropdown: list of models dropdown
32
- st.session_state["selected_option_1"] = st.sidebar.selectbox('Models:', model_list )
33
-
34
- # third dropdown in the sidebar and update session state: assistant type
35
- if st.session_state.app_type_option == "Chatting":
36
- st.session_state.selected_option_2 = st.sidebar.selectbox('Prompts:', chat_prompt_dictionary.keys())
37
- # Display the selected options
38
- st.sidebar.write(f'You are using "{st.session_state.selected_option_1}\
39
- " together with "{st.session_state.selected_option_2}" prompt.')
40
- else:
41
- st.sidebar.write(f'You are using "{st.session_state.selected_option_1}".')
42
-
43
- # defining openai engine
44
- engine.change(st.session_state.app_type_option,
45
- st.session_state.selected_option_1,
46
- chat_prompt_dictionary[st.session_state.selected_option_2])
47
-
48
- # updading the chat page with messages
49
- for message in st.session_state["memory"]:
50
- if message["role"] == "image assistant":
51
- with st.chat_message("assistant"):
52
- st.image(message["content"])
53
- elif message["role"] == "system":
54
- pass
55
- else:
56
- with st.chat_message(message["role"]):
57
- st.markdown(message["content"])
58
-
59
- # entering new message event handle
60
- if prompt := st.chat_input("Start chat ..."):
61
- with st.chat_message("user"):
62
- st.markdown(prompt)
63
-
64
- with st.chat_message("assistant"):
65
- if st.session_state.app_type_option == "Chatting":
66
- engine.generate_answer(prompt)
67
- else:
68
  engine.generate_image(prompt)
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from openai_engine import OpenAIEngine
4
+
5
+ load_dotenv()
6
+ engine = OpenAIEngine()
7
+
8
+ st.title("OpenAI API")
9
+
10
+ # Define options for the dropdown lists
11
+ chat_model_list = ["gpt-3.5-turbo","gpt-4-turbo-preview","gpt-4-vision-preview"]
12
+ image_model_list = ["dall-e-3","dall-e-2"]
13
+ chat_prompt_dictionary = {
14
+ "Code Assistant":
15
+ """You are a code assistant.
16
+ Answer questions in code with minimal to no explanation.
17
+ Put brief one line comments on the code for explanation\
18
+ """,
19
+ "General Assistant":
20
+ """You are a general AI assistant.
21
+ Answer questions with minimal and to the point explanation.
22
+ Don't put safety and cultural warnings. Only warn about security."""
23
+ }
24
+
25
+ # Create the first dropdown in the sidebar and update session state: generation type
26
+ st.session_state["app_type_option"] = st.sidebar.selectbox("Generation Type:",["Chatting","Image Generation"])
27
+ st.sidebar.write(f'You are in {st.session_state.app_type_option} mode.')
28
+
29
+ # list of models is changed based on the type of generation
30
+ model_list = chat_model_list if st.session_state.app_type_option == "Chatting" else image_model_list
31
+ # second dropdown: list of models dropdown
32
+ st.session_state["selected_option_1"] = st.sidebar.selectbox('Models:', model_list )
33
+
34
+ # third dropdown in the sidebar and update session state: assistant type
35
+ if st.session_state.app_type_option == "Chatting":
36
+ st.session_state.selected_option_2 = st.sidebar.selectbox('Prompts:', chat_prompt_dictionary.keys())
37
+ # Display the selected options
38
+ st.sidebar.write(f'You are using "{st.session_state.selected_option_1}\
39
+ " together with "{st.session_state.selected_option_2}" prompt.')
40
+ else:
41
+ st.sidebar.write(f'You are using "{st.session_state.selected_option_1}".')
42
+
43
+ # defining openai engine
44
+ engine.change(st.session_state.app_type_option,
45
+ st.session_state.selected_option_1,
46
+ chat_prompt_dictionary[st.session_state.selected_option_2])
47
+
48
+ # updading the chat page with messages
49
+ for message in st.session_state["memory"]:
50
+ if message["role"] == "image assistant":
51
+ with st.chat_message("assistant"):
52
+ st.image(message["content"])
53
+ elif message["role"] == "system":
54
+ pass
55
+ else:
56
+ with st.chat_message(message["role"]):
57
+ st.markdown(message["content"])
58
+
59
+ # entering new message event handle
60
+ if prompt := st.chat_input("Start chat ..."):
61
+ with st.chat_message("user"):
62
+ st.markdown(prompt)
63
+
64
+ with st.chat_message("assistant"):
65
+ if st.session_state.app_type_option == "Chatting":
66
+ engine.generate_answer(prompt)
67
+ else:
68
  engine.generate_image(prompt)
bot/openai-bot/openai_engine.py CHANGED
@@ -1,61 +1,61 @@
1
- from openai import OpenAI
2
- import streamlit as st
3
- import requests
4
- import os
5
-
6
- class OpenAIEngine():
7
-
8
- def __init__(self,mode="",model="",prompt=""):
9
- self.client = OpenAI()
10
- if "memory" not in st.session_state:
11
- st.session_state["memory"] = [{}]
12
- if "image_number" not in st.session_state:
13
- st.session_state['image_number'] = 1
14
- if "image_folder" not in st.session_state:
15
- st.session_state["image_folder"] = os.path.join("..","images")
16
- self.change(mode,model,prompt)
17
-
18
- def change(self,mode,model,prompt):
19
- self.mode = mode
20
- self.model = model
21
- st.session_state["memory"][0] = {"role":"system","content":prompt}
22
-
23
- def generate_answer(self,prompt):
24
- st.session_state["memory"].append({"role":"user","content":prompt})
25
- memory = []
26
- for mem in st.session_state['memory']:
27
- if mem['role'] != "image assistant":
28
- memory.append(mem)
29
- else:
30
- memory = memory[:-1]
31
- stream = self.client.chat.completions.create(
32
- model=self.model,
33
- messages=memory,
34
- stream=True,
35
- temperature=0,
36
- )
37
- response = st.write_stream(stream)
38
- response = {"role": "assistant", "content": response}
39
- st.session_state["memory"].append(response)
40
- print(st.session_state["memory"])
41
- print()
42
-
43
- def generate_image(self,prompt):
44
- image_data = self.client.images.generate(
45
- model=self.model,
46
- prompt=prompt,
47
- size="1024x1024",
48
- quality="standard",
49
- n=1,
50
- )
51
- image = requests.get(image_data.data[0].url,stream=True)
52
- if image.status_code == 200:
53
- image_path = os.path.join(st.session_state["image_folder"],
54
- f"{st.session_state['image_number']}.png")
55
- st.session_state["image_number"] += 1
56
- with open(image_path, 'wb') as f:
57
- for chunk in image:
58
- f.write(chunk)
59
- st.session_state["memory"].append({"role":"user","content":prompt})
60
- st.session_state["memory"].append({"role": "image assistant", "content": image_path})
61
  st.image(image_path)
 
1
+ from openai import OpenAI
2
+ import streamlit as st
3
+ import requests
4
+ import os
5
+
6
+ class OpenAIEngine():
7
+
8
+ def __init__(self,mode="",model="",prompt=""):
9
+ self.client = OpenAI()
10
+ if "memory" not in st.session_state:
11
+ st.session_state["memory"] = [{}]
12
+ if "image_number" not in st.session_state:
13
+ st.session_state['image_number'] = 1
14
+ if "image_folder" not in st.session_state:
15
+ st.session_state["image_folder"] = os.path.join("..","images")
16
+ self.change(mode,model,prompt)
17
+
18
+ def change(self,mode,model,prompt):
19
+ self.mode = mode
20
+ self.model = model
21
+ st.session_state["memory"][0] = {"role":"system","content":prompt}
22
+
23
+ def generate_answer(self,prompt):
24
+ st.session_state["memory"].append({"role":"user","content":prompt})
25
+ memory = []
26
+ for mem in st.session_state['memory']:
27
+ if mem['role'] != "image assistant":
28
+ memory.append(mem)
29
+ else:
30
+ memory = memory[:-1]
31
+ stream = self.client.chat.completions.create(
32
+ model=self.model,
33
+ messages=memory,
34
+ stream=True,
35
+ temperature=0,
36
+ )
37
+ response = st.write_stream(stream)
38
+ response = {"role": "assistant", "content": response}
39
+ st.session_state["memory"].append(response)
40
+ print(st.session_state["memory"])
41
+ print()
42
+
43
+ def generate_image(self,prompt):
44
+ image_data = self.client.images.generate(
45
+ model=self.model,
46
+ prompt=prompt,
47
+ size="1024x1024",
48
+ quality="standard",
49
+ n=1,
50
+ )
51
+ image = requests.get(image_data.data[0].url,stream=True)
52
+ if image.status_code == 200:
53
+ image_path = os.path.join(st.session_state["image_folder"],
54
+ f"{st.session_state['image_number']}.png")
55
+ st.session_state["image_number"] += 1
56
+ with open(image_path, 'wb') as f:
57
+ for chunk in image:
58
+ f.write(chunk)
59
+ st.session_state["memory"].append({"role":"user","content":prompt})
60
+ st.session_state["memory"].append({"role": "image assistant", "content": image_path})
61
  st.image(image_path)
bot/rag_indexing/indexing.py CHANGED
@@ -1,169 +1,169 @@
1
- import bs4
2
- import pandas as pd
3
- from langchain import hub
4
- from langchain_community.document_loaders import WebBaseLoader
5
- from langchain_community.vectorstores import Chroma
6
- from langchain_core.output_parsers import StrOutputParser
7
- from langchain_core.runnables import RunnablePassthrough
8
- from langchain_openai import ChatOpenAI, OpenAIEmbeddings
9
- from langchain_text_splitters import RecursiveCharacterTextSplitter
10
- #from langchain.document_loaders import PyPDFLoader, CSVLoader, ExcelLoader
11
- from langchain_community.document_loaders import PyPDFLoader, CSVLoader, TextLoader
12
- from langchain_community.document_loaders import UnstructuredExcelLoader
13
- #from langchain.text_splitter import CharacterTextSplitter
14
- from langchain_community.embeddings import HuggingFaceEmbeddings
15
- from dotenv import load_dotenv
16
- import sys
17
- import shutil
18
- import os
19
- import uuid
20
- import csv
21
-
22
- def from_web(url):
23
- loader = WebBaseLoader(web_paths=(url,),
24
- bs_kwargs=dict(parse_only=bs4.SoupStrainer(
25
- class_=("post-content", "post-title", "post-header")
26
- )),)
27
- docs = loader.load()
28
- return docs
29
-
30
- def from_excel(file_address):
31
- if file_address.endswith(".xlsx"):
32
- loader = UnstructuredExcelLoader(file_path=file_address)
33
- docs = loader.load()
34
- return docs
35
- else:
36
- docs = []
37
- for file_name in os.listdir(file_address):
38
- file_path = os.path.join(file_address, file_name)
39
- if os.path.isfile(file_path) and file_name.endswith(".xlsx"):
40
- # Load the Excel file
41
- loader = UnstructuredExcelLoader(file_path=file_address)
42
- docs.extend(loader.load())
43
- return docs
44
-
45
- def from_csv(file_address):
46
- docs = []
47
- #Load the CSV file
48
- if file_address.endswith(".csv"):
49
- loader = CSVLoader(file_path=file_address, encoding='utf-8')
50
- docs = loader.load()
51
- return docs
52
-
53
- def from_pdf(file_address):
54
- loader = PyPDFLoader(file_path=file_address)
55
- docs = loader.load()
56
- return docs
57
-
58
- def from_text_files(file_address):
59
- docs = []
60
- for file_name in os.listdir(file_address):
61
- file_path = os.path.join(file_address, file_name)
62
- if os.path.isfile(file_path) and file_name.endswith(".txt"):
63
- loader = TextLoader(file_path)
64
- docs.extend(loader.load())
65
- return docs
66
-
67
- def retriever_from_docs(docs):
68
- if not docs:
69
- print("No documents to process.")
70
- return
71
- #print("Documents:", docs)
72
-
73
- # Split the documents into smaller chunks
74
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
75
- splits = text_splitter.split_documents(docs)
76
- print(f"Number of document chunks: {len(splits)}")
77
-
78
- # Create embeddings for the document chunks
79
- #embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # 384 dimensionality embeddings
80
- embeddings = OpenAIEmbeddings() # 1536 dimensionality
81
- #embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") # 768 embedded dimension
82
- #embeddings = HuggingFaceEmbeddings(model_name="bert-large-uncased") # 1024 dim
83
-
84
- embeddings_list = embeddings.embed_documents([t.page_content for t in splits])
85
-
86
- # Generate unique IDs for each document chunk
87
- doc_ids = [str(uuid.uuid4()) for _ in range(len(splits))]
88
- print(f"Number of IDs generated: {len(doc_ids)}")
89
-
90
- # Create or load the Chroma vector store
91
- persist_directory="../../chroma_db"
92
-
93
- # Check if the directory exists
94
- if os.path.exists(persist_directory):
95
- # Remove the directory and its contents
96
- #shutil.rmtree(persist_directory)
97
- #print(f"Deleted {persist_directory}")
98
-
99
- # Load the existing vector store
100
- #chroma_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
101
-
102
- #print()
103
-
104
- # Create a new vector store
105
- chroma_store = Chroma.from_documents(documents=splits, embedding=embeddings,
106
- persist_directory=persist_directory)
107
-
108
- # Load the existing vector store
109
- chroma_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
110
-
111
- chroma_store.add_texts([t.page_content for t in splits], embeddings=embeddings_list, ids=doc_ids)
112
-
113
-
114
- else:
115
- print(f"{persist_directory} does not exist")
116
- # Create a new vector store
117
- chroma_store = Chroma.from_documents(documents=splits, embedding=embeddings,
118
- persist_directory=persist_directory)
119
-
120
- #Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(),persist_directory="../../chroma_db")
121
- #chroma_store = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory="../../chroma_db")
122
-
123
- # Is used to add new documents and their corresponding embeddings to an existing Chroma vector store.
124
- #chroma_store.add_texts([t.page_content for t in splits], embeddings=embeddings_list, ids=doc_ids)
125
-
126
- print("Embeddings are added to vector store.")
127
-
128
-
129
- def main():
130
- print(sys.argv)
131
- load_dotenv()
132
-
133
- #file_address = "../../../db_28_2_text/db_28_2_text/"
134
- #file_address = "../../../db_28_2_excel/db_28_2_excel/"
135
- file_address = "../../../International Job Dataset/allJobs.xlsx"
136
- #file_address = "../../../db_28_2_excel/db_28_2_excel/Technology Skills.xlsx"
137
- #file_address = "../../../db_28_2_excel/db_28_2_excel/Tools Used.xlsx"
138
- #file_address = "../../../db_28_2_excel/db_28_2_excel/Alternate Titles.xlsx"
139
- #file_address = "../../../db_28_2_excel/db_28_2_excel/Emerging Tasks.xlsx" Job Zone Reference
140
- #file_address = "../../../db_28_2_excel/db_28_2_excel/Job Zone Reference.xlsx"
141
- #file_address = "../../../db_28_2_excel/db_28_2_excel/Job Zones.xlsx"
142
- #file_address = "../../../db_28_2_excel/db_28_2_excel/Occupation Data.xlsx"
143
- #file_address = "../../../db_28_2_excel/db_28_2_excel/Related Occupations.xlsx"
144
-
145
- # Check if the file_address exists
146
- if not os.path.exists(file_address):
147
- print("File address does not exist.")
148
- return
149
-
150
- # Determine the input type and load the documents accordingly
151
- if 'http' in sys.argv[1].lower():
152
- retriever_from_docs(from_web(sys.argv[1]))
153
- elif '.xls' in sys.argv[1].lower():
154
- retriever_from_docs(from_excel(sys.argv[1]))
155
- elif '.csv' in sys.argv[1].lower():
156
- retriever_from_docs(from_csv(sys.argv[1]))
157
- elif '.pdf' in sys.argv[1].lower():
158
- retriever_from_docs(from_pdf(sys.argv[1]))
159
- elif '.txt' in sys.argv[1].lower():
160
- retriever_from_docs(from_text_files(sys.argv[1]))
161
- elif 'excel' in sys.argv[1].lower():
162
- retriever_from_docs(from_excel(sys.argv[1]))
163
- elif 'text' in sys.argv[1].lower():
164
- retriever_from_docs(from_text_files(sys.argv[1]))
165
- else:
166
- print(f"Unsupported file format for file.")
167
-
168
- if __name__ == "__main__":
169
  main()
 
1
+ import bs4
2
+ import pandas as pd
3
+ from langchain import hub
4
+ from langchain_community.document_loaders import WebBaseLoader
5
+ from langchain_community.vectorstores import Chroma
6
+ from langchain_core.output_parsers import StrOutputParser
7
+ from langchain_core.runnables import RunnablePassthrough
8
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
9
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
10
+ #from langchain.document_loaders import PyPDFLoader, CSVLoader, ExcelLoader
11
+ from langchain_community.document_loaders import PyPDFLoader, CSVLoader, TextLoader
12
+ from langchain_community.document_loaders import UnstructuredExcelLoader
13
+ #from langchain.text_splitter import CharacterTextSplitter
14
+ from langchain_community.embeddings import HuggingFaceEmbeddings
15
+ from dotenv import load_dotenv
16
+ import sys
17
+ import shutil
18
+ import os
19
+ import uuid
20
+ import csv
21
+
22
+ def from_web(url):
23
+ loader = WebBaseLoader(web_paths=(url,),
24
+ bs_kwargs=dict(parse_only=bs4.SoupStrainer(
25
+ class_=("post-content", "post-title", "post-header")
26
+ )),)
27
+ docs = loader.load()
28
+ return docs
29
+
30
+ def from_excel(file_address):
31
+ if file_address.endswith(".xlsx"):
32
+ loader = UnstructuredExcelLoader(file_path=file_address)
33
+ docs = loader.load()
34
+ return docs
35
+ else:
36
+ docs = []
37
+ for file_name in os.listdir(file_address):
38
+ file_path = os.path.join(file_address, file_name)
39
+ if os.path.isfile(file_path) and file_name.endswith(".xlsx"):
40
+ # Load the Excel file
41
+ loader = UnstructuredExcelLoader(file_path=file_address)
42
+ docs.extend(loader.load())
43
+ return docs
44
+
45
+ def from_csv(file_address):
46
+ docs = []
47
+ #Load the CSV file
48
+ if file_address.endswith(".csv"):
49
+ loader = CSVLoader(file_path=file_address, encoding='utf-8')
50
+ docs = loader.load()
51
+ return docs
52
+
53
+ def from_pdf(file_address):
54
+ loader = PyPDFLoader(file_path=file_address)
55
+ docs = loader.load()
56
+ return docs
57
+
58
+ def from_text_files(file_address):
59
+ docs = []
60
+ for file_name in os.listdir(file_address):
61
+ file_path = os.path.join(file_address, file_name)
62
+ if os.path.isfile(file_path) and file_name.endswith(".txt"):
63
+ loader = TextLoader(file_path)
64
+ docs.extend(loader.load())
65
+ return docs
66
+
67
+ def retriever_from_docs(docs):
68
+ if not docs:
69
+ print("No documents to process.")
70
+ return
71
+ #print("Documents:", docs)
72
+
73
+ # Split the documents into smaller chunks
74
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
75
+ splits = text_splitter.split_documents(docs)
76
+ print(f"Number of document chunks: {len(splits)}")
77
+
78
+ # Create embeddings for the document chunks
79
+ #embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # 384 dimensionality embeddings
80
+ embeddings = OpenAIEmbeddings() # 1536 dimensionality
81
+ #embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") # 768 embedded dimension
82
+ #embeddings = HuggingFaceEmbeddings(model_name="bert-large-uncased") # 1024 dim
83
+
84
+ embeddings_list = embeddings.embed_documents([t.page_content for t in splits])
85
+
86
+ # Generate unique IDs for each document chunk
87
+ doc_ids = [str(uuid.uuid4()) for _ in range(len(splits))]
88
+ print(f"Number of IDs generated: {len(doc_ids)}")
89
+
90
+ # Create or load the Chroma vector store
91
+ persist_directory="../../chroma_db"
92
+
93
+ # Check if the directory exists
94
+ if os.path.exists(persist_directory):
95
+ # Remove the directory and its contents
96
+ #shutil.rmtree(persist_directory)
97
+ #print(f"Deleted {persist_directory}")
98
+
99
+ # Load the existing vector store
100
+ #chroma_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
101
+
102
+ #print()
103
+
104
+ # Create a new vector store
105
+ chroma_store = Chroma.from_documents(documents=splits, embedding=embeddings,
106
+ persist_directory=persist_directory)
107
+
108
+ # Load the existing vector store
109
+ chroma_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
110
+
111
+ chroma_store.add_texts([t.page_content for t in splits], embeddings=embeddings_list, ids=doc_ids)
112
+
113
+
114
+ else:
115
+ print(f"{persist_directory} does not exist")
116
+ # Create a new vector store
117
+ chroma_store = Chroma.from_documents(documents=splits, embedding=embeddings,
118
+ persist_directory=persist_directory)
119
+
120
+ #Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(),persist_directory="../../chroma_db")
121
+ #chroma_store = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory="../../chroma_db")
122
+
123
+ # Is used to add new documents and their corresponding embeddings to an existing Chroma vector store.
124
+ #chroma_store.add_texts([t.page_content for t in splits], embeddings=embeddings_list, ids=doc_ids)
125
+
126
+ print("Embeddings are added to vector store.")
127
+
128
+
129
+ def main():
130
+ print(sys.argv)
131
+ load_dotenv()
132
+
133
+ #file_address = "../../../db_28_2_text/db_28_2_text/"
134
+ #file_address = "../../../db_28_2_excel/db_28_2_excel/"
135
+ file_address = "../../../International Job Dataset/allJobs.xlsx"
136
+ #file_address = "../../../db_28_2_excel/db_28_2_excel/Technology Skills.xlsx"
137
+ #file_address = "../../../db_28_2_excel/db_28_2_excel/Tools Used.xlsx"
138
+ #file_address = "../../../db_28_2_excel/db_28_2_excel/Alternate Titles.xlsx"
139
+ #file_address = "../../../db_28_2_excel/db_28_2_excel/Emerging Tasks.xlsx" Job Zone Reference
140
+ #file_address = "../../../db_28_2_excel/db_28_2_excel/Job Zone Reference.xlsx"
141
+ #file_address = "../../../db_28_2_excel/db_28_2_excel/Job Zones.xlsx"
142
+ #file_address = "../../../db_28_2_excel/db_28_2_excel/Occupation Data.xlsx"
143
+ #file_address = "../../../db_28_2_excel/db_28_2_excel/Related Occupations.xlsx"
144
+
145
+ # Check if the file_address exists
146
+ if not os.path.exists(file_address):
147
+ print("File address does not exist.")
148
+ return
149
+
150
+ # Determine the input type and load the documents accordingly
151
+ if 'http' in sys.argv[1].lower():
152
+ retriever_from_docs(from_web(sys.argv[1]))
153
+ elif '.xls' in sys.argv[1].lower():
154
+ retriever_from_docs(from_excel(sys.argv[1]))
155
+ elif '.csv' in sys.argv[1].lower():
156
+ retriever_from_docs(from_csv(sys.argv[1]))
157
+ elif '.pdf' in sys.argv[1].lower():
158
+ retriever_from_docs(from_pdf(sys.argv[1]))
159
+ elif '.txt' in sys.argv[1].lower():
160
+ retriever_from_docs(from_text_files(sys.argv[1]))
161
+ elif 'excel' in sys.argv[1].lower():
162
+ retriever_from_docs(from_excel(sys.argv[1]))
163
+ elif 'text' in sys.argv[1].lower():
164
+ retriever_from_docs(from_text_files(sys.argv[1]))
165
+ else:
166
+ print(f"Unsupported file format for file.")
167
+
168
+ if __name__ == "__main__":
169
  main()
chroma_db/chroma.sqlite3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01014d173911620f29e57c04fc1ff8e29816f5044fe8d0045acfb9312e9965ed
3
- size 1619869696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a18d7993408eda3ff5ce877beff0766b906a263a36108fcb788e7444827bb96e
3
+ size 5914836992
langchain.md CHANGED
@@ -1,62 +1,62 @@
1
- # Langchain
2
- **Source: https://python.langchain.com/docs**
3
- LangChain is a framework for developing applications powered by language models. It enables applications that:
4
- **Are context-aware** i.e. connect a language model to sources of context (prompt instructions, few shot examples, content to ground its response in, etc.)
5
- **Reason** i.e. rely on a language model to reason (about how to answer based on provided context, what actions to take, etc.)
6
-
7
- **Other Packages:**
8
- * **LangServe:** A library for deploying LangChain chains as a REST API.
9
- * **LangSmith:** A developer platform that lets you debug, test, evaluate, and monitor chains built on any LLM framework and seamlessly integrates with LangChain.
10
-
11
- **LangChain Libraries:**
12
- The LangChain libraries themselves are made up of several different packages.
13
- - **langchain-core:** Base abstractions and LangChain Expression Language.
14
- - **langchain-community:** Third party integrations.
15
- - **langchain:** Chains, agents, and retrieval strategies that make up an application's cognitive architecture.
16
-
17
- **LangChain Components:**
18
- 1. LangChain Expression Language
19
- 2. Modules:
20
- * Model I/O: Interface with language models
21
- * Retrieval: Interface with application-specific data
22
- * Agents: Let chains choose which tools to use given high-level directives
23
-
24
- Additional:
25
- * Chains: Common, building block compositions
26
- * Memory: Persist application state between runs of a chain
27
- * Callbacks: Log and stream intermediate steps of any chain
28
-
29
- ## LangChain Expression Language (LCEL)
30
- LangChain Expression Language, or LCEL, is a declarative way to easily compose chains together. LCEL was designed from day 1 to support putting prototypes in production, with no code changes, from the simplest “prompt + LLM” chain to the most complex chains (we’ve seen folks successfully run LCEL chains with 100s of steps in production). To highlight a few of the reasons you might want to use LCEL:
31
- * Streaming support
32
- * Async support
33
- * Optimized parallel execution
34
- * Retries and fallbacks
35
- * Access intermediate results
36
- * Input and output schemas
37
- * Seamless LangSmith tracing integration
38
- * Seamless LangServe deployment integration
39
-
40
- ## Modules:
41
- * **Model I/O:** The core element of any language model application is the model. LangChain gives you the building blocks to interface with any language model.
42
- * **Prompts:** different types of prompt templates.
43
- * **LLMs:** functionality related to the LLM class. This is a type of model that takes a text string as input and returns a text string.
44
- * **ChatModels:** functionality related to the ChatModel class. This is a type of model that takes a list of messages as input and returns a message.
45
- * **Output Parsers:** responsible for transforming the output of LLMs and ChatModels into more structured data.
46
- * **Retrieval:** Many LLM applications require user-specific data that is not part of the model's training set. The primary way of accomplishing this is through Retrieval Augmented Generation (RAG). In this process, external data is retrieved and then passed to the LLM when doing the generation step.
47
- * **Document loaders:** load documents from many different sources. LangChain provides over 100 different document loaders as well as integrations with other major providers. LangChain provides integrations to load all types of documents (HTML, PDF, code).
48
- * **Text Splitting:** splitting (or chunking) a large document into smaller chunks. LangChain provides several transformation algorithms for doing this.
49
- * **Text embedding models:** integrations with over 25 different embedding providers and methods, from open-source to proprietary API. LangChain provides a standard interface, allowing you to easily swap between models.
50
- * **Vector stores:** integrations with over 50 different vector stores, from open-source local ones to cloud-hosted proprietary ones. LangChain exposes a standard interface, allowing you to easily swap between vector stores.
51
- * **Retrievers:** LangChain supports basic methods that are easy to get started - namely simple semantic search. However, we have also added a collection of algorithms on top of this to increase performance.
52
- * **Indexing:** The LangChain Indexing API syncs your data from any source into a vector store.
53
- * **Agents:** The core idea of agents is to use a language model to choose a sequence of actions to take. In chains, a sequence of actions is hardcoded (in code). In agents, a language model is used as a reasoning engine to determine which actions to take and in which order. (Action Generation)
54
- * **Agent Types:** There are many different types of agents to use.
55
- * **Tools:** Agents are only as good as the tools they have.
56
- * **Chains:** refer to sequences of calls - whether to an LLM, a tool, or a data preprocessing step. The primary supported way to do this is with LCEL.
57
-
58
- LCEL is great for constructing your own chains, but it’s also nice to have chains that you can use off-the-shelf. There are two types of off-the-shelf chains that LangChain supports:
59
-
60
- * Chains that are built with LCEL. In this case, LangChain offers a higher-level constructor method. However, all that is being done under the hood is constructing a chain with LCEL.
61
-
62
  * [Legacy] Chains constructed by subclassing from a legacy Chain class. These chains do not use LCEL under the hood but are rather standalone classes.
 
1
+ # Langchain
2
+ **Source: https://python.langchain.com/docs**
3
+ LangChain is a framework for developing applications powered by language models. It enables applications that:
4
+ **Are context-aware** i.e. connect a language model to sources of context (prompt instructions, few shot examples, content to ground its response in, etc.)
5
+ **Reason** i.e. rely on a language model to reason (about how to answer based on provided context, what actions to take, etc.)
6
+
7
+ **Other Packages:**
8
+ * **LangServe:** A library for deploying LangChain chains as a REST API.
9
+ * **LangSmith:** A developer platform that lets you debug, test, evaluate, and monitor chains built on any LLM framework and seamlessly integrates with LangChain.
10
+
11
+ **LangChain Libraries:**
12
+ The LangChain libraries themselves are made up of several different packages.
13
+ - **langchain-core:** Base abstractions and LangChain Expression Language.
14
+ - **langchain-community:** Third party integrations.
15
+ - **langchain:** Chains, agents, and retrieval strategies that make up an application's cognitive architecture.
16
+
17
+ **LangChain Components:**
18
+ 1. LangChain Expression Language
19
+ 2. Modules:
20
+ * Model I/O: Interface with language models
21
+ * Retrieval: Interface with application-specific data
22
+ * Agents: Let chains choose which tools to use given high-level directives
23
+
24
+ Additional:
25
+ * Chains: Common, building block compositions
26
+ * Memory: Persist application state between runs of a chain
27
+ * Callbacks: Log and stream intermediate steps of any chain
28
+
29
+ ## LangChain Expression Language (LCEL)
30
+ LangChain Expression Language, or LCEL, is a declarative way to easily compose chains together. LCEL was designed from day 1 to support putting prototypes in production, with no code changes, from the simplest “prompt + LLM” chain to the most complex chains (we’ve seen folks successfully run LCEL chains with 100s of steps in production). To highlight a few of the reasons you might want to use LCEL:
31
+ * Streaming support
32
+ * Async support
33
+ * Optimized parallel execution
34
+ * Retries and fallbacks
35
+ * Access intermediate results
36
+ * Input and output schemas
37
+ * Seamless LangSmith tracing integration
38
+ * Seamless LangServe deployment integration
39
+
40
+ ## Modules:
41
+ * **Model I/O:** The core element of any language model application is the model. LangChain gives you the building blocks to interface with any language model.
42
+ * **Prompts:** different types of prompt templates.
43
+ * **LLMs:** functionality related to the LLM class. This is a type of model that takes a text string as input and returns a text string.
44
+ * **ChatModels:** functionality related to the ChatModel class. This is a type of model that takes a list of messages as input and returns a message.
45
+ * **Output Parsers:** responsible for transforming the output of LLMs and ChatModels into more structured data.
46
+ * **Retrieval:** Many LLM applications require user-specific data that is not part of the model's training set. The primary way of accomplishing this is through Retrieval Augmented Generation (RAG). In this process, external data is retrieved and then passed to the LLM when doing the generation step.
47
+ * **Document loaders:** load documents from many different sources. LangChain provides over 100 different document loaders as well as integrations with other major providers. LangChain provides integrations to load all types of documents (HTML, PDF, code).
48
+ * **Text Splitting:** splitting (or chunking) a large document into smaller chunks. LangChain provides several transformation algorithms for doing this.
49
+ * **Text embedding models:** integrations with over 25 different embedding providers and methods, from open-source to proprietary API. LangChain provides a standard interface, allowing you to easily swap between models.
50
+ * **Vector stores:** integrations with over 50 different vector stores, from open-source local ones to cloud-hosted proprietary ones. LangChain exposes a standard interface, allowing you to easily swap between vector stores.
51
+ * **Retrievers:** LangChain supports basic methods that are easy to get started - namely simple semantic search. However, we have also added a collection of algorithms on top of this to increase performance.
52
+ * **Indexing:** The LangChain Indexing API syncs your data from any source into a vector store.
53
+ * **Agents:** The core idea of agents is to use a language model to choose a sequence of actions to take. In chains, a sequence of actions is hardcoded (in code). In agents, a language model is used as a reasoning engine to determine which actions to take and in which order. (Action Generation)
54
+ * **Agent Types:** There are many different types of agents to use.
55
+ * **Tools:** Agents are only as good as the tools they have.
56
+ * **Chains:** refer to sequences of calls - whether to an LLM, a tool, or a data preprocessing step. The primary supported way to do this is with LCEL.
57
+
58
+ LCEL is great for constructing your own chains, but it’s also nice to have chains that you can use off-the-shelf. There are two types of off-the-shelf chains that LangChain supports:
59
+
60
+ * Chains that are built with LCEL. In this case, LangChain offers a higher-level constructor method. However, all that is being done under the hood is constructing a chain with LCEL.
61
+
62
  * [Legacy] Chains constructed by subclassing from a legacy Chain class. These chains do not use LCEL under the hood but are rather standalone classes.
notebooks/app.ipynb CHANGED
@@ -1,116 +1,116 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "'''\n",
10
- "requirements.txt file contents:\n",
11
- " \n",
12
- "langchain==0.0.154\n",
13
- "PyPDF2==3.0.1\n",
14
- "python-dotenv==1.0.0\n",
15
- "streamlit==1.18.1\n",
16
- "faiss-cpu==1.7.4\n",
17
- "streamlit-extras\n",
18
- "'''\n",
19
- " \n",
20
- " \n",
21
- "import streamlit as st\n",
22
- "from dotenv import load_dotenv\n",
23
- "import pickle\n",
24
- "from PyPDF2 import PdfReader\n",
25
- "from streamlit_extras.add_vertical_space import add_vertical_space\n",
26
- "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
27
- "from langchain.embeddings.openai import OpenAIEmbeddings\n",
28
- "from langchain.vectorstores import FAISS\n",
29
- "from langchain.llms import OpenAI\n",
30
- "from langchain.chains.question_answering import load_qa_chain\n",
31
- "from langchain.callbacks import get_openai_callback\n",
32
- "import os\n",
33
- " \n",
34
- "# Sidebar contents\n",
35
- "with st.sidebar:\n",
36
- " st.title('🤗💬 LLM Chat App')\n",
37
- " st.markdown('''\n",
38
- " ## About\n",
39
- " This app is an LLM-powered chatbot built using:\n",
40
- " - [Streamlit](https://streamlit.io/)\n",
41
- " - [LangChain](https://python.langchain.com/)\n",
42
- " - [OpenAI](https://platform.openai.com/docs/models) LLM model\n",
43
- " \n",
44
- " ''')\n",
45
- " add_vertical_space(5)\n",
46
- " st.write('Made with ❤️ by [Prompt Engineer](https://youtube.com/@engineerprompt)')\n",
47
- " \n",
48
- "load_dotenv()\n",
49
- " \n",
50
- "def main():\n",
51
- " st.header(\"Chat with PDF 💬\")\n",
52
- " \n",
53
- " \n",
54
- " # upload a PDF file\n",
55
- " pdf = st.file_uploader(\"Upload your PDF\", type='pdf')\n",
56
- " \n",
57
- " # st.write(pdf)\n",
58
- " if pdf is not None:\n",
59
- " pdf_reader = PdfReader(pdf)\n",
60
- " \n",
61
- " text = \"\"\n",
62
- " for page in pdf_reader.pages:\n",
63
- " text += page.extract_text()\n",
64
- " \n",
65
- " text_splitter = RecursiveCharacterTextSplitter(\n",
66
- " chunk_size=1000,\n",
67
- " chunk_overlap=200,\n",
68
- " length_function=len\n",
69
- " )\n",
70
- " chunks = text_splitter.split_text(text=text)\n",
71
- " \n",
72
- " # # embeddings\n",
73
- " store_name = pdf.name[:-4]\n",
74
- " st.write(f'{store_name}')\n",
75
- " # st.write(chunks)\n",
76
- " \n",
77
- " if os.path.exists(f\"{store_name}.pkl\"):\n",
78
- " with open(f\"{store_name}.pkl\", \"rb\") as f:\n",
79
- " VectorStore = pickle.load(f)\n",
80
- " # st.write('Embeddings Loaded from the Disk')s\n",
81
- " else:\n",
82
- " embeddings = OpenAIEmbeddings()\n",
83
- " VectorStore = FAISS.from_texts(chunks, embedding=embeddings)\n",
84
- " with open(f\"{store_name}.pkl\", \"wb\") as f:\n",
85
- " pickle.dump(VectorStore, f)\n",
86
- " \n",
87
- " # embeddings = OpenAIEmbeddings()\n",
88
- " # VectorStore = FAISS.from_texts(chunks, embedding=embeddings)\n",
89
- " \n",
90
- " # Accept user questions/query\n",
91
- " query = st.text_input(\"Ask questions about your PDF file:\")\n",
92
- " # st.write(query)\n",
93
- " \n",
94
- " if query:\n",
95
- " docs = VectorStore.similarity_search(query=query, k=3)\n",
96
- " \n",
97
- " llm = OpenAI()\n",
98
- " chain = load_qa_chain(llm=llm, chain_type=\"stuff\")\n",
99
- " with get_openai_callback() as cb:\n",
100
- " response = chain.run(input_documents=docs, question=query)\n",
101
- " print(cb)\n",
102
- " st.write(response)\n",
103
- " \n",
104
- "if __name__ == '__main__':\n",
105
- " main()"
106
- ]
107
- }
108
- ],
109
- "metadata": {
110
- "language_info": {
111
- "name": "python"
112
- }
113
- },
114
- "nbformat": 4,
115
- "nbformat_minor": 2
116
- }
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "'''\n",
10
+ "requirements.txt file contents:\n",
11
+ " \n",
12
+ "langchain==0.0.154\n",
13
+ "PyPDF2==3.0.1\n",
14
+ "python-dotenv==1.0.0\n",
15
+ "streamlit==1.18.1\n",
16
+ "faiss-cpu==1.7.4\n",
17
+ "streamlit-extras\n",
18
+ "'''\n",
19
+ " \n",
20
+ " \n",
21
+ "import streamlit as st\n",
22
+ "from dotenv import load_dotenv\n",
23
+ "import pickle\n",
24
+ "from PyPDF2 import PdfReader\n",
25
+ "from streamlit_extras.add_vertical_space import add_vertical_space\n",
26
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
27
+ "from langchain.embeddings.openai import OpenAIEmbeddings\n",
28
+ "from langchain.vectorstores import FAISS\n",
29
+ "from langchain.llms import OpenAI\n",
30
+ "from langchain.chains.question_answering import load_qa_chain\n",
31
+ "from langchain.callbacks import get_openai_callback\n",
32
+ "import os\n",
33
+ " \n",
34
+ "# Sidebar contents\n",
35
+ "with st.sidebar:\n",
36
+ " st.title('🤗💬 LLM Chat App')\n",
37
+ " st.markdown('''\n",
38
+ " ## About\n",
39
+ " This app is an LLM-powered chatbot built using:\n",
40
+ " - [Streamlit](https://streamlit.io/)\n",
41
+ " - [LangChain](https://python.langchain.com/)\n",
42
+ " - [OpenAI](https://platform.openai.com/docs/models) LLM model\n",
43
+ " \n",
44
+ " ''')\n",
45
+ " add_vertical_space(5)\n",
46
+ " st.write('Made with ❤️ by [Prompt Engineer](https://youtube.com/@engineerprompt)')\n",
47
+ " \n",
48
+ "load_dotenv()\n",
49
+ " \n",
50
+ "def main():\n",
51
+ " st.header(\"Chat with PDF 💬\")\n",
52
+ " \n",
53
+ " \n",
54
+ " # upload a PDF file\n",
55
+ " pdf = st.file_uploader(\"Upload your PDF\", type='pdf')\n",
56
+ " \n",
57
+ " # st.write(pdf)\n",
58
+ " if pdf is not None:\n",
59
+ " pdf_reader = PdfReader(pdf)\n",
60
+ " \n",
61
+ " text = \"\"\n",
62
+ " for page in pdf_reader.pages:\n",
63
+ " text += page.extract_text()\n",
64
+ " \n",
65
+ " text_splitter = RecursiveCharacterTextSplitter(\n",
66
+ " chunk_size=1000,\n",
67
+ " chunk_overlap=200,\n",
68
+ " length_function=len\n",
69
+ " )\n",
70
+ " chunks = text_splitter.split_text(text=text)\n",
71
+ " \n",
72
+ " # # embeddings\n",
73
+ " store_name = pdf.name[:-4]\n",
74
+ " st.write(f'{store_name}')\n",
75
+ " # st.write(chunks)\n",
76
+ " \n",
77
+ " if os.path.exists(f\"{store_name}.pkl\"):\n",
78
+ " with open(f\"{store_name}.pkl\", \"rb\") as f:\n",
79
+ " VectorStore = pickle.load(f)\n",
80
+ " # st.write('Embeddings Loaded from the Disk')s\n",
81
+ " else:\n",
82
+ " embeddings = OpenAIEmbeddings()\n",
83
+ " VectorStore = FAISS.from_texts(chunks, embedding=embeddings)\n",
84
+ " with open(f\"{store_name}.pkl\", \"wb\") as f:\n",
85
+ " pickle.dump(VectorStore, f)\n",
86
+ " \n",
87
+ " # embeddings = OpenAIEmbeddings()\n",
88
+ " # VectorStore = FAISS.from_texts(chunks, embedding=embeddings)\n",
89
+ " \n",
90
+ " # Accept user questions/query\n",
91
+ " query = st.text_input(\"Ask questions about your PDF file:\")\n",
92
+ " # st.write(query)\n",
93
+ " \n",
94
+ " if query:\n",
95
+ " docs = VectorStore.similarity_search(query=query, k=3)\n",
96
+ " \n",
97
+ " llm = OpenAI()\n",
98
+ " chain = load_qa_chain(llm=llm, chain_type=\"stuff\")\n",
99
+ " with get_openai_callback() as cb:\n",
100
+ " response = chain.run(input_documents=docs, question=query)\n",
101
+ " print(cb)\n",
102
+ " st.write(response)\n",
103
+ " \n",
104
+ "if __name__ == '__main__':\n",
105
+ " main()"
106
+ ]
107
+ }
108
+ ],
109
+ "metadata": {
110
+ "language_info": {
111
+ "name": "python"
112
+ }
113
+ },
114
+ "nbformat": 4,
115
+ "nbformat_minor": 2
116
+ }
notebooks/langchain_agent.ipynb CHANGED
@@ -1,165 +1,165 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 19,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "from typing import Optional, Type\n",
10
- "from langchain_core.pydantic_v1 import BaseModel, Field\n",
11
- "from langchain_core.tools import BaseTool\n",
12
- "\n",
13
- "class MultiplierInput(BaseModel):\n",
14
- " a: int = Field(description=\"first number\")\n",
15
- " b: int = Field(description=\"second number\")\n",
16
- "\n",
17
- "class CustomMultiplierTool(BaseTool):\n",
18
- " name = \"Calculator\"\n",
19
- " description = \"useful for when you need to answer questions about math\"\n",
20
- " args_schema: Type[BaseModel] = MultiplierInput\n",
21
- "\n",
22
- " def _run(\n",
23
- " self, a: int, b: int\n",
24
- " ) -> str:\n",
25
- " \"\"\"Use the tool.\"\"\"\n",
26
- " return a * b\n",
27
- "\n",
28
- " async def _arun(\n",
29
- " self,\n",
30
- " a: int,\n",
31
- " b: int,\n",
32
- " ) -> str:\n",
33
- " \"\"\"Use the tool asynchronously.\"\"\"\n",
34
- " return self._run(a,b)"
35
- ]
36
- },
37
- {
38
- "cell_type": "code",
39
- "execution_count": 26,
40
- "metadata": {},
41
- "outputs": [],
42
- "source": [
43
- "from langchain.agents import AgentExecutor, create_openai_tools_agent,Tool\n",
44
- "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
45
- "from langchain_openai import ChatOpenAI\n",
46
- "from langchain.memory import ConversationBufferMemory\n",
47
- "from langchain.prompts import MessagesPlaceholder\n",
48
- "from langchain.globals import set_verbose\n",
49
- "import dotenv\n",
50
- "\n",
51
- "dotenv.load_dotenv()\n",
52
- "\n",
53
- "def create_agent(model_name):\n",
54
- "\n",
55
- " memory = ConversationBufferMemory(memory_key=\"chat_history\", return_messages=True)\n",
56
- " \n",
57
- " tools = [CustomMultiplierTool()]\n",
58
- " \n",
59
- " llm = ChatOpenAI(model=model_name, temperature=0)\n",
60
- "\n",
61
- " system_message = \"\"\"You are a general AI assistant.\n",
62
- " You have access to several tool. Don't answer the question if you are not getting the answer from a tool.\n",
63
- " Don't put safety and cultural warnings. Only warn about security. \n",
64
- " \"\"\"\n",
65
- "\n",
66
- " prompt = ChatPromptTemplate.from_messages(\n",
67
- " [\n",
68
- " (\"system\", system_message),\n",
69
- " MessagesPlaceholder(\"chat_history\", optional=True),\n",
70
- " (\"human\", \"{input}\"),\n",
71
- " MessagesPlaceholder(\"agent_scratchpad\"),\n",
72
- " ]\n",
73
- " )\n",
74
- " agent = create_openai_tools_agent(llm, tools, prompt)\n",
75
- " agent_exe = AgentExecutor(agent=agent, tools=tools,memory=memory)\n",
76
- " return agent_exe\n",
77
- "\n",
78
- "async def run_agent(agent,user_query):\n",
79
- " #print(agent.memory.chat_memory.messages[-2:] if len(agent.memory.chat_memory.messages) > 1 else \"\")\n",
80
- " #set_verbose(True)\n",
81
- " # print(agent.memory.chat_memory)\n",
82
- " print('********************')\n",
83
- " # print()\n",
84
- " return await agent.ainvoke(input={\"input\":user_query},verbose=True)"
85
- ]
86
- },
87
- {
88
- "cell_type": "code",
89
- "execution_count": 29,
90
- "metadata": {},
91
- "outputs": [],
92
- "source": [
93
- "agent_exe = create_agent(\"gpt-3.5-turbo\")"
94
- ]
95
- },
96
- {
97
- "cell_type": "code",
98
- "execution_count": 30,
99
- "metadata": {},
100
- "outputs": [
101
- {
102
- "name": "stdout",
103
- "output_type": "stream",
104
- "text": [
105
- "********************\n",
106
- "\n",
107
- "\n",
108
- "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
109
- "\u001b[32;1m\u001b[1;3m\n",
110
- "Invoking: `Calculator` with `{'a': 2, 'b': 3}`\n",
111
- "\n",
112
- "\n",
113
- "\u001b[0m\u001b[36;1m\u001b[1;3m6\u001b[0m\u001b[32;1m\u001b[1;3m2 multiplied by 3 is equal to 6.\u001b[0m\n",
114
- "\n",
115
- "\u001b[1m> Finished chain.\u001b[0m\n"
116
- ]
117
- },
118
- {
119
- "data": {
120
- "text/plain": [
121
- "{'input': '2 multiply by 3',\n",
122
- " 'chat_history': [HumanMessage(content='2 multiply by 3'),\n",
123
- " AIMessage(content='2 multiplied by 3 is equal to 6.')],\n",
124
- " 'output': '2 multiplied by 3 is equal to 6.'}"
125
- ]
126
- },
127
- "execution_count": 30,
128
- "metadata": {},
129
- "output_type": "execute_result"
130
- }
131
- ],
132
- "source": [
133
- "await run_agent(agent_exe,\"2 multiply by 3\")"
134
- ]
135
- },
136
- {
137
- "cell_type": "code",
138
- "execution_count": null,
139
- "metadata": {},
140
- "outputs": [],
141
- "source": []
142
- }
143
- ],
144
- "metadata": {
145
- "kernelspec": {
146
- "display_name": "stevens-chat-py311-v1",
147
- "language": "python",
148
- "name": "python3"
149
- },
150
- "language_info": {
151
- "codemirror_mode": {
152
- "name": "ipython",
153
- "version": 3
154
- },
155
- "file_extension": ".py",
156
- "mimetype": "text/x-python",
157
- "name": "python",
158
- "nbconvert_exporter": "python",
159
- "pygments_lexer": "ipython3",
160
- "version": "3.11.6"
161
- }
162
- },
163
- "nbformat": 4,
164
- "nbformat_minor": 2
165
- }
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 19,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from typing import Optional, Type\n",
10
+ "from langchain_core.pydantic_v1 import BaseModel, Field\n",
11
+ "from langchain_core.tools import BaseTool\n",
12
+ "\n",
13
+ "class MultiplierInput(BaseModel):\n",
14
+ " a: int = Field(description=\"first number\")\n",
15
+ " b: int = Field(description=\"second number\")\n",
16
+ "\n",
17
+ "class CustomMultiplierTool(BaseTool):\n",
18
+ " name = \"Calculator\"\n",
19
+ " description = \"useful for when you need to answer questions about math\"\n",
20
+ " args_schema: Type[BaseModel] = MultiplierInput\n",
21
+ "\n",
22
+ " def _run(\n",
23
+ " self, a: int, b: int\n",
24
+ " ) -> str:\n",
25
+ " \"\"\"Use the tool.\"\"\"\n",
26
+ " return a * b\n",
27
+ "\n",
28
+ " async def _arun(\n",
29
+ " self,\n",
30
+ " a: int,\n",
31
+ " b: int,\n",
32
+ " ) -> str:\n",
33
+ " \"\"\"Use the tool asynchronously.\"\"\"\n",
34
+ " return self._run(a,b)"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 26,
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "from langchain.agents import AgentExecutor, create_openai_tools_agent,Tool\n",
44
+ "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
45
+ "from langchain_openai import ChatOpenAI\n",
46
+ "from langchain.memory import ConversationBufferMemory\n",
47
+ "from langchain.prompts import MessagesPlaceholder\n",
48
+ "from langchain.globals import set_verbose\n",
49
+ "import dotenv\n",
50
+ "\n",
51
+ "dotenv.load_dotenv()\n",
52
+ "\n",
53
+ "def create_agent(model_name):\n",
54
+ "\n",
55
+ " memory = ConversationBufferMemory(memory_key=\"chat_history\", return_messages=True)\n",
56
+ " \n",
57
+ " tools = [CustomMultiplierTool()]\n",
58
+ " \n",
59
+ " llm = ChatOpenAI(model=model_name, temperature=0)\n",
60
+ "\n",
61
+ " system_message = \"\"\"You are a general AI assistant.\n",
62
+ " You have access to several tool. Don't answer the question if you are not getting the answer from a tool.\n",
63
+ " Don't put safety and cultural warnings. Only warn about security. \n",
64
+ " \"\"\"\n",
65
+ "\n",
66
+ " prompt = ChatPromptTemplate.from_messages(\n",
67
+ " [\n",
68
+ " (\"system\", system_message),\n",
69
+ " MessagesPlaceholder(\"chat_history\", optional=True),\n",
70
+ " (\"human\", \"{input}\"),\n",
71
+ " MessagesPlaceholder(\"agent_scratchpad\"),\n",
72
+ " ]\n",
73
+ " )\n",
74
+ " agent = create_openai_tools_agent(llm, tools, prompt)\n",
75
+ " agent_exe = AgentExecutor(agent=agent, tools=tools,memory=memory)\n",
76
+ " return agent_exe\n",
77
+ "\n",
78
+ "async def run_agent(agent,user_query):\n",
79
+ " #print(agent.memory.chat_memory.messages[-2:] if len(agent.memory.chat_memory.messages) > 1 else \"\")\n",
80
+ " #set_verbose(True)\n",
81
+ " # print(agent.memory.chat_memory)\n",
82
+ " print('********************')\n",
83
+ " # print()\n",
84
+ " return await agent.ainvoke(input={\"input\":user_query},verbose=True)"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 29,
90
+ "metadata": {},
91
+ "outputs": [],
92
+ "source": [
93
+ "agent_exe = create_agent(\"gpt-3.5-turbo\")"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": 30,
99
+ "metadata": {},
100
+ "outputs": [
101
+ {
102
+ "name": "stdout",
103
+ "output_type": "stream",
104
+ "text": [
105
+ "********************\n",
106
+ "\n",
107
+ "\n",
108
+ "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
109
+ "\u001b[32;1m\u001b[1;3m\n",
110
+ "Invoking: `Calculator` with `{'a': 2, 'b': 3}`\n",
111
+ "\n",
112
+ "\n",
113
+ "\u001b[0m\u001b[36;1m\u001b[1;3m6\u001b[0m\u001b[32;1m\u001b[1;3m2 multiplied by 3 is equal to 6.\u001b[0m\n",
114
+ "\n",
115
+ "\u001b[1m> Finished chain.\u001b[0m\n"
116
+ ]
117
+ },
118
+ {
119
+ "data": {
120
+ "text/plain": [
121
+ "{'input': '2 multiply by 3',\n",
122
+ " 'chat_history': [HumanMessage(content='2 multiply by 3'),\n",
123
+ " AIMessage(content='2 multiplied by 3 is equal to 6.')],\n",
124
+ " 'output': '2 multiplied by 3 is equal to 6.'}"
125
+ ]
126
+ },
127
+ "execution_count": 30,
128
+ "metadata": {},
129
+ "output_type": "execute_result"
130
+ }
131
+ ],
132
+ "source": [
133
+ "await run_agent(agent_exe,\"2 multiply by 3\")"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": null,
139
+ "metadata": {},
140
+ "outputs": [],
141
+ "source": []
142
+ }
143
+ ],
144
+ "metadata": {
145
+ "kernelspec": {
146
+ "display_name": "stevens-chat-py311-v1",
147
+ "language": "python",
148
+ "name": "python3"
149
+ },
150
+ "language_info": {
151
+ "codemirror_mode": {
152
+ "name": "ipython",
153
+ "version": 3
154
+ },
155
+ "file_extension": ".py",
156
+ "mimetype": "text/x-python",
157
+ "name": "python",
158
+ "nbconvert_exporter": "python",
159
+ "pygments_lexer": "ipython3",
160
+ "version": "3.11.6"
161
+ }
162
+ },
163
+ "nbformat": 4,
164
+ "nbformat_minor": 2
165
+ }
notebooks/streamlit_bot.ipynb CHANGED
@@ -1,444 +1,444 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {},
6
- "source": [
7
- "### Image Generation Response Analysis"
8
- ]
9
- },
10
- {
11
- "cell_type": "code",
12
- "execution_count": 2,
13
- "metadata": {},
14
- "outputs": [],
15
- "source": [
16
- "from openai import OpenAI\n",
17
- "from dotenv import load_dotenv\n",
18
- "\n",
19
- "load_dotenv()\n",
20
- "\n",
21
- "client = OpenAI()\n",
22
- "\n",
23
- "response = client.images.generate(\n",
24
- " model=\"dall-e-2\",\n",
25
- " prompt=\"A Persian cat sitting on a grand Persian carpet\",\n",
26
- " size=\"1024x1024\",\n",
27
- " quality=\"standard\",\n",
28
- " n=1,\n",
29
- ")"
30
- ]
31
- },
32
- {
33
- "cell_type": "code",
34
- "execution_count": 4,
35
- "metadata": {},
36
- "outputs": [
37
- {
38
- "data": {
39
- "text/plain": [
40
- "openai.types.images_response.ImagesResponse"
41
- ]
42
- },
43
- "execution_count": 4,
44
- "metadata": {},
45
- "output_type": "execute_result"
46
- }
47
- ],
48
- "source": [
49
- "type(response)"
50
- ]
51
- },
52
- {
53
- "cell_type": "code",
54
- "execution_count": 5,
55
- "metadata": {},
56
- "outputs": [
57
- {
58
- "data": {
59
- "text/plain": [
60
- "['__abstractmethods__',\n",
61
- " '__annotations__',\n",
62
- " '__class__',\n",
63
- " '__class_getitem__',\n",
64
- " '__class_vars__',\n",
65
- " '__copy__',\n",
66
- " '__deepcopy__',\n",
67
- " '__delattr__',\n",
68
- " '__dict__',\n",
69
- " '__dir__',\n",
70
- " '__doc__',\n",
71
- " '__eq__',\n",
72
- " '__fields__',\n",
73
- " '__fields_set__',\n",
74
- " '__format__',\n",
75
- " '__ge__',\n",
76
- " '__get_pydantic_core_schema__',\n",
77
- " '__get_pydantic_json_schema__',\n",
78
- " '__getattr__',\n",
79
- " '__getattribute__',\n",
80
- " '__getstate__',\n",
81
- " '__gt__',\n",
82
- " '__hash__',\n",
83
- " '__init__',\n",
84
- " '__init_subclass__',\n",
85
- " '__iter__',\n",
86
- " '__le__',\n",
87
- " '__lt__',\n",
88
- " '__module__',\n",
89
- " '__ne__',\n",
90
- " '__new__',\n",
91
- " '__pretty__',\n",
92
- " '__private_attributes__',\n",
93
- " '__pydantic_complete__',\n",
94
- " '__pydantic_core_schema__',\n",
95
- " '__pydantic_custom_init__',\n",
96
- " '__pydantic_decorators__',\n",
97
- " '__pydantic_extra__',\n",
98
- " '__pydantic_fields_set__',\n",
99
- " '__pydantic_generic_metadata__',\n",
100
- " '__pydantic_init_subclass__',\n",
101
- " '__pydantic_parent_namespace__',\n",
102
- " '__pydantic_post_init__',\n",
103
- " '__pydantic_private__',\n",
104
- " '__pydantic_root_model__',\n",
105
- " '__pydantic_serializer__',\n",
106
- " '__pydantic_validator__',\n",
107
- " '__reduce__',\n",
108
- " '__reduce_ex__',\n",
109
- " '__repr__',\n",
110
- " '__repr_args__',\n",
111
- " '__repr_name__',\n",
112
- " '__repr_str__',\n",
113
- " '__rich_repr__',\n",
114
- " '__setattr__',\n",
115
- " '__setstate__',\n",
116
- " '__signature__',\n",
117
- " '__sizeof__',\n",
118
- " '__slots__',\n",
119
- " '__str__',\n",
120
- " '__subclasshook__',\n",
121
- " '__weakref__',\n",
122
- " '_abc_impl',\n",
123
- " '_calculate_keys',\n",
124
- " '_check_frozen',\n",
125
- " '_copy_and_set_values',\n",
126
- " '_get_value',\n",
127
- " '_iter',\n",
128
- " 'construct',\n",
129
- " 'copy',\n",
130
- " 'created',\n",
131
- " 'data',\n",
132
- " 'dict',\n",
133
- " 'from_orm',\n",
134
- " 'json',\n",
135
- " 'model_computed_fields',\n",
136
- " 'model_config',\n",
137
- " 'model_construct',\n",
138
- " 'model_copy',\n",
139
- " 'model_dump',\n",
140
- " 'model_dump_json',\n",
141
- " 'model_extra',\n",
142
- " 'model_fields',\n",
143
- " 'model_fields_set',\n",
144
- " 'model_json_schema',\n",
145
- " 'model_parametrized_name',\n",
146
- " 'model_post_init',\n",
147
- " 'model_rebuild',\n",
148
- " 'model_validate',\n",
149
- " 'model_validate_json',\n",
150
- " 'model_validate_strings',\n",
151
- " 'parse_file',\n",
152
- " 'parse_obj',\n",
153
- " 'parse_raw',\n",
154
- " 'schema',\n",
155
- " 'schema_json',\n",
156
- " 'update_forward_refs',\n",
157
- " 'validate']"
158
- ]
159
- },
160
- "execution_count": 5,
161
- "metadata": {},
162
- "output_type": "execute_result"
163
- }
164
- ],
165
- "source": [
166
- "dir(response)"
167
- ]
168
- },
169
- {
170
- "cell_type": "code",
171
- "execution_count": 6,
172
- "metadata": {},
173
- "outputs": [
174
- {
175
- "data": {
176
- "text/plain": [
177
- "ImagesResponse(created=1709567373, data=[Image(b64_json=None, revised_prompt=None, url='https://oaidalleapiprodscus.blob.core.windows.net/private/org-bMkUKmvAIOpUOCuXviHRwAQQ/user-u8VOUy4ltWJcbWslWW6PhwXz/img-qHtW8sFZmN0zQAdnArkbjJTj.png?st=2024-03-04T14%3A49%3A33Z&se=2024-03-04T16%3A49%3A33Z&sp=r&sv=2021-08-06&sr=b&rscd=inline&rsct=image/png&skoid=6aaadede-4fb3-4698-a8f6-684d7786b067&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2024-03-04T09%3A37%3A21Z&ske=2024-03-05T09%3A37%3A21Z&sks=b&skv=2021-08-06&sig=8FAlknmnBl7aCOyoZo%2BKsy%2BIrmSSrPv9lfQBo5mmDYk%3D')])"
178
- ]
179
- },
180
- "execution_count": 6,
181
- "metadata": {},
182
- "output_type": "execute_result"
183
- }
184
- ],
185
- "source": [
186
- "response"
187
- ]
188
- },
189
- {
190
- "cell_type": "code",
191
- "execution_count": 3,
192
- "metadata": {},
193
- "outputs": [
194
- {
195
- "data": {
196
- "text/plain": [
197
- "{'b64_json': None,\n",
198
- " 'revised_prompt': None,\n",
199
- " 'url': 'https://oaidalleapiprodscus.blob.core.windows.net/private/org-bMkUKmvAIOpUOCuXviHRwAQQ/user-u8VOUy4ltWJcbWslWW6PhwXz/img-3uw1d2UZQ6Dd8F4EAENGHZoS.png?st=2024-03-04T22%3A23%3A13Z&se=2024-03-05T00%3A23%3A13Z&sp=r&sv=2021-08-06&sr=b&rscd=inline&rsct=image/png&skoid=6aaadede-4fb3-4698-a8f6-684d7786b067&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2024-03-04T14%3A54%3A11Z&ske=2024-03-05T14%3A54%3A11Z&sks=b&skv=2021-08-06&sig=b1h2vJTXMMFQaBa2vMc4eCyPIfJMEn3LfiV6WLxhto8%3D'}"
200
- ]
201
- },
202
- "execution_count": 3,
203
- "metadata": {},
204
- "output_type": "execute_result"
205
- }
206
- ],
207
- "source": [
208
- "response.data[0].__dict__"
209
- ]
210
- },
211
- {
212
- "cell_type": "code",
213
- "execution_count": 4,
214
- "metadata": {},
215
- "outputs": [
216
- {
217
- "data": {
218
- "text/plain": [
219
- "'https://oaidalleapiprodscus.blob.core.windows.net/private/org-bMkUKmvAIOpUOCuXviHRwAQQ/user-u8VOUy4ltWJcbWslWW6PhwXz/img-3uw1d2UZQ6Dd8F4EAENGHZoS.png?st=2024-03-04T22%3A23%3A13Z&se=2024-03-05T00%3A23%3A13Z&sp=r&sv=2021-08-06&sr=b&rscd=inline&rsct=image/png&skoid=6aaadede-4fb3-4698-a8f6-684d7786b067&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2024-03-04T14%3A54%3A11Z&ske=2024-03-05T14%3A54%3A11Z&sks=b&skv=2021-08-06&sig=b1h2vJTXMMFQaBa2vMc4eCyPIfJMEn3LfiV6WLxhto8%3D'"
220
- ]
221
- },
222
- "execution_count": 4,
223
- "metadata": {},
224
- "output_type": "execute_result"
225
- }
226
- ],
227
- "source": [
228
- "response.data[0].url"
229
- ]
230
- },
231
- {
232
- "cell_type": "code",
233
- "execution_count": 8,
234
- "metadata": {},
235
- "outputs": [],
236
- "source": [
237
- "import requests\n",
238
- "import os \n",
239
- "\n",
240
- "image = requests.get(response.data[0].url,stream=True)\n",
241
- "image_path = os.path.join(\"../bot/images\",\"test2.png\")\n",
242
- "if image.status_code == 200:\n",
243
- " with open(image_path, 'wb') as f:\n",
244
- " for chunk in image:\n",
245
- " f.write(chunk)"
246
- ]
247
- },
248
- {
249
- "cell_type": "code",
250
- "execution_count": 13,
251
- "metadata": {},
252
- "outputs": [
253
- {
254
- "data": {
255
- "text/plain": [
256
- "requests.models.Response"
257
- ]
258
- },
259
- "execution_count": 13,
260
- "metadata": {},
261
- "output_type": "execute_result"
262
- }
263
- ],
264
- "source": [
265
- "type(image)"
266
- ]
267
- },
268
- {
269
- "cell_type": "code",
270
- "execution_count": 17,
271
- "metadata": {},
272
- "outputs": [
273
- {
274
- "name": "stdout",
275
- "output_type": "stream",
276
- "text": [
277
- "200\n"
278
- ]
279
- }
280
- ],
281
- "source": [
282
- "print(image.status_code)"
283
- ]
284
- },
285
- {
286
- "cell_type": "markdown",
287
- "metadata": {},
288
- "source": [
289
- "### OpenAI Package Test"
290
- ]
291
- },
292
- {
293
- "cell_type": "code",
294
- "execution_count": 10,
295
- "metadata": {},
296
- "outputs": [
297
- {
298
- "data": {
299
- "text/plain": [
300
- "['__annotations__',\n",
301
- " '__class__',\n",
302
- " '__class_getitem__',\n",
303
- " '__delattr__',\n",
304
- " '__dict__',\n",
305
- " '__dir__',\n",
306
- " '__doc__',\n",
307
- " '__enter__',\n",
308
- " '__eq__',\n",
309
- " '__exit__',\n",
310
- " '__format__',\n",
311
- " '__ge__',\n",
312
- " '__getattribute__',\n",
313
- " '__gt__',\n",
314
- " '__hash__',\n",
315
- " '__init__',\n",
316
- " '__init_subclass__',\n",
317
- " '__le__',\n",
318
- " '__lt__',\n",
319
- " '__module__',\n",
320
- " '__ne__',\n",
321
- " '__new__',\n",
322
- " '__orig_bases__',\n",
323
- " '__parameters__',\n",
324
- " '__reduce__',\n",
325
- " '__reduce_ex__',\n",
326
- " '__repr__',\n",
327
- " '__setattr__',\n",
328
- " '__sizeof__',\n",
329
- " '__slots__',\n",
330
- " '__str__',\n",
331
- " '__subclasshook__',\n",
332
- " '__weakref__',\n",
333
- " '_base_url',\n",
334
- " '_build_headers',\n",
335
- " '_build_request',\n",
336
- " '_calculate_retry_timeout',\n",
337
- " '_client',\n",
338
- " '_custom_headers',\n",
339
- " '_custom_query',\n",
340
- " '_default_stream_cls',\n",
341
- " '_enforce_trailing_slash',\n",
342
- " '_idempotency_header',\n",
343
- " '_idempotency_key',\n",
344
- " '_is_protocol',\n",
345
- " '_limits',\n",
346
- " '_make_status_error',\n",
347
- " '_make_status_error_from_response',\n",
348
- " '_maybe_override_cast_to',\n",
349
- " '_parse_retry_after_header',\n",
350
- " '_prepare_options',\n",
351
- " '_prepare_request',\n",
352
- " '_prepare_url',\n",
353
- " '_process_response',\n",
354
- " '_process_response_data',\n",
355
- " '_proxies',\n",
356
- " '_remaining_retries',\n",
357
- " '_request',\n",
358
- " '_request_api_list',\n",
359
- " '_retry_request',\n",
360
- " '_serialize_multipartform',\n",
361
- " '_should_retry',\n",
362
- " '_should_stream_response_body',\n",
363
- " '_strict_response_validation',\n",
364
- " '_transport',\n",
365
- " '_validate_headers',\n",
366
- " '_version',\n",
367
- " 'api_key',\n",
368
- " 'audio',\n",
369
- " 'auth_headers',\n",
370
- " 'base_url',\n",
371
- " 'beta',\n",
372
- " 'chat',\n",
373
- " 'close',\n",
374
- " 'completions',\n",
375
- " 'copy',\n",
376
- " 'custom_auth',\n",
377
- " 'default_headers',\n",
378
- " 'delete',\n",
379
- " 'embeddings',\n",
380
- " 'files',\n",
381
- " 'fine_tuning',\n",
382
- " 'get',\n",
383
- " 'get_api_list',\n",
384
- " 'images',\n",
385
- " 'is_closed',\n",
386
- " 'max_retries',\n",
387
- " 'models',\n",
388
- " 'moderations',\n",
389
- " 'organization',\n",
390
- " 'patch',\n",
391
- " 'platform_headers',\n",
392
- " 'post',\n",
393
- " 'put',\n",
394
- " 'qs',\n",
395
- " 'request',\n",
396
- " 'timeout',\n",
397
- " 'user_agent',\n",
398
- " 'with_options',\n",
399
- " 'with_raw_response',\n",
400
- " 'with_streaming_response']"
401
- ]
402
- },
403
- "execution_count": 10,
404
- "metadata": {},
405
- "output_type": "execute_result"
406
- }
407
- ],
408
- "source": [
409
- "from openai import OpenAI\n",
410
- "\n",
411
- "client = OpenAI()\n",
412
- "dir(client)"
413
- ]
414
- },
415
- {
416
- "cell_type": "code",
417
- "execution_count": null,
418
- "metadata": {},
419
- "outputs": [],
420
- "source": []
421
- }
422
- ],
423
- "metadata": {
424
- "kernelspec": {
425
- "display_name": "stevens-chat-v2-py310",
426
- "language": "python",
427
- "name": "python3"
428
- },
429
- "language_info": {
430
- "codemirror_mode": {
431
- "name": "ipython",
432
- "version": 3
433
- },
434
- "file_extension": ".py",
435
- "mimetype": "text/x-python",
436
- "name": "python",
437
- "nbconvert_exporter": "python",
438
- "pygments_lexer": "ipython3",
439
- "version": "3.10.6"
440
- }
441
- },
442
- "nbformat": 4,
443
- "nbformat_minor": 2
444
- }
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "### Image Generation Response Analysis"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 2,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "from openai import OpenAI\n",
17
+ "from dotenv import load_dotenv\n",
18
+ "\n",
19
+ "load_dotenv()\n",
20
+ "\n",
21
+ "client = OpenAI()\n",
22
+ "\n",
23
+ "response = client.images.generate(\n",
24
+ " model=\"dall-e-2\",\n",
25
+ " prompt=\"A Persian cat sitting on a grand Persian carpet\",\n",
26
+ " size=\"1024x1024\",\n",
27
+ " quality=\"standard\",\n",
28
+ " n=1,\n",
29
+ ")"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 4,
35
+ "metadata": {},
36
+ "outputs": [
37
+ {
38
+ "data": {
39
+ "text/plain": [
40
+ "openai.types.images_response.ImagesResponse"
41
+ ]
42
+ },
43
+ "execution_count": 4,
44
+ "metadata": {},
45
+ "output_type": "execute_result"
46
+ }
47
+ ],
48
+ "source": [
49
+ "type(response)"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 5,
55
+ "metadata": {},
56
+ "outputs": [
57
+ {
58
+ "data": {
59
+ "text/plain": [
60
+ "['__abstractmethods__',\n",
61
+ " '__annotations__',\n",
62
+ " '__class__',\n",
63
+ " '__class_getitem__',\n",
64
+ " '__class_vars__',\n",
65
+ " '__copy__',\n",
66
+ " '__deepcopy__',\n",
67
+ " '__delattr__',\n",
68
+ " '__dict__',\n",
69
+ " '__dir__',\n",
70
+ " '__doc__',\n",
71
+ " '__eq__',\n",
72
+ " '__fields__',\n",
73
+ " '__fields_set__',\n",
74
+ " '__format__',\n",
75
+ " '__ge__',\n",
76
+ " '__get_pydantic_core_schema__',\n",
77
+ " '__get_pydantic_json_schema__',\n",
78
+ " '__getattr__',\n",
79
+ " '__getattribute__',\n",
80
+ " '__getstate__',\n",
81
+ " '__gt__',\n",
82
+ " '__hash__',\n",
83
+ " '__init__',\n",
84
+ " '__init_subclass__',\n",
85
+ " '__iter__',\n",
86
+ " '__le__',\n",
87
+ " '__lt__',\n",
88
+ " '__module__',\n",
89
+ " '__ne__',\n",
90
+ " '__new__',\n",
91
+ " '__pretty__',\n",
92
+ " '__private_attributes__',\n",
93
+ " '__pydantic_complete__',\n",
94
+ " '__pydantic_core_schema__',\n",
95
+ " '__pydantic_custom_init__',\n",
96
+ " '__pydantic_decorators__',\n",
97
+ " '__pydantic_extra__',\n",
98
+ " '__pydantic_fields_set__',\n",
99
+ " '__pydantic_generic_metadata__',\n",
100
+ " '__pydantic_init_subclass__',\n",
101
+ " '__pydantic_parent_namespace__',\n",
102
+ " '__pydantic_post_init__',\n",
103
+ " '__pydantic_private__',\n",
104
+ " '__pydantic_root_model__',\n",
105
+ " '__pydantic_serializer__',\n",
106
+ " '__pydantic_validator__',\n",
107
+ " '__reduce__',\n",
108
+ " '__reduce_ex__',\n",
109
+ " '__repr__',\n",
110
+ " '__repr_args__',\n",
111
+ " '__repr_name__',\n",
112
+ " '__repr_str__',\n",
113
+ " '__rich_repr__',\n",
114
+ " '__setattr__',\n",
115
+ " '__setstate__',\n",
116
+ " '__signature__',\n",
117
+ " '__sizeof__',\n",
118
+ " '__slots__',\n",
119
+ " '__str__',\n",
120
+ " '__subclasshook__',\n",
121
+ " '__weakref__',\n",
122
+ " '_abc_impl',\n",
123
+ " '_calculate_keys',\n",
124
+ " '_check_frozen',\n",
125
+ " '_copy_and_set_values',\n",
126
+ " '_get_value',\n",
127
+ " '_iter',\n",
128
+ " 'construct',\n",
129
+ " 'copy',\n",
130
+ " 'created',\n",
131
+ " 'data',\n",
132
+ " 'dict',\n",
133
+ " 'from_orm',\n",
134
+ " 'json',\n",
135
+ " 'model_computed_fields',\n",
136
+ " 'model_config',\n",
137
+ " 'model_construct',\n",
138
+ " 'model_copy',\n",
139
+ " 'model_dump',\n",
140
+ " 'model_dump_json',\n",
141
+ " 'model_extra',\n",
142
+ " 'model_fields',\n",
143
+ " 'model_fields_set',\n",
144
+ " 'model_json_schema',\n",
145
+ " 'model_parametrized_name',\n",
146
+ " 'model_post_init',\n",
147
+ " 'model_rebuild',\n",
148
+ " 'model_validate',\n",
149
+ " 'model_validate_json',\n",
150
+ " 'model_validate_strings',\n",
151
+ " 'parse_file',\n",
152
+ " 'parse_obj',\n",
153
+ " 'parse_raw',\n",
154
+ " 'schema',\n",
155
+ " 'schema_json',\n",
156
+ " 'update_forward_refs',\n",
157
+ " 'validate']"
158
+ ]
159
+ },
160
+ "execution_count": 5,
161
+ "metadata": {},
162
+ "output_type": "execute_result"
163
+ }
164
+ ],
165
+ "source": [
166
+ "dir(response)"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": 6,
172
+ "metadata": {},
173
+ "outputs": [
174
+ {
175
+ "data": {
176
+ "text/plain": [
177
+ "ImagesResponse(created=1709567373, data=[Image(b64_json=None, revised_prompt=None, url='https://oaidalleapiprodscus.blob.core.windows.net/private/org-bMkUKmvAIOpUOCuXviHRwAQQ/user-u8VOUy4ltWJcbWslWW6PhwXz/img-qHtW8sFZmN0zQAdnArkbjJTj.png?st=2024-03-04T14%3A49%3A33Z&se=2024-03-04T16%3A49%3A33Z&sp=r&sv=2021-08-06&sr=b&rscd=inline&rsct=image/png&skoid=6aaadede-4fb3-4698-a8f6-684d7786b067&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2024-03-04T09%3A37%3A21Z&ske=2024-03-05T09%3A37%3A21Z&sks=b&skv=2021-08-06&sig=8FAlknmnBl7aCOyoZo%2BKsy%2BIrmSSrPv9lfQBo5mmDYk%3D')])"
178
+ ]
179
+ },
180
+ "execution_count": 6,
181
+ "metadata": {},
182
+ "output_type": "execute_result"
183
+ }
184
+ ],
185
+ "source": [
186
+ "response"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": 3,
192
+ "metadata": {},
193
+ "outputs": [
194
+ {
195
+ "data": {
196
+ "text/plain": [
197
+ "{'b64_json': None,\n",
198
+ " 'revised_prompt': None,\n",
199
+ " 'url': 'https://oaidalleapiprodscus.blob.core.windows.net/private/org-bMkUKmvAIOpUOCuXviHRwAQQ/user-u8VOUy4ltWJcbWslWW6PhwXz/img-3uw1d2UZQ6Dd8F4EAENGHZoS.png?st=2024-03-04T22%3A23%3A13Z&se=2024-03-05T00%3A23%3A13Z&sp=r&sv=2021-08-06&sr=b&rscd=inline&rsct=image/png&skoid=6aaadede-4fb3-4698-a8f6-684d7786b067&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2024-03-04T14%3A54%3A11Z&ske=2024-03-05T14%3A54%3A11Z&sks=b&skv=2021-08-06&sig=b1h2vJTXMMFQaBa2vMc4eCyPIfJMEn3LfiV6WLxhto8%3D'}"
200
+ ]
201
+ },
202
+ "execution_count": 3,
203
+ "metadata": {},
204
+ "output_type": "execute_result"
205
+ }
206
+ ],
207
+ "source": [
208
+ "response.data[0].__dict__"
209
+ ]
210
+ },
211
+ {
212
+ "cell_type": "code",
213
+ "execution_count": 4,
214
+ "metadata": {},
215
+ "outputs": [
216
+ {
217
+ "data": {
218
+ "text/plain": [
219
+ "'https://oaidalleapiprodscus.blob.core.windows.net/private/org-bMkUKmvAIOpUOCuXviHRwAQQ/user-u8VOUy4ltWJcbWslWW6PhwXz/img-3uw1d2UZQ6Dd8F4EAENGHZoS.png?st=2024-03-04T22%3A23%3A13Z&se=2024-03-05T00%3A23%3A13Z&sp=r&sv=2021-08-06&sr=b&rscd=inline&rsct=image/png&skoid=6aaadede-4fb3-4698-a8f6-684d7786b067&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2024-03-04T14%3A54%3A11Z&ske=2024-03-05T14%3A54%3A11Z&sks=b&skv=2021-08-06&sig=b1h2vJTXMMFQaBa2vMc4eCyPIfJMEn3LfiV6WLxhto8%3D'"
220
+ ]
221
+ },
222
+ "execution_count": 4,
223
+ "metadata": {},
224
+ "output_type": "execute_result"
225
+ }
226
+ ],
227
+ "source": [
228
+ "response.data[0].url"
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "code",
233
+ "execution_count": 8,
234
+ "metadata": {},
235
+ "outputs": [],
236
+ "source": [
237
+ "import requests\n",
238
+ "import os \n",
239
+ "\n",
240
+ "image = requests.get(response.data[0].url,stream=True)\n",
241
+ "image_path = os.path.join(\"../bot/images\",\"test2.png\")\n",
242
+ "if image.status_code == 200:\n",
243
+ " with open(image_path, 'wb') as f:\n",
244
+ " for chunk in image:\n",
245
+ " f.write(chunk)"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": 13,
251
+ "metadata": {},
252
+ "outputs": [
253
+ {
254
+ "data": {
255
+ "text/plain": [
256
+ "requests.models.Response"
257
+ ]
258
+ },
259
+ "execution_count": 13,
260
+ "metadata": {},
261
+ "output_type": "execute_result"
262
+ }
263
+ ],
264
+ "source": [
265
+ "type(image)"
266
+ ]
267
+ },
268
+ {
269
+ "cell_type": "code",
270
+ "execution_count": 17,
271
+ "metadata": {},
272
+ "outputs": [
273
+ {
274
+ "name": "stdout",
275
+ "output_type": "stream",
276
+ "text": [
277
+ "200\n"
278
+ ]
279
+ }
280
+ ],
281
+ "source": [
282
+ "print(image.status_code)"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "markdown",
287
+ "metadata": {},
288
+ "source": [
289
+ "### OpenAI Package Test"
290
+ ]
291
+ },
292
+ {
293
+ "cell_type": "code",
294
+ "execution_count": 10,
295
+ "metadata": {},
296
+ "outputs": [
297
+ {
298
+ "data": {
299
+ "text/plain": [
300
+ "['__annotations__',\n",
301
+ " '__class__',\n",
302
+ " '__class_getitem__',\n",
303
+ " '__delattr__',\n",
304
+ " '__dict__',\n",
305
+ " '__dir__',\n",
306
+ " '__doc__',\n",
307
+ " '__enter__',\n",
308
+ " '__eq__',\n",
309
+ " '__exit__',\n",
310
+ " '__format__',\n",
311
+ " '__ge__',\n",
312
+ " '__getattribute__',\n",
313
+ " '__gt__',\n",
314
+ " '__hash__',\n",
315
+ " '__init__',\n",
316
+ " '__init_subclass__',\n",
317
+ " '__le__',\n",
318
+ " '__lt__',\n",
319
+ " '__module__',\n",
320
+ " '__ne__',\n",
321
+ " '__new__',\n",
322
+ " '__orig_bases__',\n",
323
+ " '__parameters__',\n",
324
+ " '__reduce__',\n",
325
+ " '__reduce_ex__',\n",
326
+ " '__repr__',\n",
327
+ " '__setattr__',\n",
328
+ " '__sizeof__',\n",
329
+ " '__slots__',\n",
330
+ " '__str__',\n",
331
+ " '__subclasshook__',\n",
332
+ " '__weakref__',\n",
333
+ " '_base_url',\n",
334
+ " '_build_headers',\n",
335
+ " '_build_request',\n",
336
+ " '_calculate_retry_timeout',\n",
337
+ " '_client',\n",
338
+ " '_custom_headers',\n",
339
+ " '_custom_query',\n",
340
+ " '_default_stream_cls',\n",
341
+ " '_enforce_trailing_slash',\n",
342
+ " '_idempotency_header',\n",
343
+ " '_idempotency_key',\n",
344
+ " '_is_protocol',\n",
345
+ " '_limits',\n",
346
+ " '_make_status_error',\n",
347
+ " '_make_status_error_from_response',\n",
348
+ " '_maybe_override_cast_to',\n",
349
+ " '_parse_retry_after_header',\n",
350
+ " '_prepare_options',\n",
351
+ " '_prepare_request',\n",
352
+ " '_prepare_url',\n",
353
+ " '_process_response',\n",
354
+ " '_process_response_data',\n",
355
+ " '_proxies',\n",
356
+ " '_remaining_retries',\n",
357
+ " '_request',\n",
358
+ " '_request_api_list',\n",
359
+ " '_retry_request',\n",
360
+ " '_serialize_multipartform',\n",
361
+ " '_should_retry',\n",
362
+ " '_should_stream_response_body',\n",
363
+ " '_strict_response_validation',\n",
364
+ " '_transport',\n",
365
+ " '_validate_headers',\n",
366
+ " '_version',\n",
367
+ " 'api_key',\n",
368
+ " 'audio',\n",
369
+ " 'auth_headers',\n",
370
+ " 'base_url',\n",
371
+ " 'beta',\n",
372
+ " 'chat',\n",
373
+ " 'close',\n",
374
+ " 'completions',\n",
375
+ " 'copy',\n",
376
+ " 'custom_auth',\n",
377
+ " 'default_headers',\n",
378
+ " 'delete',\n",
379
+ " 'embeddings',\n",
380
+ " 'files',\n",
381
+ " 'fine_tuning',\n",
382
+ " 'get',\n",
383
+ " 'get_api_list',\n",
384
+ " 'images',\n",
385
+ " 'is_closed',\n",
386
+ " 'max_retries',\n",
387
+ " 'models',\n",
388
+ " 'moderations',\n",
389
+ " 'organization',\n",
390
+ " 'patch',\n",
391
+ " 'platform_headers',\n",
392
+ " 'post',\n",
393
+ " 'put',\n",
394
+ " 'qs',\n",
395
+ " 'request',\n",
396
+ " 'timeout',\n",
397
+ " 'user_agent',\n",
398
+ " 'with_options',\n",
399
+ " 'with_raw_response',\n",
400
+ " 'with_streaming_response']"
401
+ ]
402
+ },
403
+ "execution_count": 10,
404
+ "metadata": {},
405
+ "output_type": "execute_result"
406
+ }
407
+ ],
408
+ "source": [
409
+ "from openai import OpenAI\n",
410
+ "\n",
411
+ "client = OpenAI()\n",
412
+ "dir(client)"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": null,
418
+ "metadata": {},
419
+ "outputs": [],
420
+ "source": []
421
+ }
422
+ ],
423
+ "metadata": {
424
+ "kernelspec": {
425
+ "display_name": "stevens-chat-v2-py310",
426
+ "language": "python",
427
+ "name": "python3"
428
+ },
429
+ "language_info": {
430
+ "codemirror_mode": {
431
+ "name": "ipython",
432
+ "version": 3
433
+ },
434
+ "file_extension": ".py",
435
+ "mimetype": "text/x-python",
436
+ "name": "python",
437
+ "nbconvert_exporter": "python",
438
+ "pygments_lexer": "ipython3",
439
+ "version": "3.10.6"
440
+ }
441
+ },
442
+ "nbformat": 4,
443
+ "nbformat_minor": 2
444
+ }
notebooks/web_scrapping.ipynb CHANGED
@@ -1,348 +1,348 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import pandas as pd\n",
10
- "from selenium import webdriver\n",
11
- "from selenium.webdriver.common.by import By\n",
12
- "import time\n",
13
- "import spacy"
14
- ]
15
- },
16
- {
17
- "cell_type": "code",
18
- "execution_count": null,
19
- "metadata": {},
20
- "outputs": [],
21
- "source": [
22
- "def tokenize(text):\n",
23
- " nlp = spacy.load('en_core_web_sm')\n",
24
- " tokens = [token.text for token in nlp(text) if not token.is_punct and not token.is_space]\n",
25
- " \n",
26
- " return tokens"
27
- ]
28
- },
29
- {
30
- "cell_type": "code",
31
- "execution_count": null,
32
- "metadata": {},
33
- "outputs": [],
34
- "source": [
35
- "def add_target_columns(data, category, target):\n",
36
- " # Column to store category of questions\n",
37
- " data[\"Category\"] = category\n",
38
- " # Target column to store whether human answer or chatgpt answer\n",
39
- " data[\"Human vs ChatGPT\"] = target\n",
40
- " \n",
41
- " return data"
42
- ]
43
- },
44
- {
45
- "cell_type": "code",
46
- "execution_count": null,
47
- "metadata": {},
48
- "outputs": [],
49
- "source": [
50
- "def store_excel(data, prev_data = None):\n",
51
- " if prev_data:\n",
52
- " # Loading old data into dataframe\n",
53
- " old_data = pd.read_excel(prev_data)\n",
54
- " # Concatenating the two dataframes vertically\n",
55
- " complete_data = pd.concat([old_data, data], ignore_index=True)\n",
56
- " # Storing the combined data to the excel file\n",
57
- " complete_data.to_excel('scraped_data.xlsx', index=False)\n",
58
- " else:\n",
59
- " data.to_excel(\"scraped_data.xlsx\", index=False)"
60
- ]
61
- },
62
- {
63
- "cell_type": "code",
64
- "execution_count": null,
65
- "metadata": {},
66
- "outputs": [],
67
- "source": [
68
- "def getData(page_urls, driver, min_ans_len = 15, limit = 10, scroll_num = 10):\n",
69
- " \n",
70
- " # Empty dataframe to store the scraped content\n",
71
- " scraped_data = pd.DataFrame()\n",
72
- " \n",
73
- " # Initializing variable to track the number of data samples collected \n",
74
- " len_data = 0\n",
75
- " \n",
76
- " # Initializing lists to store the scraped content\n",
77
- " questions = []\n",
78
- " answers = []\n",
79
- " \n",
80
- " count = 1\n",
81
- " count1 = 0\n",
82
- " for page_url in page_urls:\n",
83
- " print(f\"Page {count} of {len(page_urls)}\")\n",
84
- " # Sending a get request to the web page (Navigating to the webpage)\n",
85
- " driver.get(page_url)\n",
86
- " # Wait\n",
87
- " driver.implicitly_wait(10)\n",
88
- "\n",
89
- " # Initializing variables to iterate through the try except block\n",
90
- " max_tries = 10\n",
91
- " retry = 0 \n",
92
- "\n",
93
- " # Initializing variable to check if we've reached the end of the page \n",
94
- " old_content = None\n",
95
- " new_content = None\n",
96
- "\n",
97
- " # Scrolling to get enough answers\n",
98
- " for i in range(scroll_num):\n",
99
- " # Scrolling to access the next page of questions\n",
100
- " driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
101
- " # Wait for the content to be loaded \n",
102
- " time.sleep(5) \n",
103
- " # Checking if page is same before and after scrolling\n",
104
- " new_content = driver.page_source\n",
105
- " if new_content == old_content:\n",
106
- " break\n",
107
- " old_content = new_content\n",
108
- "\n",
109
- " # Used while loop to avoid \"StaleElementReferenceException\" error\n",
110
- " while retry<max_tries:\n",
111
- " try:\n",
112
- " # Scraping the question answer blocks on Quora\n",
113
- " data_elements = driver.find_elements(By.CSS_SELECTOR, \"div.dom_annotate_multifeed_bundle_AnswersBundle\")\n",
114
- " retry = 0\n",
115
- " break\n",
116
- " except:\n",
117
- " retry += 1\n",
118
- "\n",
119
- "\n",
120
- " # Used while loop to avoid \"StaleElementReferenceException\" error\n",
121
- " while retry<max_tries:\n",
122
- " try:\n",
123
- " for block in data_elements:\n",
124
- "\n",
125
- " ### --- Questions --- ###\n",
126
- " # Scraping question from the webpage \n",
127
- " ques = block.find_element(By.CSS_SELECTOR, \"div.q-text.puppeteer_test_question_title span\")\n",
128
- "\n",
129
- " ### --- Answers --- ###\n",
130
- " # Checking if \"more\" button is present for an answer\n",
131
- " try: \n",
132
- " # Selecting the \"more\" button\n",
133
- " read_more = block.find_element(By.CSS_SELECTOR, \"div.q-absolute div.qt_read_more\")\n",
134
- " # Checking if the button is clickable\n",
135
- " try:\n",
136
- " # Expanding answer by clicking \"more\" button\n",
137
- " read_more.click()\n",
138
- " except:\n",
139
- " # Discarding data where complete answer cannot be obtained\n",
140
- " continue\n",
141
- " except:\n",
142
- " None\n",
143
- " # Scraping answers from the webpage \n",
144
- " ans = block.find_element(By.CSS_SELECTOR, \"div.q-box.spacing_log_answer_content.puppeteer_test_answer_content span.q-box\")\n",
145
- "\n",
146
- " if ques.text and ans.text:\n",
147
- " # Skipping questions that are already present\n",
148
- " if ques.text in questions:\n",
149
- " continue\n",
150
- " # Skipping the questions where length of answers are less than a given threshold\n",
151
- " ans_tokens = len(tokenize(ans.text))\n",
152
- " if ans_tokens<min_ans_len:\n",
153
- " continue\n",
154
- " # Appending the scraped question\n",
155
- " questions.append(ques.text)\n",
156
- " # Appending the scraped answer\n",
157
- " answers.append(ans.text)\n",
158
- " count1+=1\n",
159
- " print(f\"{count1} of 250\")\n",
160
- " else:\n",
161
- " continue\n",
162
- "\n",
163
- " # Updating the number of data samples collected\n",
164
- " len_data = len(questions) \n",
165
- " # Collecting data until limit is reached \n",
166
- " if len_data == limit:\n",
167
- " break\n",
168
- " retry = 0\n",
169
- " break\n",
170
- " except:\n",
171
- " retry += 1 \n",
172
- " count+=1\n",
173
- " if len_data == limit:\n",
174
- " break\n",
175
- " \n",
176
- " # Warning to give more urls if desired amount of data is not scraped\n",
177
- " if len_data < limit:\n",
178
- " print(\"Warning: Need to provide more webpages to get desired amount of data!\")\n",
179
- " \n",
180
- " # Storing the scraped information in a dataframe \n",
181
- " scraped_data[\"Questions\"] = questions\n",
182
- " scraped_data[\"Answers\"] = answers\n",
183
- " \n",
184
- " return scraped_data"
185
- ]
186
- },
187
- {
188
- "cell_type": "code",
189
- "execution_count": 1,
190
- "metadata": {},
191
- "outputs": [],
192
- "source": [
193
- "import requests\n",
194
- "from bs4 import BeautifulSoup"
195
- ]
196
- },
197
- {
198
- "cell_type": "code",
199
- "execution_count": 2,
200
- "metadata": {},
201
- "outputs": [],
202
- "source": [
203
- "url = \"https://boards.greenhouse.io/enveritas/jobs/4001717008\""
204
- ]
205
- },
206
- {
207
- "cell_type": "code",
208
- "execution_count": 3,
209
- "metadata": {},
210
- "outputs": [],
211
- "source": [
212
- "# Send a GET request to the URL\n",
213
- "response = requests.get(url)\n",
214
- "\n",
215
- "# Parse the HTML content with BeautifulSoup\n",
216
- "soup = BeautifulSoup(response.content, \"html.parser\")"
217
- ]
218
- },
219
- {
220
- "cell_type": "code",
221
- "execution_count": 27,
222
- "metadata": {},
223
- "outputs": [
224
- {
225
- "name": "stdout",
226
- "output_type": "stream",
227
- "text": [
228
- "Job Title: Data Scientist\n",
229
- "Company: Enveritas\n",
230
- "Job Location: Global / Remote\n",
231
- "Job Description: \n",
232
- "Data Scientist, Engineering & Data Group\n",
233
- "Do you want to work for a mission-driven non-profit, analyzing data and writing software that will contribute to helping millions of coffee farmers out of poverty? Enveritas is a 501(c)3 non-profit and Y Combinator-backed startup looking to hire a Data Scientist for our Data Team. \n",
234
- "We are looking for a Data Scientist with extensive professional experience to join our Engineering and Data Group on a remote, full-time basis. This position is open globally, based on locations supported by our EOR partner, Deel. You can learn more about this role at https://www.enveritas.org/jobs/data-scientist.\n",
235
- "Our Engineering and Data Group is a quirky, talented, and humble group of about twenty with diverse backgrounds ranging from journalism to academia to international industry.\n",
236
- "About Our Data Team\n",
237
- "The Data Team's mission is to leverage data analytics to drive Enveritas' efforts in improving the livelihoods of smallholder farmers around the world. We are responsible for ensuring that we collect high quality data, we leverage it to generate actionable insights that support smallholder farmers worldwide, and we make the data and insights accessible to other teams, such as the Operations and Partnerships Team.\n",
238
- "To improve the quality of our data collection process, we create tools to support our Country Operations teams. These tools enable the detection of outliers and automating quality control processes while collecting our survey data. \n",
239
- "We are also responsible for the transformation of our raw data into insights. This includes writing the code that scores our standards, and creating models that give new insights about smallholder farming. For example, we estimate the cost farmers face when growing coffee to know if they earn a livable income.\n",
240
- "Our programming languages of choice are Python and SQL (we use PostgreSQL), although some of our analysis is also done in R. We use git for version control, Github for hosting our repositories, and pytest for automated testing. Our internal BI tool is Looker.\n",
241
- "What You’ll Be Doing\n",
242
- "Providing technical oversight and mentorship. You will provide technical oversight to the models the Data Team produces. Additionally, you will offer guidance on and document statistical best practices and methodologies, helping to elevate the overall analytical capabilities within the Data Team and the organization.\n",
243
- "Build models to create new insights from our data. You will identify opportunities for creating models using the data we collect. Engage with the Operations and Partnerships teams to generate ideas for models that would be beneficial for them, and the coffee and cocoa community. This includes using Python to craft these models, incorporating them into our data pipeline, and effectively synthesizing, visualizing, and conveying the results.\n",
244
- "Serve as internal statistics consultant. Undertake the role of an internal statistical advisor, providing expertise across the organization for all statistics-related matters. This includes aiding teams in developing innovative sampling methods, calculating appropriate sample sizes, interpreting data analysis results, and ensuring the statistical integrity of projects and data collection. \n",
245
- "Who You Are\n",
246
- "Our team is fully distributed, so you should be comfortable with remote work. This role is a full-time individual contributor role. While you can be located anywhere, our core hours are 10am to 2pm Eastern Time Monday through Friday, with team members choosing either an early start or later stop as suits them.\n",
247
- "Our work is often ambiguous, so you should have a love for environments with uncertainty. You should have a deep empathy for users of our tools and understand the importance of supporting the work of other teams. You should also be willing to engage in a broad range of tasks beyond standard data science functions, and you are not afraid of using a simple model that gets the job done.\n",
248
- " \n",
249
- "Qualifications\n",
250
- "Read this first: Research shows that people of different backgrounds read job postings differently. If you don’t think you meet all of the qualifications but do think you’d be a great match for us, please consider applying and sharing more in your cover letter. We’d love to talk with you to see what skills you can bring to our team. This said, we are most likely to be interested in your candidacy if you can demonstrate the majority of the qualifications listed below:\n",
251
- "\n",
252
- "Extensive professional experience as a data scientist, a statistician, or equivalent.\n",
253
- "Hands-on experience building and deploying statistical models. Strong theoretical knowledge of probability and statistics.\n",
254
- "Intermediate Python. Experience with good software engineering practices such as reproducible environments, testing, and code quality.\n",
255
- "A desire to apply your skills at a non-profit working to improve the livelihoods of smallholder farmers.\n",
256
- "Have experience or expertise in one or more of the following:\n",
257
- "\n",
258
- "Building sustainability or agricultural models.\n",
259
- "Advanced statistical modeling.\n",
260
- "Survey sampling.\n",
261
- "Survey methodology.\n",
262
- "Bayesian statistical modeling.\n",
263
- "Data Management.\n",
264
- "Causal inference.\n",
265
- "Managing teams.\n",
266
- "\n",
267
- "\n",
268
- "\n",
269
- " \n",
270
- "About Working With Us & Compensation\n",
271
- "Enveritas has teams around the world: we are about 90 people spread over almost two dozen countries, and of all backgrounds, faiths, and identities. To learn more about working at Enveritas, see https://www.enveritas.org/jobs/\n",
272
- "For a US-Based hire, base salary for this position will be between $110,000-$130,000 annually (paid semi-monthly). This is a full-time exempt position. Full benefits include 401k with matching contributions, Medical/Dental/Vision, and Flexible Spending Account (FSA), 4 weeks vacation in addition to 12 standard holidays, and personal/sick time.\n",
273
- "For a hire outside the US, our offer will be competitive; the specific benefits and compensation details will vary as required to account for your region’s laws and requirements. Salary for this position will be paid in relevant local currency.\n",
274
- "For all staff, we are able to offer:\n",
275
- "\n",
276
- "Annual education budget for conferences, books, and other professional development opportunities.\n",
277
- "Annual all-company retreat and annual group retreat.\n",
278
- "Field visits to our Country Ops teams in coffee-growing countries such as Colombia, Costa Rica, Ethiopia, and Indonesia.\n",
279
- "\n",
280
- "Interview Process\n",
281
- "We are committed to fair and equitable hiring. To honor this commitment, we are being transparent about our interview process. We are interested in learning what working with you would be like and believe the below is the fairest method for us to see you at your best — and for you to learn about us! If you feel that a different method would be better for us to learn what working together would be like, please tell us in your application. \n",
282
- "After your introductory interview, we expect your interview process to take four to six weeks (but will depend on scheduling), and consist of four conversations that total about five hours of time. You should plan to also spend about four hours in total preparing for interviews.  See the hiring page at https://www.enveritas.org/jobs/data-scientist/ for details about each of these interviews.\n",
283
- "\n",
284
- "Introductory Interview (30 minutes; Google Meet; audio-only)\n",
285
- "First Technical Interview (60-90 minutes; Google Meet video)\n",
286
- "Second Technical Interview (60-90 minutes; Google Meet video)\n",
287
- "Manager Interview (45 minutes; Google Meet video)\n",
288
- "\n",
289
- "How to Apply\n",
290
- "Please apply here. Feel free to contact us at jobs@enveritas.org should you have any questions about the position, the interview process, or if you require any adjustments to ensure a fair and equitable application process. Questions about this opportunity or process will not reflect negatively on your application.\n",
291
- "A few notes about our communications: We are not able to reply to messages sent to staff outside of either our application process or our jobs email address, as this is unfair to other candidates. Also, Enveritas has been made aware of fake job postings by individuals pretending to hire persons seeking employment. These individuals are looking to collect personal information about you for fraudulent purposes. All legitimate Enveritas job openings are posted on our Greenhouse board at https://boards.greenhouse.io/enveritas or Lever board at https://jobs.lever.co/Enveritas. All recruiting emails from Enveritas team members will come through Lever or from @enveritas.org. If you have any concerns about employment opportunities or contact from someone supposedly representing Enveritas, please reach out to us at jobs@enveritas.org\n"
292
- ]
293
- }
294
- ],
295
- "source": [
296
- "# Extract the job description content\n",
297
- "job_title = soup.find(\"h1\", {\"class\": \"app-title\"})\n",
298
- "\n",
299
- "company_name = soup.find(\"span\",{\"class\": \"company-name\"})\n",
300
- "\n",
301
- "job_location = soup.find(\"div\", {\"class\": \"location\"})\n",
302
- "\n",
303
- "job_description = soup.find(\"div\", {\"id\": \"content\"})\n",
304
- "\n",
305
- "# Print the job description\n",
306
- "if job_description:\n",
307
- " print(\"Job Title: \"+job_title.get_text()+\"\\n\"+\"Company: \"+company_name.get_text().strip().split(\"at \")[1]+\"\\n\"+\"Job Location: \"+job_location.get_text().strip()+\"\\n\"+\"Job Description: \\n\"+job_description.get_text().strip())\n",
308
- "else:\n",
309
- " print(\"Job description not found.\")\n"
310
- ]
311
- },
312
- {
313
- "cell_type": "code",
314
- "execution_count": null,
315
- "metadata": {},
316
- "outputs": [],
317
- "source": []
318
- },
319
- {
320
- "cell_type": "code",
321
- "execution_count": null,
322
- "metadata": {},
323
- "outputs": [],
324
- "source": []
325
- }
326
- ],
327
- "metadata": {
328
- "kernelspec": {
329
- "display_name": "my_env",
330
- "language": "python",
331
- "name": "python3"
332
- },
333
- "language_info": {
334
- "codemirror_mode": {
335
- "name": "ipython",
336
- "version": 3
337
- },
338
- "file_extension": ".py",
339
- "mimetype": "text/x-python",
340
- "name": "python",
341
- "nbconvert_exporter": "python",
342
- "pygments_lexer": "ipython3",
343
- "version": "3.10.6"
344
- }
345
- },
346
- "nbformat": 4,
347
- "nbformat_minor": 2
348
- }
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "from selenium import webdriver\n",
11
+ "from selenium.webdriver.common.by import By\n",
12
+ "import time\n",
13
+ "import spacy"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": null,
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "def tokenize(text):\n",
23
+ " nlp = spacy.load('en_core_web_sm')\n",
24
+ " tokens = [token.text for token in nlp(text) if not token.is_punct and not token.is_space]\n",
25
+ " \n",
26
+ " return tokens"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": null,
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "def add_target_columns(data, category, target):\n",
36
+ " # Column to store category of questions\n",
37
+ " data[\"Category\"] = category\n",
38
+ " # Target column to store whether human answer or chatgpt answer\n",
39
+ " data[\"Human vs ChatGPT\"] = target\n",
40
+ " \n",
41
+ " return data"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": null,
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "def store_excel(data, prev_data = None):\n",
51
+ " if prev_data:\n",
52
+ " # Loading old data into dataframe\n",
53
+ " old_data = pd.read_excel(prev_data)\n",
54
+ " # Concatenating the two dataframes vertically\n",
55
+ " complete_data = pd.concat([old_data, data], ignore_index=True)\n",
56
+ " # Storing the combined data to the excel file\n",
57
+ " complete_data.to_excel('scraped_data.xlsx', index=False)\n",
58
+ " else:\n",
59
+ " data.to_excel(\"scraped_data.xlsx\", index=False)"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": null,
65
+ "metadata": {},
66
+ "outputs": [],
67
+ "source": [
68
+ "def getData(page_urls, driver, min_ans_len = 15, limit = 10, scroll_num = 10):\n",
69
+ " \n",
70
+ " # Empty dataframe to store the scraped content\n",
71
+ " scraped_data = pd.DataFrame()\n",
72
+ " \n",
73
+ " # Initializing variable to track the number of data samples collected \n",
74
+ " len_data = 0\n",
75
+ " \n",
76
+ " # Initializing lists to store the scraped content\n",
77
+ " questions = []\n",
78
+ " answers = []\n",
79
+ " \n",
80
+ " count = 1\n",
81
+ " count1 = 0\n",
82
+ " for page_url in page_urls:\n",
83
+ " print(f\"Page {count} of {len(page_urls)}\")\n",
84
+ " # Sending a get request to the web page (Navigating to the webpage)\n",
85
+ " driver.get(page_url)\n",
86
+ " # Wait\n",
87
+ " driver.implicitly_wait(10)\n",
88
+ "\n",
89
+ " # Initializing variables to iterate through the try except block\n",
90
+ " max_tries = 10\n",
91
+ " retry = 0 \n",
92
+ "\n",
93
+ " # Initializing variable to check if we've reached the end of the page \n",
94
+ " old_content = None\n",
95
+ " new_content = None\n",
96
+ "\n",
97
+ " # Scrolling to get enough answers\n",
98
+ " for i in range(scroll_num):\n",
99
+ " # Scrolling to access the next page of questions\n",
100
+ " driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
101
+ " # Wait for the content to be loaded \n",
102
+ " time.sleep(5) \n",
103
+ " # Checking if page is same before and after scrolling\n",
104
+ " new_content = driver.page_source\n",
105
+ " if new_content == old_content:\n",
106
+ " break\n",
107
+ " old_content = new_content\n",
108
+ "\n",
109
+ " # Used while loop to avoid \"StaleElementReferenceException\" error\n",
110
+ " while retry<max_tries:\n",
111
+ " try:\n",
112
+ " # Scraping the question answer blocks on Quora\n",
113
+ " data_elements = driver.find_elements(By.CSS_SELECTOR, \"div.dom_annotate_multifeed_bundle_AnswersBundle\")\n",
114
+ " retry = 0\n",
115
+ " break\n",
116
+ " except:\n",
117
+ " retry += 1\n",
118
+ "\n",
119
+ "\n",
120
+ " # Used while loop to avoid \"StaleElementReferenceException\" error\n",
121
+ " while retry<max_tries:\n",
122
+ " try:\n",
123
+ " for block in data_elements:\n",
124
+ "\n",
125
+ " ### --- Questions --- ###\n",
126
+ " # Scraping question from the webpage \n",
127
+ " ques = block.find_element(By.CSS_SELECTOR, \"div.q-text.puppeteer_test_question_title span\")\n",
128
+ "\n",
129
+ " ### --- Answers --- ###\n",
130
+ " # Checking if \"more\" button is present for an answer\n",
131
+ " try: \n",
132
+ " # Selecting the \"more\" button\n",
133
+ " read_more = block.find_element(By.CSS_SELECTOR, \"div.q-absolute div.qt_read_more\")\n",
134
+ " # Checking if the button is clickable\n",
135
+ " try:\n",
136
+ " # Expanding answer by clicking \"more\" button\n",
137
+ " read_more.click()\n",
138
+ " except:\n",
139
+ " # Discarding data where complete answer cannot be obtained\n",
140
+ " continue\n",
141
+ " except:\n",
142
+ " None\n",
143
+ " # Scraping answers from the webpage \n",
144
+ " ans = block.find_element(By.CSS_SELECTOR, \"div.q-box.spacing_log_answer_content.puppeteer_test_answer_content span.q-box\")\n",
145
+ "\n",
146
+ " if ques.text and ans.text:\n",
147
+ " # Skipping questions that are already present\n",
148
+ " if ques.text in questions:\n",
149
+ " continue\n",
150
+ " # Skipping the questions where length of answers are less than a given threshold\n",
151
+ " ans_tokens = len(tokenize(ans.text))\n",
152
+ " if ans_tokens<min_ans_len:\n",
153
+ " continue\n",
154
+ " # Appending the scraped question\n",
155
+ " questions.append(ques.text)\n",
156
+ " # Appending the scraped answer\n",
157
+ " answers.append(ans.text)\n",
158
+ " count1+=1\n",
159
+ " print(f\"{count1} of 250\")\n",
160
+ " else:\n",
161
+ " continue\n",
162
+ "\n",
163
+ " # Updating the number of data samples collected\n",
164
+ " len_data = len(questions) \n",
165
+ " # Collecting data until limit is reached \n",
166
+ " if len_data == limit:\n",
167
+ " break\n",
168
+ " retry = 0\n",
169
+ " break\n",
170
+ " except:\n",
171
+ " retry += 1 \n",
172
+ " count+=1\n",
173
+ " if len_data == limit:\n",
174
+ " break\n",
175
+ " \n",
176
+ " # Warning to give more urls if desired amount of data is not scraped\n",
177
+ " if len_data < limit:\n",
178
+ " print(\"Warning: Need to provide more webpages to get desired amount of data!\")\n",
179
+ " \n",
180
+ " # Storing the scraped information in a dataframe \n",
181
+ " scraped_data[\"Questions\"] = questions\n",
182
+ " scraped_data[\"Answers\"] = answers\n",
183
+ " \n",
184
+ " return scraped_data"
185
+ ]
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "execution_count": 1,
190
+ "metadata": {},
191
+ "outputs": [],
192
+ "source": [
193
+ "import requests\n",
194
+ "from bs4 import BeautifulSoup"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": 2,
200
+ "metadata": {},
201
+ "outputs": [],
202
+ "source": [
203
+ "url = \"https://boards.greenhouse.io/enveritas/jobs/4001717008\""
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": 3,
209
+ "metadata": {},
210
+ "outputs": [],
211
+ "source": [
212
+ "# Send a GET request to the URL\n",
213
+ "response = requests.get(url)\n",
214
+ "\n",
215
+ "# Parse the HTML content with BeautifulSoup\n",
216
+ "soup = BeautifulSoup(response.content, \"html.parser\")"
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "code",
221
+ "execution_count": 27,
222
+ "metadata": {},
223
+ "outputs": [
224
+ {
225
+ "name": "stdout",
226
+ "output_type": "stream",
227
+ "text": [
228
+ "Job Title: Data Scientist\n",
229
+ "Company: Enveritas\n",
230
+ "Job Location: Global / Remote\n",
231
+ "Job Description: \n",
232
+ "Data Scientist, Engineering & Data Group\n",
233
+ "Do you want to work for a mission-driven non-profit, analyzing data and writing software that will contribute to helping millions of coffee farmers out of poverty? Enveritas is a 501(c)3 non-profit and Y Combinator-backed startup looking to hire a Data Scientist for our Data Team. \n",
234
+ "We are looking for a Data Scientist with extensive professional experience to join our Engineering and Data Group on a remote, full-time basis. This position is open globally, based on locations supported by our EOR partner, Deel. You can learn more about this role at https://www.enveritas.org/jobs/data-scientist.\n",
235
+ "Our Engineering and Data Group is a quirky, talented, and humble group of about twenty with diverse backgrounds ranging from journalism to academia to international industry.\n",
236
+ "About Our Data Team\n",
237
+ "The Data Team's mission is to leverage data analytics to drive Enveritas' efforts in improving the livelihoods of smallholder farmers around the world. We are responsible for ensuring that we collect high quality data, we leverage it to generate actionable insights that support smallholder farmers worldwide, and we make the data and insights accessible to other teams, such as the Operations and Partnerships Team.\n",
238
+ "To improve the quality of our data collection process, we create tools to support our Country Operations teams. These tools enable the detection of outliers and automating quality control processes while collecting our survey data. \n",
239
+ "We are also responsible for the transformation of our raw data into insights. This includes writing the code that scores our standards, and creating models that give new insights about smallholder farming. For example, we estimate the cost farmers face when growing coffee to know if they earn a livable income.\n",
240
+ "Our programming languages of choice are Python and SQL (we use PostgreSQL), although some of our analysis is also done in R. We use git for version control, Github for hosting our repositories, and pytest for automated testing. Our internal BI tool is Looker.\n",
241
+ "What You’ll Be Doing\n",
242
+ "Providing technical oversight and mentorship. You will provide technical oversight to the models the Data Team produces. Additionally, you will offer guidance on and document statistical best practices and methodologies, helping to elevate the overall analytical capabilities within the Data Team and the organization.\n",
243
+ "Build models to create new insights from our data. You will identify opportunities for creating models using the data we collect. Engage with the Operations and Partnerships teams to generate ideas for models that would be beneficial for them, and the coffee and cocoa community. This includes using Python to craft these models, incorporating them into our data pipeline, and effectively synthesizing, visualizing, and conveying the results.\n",
244
+ "Serve as internal statistics consultant. Undertake the role of an internal statistical advisor, providing expertise across the organization for all statistics-related matters. This includes aiding teams in developing innovative sampling methods, calculating appropriate sample sizes, interpreting data analysis results, and ensuring the statistical integrity of projects and data collection. \n",
245
+ "Who You Are\n",
246
+ "Our team is fully distributed, so you should be comfortable with remote work. This role is a full-time individual contributor role. While you can be located anywhere, our core hours are 10am to 2pm Eastern Time Monday through Friday, with team members choosing either an early start or later stop as suits them.\n",
247
+ "Our work is often ambiguous, so you should have a love for environments with uncertainty. You should have a deep empathy for users of our tools and understand the importance of supporting the work of other teams. You should also be willing to engage in a broad range of tasks beyond standard data science functions, and you are not afraid of using a simple model that gets the job done.\n",
248
+ " \n",
249
+ "Qualifications\n",
250
+ "Read this first: Research shows that people of different backgrounds read job postings differently. If you don’t think you meet all of the qualifications but do think you’d be a great match for us, please consider applying and sharing more in your cover letter. We’d love to talk with you to see what skills you can bring to our team. This said, we are most likely to be interested in your candidacy if you can demonstrate the majority of the qualifications listed below:\n",
251
+ "\n",
252
+ "Extensive professional experience as a data scientist, a statistician, or equivalent.\n",
253
+ "Hands-on experience building and deploying statistical models. Strong theoretical knowledge of probability and statistics.\n",
254
+ "Intermediate Python. Experience with good software engineering practices such as reproducible environments, testing, and code quality.\n",
255
+ "A desire to apply your skills at a non-profit working to improve the livelihoods of smallholder farmers.\n",
256
+ "Have experience or expertise in one or more of the following:\n",
257
+ "\n",
258
+ "Building sustainability or agricultural models.\n",
259
+ "Advanced statistical modeling.\n",
260
+ "Survey sampling.\n",
261
+ "Survey methodology.\n",
262
+ "Bayesian statistical modeling.\n",
263
+ "Data Management.\n",
264
+ "Causal inference.\n",
265
+ "Managing teams.\n",
266
+ "\n",
267
+ "\n",
268
+ "\n",
269
+ " \n",
270
+ "About Working With Us & Compensation\n",
271
+ "Enveritas has teams around the world: we are about 90 people spread over almost two dozen countries, and of all backgrounds, faiths, and identities. To learn more about working at Enveritas, see https://www.enveritas.org/jobs/\n",
272
+ "For a US-Based hire, base salary for this position will be between $110,000-$130,000 annually (paid semi-monthly). This is a full-time exempt position. Full benefits include 401k with matching contributions, Medical/Dental/Vision, and Flexible Spending Account (FSA), 4 weeks vacation in addition to 12 standard holidays, and personal/sick time.\n",
273
+ "For a hire outside the US, our offer will be competitive; the specific benefits and compensation details will vary as required to account for your region’s laws and requirements. Salary for this position will be paid in relevant local currency.\n",
274
+ "For all staff, we are able to offer:\n",
275
+ "\n",
276
+ "Annual education budget for conferences, books, and other professional development opportunities.\n",
277
+ "Annual all-company retreat and annual group retreat.\n",
278
+ "Field visits to our Country Ops teams in coffee-growing countries such as Colombia, Costa Rica, Ethiopia, and Indonesia.\n",
279
+ "\n",
280
+ "Interview Process\n",
281
+ "We are committed to fair and equitable hiring. To honor this commitment, we are being transparent about our interview process. We are interested in learning what working with you would be like and believe the below is the fairest method for us to see you at your best — and for you to learn about us! If you feel that a different method would be better for us to learn what working together would be like, please tell us in your application. \n",
282
+ "After your introductory interview, we expect your interview process to take four to six weeks (but will depend on scheduling), and consist of four conversations that total about five hours of time. You should plan to also spend about four hours in total preparing for interviews.  See the hiring page at https://www.enveritas.org/jobs/data-scientist/ for details about each of these interviews.\n",
283
+ "\n",
284
+ "Introductory Interview (30 minutes; Google Meet; audio-only)\n",
285
+ "First Technical Interview (60-90 minutes; Google Meet video)\n",
286
+ "Second Technical Interview (60-90 minutes; Google Meet video)\n",
287
+ "Manager Interview (45 minutes; Google Meet video)\n",
288
+ "\n",
289
+ "How to Apply\n",
290
+ "Please apply here. Feel free to contact us at jobs@enveritas.org should you have any questions about the position, the interview process, or if you require any adjustments to ensure a fair and equitable application process. Questions about this opportunity or process will not reflect negatively on your application.\n",
291
+ "A few notes about our communications: We are not able to reply to messages sent to staff outside of either our application process or our jobs email address, as this is unfair to other candidates. Also, Enveritas has been made aware of fake job postings by individuals pretending to hire persons seeking employment. These individuals are looking to collect personal information about you for fraudulent purposes. All legitimate Enveritas job openings are posted on our Greenhouse board at https://boards.greenhouse.io/enveritas or Lever board at https://jobs.lever.co/Enveritas. All recruiting emails from Enveritas team members will come through Lever or from @enveritas.org. If you have any concerns about employment opportunities or contact from someone supposedly representing Enveritas, please reach out to us at jobs@enveritas.org\n"
292
+ ]
293
+ }
294
+ ],
295
+ "source": [
296
+ "# Extract the job description content\n",
297
+ "job_title = soup.find(\"h1\", {\"class\": \"app-title\"})\n",
298
+ "\n",
299
+ "company_name = soup.find(\"span\",{\"class\": \"company-name\"})\n",
300
+ "\n",
301
+ "job_location = soup.find(\"div\", {\"class\": \"location\"})\n",
302
+ "\n",
303
+ "job_description = soup.find(\"div\", {\"id\": \"content\"})\n",
304
+ "\n",
305
+ "# Print the job description\n",
306
+ "if job_description:\n",
307
+ " print(\"Job Title: \"+job_title.get_text()+\"\\n\"+\"Company: \"+company_name.get_text().strip().split(\"at \")[1]+\"\\n\"+\"Job Location: \"+job_location.get_text().strip()+\"\\n\"+\"Job Description: \\n\"+job_description.get_text().strip())\n",
308
+ "else:\n",
309
+ " print(\"Job description not found.\")\n"
310
+ ]
311
+ },
312
+ {
313
+ "cell_type": "code",
314
+ "execution_count": null,
315
+ "metadata": {},
316
+ "outputs": [],
317
+ "source": []
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": null,
322
+ "metadata": {},
323
+ "outputs": [],
324
+ "source": []
325
+ }
326
+ ],
327
+ "metadata": {
328
+ "kernelspec": {
329
+ "display_name": "my_env",
330
+ "language": "python",
331
+ "name": "python3"
332
+ },
333
+ "language_info": {
334
+ "codemirror_mode": {
335
+ "name": "ipython",
336
+ "version": 3
337
+ },
338
+ "file_extension": ".py",
339
+ "mimetype": "text/x-python",
340
+ "name": "python",
341
+ "nbconvert_exporter": "python",
342
+ "pygments_lexer": "ipython3",
343
+ "version": "3.10.6"
344
+ }
345
+ },
346
+ "nbformat": 4,
347
+ "nbformat_minor": 2
348
+ }
openai.md CHANGED
@@ -1,56 +1,56 @@
1
- # OpenAI APIs
2
- OpenAI API is a simple restful api. But the have **Python** and **Node.js** packages built on top of the api. We are using the Python package.
3
-
4
- ## Installation and Setup
5
- * **Python:**
6
- * **Windows & MacOS:** Go to [Python3.11](https://www.python.org/downloads/release/python-3118/) and find the link in the page and follow the installer.
7
- * **WSL2 & Ubuntu:**
8
- 1. run ```sudo apt update && sudo apt upgrade```.
9
- 2. Check the Python version: ```python -V```.
10
- 3. If you need to update the python, run
11
- ```sudo apt upgrade python3```.
12
- * **Virtual Environment:**
13
- 1. Create a venv:
14
- ```python -m venv path-to-venv```
15
- venv stands for Virtual Environment.
16
- 2. Activate the venv:
17
- **Linux & MacOS:** ```source path-to-venv/bin/activate```.
18
- **Windows:** ```path-to-venv\Scripts\activate```.
19
- * **VS Code:**
20
- 1. Install Python Extension:
21
- ```Extensions > search Python > Install microsoft.com Python extension```.
22
- 2. Choose the Python interpreter from your venv:
23
- ```Settings > Command Palette > Python: Select interpreter > Enter interpreter path > Find > Path-to-env/bin/activate```.
24
- 3. Choose the kernel for notebooks.
25
- * **Set up API Key:** We use ```dotenv``` package and ```.env``` file. This way can work for all similar APIs.
26
- 1. In each venv install ```dotenv``` package by ```pip install dotenv```.
27
- 2. In each project folder create ```.env``` file. In this file add the following line: ```OPENAI_API_KEY=key```.
28
- 3. At the top of the main file of your project, add the following two lines:
29
- ```from dotenv import load_dotenv```
30
- ```load_dotenv()```
31
-
32
- ## Models
33
- * **API Structure:** The API interface is very simple. You just need a client to call any of the models. The following two lines import the needed class and create an instance of it:
34
- ```
35
- from openai import OpenAI
36
- client = OpenAI()
37
- ```
38
- * **Models List**
39
- * **GPT-4 Turbo** accessed by handle ```gpt-4-turbo-preview``` or any of the following handles.
40
- 1. ```gpt-4-0125-preview``` Context Window: 128 k, Training Data: Up to Dec 2023.
41
- 2. ```gpt-4-1106-preview``` Context Window: 128 k, Training Data: Up to Apr 2023.
42
- 3. ```gpt-4-1106-vision-preview``` Context Window: 128 k, Training Data: Up to Apr 2023. Understands images.
43
- * **GPT-3.5 Turbo** accessed by handle ```gpt-3.5-turbo```
44
- 1. ```gpt-3.5-turbo-0125``` Context Window: 32 k, Training Data: Up to Sep 2021.
45
- * **DALL-E**
46
- 1. ```dall-e-3``` released Nov 2023
47
- 2. ```dall-e-2``` released Nov 2022
48
- * **TTS** Text to natural sounding spoken text.
49
- 1. ```tts-1```
50
- 2. ```tts-1-hd```
51
- * **Whisper** general-purpose speech recognition model.
52
- * **Embeddings** numerical representation of text for semantic similarity and semantic search.
53
- 1. ```text-embeddings-3-large``` Output dimension: 3072
54
- 2. ```text-embeddings-3-small``` Output dimension: 1536
55
- 3. ```text-embeddings-ada-002``` Output dimension: 1536 (oldest)
56
-
 
1
+ # OpenAI APIs
2
+ OpenAI API is a simple restful api. But the have **Python** and **Node.js** packages built on top of the api. We are using the Python package.
3
+
4
+ ## Installation and Setup
5
+ * **Python:**
6
+ * **Windows & MacOS:** Go to [Python3.11](https://www.python.org/downloads/release/python-3118/) and find the link in the page and follow the installer.
7
+ * **WSL2 & Ubuntu:**
8
+ 1. run ```sudo apt update && sudo apt upgrade```.
9
+ 2. Check the Python version: ```python -V```.
10
+ 3. If you need to update the python, run
11
+ ```sudo apt upgrade python3```.
12
+ * **Virtual Environment:**
13
+ 1. Create a venv:
14
+ ```python -m venv path-to-venv```
15
+ venv stands for Virtual Environment.
16
+ 2. Activate the venv:
17
+ **Linux & MacOS:** ```source path-to-venv/bin/activate```.
18
+ **Windows:** ```path-to-venv\Scripts\activate```.
19
+ * **VS Code:**
20
+ 1. Install Python Extension:
21
+ ```Extensions > search Python > Install microsoft.com Python extension```.
22
+ 2. Choose the Python interpreter from your venv:
23
+ ```Settings > Command Palette > Python: Select interpreter > Enter interpreter path > Find > Path-to-env/bin/activate```.
24
+ 3. Choose the kernel for notebooks.
25
+ * **Set up API Key:** We use ```dotenv``` package and ```.env``` file. This way can work for all similar APIs.
26
+ 1. In each venv install ```dotenv``` package by ```pip install dotenv```.
27
+ 2. In each project folder create ```.env``` file. In this file add the following line: ```OPENAI_API_KEY=key```.
28
+ 3. At the top of the main file of your project, add the following two lines:
29
+ ```from dotenv import load_dotenv```
30
+ ```load_dotenv()```
31
+
32
+ ## Models
33
+ * **API Structure:** The API interface is very simple. You just need a client to call any of the models. The following two lines import the needed class and create an instance of it:
34
+ ```
35
+ from openai import OpenAI
36
+ client = OpenAI()
37
+ ```
38
+ * **Models List**
39
+ * **GPT-4 Turbo** accessed by handle ```gpt-4-turbo-preview``` or any of the following handles.
40
+ 1. ```gpt-4-0125-preview``` Context Window: 128 k, Training Data: Up to Dec 2023.
41
+ 2. ```gpt-4-1106-preview``` Context Window: 128 k, Training Data: Up to Apr 2023.
42
+ 3. ```gpt-4-1106-vision-preview``` Context Window: 128 k, Training Data: Up to Apr 2023. Understands images.
43
+ * **GPT-3.5 Turbo** accessed by handle ```gpt-3.5-turbo```
44
+ 1. ```gpt-3.5-turbo-0125``` Context Window: 32 k, Training Data: Up to Sep 2021.
45
+ * **DALL-E**
46
+ 1. ```dall-e-3``` released Nov 2023
47
+ 2. ```dall-e-2``` released Nov 2022
48
+ * **TTS** Text to natural sounding spoken text.
49
+ 1. ```tts-1```
50
+ 2. ```tts-1-hd```
51
+ * **Whisper** general-purpose speech recognition model.
52
+ * **Embeddings** numerical representation of text for semantic similarity and semantic search.
53
+ 1. ```text-embeddings-3-large``` Output dimension: 3072
54
+ 2. ```text-embeddings-3-small``` Output dimension: 1536
55
+ 3. ```text-embeddings-ada-002``` Output dimension: 1536 (oldest)
56
+
rag.md CHANGED
@@ -1,49 +1,49 @@
1
- # RAG: Retrieval-Augmented Generation
2
- Paper: [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/pdf/2005.11401v4.pdf)
3
- Code: [https://python.langchain.com/docs/use_cases/question_answering/quickstart](https://python.langchain.com/docs/use_cases/question_answering/quickstart)
4
- Similarity Search: [https://arxiv.org/pdf/2403.05440.pdf](https://arxiv.org/pdf/2403.05440.pdf)
5
- Prompt Hub: [https://smith.langchain.com/hub](https://smith.langchain.com/hub)
6
-
7
- ## Premise
8
- LLMs store factual knowledge in their parameters. But accessing and manipulating this knowledge in a precise way is not easy. Instead, that specific knowledge can be accessed through the weights and similarity and then added to the prompt of another model for answer generation:
9
-
10
- ![RAG Architecture](readme_data/rag1.png)
11
- The original paper considers training end-to-end retriever and generator models in one pipeline.
12
- Models like GPT3.5 and GPT4 don't need that training piece, if their are used with the open AI embedding models.
13
-
14
- **Where RAG Is Used?**
15
- * Mitigate hallucination generated by LLMs
16
- * Hallucinations are factually incorrect information generated by an LLM in response to an instruction or question from a user.
17
- * Hallucinations are very hard to capture and need other methodologies to catch them.
18
- * Even with RAG the probability of hallucination is not zero.
19
- * Allow LLMs the consume the data that is not part of their training in their inference.
20
- * LLMs are pre-training on huge amounts of data from public sources.
21
- * Proprietary data is not available to any general pre-trained LLM.
22
-
23
- **How RAG Works**
24
- 1. One can vectorize the semantics of a piece of text using specialized LLMs, called embedding models.
25
- 2. A collection of text can be vectorized to be used for answering the incoming questions.
26
- 3. A question is embedded using the same embedding model, and similar documents from a vector database is retrieved using a similarity search algorithm, like cosine similarity.
27
- 4. Found documents with the question are passed to generator LLM to generate and answer.
28
-
29
-
30
- ## RAG Components
31
- Here are different components present in a RAG pipeline:
32
- 1. **Embedding Model:** Vectorization model which for each string outputs a vector of fixed length.
33
- * The length is the dimension of latent space for this model.
34
- 3. **Vector DB:** Specialized database for saving pairs of (text,embeddings). Each of these pairs are called a document. Usually we put related documents in one collection. This makes the similarity search easier.
35
- 2. **Similarity Metric:** Given two document pairs $(t_1,e_1)$ and $(t_2,e_2), the metric calculates the similarity of $t1$ and $t_2$ by performing some geometric calculation on their respective embeddings.
36
- * **Cosine Similarity:** Calculates the cosine of the angle between embedding1 and embedding2:
37
- $$\cos(\theta)=\frac{e_1 \cdot e_2}{||e_1||\;||e_2||}.$$
38
- * **Inner Product:** Calculates the inner product of $e_1$ and $e_2$:
39
- $$e1\cdot e_2.$$
40
- * **Distance:** Calculates the distance of $e_1$ from $e_2$ using $L_p$ norms:
41
- $$||e_1-e_2||_p.$$
42
- 4. **Generator Model:** Generates the final answer based on the question and found similar text in the database that may contain the answer.
43
-
44
- ## Cosine Similarity Problems
45
- * The motivation for using cosine similarity is that the norm of the learned embedding-vectors is not as important as the directional alignment between the embedding-vectors.
46
- * But cosine similarity "work better but sometimes also worse than the unnormalized dot-product between embedded vectors in practice."
47
- * The paper derives "analytically how cosine-similarity can yield arbitrary and therefore meaningless ‘similarities.’"
48
- * To do this, they "study embeddings derived from regularized linear models, where closed-form solutions facilitate analytical insights."
49
  * "The underlying reason is not cosine similarity itself, but the fact that the learned embeddings have a degree of freedom that can render arbitrary cosine-similarities even though their (unnormalized) dot-products are well-defined and unique."
 
1
+ # RAG: Retrieval-Augmented Generation
2
+ Paper: [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/pdf/2005.11401v4.pdf)
3
+ Code: [https://python.langchain.com/docs/use_cases/question_answering/quickstart](https://python.langchain.com/docs/use_cases/question_answering/quickstart)
4
+ Similarity Search: [https://arxiv.org/pdf/2403.05440.pdf](https://arxiv.org/pdf/2403.05440.pdf)
5
+ Prompt Hub: [https://smith.langchain.com/hub](https://smith.langchain.com/hub)
6
+
7
+ ## Premise
8
+ LLMs store factual knowledge in their parameters. But accessing and manipulating this knowledge in a precise way is not easy. Instead, that specific knowledge can be accessed through the weights and similarity and then added to the prompt of another model for answer generation:
9
+
10
+ ![RAG Architecture](readme_data/rag1.png)
11
+ The original paper considers training end-to-end retriever and generator models in one pipeline.
12
+ Models like GPT3.5 and GPT4 don't need that training piece, if their are used with the open AI embedding models.
13
+
14
+ **Where RAG Is Used?**
15
+ * Mitigate hallucination generated by LLMs
16
+ * Hallucinations are factually incorrect information generated by an LLM in response to an instruction or question from a user.
17
+ * Hallucinations are very hard to capture and need other methodologies to catch them.
18
+ * Even with RAG the probability of hallucination is not zero.
19
+ * Allow LLMs the consume the data that is not part of their training in their inference.
20
+ * LLMs are pre-training on huge amounts of data from public sources.
21
+ * Proprietary data is not available to any general pre-trained LLM.
22
+
23
+ **How RAG Works**
24
+ 1. One can vectorize the semantics of a piece of text using specialized LLMs, called embedding models.
25
+ 2. A collection of text can be vectorized to be used for answering the incoming questions.
26
+ 3. A question is embedded using the same embedding model, and similar documents from a vector database is retrieved using a similarity search algorithm, like cosine similarity.
27
+ 4. Found documents with the question are passed to generator LLM to generate and answer.
28
+
29
+
30
+ ## RAG Components
31
+ Here are different components present in a RAG pipeline:
32
+ 1. **Embedding Model:** Vectorization model which for each string outputs a vector of fixed length.
33
+ * The length is the dimension of latent space for this model.
34
+ 3. **Vector DB:** Specialized database for saving pairs of (text,embeddings). Each of these pairs are called a document. Usually we put related documents in one collection. This makes the similarity search easier.
35
+ 2. **Similarity Metric:** Given two document pairs $(t_1,e_1)$ and $(t_2,e_2), the metric calculates the similarity of $t1$ and $t_2$ by performing some geometric calculation on their respective embeddings.
36
+ * **Cosine Similarity:** Calculates the cosine of the angle between embedding1 and embedding2:
37
+ $$\cos(\theta)=\frac{e_1 \cdot e_2}{||e_1||\;||e_2||}.$$
38
+ * **Inner Product:** Calculates the inner product of $e_1$ and $e_2$:
39
+ $$e1\cdot e_2.$$
40
+ * **Distance:** Calculates the distance of $e_1$ from $e_2$ using $L_p$ norms:
41
+ $$||e_1-e_2||_p.$$
42
+ 4. **Generator Model:** Generates the final answer based on the question and found similar text in the database that may contain the answer.
43
+
44
+ ## Cosine Similarity Problems
45
+ * The motivation for using cosine similarity is that the norm of the learned embedding-vectors is not as important as the directional alignment between the embedding-vectors.
46
+ * But cosine similarity "work better but sometimes also worse than the unnormalized dot-product between embedded vectors in practice."
47
+ * The paper derives "analytically how cosine-similarity can yield arbitrary and therefore meaningless ‘similarities.’"
48
+ * To do this, they "study embeddings derived from regularized linear models, where closed-form solutions facilitate analytical insights."
49
  * "The underlying reason is not cosine similarity itself, but the fact that the learned embeddings have a degree of freedom that can render arbitrary cosine-similarities even though their (unnormalized) dot-products are well-defined and unique."
requirements.txt CHANGED
@@ -1,17 +1,17 @@
1
- langchain==0.1.8
2
- streamlit==1.31.1
3
- langchain-openai==0.0.8
4
- python-dotenv==1.0.1
5
- openai==1.12.0
6
- beautifulsoup4==4.12.3
7
- langchain_text_splitters==0.0.1
8
- chromadb==0.4.24
9
- langchainhub==0.1.15
10
- langchain-community==0.0.37
11
- langchain-core==0.1.52
12
- sentence-transformers
13
- selenium==4.20.0
14
- openpyxl
15
- bs4
16
- unstructured==0.13.7
17
  openpyxl
 
1
+ langchain==0.1.20
2
+ streamlit==1.31.1
3
+ langchain-openai==0.0.8
4
+ python-dotenv==1.0.1
5
+ openai==1.12.0
6
+ beautifulsoup4==4.12.3
7
+ langchain_text_splitters==0.0.1
8
+ chromadb==0.4.3
9
+ langchainhub==0.1.15
10
+ langchain-community==0.0.38
11
+ langchain-core==0.1.52
12
+ sentence-transformers
13
+ selenium==4.20.0
14
+ openpyxl
15
+ bs4
16
+ unstructured==0.13.7
17
  openpyxl