Spaces:
Sleeping
Sleeping
from llama_index.core import ( | |
VectorStoreIndex, | |
SimpleDirectoryReader, | |
) | |
from typing import List, Dict | |
from llama_index.agent.openai_legacy import FnRetrieverOpenAIAgent | |
from llama_index.core.tools import QueryEngineTool, ToolMetadata | |
from llama_index.llms.openai import OpenAI | |
from llama_index.embeddings.openai import OpenAIEmbedding | |
from llama_index.core import Settings | |
from llama_index.agent.openai import OpenAIAgent | |
from llama_index.core import load_index_from_storage, StorageContext | |
from llama_index.core.node_parser import SentenceSplitter | |
from llama_index.core.postprocessor import LLMRerank | |
from llama_index.core import VectorStoreIndex | |
from llama_index.core.objects import ObjectIndex, SimpleToolNodeMapping | |
import os | |
from pathlib import Path | |
# from llama_index.embeddings | |
from dotenv import load_dotenv | |
load_dotenv() | |
Settings.llm = OpenAI(temperature=0.5, model="gpt-3.5-turbo") | |
Settings.embed_model = OpenAIEmbedding() | |
def get_pdf_file_names(data_dir: Path) -> List: | |
file_paths = list(Path(data_dir).glob("*.pdf")) | |
file_names = [str(i).split("\\")[-1] for i in file_paths] | |
return file_names | |
def get_relevant_docs(file_names): | |
relevant_docs = {} | |
for i,file_name in enumerate(file_names): | |
relevant_docs[file_names[i]] = SimpleDirectoryReader( | |
input_files = [f"data/{file_name}"] | |
).load_data() | |
return relevant_docs | |
def building_document_agents(file_names: List, | |
relevant_docs: Dict): | |
desciptions =["Useful for finding relavant information about the common interview questions and how to answer them", | |
"Useful for finding relavant information for work history in the resume"] | |
agent_desciptions = ["You are a specialized agent designed to look for existing answers for an interview question", | |
"You are a specialized agent designed to look into work history in the resume"] | |
node_parser = SentenceSplitter() | |
# Build agents dictionary | |
agents = {} | |
query_engines = {} | |
desciptions | |
# this is for the baseline | |
all_nodes = [] | |
for idx, file_name in enumerate(file_names): | |
nodes = node_parser.get_nodes_from_documents(relevant_docs[file_name]) | |
all_nodes.extend(nodes) | |
if not os.path.exists(f"./vector_index/{file_name}"): | |
# build vector index | |
vector_index = VectorStoreIndex(nodes,) | |
vector_index.storage_context.persist( | |
persist_dir=f"./vector_index/{file_name}" | |
) | |
else: | |
vector_index = load_index_from_storage( | |
StorageContext.from_defaults(persist_dir=f"./vector_index/{file_name}"), | |
) | |
# define query engines | |
reranker = LLMRerank( | |
choice_batch_size=10, | |
top_n=3 | |
) | |
vector_query_engine = vector_index.as_chat_engine(node_postprocessors=[reranker]) | |
query_engine_tools = [ | |
QueryEngineTool( | |
query_engine=vector_query_engine, | |
metadata=ToolMetadata( | |
name="vector_tool", | |
description=(desciptions[idx]),),), | |
] | |
# build agent | |
function_llm = OpenAI(model="gpt-4-0125-preview") | |
agent = OpenAIAgent.from_tools( | |
query_engine_tools, | |
llm=function_llm, | |
verbose=True, | |
system_prompt=f"""\ | |
{agent_desciptions[idx]}. | |
You must ALWAYS use at least one of the tools provided when answering a question; do NOT rely on prior knowledge.\ | |
""", | |
) | |
agents[file_name] = agent | |
query_engines[file_name] = vector_index.as_query_engine( | |
similarity_top_k=3 | |
) | |
return agents | |
def define_all_tools(agents: Dict, | |
file_names: List): | |
file_summaries = ["This content contains practical information about the common interview questions and answers.\n", | |
"This content contains practical information my resume and work history.\n"] | |
all_tools = [] | |
tool_names = ["interview_questions_and_answers", | |
"work_history_information"] | |
for idx,file_name in enumerate(file_names): | |
file_summary = ( | |
file_summaries[idx] | |
) | |
file_name_format = tool_names[idx] | |
doc_tool = QueryEngineTool( | |
query_engine=agents[file_name], | |
metadata=ToolMetadata( | |
name=f"tool_{file_name_format}", | |
description=file_summary, | |
), | |
) | |
all_tools.append(doc_tool) | |
tool_mapping = SimpleToolNodeMapping.from_objects(all_tools) | |
obj_index = ObjectIndex.from_objects( | |
all_tools, | |
tool_mapping, | |
VectorStoreIndex, | |
) | |
return obj_index | |
GREETING_MESSAGE = "Hi! My name is Newme,I'm a virtual mental health supporter powered by artificial intelligence (AI). I get it, I'm not a human like you, but behind my interactions, there is a group of real mental health experts, who dedicate time and effort to make sure that any of the content and interactions with users like you, are based on the most up-to-date research in the mental health field. However, check with your primary care or mental health provider before implementing any of these practices, as your personal case might require additional strategies or treatment. Please don't take this as professional or medical advice. Are you Ok with that?" | |
def create_top_agent(obj_index: ObjectIndex): | |
top_agent = FnRetrieverOpenAIAgent.from_retriever( | |
obj_index.as_retriever(similarity_top_k=10), | |
system_prompt=""" \ | |
You are a machine learning developer with back groung in chemical engineering who is talking to an interviewer. Please always use ALL tools provided to answer a question. Do not rely on prior knowledge. | |
You should answer the question or concern of the interviewer and you will follow ALL of the rules below: | |
1/ Response as if you are in an interview | |
2/ Response as details as posible but should not be longer than 200 words | |
3/ Occasionally provide a link to my social media profiles occasionally if the question is about work history: | |
1/ LinkedIn profile: https://www.linkedin.com/in/henrythdu/ | |
2/ Portfolio Website: https://henrydu-portfolio.webflow.io/ | |
\ | |
""", | |
verbose=True, | |
streaming = True | |
) | |
return top_agent | |