RAG_Work_History / LlamaIndex_helper.py
Henry Du
Update file name
af0024e
from llama_index.core import (
VectorStoreIndex,
SimpleDirectoryReader,
)
from typing import List, Dict
from llama_index.agent.openai_legacy import FnRetrieverOpenAIAgent
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
from llama_index.agent.openai import OpenAIAgent
from llama_index.core import load_index_from_storage, StorageContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.postprocessor import LLMRerank
from llama_index.core import VectorStoreIndex
from llama_index.core.objects import ObjectIndex, SimpleToolNodeMapping
import os
from pathlib import Path
# from llama_index.embeddings
from dotenv import load_dotenv
load_dotenv()
Settings.llm = OpenAI(temperature=0.5, model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding()
def get_pdf_file_names(data_dir: Path) -> List:
file_paths = list(Path(data_dir).glob("*.pdf"))
file_names = [str(i).split("\\")[-1] for i in file_paths]
return file_names
def get_relevant_docs(file_names):
relevant_docs = {}
for i,file_name in enumerate(file_names):
relevant_docs[file_names[i]] = SimpleDirectoryReader(
input_files = [f"data/{file_name}"]
).load_data()
return relevant_docs
def building_document_agents(file_names: List,
relevant_docs: Dict):
desciptions =["Useful for finding relavant information about the common interview questions and how to answer them",
"Useful for finding relavant information for work history in the resume"]
agent_desciptions = ["You are a specialized agent designed to look for existing answers for an interview question",
"You are a specialized agent designed to look into work history in the resume"]
node_parser = SentenceSplitter()
# Build agents dictionary
agents = {}
query_engines = {}
desciptions
# this is for the baseline
all_nodes = []
for idx, file_name in enumerate(file_names):
nodes = node_parser.get_nodes_from_documents(relevant_docs[file_name])
all_nodes.extend(nodes)
if not os.path.exists(f"./vector_index/{file_name}"):
# build vector index
vector_index = VectorStoreIndex(nodes,)
vector_index.storage_context.persist(
persist_dir=f"./vector_index/{file_name}"
)
else:
vector_index = load_index_from_storage(
StorageContext.from_defaults(persist_dir=f"./vector_index/{file_name}"),
)
# define query engines
reranker = LLMRerank(
choice_batch_size=10,
top_n=3
)
vector_query_engine = vector_index.as_chat_engine(node_postprocessors=[reranker])
query_engine_tools = [
QueryEngineTool(
query_engine=vector_query_engine,
metadata=ToolMetadata(
name="vector_tool",
description=(desciptions[idx]),),),
]
# build agent
function_llm = OpenAI(model="gpt-4-0125-preview")
agent = OpenAIAgent.from_tools(
query_engine_tools,
llm=function_llm,
verbose=True,
system_prompt=f"""\
{agent_desciptions[idx]}.
You must ALWAYS use at least one of the tools provided when answering a question; do NOT rely on prior knowledge.\
""",
)
agents[file_name] = agent
query_engines[file_name] = vector_index.as_query_engine(
similarity_top_k=3
)
return agents
def define_all_tools(agents: Dict,
file_names: List):
file_summaries = ["This content contains practical information about the common interview questions and answers.\n",
"This content contains practical information my resume and work history.\n"]
all_tools = []
tool_names = ["interview_questions_and_answers",
"work_history_information"]
for idx,file_name in enumerate(file_names):
file_summary = (
file_summaries[idx]
)
file_name_format = tool_names[idx]
doc_tool = QueryEngineTool(
query_engine=agents[file_name],
metadata=ToolMetadata(
name=f"tool_{file_name_format}",
description=file_summary,
),
)
all_tools.append(doc_tool)
tool_mapping = SimpleToolNodeMapping.from_objects(all_tools)
obj_index = ObjectIndex.from_objects(
all_tools,
tool_mapping,
VectorStoreIndex,
)
return obj_index
GREETING_MESSAGE = "Hi! My name is Newme,I'm a virtual mental health supporter powered by artificial intelligence (AI). I get it, I'm not a human like you, but behind my interactions, there is a group of real mental health experts, who dedicate time and effort to make sure that any of the content and interactions with users like you, are based on the most up-to-date research in the mental health field. However, check with your primary care or mental health provider before implementing any of these practices, as your personal case might require additional strategies or treatment. Please don't take this as professional or medical advice. Are you Ok with that?"
def create_top_agent(obj_index: ObjectIndex):
top_agent = FnRetrieverOpenAIAgent.from_retriever(
obj_index.as_retriever(similarity_top_k=10),
system_prompt=""" \
You are a machine learning developer with back groung in chemical engineering who is talking to an interviewer. Please always use ALL tools provided to answer a question. Do not rely on prior knowledge.
You should answer the question or concern of the interviewer and you will follow ALL of the rules below:
1/ Response as if you are in an interview
2/ Response as details as posible but should not be longer than 200 words
3/ Occasionally provide a link to my social media profiles occasionally if the question is about work history:
1/ LinkedIn profile: https://www.linkedin.com/in/henrythdu/
2/ Portfolio Website: https://henrydu-portfolio.webflow.io/
\
""",
verbose=True,
streaming = True
)
return top_agent