Spaces:

hdu
/

RAG_Work_History

Sleeping

RAG_Work_History / LlamaIndex_helper.py

Henry Du

Update file name

af0024e over 1 year ago

6.36 kB

	from llama_index.core import (
	VectorStoreIndex,
	SimpleDirectoryReader,
	)
	from typing import List, Dict
	from llama_index.agent.openai_legacy import FnRetrieverOpenAIAgent
	from llama_index.core.tools import QueryEngineTool, ToolMetadata
	from llama_index.llms.openai import OpenAI
	from llama_index.embeddings.openai import OpenAIEmbedding
	from llama_index.core import Settings
	from llama_index.agent.openai import OpenAIAgent
	from llama_index.core import load_index_from_storage, StorageContext
	from llama_index.core.node_parser import SentenceSplitter
	from llama_index.core.postprocessor import LLMRerank
	from llama_index.core import VectorStoreIndex
	from llama_index.core.objects import ObjectIndex, SimpleToolNodeMapping
	import os
	from pathlib import Path
	# from llama_index.embeddings
	from dotenv import load_dotenv

	load_dotenv()

	Settings.llm = OpenAI(temperature=0.5, model="gpt-3.5-turbo")
	Settings.embed_model = OpenAIEmbedding()

	def get_pdf_file_names(data_dir: Path) -> List:
	file_paths = list(Path(data_dir).glob("*.pdf"))
	file_names = [str(i).split("\\")[-1] for i in file_paths]
	return file_names

	def get_relevant_docs(file_names):
	relevant_docs = {}
	for i,file_name in enumerate(file_names):
	relevant_docs[file_names[i]] = SimpleDirectoryReader(
	input_files = [f"data/{file_name}"]
	).load_data()
	return relevant_docs


	def building_document_agents(file_names: List,
	relevant_docs: Dict):
	desciptions =["Useful for finding relavant information about the common interview questions and how to answer them",
	"Useful for finding relavant information for work history in the resume"]
	agent_desciptions = ["You are a specialized agent designed to look for existing answers for an interview question",
	"You are a specialized agent designed to look into work history in the resume"]
	node_parser = SentenceSplitter()
	# Build agents dictionary
	agents = {}
	query_engines = {}
	desciptions
	# this is for the baseline
	all_nodes = []

	for idx, file_name in enumerate(file_names):
	nodes = node_parser.get_nodes_from_documents(relevant_docs[file_name])
	all_nodes.extend(nodes)
	if not os.path.exists(f"./vector_index/{file_name}"):
	# build vector index
	vector_index = VectorStoreIndex(nodes,)
	vector_index.storage_context.persist(
	persist_dir=f"./vector_index/{file_name}"
	)
	else:
	vector_index = load_index_from_storage(
	StorageContext.from_defaults(persist_dir=f"./vector_index/{file_name}"),
	)
	# define query engines
	reranker = LLMRerank(
	choice_batch_size=10,
	top_n=3
	)

	vector_query_engine = vector_index.as_chat_engine(node_postprocessors=[reranker])
	query_engine_tools = [
	QueryEngineTool(
	query_engine=vector_query_engine,
	metadata=ToolMetadata(
	name="vector_tool",
	description=(desciptions[idx]),),),
	]

	# build agent

	function_llm = OpenAI(model="gpt-4-0125-preview")
	agent = OpenAIAgent.from_tools(
	query_engine_tools,
	llm=function_llm,
	verbose=True,
	system_prompt=f"""\
	{agent_desciptions[idx]}.
	You must ALWAYS use at least one of the tools provided when answering a question; do NOT rely on prior knowledge.\
	""",
	)


	agents[file_name] = agent
	query_engines[file_name] = vector_index.as_query_engine(
	similarity_top_k=3
	)
	return agents


	def define_all_tools(agents: Dict,
	file_names: List):
	file_summaries = ["This content contains practical information about the common interview questions and answers.\n",
	"This content contains practical information my resume and work history.\n"]
	all_tools = []
	tool_names = ["interview_questions_and_answers",
	"work_history_information"]
	for idx,file_name in enumerate(file_names):
	file_summary = (
	file_summaries[idx]
	)
	file_name_format = tool_names[idx]
	doc_tool = QueryEngineTool(
	query_engine=agents[file_name],
	metadata=ToolMetadata(
	name=f"tool_{file_name_format}",
	description=file_summary,
	),
	)
	all_tools.append(doc_tool)
	tool_mapping = SimpleToolNodeMapping.from_objects(all_tools)
	obj_index = ObjectIndex.from_objects(
	all_tools,
	tool_mapping,
	VectorStoreIndex,
	)
	return obj_index

	GREETING_MESSAGE = "Hi! My name is Newme,I'm a virtual mental health supporter powered by artificial intelligence (AI). I get it, I'm not a human like you, but behind my interactions, there is a group of real mental health experts, who dedicate time and effort to make sure that any of the content and interactions with users like you, are based on the most up-to-date research in the mental health field. However, check with your primary care or mental health provider before implementing any of these practices, as your personal case might require additional strategies or treatment. Please don't take this as professional or medical advice. Are you Ok with that?"


	def create_top_agent(obj_index: ObjectIndex):
	top_agent = FnRetrieverOpenAIAgent.from_retriever(
	obj_index.as_retriever(similarity_top_k=10),
	system_prompt=""" \
	You are a machine learning developer with back groung in chemical engineering who is talking to an interviewer. Please always use ALL tools provided to answer a question. Do not rely on prior knowledge.

	You should answer the question or concern of the interviewer and you will follow ALL of the rules below:

	1/ Response as if you are in an interview

	2/ Response as details as posible but should not be longer than 200 words

	3/ Occasionally provide a link to my social media profiles occasionally if the question is about work history:
	1/ LinkedIn profile: https://www.linkedin.com/in/henrythdu/
	2/ Portfolio Website: https://henrydu-portfolio.webflow.io/

	\
	""",
	verbose=True,
	streaming = True
	)
	return top_agent