Spaces:

ulab-ai
/

RoutePilot

Running

App Files Files Community

RoutePilot / GraphRouter_eval /data_processing /construct_router_data.py

cmulgy

add demo

41b743c about 2 months ago

raw

history blame

3.19 kB

	from llm_engine import LLMEngine
	from utils import savejson,loadjson,savepkl,loadpkl,get_embedding
	import pandas as pd
	from transformers import AutoTokenizer, AutoModel
	import yaml

	class data_building:
	def __init__(self,qa_path,llm_path,config):
	self.qa_data=pd.read_csv(qa_path)
	self.llm_description = loadjson(llm_path)
	self.llm_names = list(self.llm_description.keys())
	self.all_llm_description = []
	for inter in self.llm_names:
	self.all_llm_description.append(self.llm_description[inter]['feature'])
	self.MyLLMEngine = LLMEngine(llm_names=self.llm_names,llm_description=self.llm_description)
	self.config=config
	self.construct_data_with_LLM()


	def construct_data_with_LLM(self):
	df = pd.DataFrame(columns=['task_id', 'query','query_embedding', 'ground_truth', 'metric','llm',
	'effect','cost'])
	count=0
	for row in self.qa_data.itertuples():
	task_id_t=row.task_id
	query_t=row.query
	task_description=row.task_description
	if task_id_t=="multi_news":
	tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
	tokens = tokenizer.tokenize(query_t)
	extracted_text = tokens[:3000]
	query_t = tokenizer.convert_tokens_to_string(extracted_text)
	query_t_embedding = get_embedding([query_t])
	task_description_embedding=get_embedding([task_description])
	ground_truth_t=row.ground_truth
	metric_t=row.metric

	for a_t in range(len(self.llm_names)):
	response_t = self.MyLLMEngine.get_llm_response(query=query_t, llm_idx=a_t)
	reward_t = self.MyLLMEngine.eval(prediction=response_t, ground_truth=ground_truth_t, metric=metric_t)
	cost_t = self.MyLLMEngine.compute_cost(llm_idx=a_t, input_text=query_t, output_size=self.config['query_response_length'])
	llm_t=self.llm_names[a_t]
	new_row = {'task_id':task_id_t,'task_description':task_description,'task_description_embedding':task_description_embedding,'query':query_t,'query_embedding':query_t_embedding, 'ground_truth':ground_truth_t, 'metric':metric_t,
	'llm':llm_t,'effect':reward_t,'cost':cost_t}
	df = df._append(new_row, ignore_index=True)
	count+=1

	# Normalize cost according to task
	df['cost'] = df.groupby('task_id')['cost'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))

	df.to_csv(self.config['saved_router_data_path'], index=False)
	llm_description_embedding = get_embedding(self.all_llm_description)
	savepkl(llm_description_embedding, self.config['llm_embedding_path'])


	if __name__ == "__main__":
	import os
	os.environ["KMP_DUPLICATE_LIB_OK"] = 'True'
	with open("configs/config.yaml", 'r', encoding='utf-8') as file:
	config = yaml.safe_load(file)
	os.environ["TOGETHERAI_API_KEY"] = config["api_key"]
	data_building(qa_path=config['unified_qa_data_path'],llm_path=config['llm_description_path'],config=config)