Spaces:

alexshengzhili
/

calahealthgpt

Runtime error

App Files Files Community

calahealthgpt / fastchat /serve /monitor /clean_battle_data.py

alexshengzhili

Upload folder using huggingface_hub

e72aedf about 1 year ago

raw

history blame

7.07 kB

	"""
	Clean chatbot arena battle log.
	"""
	import argparse
	import datetime
	import json
	import os
	from pytz import timezone
	import time

	from tqdm import tqdm

	from fastchat.serve.monitor.basic_stats import get_log_files
	from fastchat.utils import detect_language


	VOTES = ["tievote", "leftvote", "rightvote", "bothbad_vote"]
	IDENTITY_WORDS = [
	"vicuna",
	"lmsys",
	"koala",
	"uc berkeley",
	"open assistant",
	"laion",
	"chatglm",
	"chatgpt",
	"openai",
	"anthropic",
	"claude",
	"bard",
	"palm",
	"lamda",
	"google",
	"NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.",
	]


	def get_log_files(max_num_files=None):
	dates = []
	for month in [4, 5, 6]:
	for day in range(1, 32):
	dates.append(f"2023-{month:02d}-{day:02d}")

	num_servers = 12
	filenames = []
	for d in dates:
	for i in range(num_servers):
	name = os.path.expanduser(f"~/fastchat_logs/server{i}/{d}-conv.json")
	if os.path.exists(name):
	filenames.append(name)
	max_num_files = max_num_files or len(filenames)
	filenames = filenames[-max_num_files:]
	return filenames


	def remove_html(raw):
	if raw.startswith("<h3>"):
	return raw[raw.find(": ") + 2 : -len("</h3>\n")]
	return raw


	def to_openai_format(messages):
	roles = ["user", "assistant"]
	ret = []
	for i, x in enumerate(messages):
	ret.append({"role": roles[i % 2], "content": x[1]})
	return ret


	def clean_battle_data(log_files):
	data = []
	for filename in tqdm(log_files, desc="read files"):
	for retry in range(5):
	try:
	lines = open(filename).readlines()
	break
	except FileNotFoundError:
	time.sleep(2)

	for l in lines:
	row = json.loads(l)
	if row["type"] in VOTES:
	data.append(row)

	convert_type = {
	"leftvote": "model_a",
	"rightvote": "model_b",
	"tievote": "tie",
	"bothbad_vote": "tie (bothbad)",
	}

	all_models = set()
	ct_anony = 0
	ct_invalid = 0
	ct_leaked_identity = 0
	battles = []
	for row in data:
	# Resolve model names
	models_public = [remove_html(row["models"][0]), remove_html(row["models"][1])]
	if "model_name" in row["states"][0]:
	models_hidden = [
	row["states"][0]["model_name"],
	row["states"][1]["model_name"],
	]
	if models_hidden[0] is None:
	models_hidden = models_public
	else:
	models_hidden = models_public

	if (models_public[0] == "" and models_public[1] != "") or (
	models_public[1] == "" and models_public[0] != ""
	):
	ct_invalid += 1
	continue

	if models_public[0] == "" or models_public[0] == "Model A":
	anony = True
	models = models_hidden
	ct_anony += 1
	else:
	anony = False
	models = models_public
	if not models_public == models_hidden:
	ct_invalid += 1
	continue

	# Detect langauge
	state = row["states"][0]
	if state["offset"] >= len(state["messages"]):
	ct_invalid += 1
	continue
	lang_code = detect_language(state["messages"][state["offset"]][1])
	rounds = (len(state["messages"]) - state["offset"]) // 2

	# Drop conversations if the model names are leaked
	leaked_identity = False
	messages = ""
	for i in range(2):
	state = row["states"][i]
	for role, msg in state["messages"][state["offset"] :]:
	if msg:
	messages += msg.lower()
	for word in IDENTITY_WORDS:
	if word in messages:
	leaked_identity = True
	break

	if leaked_identity:
	ct_leaked_identity += 1
	continue

	# Replace bard with palm
	models = [m.replace("bard", "palm-2") for m in models]

	question_id = row["states"][0]["conv_id"]
	conversation_a = to_openai_format(
	row["states"][0]["messages"][row["states"][0]["offset"] :]
	)
	conversation_b = to_openai_format(
	row["states"][1]["messages"][row["states"][1]["offset"] :]
	)

	# Save the result
	battles.append(
	dict(
	question_id=question_id,
	model_a=models[0],
	model_b=models[1],
	winner=convert_type[row["type"]],
	judge="arena_user",
	conversation_a=conversation_a,
	conversation_b=conversation_b,
	turn=len(conversation_a) // 2,
	anony=anony,
	rounds=rounds,
	language=lang_code,
	tstamp=row["tstamp"],
	)
	)

	all_models.update(models_hidden)
	battles.sort(key=lambda x: x["tstamp"])
	last_updated_tstamp = battles[-1]["tstamp"]

	last_updated_datetime = datetime.datetime.fromtimestamp(
	last_updated_tstamp, tz=timezone("US/Pacific")
	).strftime("%Y-%m-%d %H:%M:%S %Z")

	print(
	f"#votes: {len(data)}, #invalid votes: {ct_invalid}, "
	f"#leaked_identity: {ct_leaked_identity}"
	)
	print(f"#battles: {len(battles)}, #anony: {ct_anony}")
	print(f"#models: {len(all_models)}, {all_models}")
	print(f"last-updated: {last_updated_datetime}")

	return battles


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--max-num-files", type=int)
	parser.add_argument(
	"--mode", type=str, choices=["simple", "conv_release"], default="simple"
	)
	args = parser.parse_args()

	log_files = get_log_files(args.max_num_files)
	battles = clean_battle_data(log_files)
	last_updated_tstamp = battles[-1]["tstamp"]
	cutoff_date = datetime.datetime.fromtimestamp(
	last_updated_tstamp, tz=timezone("US/Pacific")
	).strftime("%Y%m%d")

	if args.mode == "simple":
	for x in battles:
	for key in [
	"conversation_a",
	"conversation_b",
	"judge",
	"question_id",
	"turn",
	]:
	del x[key]
	print("Samples:")
	for i in range(4):
	print(battles[i])
	output = f"clean_battle_{cutoff_date}.json"
	elif args.mode == "conv_release":
	new_battles = []
	for x in battles:
	if not x["anony"]:
	continue
	# for key in ["tstamp", "rounds"]:
	for key in ["rounds"]:
	del x[key]
	new_battles.append(x)
	battles = new_battles
	output = f"clean_battle_conv_release_{cutoff_date}.json"

	with open(output, "w") as fout:
	json.dump(battles, fout, indent=2, ensure_ascii=False)
	print(f"Write cleaned data to {output}")