Spaces:
Running
on
Zero
Running
on
Zero
""" | |
Clean chatbot arena battle log. | |
Usage: | |
python3 clean_battle_data.py --mode conv_release | |
""" | |
import argparse | |
import datetime | |
import json | |
import os | |
import sys | |
from pytz import timezone | |
import time | |
import PIL | |
from PIL import ImageFile | |
ImageFile.LOAD_TRUNCATED_IMAGES = True | |
from tqdm import tqdm | |
from .basic_stats import get_log_files, NUM_SERVERS, LOG_ROOT_DIR | |
from .utils import detect_language, get_time_stamp_from_date, get_model_info | |
VOTES = ["tievote", "leftvote", "rightvote", "bothbad_vote"] | |
def parse_model_name(model_name): | |
return NotImplementedError() | |
return model_source, model_name, model_type | |
def remove_html(raw): | |
if raw.startswith("<h3>"): | |
return raw[raw.find(": ") + 2 : -len("</h3>\n")] | |
if raw.startswith("### Model A: ") or raw.startswith("### Model B: "): | |
return raw[13:] | |
return raw | |
def to_openai_format(messages): | |
roles = ["user", "assistant"] | |
ret = [] | |
for i, x in enumerate(messages): | |
ret.append({"role": roles[i % 2], "content": x[1]}) | |
return ret | |
def replace_model_name(old_name, tstamp): | |
replace_dict = { | |
"PlayGroundV2": "PlayGround V2", | |
"PlayGroundV2.5": "PlayGround V2.5", | |
"FluxTimestep": "FLUX1schnell", | |
"FluxGuidance": "FLUX1dev", | |
"CogVideoX": "CogVideoX-2B" | |
} | |
if old_name in replace_dict: | |
old_name = replace_dict[old_name] | |
if "Flux" in old_name: | |
print(f"Invalid model names: {old_name}") | |
exit(1) | |
model_info = get_model_info(old_name) | |
old_name = model_info.simple_name | |
return old_name | |
def read_file(filename): | |
data = [] | |
for retry in range(5): | |
try: | |
# lines = open(filename).readlines() | |
for l in open(filename): | |
row = json.loads(l) | |
if row["type"] in VOTES: | |
data.append(row) | |
break | |
except FileNotFoundError: | |
time.sleep(2) | |
except json.JSONDecodeError: | |
print(f"Error in reading {filename}") | |
print(row) | |
exit(0) | |
return data | |
def read_file_parallel(log_files, num_threads=16): | |
data_all = [] | |
if num_threads == 1: | |
for log_file in tqdm(log_files, desc="Reading"): | |
data_all.extend(read_file(log_file)) | |
return data_all | |
else: | |
from multiprocessing import Pool | |
with Pool(num_threads) as p: | |
ret_all = list(tqdm(p.imap(read_file, log_files), total=len(log_files))) | |
for ret in ret_all: | |
data_all.extend(ret) | |
return data_all | |
def load_image(image_path): | |
try: | |
return PIL.Image.open(image_path) | |
except: | |
return None | |
def clean_battle_data( | |
log_files, exclude_model_names, ban_ip_list=None, sanitize_ip=False, mode="simple", task_name="image_editing" | |
): | |
data = read_file_parallel(log_files, num_threads=1) | |
convert_type = { | |
"leftvote": "model_a", | |
"rightvote": "model_b", | |
"tievote": "tie", | |
"bothbad_vote": "tie (bothbad)", | |
} | |
all_models = set() | |
all_ips = dict() | |
ct_anony = 0 | |
ct_invalid = 0 | |
ct_leaked_identity = 0 | |
ct_banned = 0 | |
battles = [] | |
for row in tqdm(data, desc="Cleaning"): | |
if row["models"][0] is None or row["models"][1] is None: | |
print(f"Invalid model names: {row['models']}") | |
continue | |
# Resolve model names | |
models_public = [remove_html(row["models"][0]), remove_html(row["models"][1])] | |
if "model_name" in row["states"][0]: | |
models_hidden = [ | |
row["states"][0]["model_name"], | |
row["states"][1]["model_name"], | |
] | |
if models_hidden[0] is None: | |
models_hidden = models_public | |
else: | |
models_hidden = models_public | |
if (models_public[0] == "" and models_public[1] != "") or ( | |
models_public[1] == "" and models_public[0] != "" | |
): | |
ct_invalid += 1 | |
print(f"Invalid model names: {models_public}") | |
continue | |
if models_public[0] == "" or models_public[0] == "Model A": | |
anony = True | |
models = models_hidden | |
ct_anony += 1 | |
else: | |
anony = False | |
models = models_public | |
if not models_public == models_hidden: | |
print(f"Model names mismatch: {models_public} vs {models_hidden}") | |
ct_invalid += 1 | |
continue | |
def preprocess_model_name(m): | |
if m == "Playground v2": | |
return 'playground_PlayGroundV2_generation' | |
if m == "Playground v2.5": | |
return 'playground_PlayGroundV2.5_generation' | |
return m | |
models = [preprocess_model_name(m) for m in models] | |
# Replace bard with palm | |
if task_name == "image_editing": | |
valid = True | |
for _model in models: | |
try: | |
platform, model_name, task = _model.split("_") | |
except ValueError: | |
valid = False | |
break | |
if not (platform in ["playground", "imagenhub"] and task == "edition"): | |
valid = False | |
break | |
if not valid: | |
ct_invalid += 1 | |
continue | |
for i, _model in enumerate(models): | |
platform, model_name, task = _model.split("_") | |
models[i] = model_name | |
elif task_name == "t2i_generation": | |
valid = True | |
for _model in models: | |
try: | |
platform, model_name, task = _model.split("_") | |
except ValueError: | |
valid = False | |
break | |
if not (platform.lower() in ["playground", "imagenhub", 'fal'] and (task == "generation" or task == "text2image")): | |
valid = False | |
break | |
if not valid: | |
ct_invalid += 1 | |
continue | |
for i, _model in enumerate(models): | |
platform, model_name, task = _model.split("_") | |
models[i] = model_name | |
elif task_name == "video_generation": | |
valid = True | |
for _model in models: | |
try: | |
platform, model_name, task = _model.split("_") | |
except ValueError: | |
valid = False | |
break | |
if not (platform in ["videogenhub", "fal"] and task == "generation" or task == "text2video"): | |
valid = False | |
break | |
if not valid: | |
ct_invalid += 1 | |
continue | |
for i, _model in enumerate(models): | |
platform, model_name, task = _model.split("_") | |
models[i] = model_name | |
else: | |
raise ValueError(f"Invalid task_name: {task_name}") | |
models = [replace_model_name(m, row["tstamp"]) for m in models] | |
# Exclude certain models | |
if exclude_model_names and any(x in exclude_model_names for x in models): | |
ct_invalid += 1 | |
continue | |
if mode == "conv_release": | |
# assert the two images are the same | |
date = datetime.datetime.fromtimestamp(row["tstamp"], tz=timezone("US/Pacific")).strftime("%Y-%m-%d") # 2024-02-29 | |
image_path_format = f"{LOG_ROOT_DIR}/{date}-convinput_images/input_image_" | |
image_path_0 = image_path_format + str(row["states"][0]["conv_id"]) + ".png" | |
image_path_1 = image_path_format + str(row["states"][1]["conv_id"]) + ".png" | |
if not os.path.exists(image_path_0) or not os.path.exists(image_path_1): | |
print(f"Image not found for {image_path_0} or {image_path_1}") | |
ct_invalid += 1 | |
continue | |
image_0 = load_image(image_path_0) | |
image_1 = load_image(image_path_1) | |
if image_0 is None or image_1 is None: | |
print(f"Image not found for {image_path_0} or {image_path_1}") | |
ct_invalid += 1 | |
continue | |
if image_0.tobytes() != image_1.tobytes(): | |
print(f"Image not the same for {image_path_0} and {image_path_1}") | |
ct_invalid += 1 | |
continue | |
ip = row["ip"] | |
if ip not in all_ips: | |
all_ips[ip] = {"ip": ip, "count": 0, "sanitized_id": len(all_ips)} | |
all_ips[ip]["count"] += 1 | |
if sanitize_ip: | |
user_id = f"arena_user_{all_ips[ip]['sanitized_id']}" | |
else: | |
user_id = f"{all_ips[ip]['ip']}" | |
if ban_ip_list is not None and ip in ban_ip_list: | |
ct_banned += 1 | |
print(f"User {user_id} is banned") | |
continue | |
required_keys_each_task = { | |
"image_editing": ["source_prompt", "target_prompt", "instruct_prompt"], | |
"t2i_generation": ["prompt"], | |
"video_generation": ["prompt"] | |
} | |
model_a_inputs = row["states"][0].copy() | |
# pop conv_id and model_name | |
model_a_inputs.pop("conv_id") | |
model_a_inputs.pop("model_name") | |
model_b_inputs = row["states"][1].copy() | |
model_b_inputs.pop("conv_id") | |
model_b_inputs.pop("model_name") | |
for key in model_a_inputs: | |
if not (key in model_b_inputs and model_a_inputs[key] == model_b_inputs[key]): | |
print(f"Inconsistent inputs: {model_a_inputs} vs {model_b_inputs}") | |
ct_invalid += 1 | |
continue | |
if mode == "conv_release": | |
if any(key not in model_a_inputs for key in required_keys_each_task[task_name]): | |
print(f"Missing required keys: {model_a_inputs}, {required_keys_each_task[task_name]}") | |
ct_invalid += 1 | |
continue | |
inputs = model_a_inputs | |
# Save the results | |
battles.append( | |
dict( | |
model_a_conv_id=row["states"][0]["conv_id"], | |
model_b_conv_id=row["states"][1]["conv_id"], | |
inputs=inputs, | |
model_a=models[0], | |
model_b=models[1], | |
vote_type=row["type"], | |
winner=convert_type[row["type"]], | |
judge=f"arena_user_{user_id}", | |
anony=anony, | |
tstamp=row["tstamp"], | |
) | |
) | |
all_models.update(models_hidden) | |
battles.sort(key=lambda x: x["tstamp"]) | |
last_updated_tstamp = battles[-1]["tstamp"] | |
last_updated_datetime = datetime.datetime.fromtimestamp( | |
last_updated_tstamp, tz=timezone("US/Pacific") | |
).strftime("%Y-%m-%d %H:%M:%S %Z") | |
print( | |
f"#votes: {len(data)}, #invalid votes: {ct_invalid}, " | |
f"#leaked_identity: {ct_leaked_identity} " | |
f"#banned: {ct_banned} " | |
) | |
print(f"#battles: {len(battles)}, #anony: {ct_anony}") | |
print(f"#models: {len(all_models)}, {all_models}") | |
print(f"last-updated: {last_updated_datetime}") | |
if ban_ip_list is not None: | |
for ban_ip in ban_ip_list: | |
if ban_ip in all_ips: | |
del all_ips[ban_ip] | |
print("Top 30 IPs:") | |
print(sorted(all_ips.values(), key=lambda x: x["count"], reverse=True)[:30]) | |
return battles | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--max-num-files", type=int) | |
parser.add_argument( | |
"--mode", type=str, choices=["simple", "conv_release"], default="simple" | |
) | |
parser.add_argument("--task_name", type=str, default="image_editing", choices=["image_editing", "t2i_generation", "video_generation"]) | |
parser.add_argument("--exclude-model-names", type=str, nargs="+") | |
parser.add_argument("--ban-ip-file", type=str) | |
parser.add_argument("--sanitize-ip", action="store_true", default=False) | |
args = parser.parse_args() | |
log_files = get_log_files(args.max_num_files) | |
ban_ip_list = json.load(open(args.ban_ip_file)) if args.ban_ip_file else None | |
battles = clean_battle_data( | |
log_files, args.exclude_model_names or [], ban_ip_list, args.sanitize_ip, args.mode, args.task_name | |
) | |
last_updated_tstamp = battles[-1]["tstamp"] | |
cutoff_date = datetime.datetime.fromtimestamp( | |
last_updated_tstamp, tz=timezone("US/Pacific") | |
).strftime("%Y%m%d") | |
if args.mode == "simple": | |
# for x in battles: | |
# for key in [ | |
# "conversation_a", | |
# "conversation_b", | |
# "question_id", | |
# ]: | |
# if key in x: | |
# del x[key] | |
print("Samples:") | |
for i in range(min(4, len(battles))): | |
print(battles[i]) | |
output = f"clean_battle_{args.task_name}_{cutoff_date}.json" | |
elif args.mode == "conv_release": | |
output = f"clean_battle_{args.task_name}_conv_{cutoff_date}.json" | |
with open(output, "w") as fout: | |
json.dump(battles, fout, indent=2, ensure_ascii=False) | |
print(f"Write cleaned data to {output}") | |
with open("cut_off_date.txt", "w") as fout: | |
fout.write(cutoff_date) |