Spaces:
Sleeping
Sleeping
from datasets import load_dataset | |
MODELS = [ | |
'cmg_codellama13b-instruct', 'cmg_gpt_4_0613', 'cmg_deepseek-coder-33b-instruct'] | |
CACHE_DIR = 'cache' | |
def load_data(): | |
dataset = load_dataset("JetBrains-Research/lca-cmg", | |
"commitchronicle-py-long", | |
split="test", | |
cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo']).rename( | |
columns={'message': 'reference'}) | |
message_cols = ['reference'] | |
for model in MODELS: | |
model_dataset = load_dataset("JetBrains-Research/lca-results", | |
model, | |
split="test", | |
cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo'])[["prediction"]] | |
model_dataset = model_dataset[~model_dataset.index.duplicated(keep='first')] | |
cur_col_name = f"{model}" | |
dataset = dataset.join(other=model_dataset).rename(columns={'prediction': cur_col_name}) | |
message_cols.append(cur_col_name) | |
return dataset.reset_index().to_dict("records"), message_cols | |