Spaces:
Sleeping
Sleeping
File size: 1,124 Bytes
14bb44e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
from datasets import load_dataset
MODELS = [
'cmg_codellama13b-instruct', 'cmg_gpt_4_0613', 'deepseek-coder-33b-instruct']
CACHE_DIR = 'cache'
def load_data():
dataset = load_dataset("JetBrains-Research/lca-cmg",
"commitchronicle-py-long",
split="test",
cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo']).rename(
columns={'message': 'reference'})
message_cols = ['reference']
for model in MODELS:
model_dataset = load_dataset("JetBrains-Research/lca-results",
model,
split="test",
cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo'])[["prediction"]]
model_dataset = model_dataset[~model_dataset.index.duplicated(keep='first')]
cur_col_name = f"{model}"
dataset = dataset.join(other=model_dataset).rename(columns={'prediction': cur_col_name})
message_cols.append(cur_col_name)
return dataset.reset_index().to_dict("records"), message_cols
|