from datasets import load_dataset

MODELS = [
    'cmg_codellama13b-instruct', 'cmg_gpt_4_0613', 'deepseek-coder-33b-instruct']

CACHE_DIR = 'cache'


def load_data():
    dataset = load_dataset("JetBrains-Research/lca-cmg",
                           "commitchronicle-py-long",
                           split="test",
                           cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo']).rename(
        columns={'message': 'reference'})

    message_cols = ['reference']

    for model in MODELS:
        model_dataset = load_dataset("JetBrains-Research/lca-results",
                                     model,
                                     split="test",
                                     cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo'])[["prediction"]]
        model_dataset = model_dataset[~model_dataset.index.duplicated(keep='first')]

        cur_col_name = f"{model}"
        dataset = dataset.join(other=model_dataset).rename(columns={'prediction': cur_col_name})
        message_cols.append(cur_col_name)

    return dataset.reset_index().to_dict("records"), message_cols