Spaces:

JetBrains-Research
/

commit-message-editing-visualization

Runtime error

File size: 3,700 Bytes
import json
import os
from datetime import datetime, timedelta

import pandas as pd
from datasets import load_dataset
from huggingface_hub import hf_hub_download, list_repo_tree

import config


def load_raw_rewriting_as_pandas():
    return load_dataset(
        config.HF_RAW_DATASET_NAME, split=config.HF_RAW_DATASET_SPLIT, token=config.HF_TOKEN, cache_dir=config.CACHE_DIR
    ).to_pandas()


def load_full_commit_as_pandas():
    return (
        load_dataset(
            path=config.HF_FULL_COMMITS_DATASET_NAME,
            name=config.HF_FULL_COMMITS_DATASET_SUBNAME,
            split=config.HF_FULL_COMMITS_DATASET_SPLIT,
            cache_dir=config.CACHE_DIR,
        )
        .to_pandas()
        .rename(columns={"message": "reference"})
    )


def edit_time_from_history(history_str):
    history = json.loads(history_str)

    if len(history) == 0:
        return 0

    timestamps = list(map(lambda e: datetime.fromisoformat(e["ts"]), history))
    delta = max(timestamps) - min(timestamps)

    return delta // timedelta(milliseconds=1)


def edit_time_from_timestamps(row):
    loaded_ts = datetime.fromisoformat(row["loaded_ts"])
    submitted_ts = datetime.fromisoformat(row["submitted_ts"])

    delta = submitted_ts - loaded_ts

    result = delta // timedelta(milliseconds=1)

    return result if result >= 0 else None


def load_processed_rewriting_as_pandas():
    manual_rewriting = load_raw_rewriting_as_pandas()[
        [
            "hash",
            "repo",
            "commit_msg_start",
            "commit_msg_end",
            "session",
            "commit_msg_history",
            "loaded_ts",
            "submitted_ts",
        ]
    ]

    manual_rewriting["edit_time_hist"] = manual_rewriting["commit_msg_history"].apply(edit_time_from_history)
    manual_rewriting["edit_time"] = manual_rewriting.apply(edit_time_from_timestamps, axis=1)

    manual_rewriting.drop(columns=["commit_msg_history", "loaded_ts", "submitted_ts"])

    manual_rewriting.set_index(["hash", "repo"], inplace=True)

    mods_dataset = load_full_commit_as_pandas()[["hash", "repo", "mods"]]
    mods_dataset.set_index(["hash", "repo"], inplace=True)

    return manual_rewriting.join(other=mods_dataset, how="left").reset_index()


def load_synthetic_as_pandas():
    return load_dataset(
        config.HF_SYNTHETIC_DATASET_NAME,
        "all_pairs_with_metrics",
        split=config.HF_SYNTHETIC_DATASET_SPLIT,
        token=config.HF_TOKEN,
        cache_dir=config.CACHE_DIR,
    ).to_pandas()


def load_full_commit_with_predictions_as_pandas():
    full_dataset = load_full_commit_as_pandas()

    predictions_paths = []
    for prediction_file in list_repo_tree(
        repo_id=config.HF_PREDICTIONS_DATASET_NAME,
        path=os.path.join("commit_message_generation/predictions", config.HF_PREDICTIONS_MODEL),
        repo_type="dataset",
    ):
        predictions_paths.append(
            hf_hub_download(
                prediction_file.path,
                repo_id=config.HF_PREDICTIONS_DATASET_NAME,
                repo_type="dataset",
                cache_dir=config.CACHE_DIR,
            )
        )

    dfs = []
    for path in predictions_paths:
        dfs.append(pd.read_json(path, orient="records", lines=True))
    predictions_dataset = pd.concat(dfs, axis=0, ignore_index=True)
    predictions_dataset = predictions_dataset.sample(frac=1, random_state=config.RANDOM_STATE).set_index(
        ["hash", "repo"]
    )[["prediction"]]
    predictions_dataset = predictions_dataset[~predictions_dataset.index.duplicated(keep="first")]

    dataset = full_dataset.join(other=predictions_dataset, on=("hash", "repo"))

    return dataset.reset_index()