Spaces:

JetBrains-Research
/

commit-message-editing-visualization

Runtime error

App Files Files Community

Petr Tsvetkov commited on Apr 29, 2024

Commit

f5faae7

1 Parent(s): 6676c5a

Add edit distance and edit time metrics; add GPT-based metric

Browse files

Files changed (8) hide show

api_wrappers/grazie_wrapper.py +34 -2
api_wrappers/hf_data_loader.py +34 -1
config.py +2 -0
custom_metrics/gpt_eval.py +30 -23
generation_steps/metrics_analysis.py +65 -31
generation_steps/synthetic_end_to_start.py +10 -3
generation_steps/synthetic_start_to_end.py +2 -2
generate_synthetic_dataset.py → run_pipeline.py +0 -0

api_wrappers/grazie_wrapper.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import time
 from grazie.api.client.chat.prompt import ChatPrompt
@@ -14,8 +15,19 @@ client = GrazieApiGatewayClient(
     grazie_jwt_token=config.GRAZIE_API_JWT_TOKEN
 )
-def generate_for_prompt(prompt):
     output = None
     while output is None:
@@ -24,7 +36,7 @@ def generate_for_prompt(prompt):
                 chat=ChatPrompt()
                 .add_system("You are a helpful assistant.")
                 .add_user(prompt),
-                profile=LLMProfile("gpt-4-1106-preview")
             ).content
         except:
             time.sleep(config.GRAZIE_TIMEOUT_SEC)
@@ -32,3 +44,23 @@ def generate_for_prompt(prompt):
     assert output is not None
     return output

+import pickle
 import time
 from grazie.api.client.chat.prompt import ChatPrompt
     grazie_jwt_token=config.GRAZIE_API_JWT_TOKEN
 )
+LLM_CACHE_FILE = config.CACHE_DIR / f"{config.LLM_MODEL}.cache.pkl"
+LLM_CACHE = {}
+LLM_CACHE_USED = {}
+if not LLM_CACHE_FILE.exists():
+    with open(LLM_CACHE_FILE, "wb") as file:
+        pickle.dump(obj=LLM_CACHE, file=file)
+with open(LLM_CACHE_FILE, "rb") as file:
+    LLM_CACHE = pickle.load(file=file)
+def llm_request(prompt):
     output = None
     while output is None:
                 chat=ChatPrompt()
                 .add_system("You are a helpful assistant.")
                 .add_user(prompt),
+                profile=LLMProfile(config.LLM_MODEL)
             ).content
         except:
             time.sleep(config.GRAZIE_TIMEOUT_SEC)
     assert output is not None
     return output
+def generate_for_prompt(prompt):
+    if prompt not in LLM_CACHE:
+        LLM_CACHE[prompt] = []
+    if prompt not in LLM_CACHE_USED:
+        LLM_CACHE_USED[prompt] = 0
+    while LLM_CACHE_USED[prompt] >= len(LLM_CACHE[prompt]):
+        new_response = llm_request(prompt)
+        LLM_CACHE[prompt].append(new_response)
+        with open(LLM_CACHE_FILE, "wb") as file:
+            pickle.dump(obj=LLM_CACHE, file=file)
+    result = LLM_CACHE[prompt][LLM_CACHE_USED[prompt]]
+    LLM_CACHE_USED[prompt] += 1
+    return result

api_wrappers/hf_data_loader.py CHANGED Viewed

@@ -1,3 +1,6 @@
 from datasets import load_dataset
 import config
@@ -18,9 +21,39 @@ def load_full_commit_as_pandas():
         columns={'message': 'reference'})
 def load_processed_rewriting_as_pandas():
     manual_rewriting = load_raw_rewriting_as_pandas()[
-        ["hash", "repo", "commit_msg_start", "commit_msg_end", "session"]]
     manual_rewriting.set_index(["hash", "repo"], inplace=True)
     mods_dataset = load_full_commit_as_pandas()[["hash", "repo", "mods"]]

+import json
+from datetime import datetime, timedelta
 from datasets import load_dataset
 import config
         columns={'message': 'reference'})
+def edit_time_from_history(history_str):
+    history = json.loads(history_str)
+    if len(history) == 0:
+        return 0
+    timestamps = list(map(lambda e: datetime.fromisoformat(e['ts']), history))
+    delta = (max(timestamps) - min(timestamps))
+    return delta // timedelta(milliseconds=1)
+def edit_time_from_timestamps(row):
+    loaded_ts = datetime.fromisoformat(row['loaded_ts'])
+    submitted_ts = datetime.fromisoformat(row['submitted_ts'])
+    delta = submitted_ts - loaded_ts
+    result = delta // timedelta(milliseconds=1)
+    return result if result >= 0 else None
 def load_processed_rewriting_as_pandas():
     manual_rewriting = load_raw_rewriting_as_pandas()[
+        ["hash", "repo", "commit_msg_start", "commit_msg_end", "session", "commit_msg_history", "loaded_ts",
+         "submitted_ts"]]
+    manual_rewriting['edit_time_hist'] = manual_rewriting['commit_msg_history'].apply(edit_time_from_history)
+    manual_rewriting['edit_time'] = manual_rewriting.apply(edit_time_from_timestamps, axis=1)
+    manual_rewriting.drop(columns=['commit_msg_history', "loaded_ts", "submitted_ts"])
     manual_rewriting.set_index(["hash", "repo"], inplace=True)
     mods_dataset = load_full_commit_as_pandas()[["hash", "repo", "mods"]]

config.py CHANGED Viewed

@@ -22,6 +22,8 @@ HF_PREDICTIONS_DATASET_SPLIT = "test"
 HF_SYNTHETIC_DATASET_NAME = "petrtsv-jb/synthetic-commit-msg-rewriting"
 HF_SYNTHETIC_DATASET_SPLIT = 'train'
 CACHE_DIR = Path("cache")
 CACHE_DIR.mkdir(exist_ok=True)

 HF_SYNTHETIC_DATASET_NAME = "petrtsv-jb/synthetic-commit-msg-rewriting"
 HF_SYNTHETIC_DATASET_SPLIT = 'train'
+LLM_MODEL = "gpt-4-1106-preview"
 CACHE_DIR = Path("cache")
 CACHE_DIR.mkdir(exist_ok=True)

custom_metrics/gpt_eval.py CHANGED Viewed

@@ -1,46 +1,53 @@
-import time
 from api_wrappers import grazie_wrapper
-def build_prompt(prediction, reference):
-    return f"""Your task is to rate the quality of the generated commit message using the scale from 1 to 5.
-A good commit message has to be concise.
-Assign lower scores for the commit messages that are too verbose for a commit message.
-The generated commit message you have to evaluate:
-START OF THE GENERATED COMMIT MESSAGE
 {prediction}
-END OF THE GENERATED COMMIT MESSAGE
-Here is an example of an ideal reference commit message for the same commit:
-START OF THE REFERENCE COMMIT MESSAGE
 {reference}
 END OF THE REFERENCE COMMIT MESSAGE
-All the information in the reference commit message is true.
-Print only one integer number after the token "OUTPUT" - the rating of the generated commit message.
-Do not print anything that is not an integer.
-OUTPUT
 """
 N_RETRIES = 3
-def compute(prediction, reference):
-    prompt = build_prompt(prediction, reference)
     outputs = []
     for i in range(N_RETRIES):
         try:
-            output = grazie_wrapper.generate_for_prompt(prompt).strip()[-1]
             outputs.append(output)
-            return int(output)
         except ValueError:
             continue
-    raise RuntimeError(f"GPT4 cannot generate a number. Its outputs were: {str(outputs)}")

 from api_wrappers import grazie_wrapper
+def build_prompt_ref(prediction, reference):
+    return f"""Evaluate the following commit message based on clarity, specificity, context, and conciseness without
+providing any additional feedback or commentary:
+START OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
 {prediction}
+END OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
+For reference, consider this as an example of a good commit message for the same commit that is both concise and
+specific:
+START OF THE REFERENCE COMMIT MESSAGE
 {reference}
 END OF THE REFERENCE COMMIT MESSAGE
+YOUR TASK: Provide a single number as a response, representing the rating on a scale from 1 to 10, where 1 is the
+lowest quality and 10 is the highest quality. Do not include any other text or explanation in your response.
 """
 N_RETRIES = 3
+def get_number_for_prompt(prompt):
     outputs = []
+    result = None
     for i in range(N_RETRIES):
         try:
+            output = grazie_wrapper.generate_for_prompt(prompt).strip().split()[-1]
             outputs.append(output)
+            result = int(output)
+            break
         except ValueError:
             continue
+    if result is None:
+        raise RuntimeError(f"LLM cannot generate a number. Its outputs were: {str(outputs)}")
+    return result
+def compute_ref(prediction, reference, n_requests):
+    prompt = build_prompt_ref(prediction, reference)
+    results = [
+        get_number_for_prompt(prompt)
+        for _ in range(n_requests)
+    ]
+    return sum(results) / len(results)

generation_steps/metrics_analysis.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import functools
 import operator
 import evaluate
 import pandas as pd
 from tqdm import tqdm
@@ -12,59 +13,78 @@ from custom_metrics import gpt_eval
 BLEU = evaluate.load('bleu', cache_dir=config.CACHE_DIR)
-def bleu_fn(pred, ref):
     return BLEU.compute(predictions=[pred], references=[ref])["bleu"]
 METEOR = evaluate.load('meteor', cache_dir=config.CACHE_DIR)
-def meteor_fn(pred, ref):
     return METEOR.compute(predictions=[pred], references=[ref])["meteor"]
 ROUGE = evaluate.load('rouge', cache_dir=config.CACHE_DIR)
-def rouge1_fn(pred, ref):
     return ROUGE.compute(predictions=[pred], references=[ref])["rouge1"]
-def rouge2_fn(pred, ref):
     return ROUGE.compute(predictions=[pred], references=[ref])["rouge2"]
-def rougeL_fn(pred, ref):
     return ROUGE.compute(predictions=[pred], references=[ref])["rougeL"]
 BERTSCORE = evaluate.load('bertscore', cache_dir=config.CACHE_DIR)
-def bertscore_fn(pred, ref):
     return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
-def gptscore_fn(pred, ref):
-    return gpt_eval.compute(prediction=pred, reference=ref)
 CHRF = evaluate.load("chrf")
-def chrf_fn(pred, ref):
     return CHRF.compute(predictions=[pred], references=[[ref]])["score"]
 TER = evaluate.load("ter")
-def ter_fn(pred, ref):
     return TER.compute(predictions=[pred], references=[[ref]])["score"]
-METRICS = {
-    # "gptscore": gptscore_fn,
     "bleu": bleu_fn,
     "meteor": meteor_fn,
     "rouge1": rouge1_fn,
@@ -72,7 +92,12 @@ METRICS = {
     "rougeL": rougeL_fn,
     "bertscore": bertscore_fn,
     "chrF": chrf_fn,
-    "ter": ter_fn
 }
@@ -86,11 +111,11 @@ def compute_metrics(df):
     tqdm.pandas()
     def apply_metric_fn_to_row(row, fn, col_pred, col_ref):
-        return fn(row[col_pred], row[col_ref])
-    for metric in METRICS:
-        print(f"Computing {metric}")
-        metric_fn = METRICS[metric]
         df[f"{metric}_related"] = df.progress_apply(
             lambda row: apply_metric_fn_to_row(row=row,
                                                fn=metric_fn,
@@ -98,6 +123,10 @@ def compute_metrics(df):
                                                col_ref="commit_msg_end"),
             axis=1
         )
         df[f"{metric}_independent"] = df.progress_apply(
             lambda row: apply_metric_fn_to_row(row=row,
                                                fn=metric_fn,
@@ -106,25 +135,30 @@ def compute_metrics(df):
             axis=1
         )
-        df[f"{metric}_pearson"] = df[f"{metric}_related"].corr(df[f"{metric}_independent"], method="pearson")
-        df[f"{metric}_spearman"] = df[f"{metric}_related"].corr(df[f"{metric}_independent"], method="spearman")
     return df
 def correlations_for_group(group):
     correlations = []
-    for metric in METRICS:
-        correlations.append({
-            f"{metric}_pearson": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="pearson"),
-            f"{metric}_spearman": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="spearman")
-        })
-        for other_metric in METRICS:
             correlations.append({
-                f"ind_{metric}_rel_{other_metric}_pearson": group[f"{other_metric}_related"].corr(
-                    group[f"{metric}_independent"], method="pearson"),
-                f"ind_{metric}_rel_{other_metric}_spearman": group[f"{other_metric}_related"].corr(
-                    group[f"{metric}_independent"], method="spearman")
             })
     return pd.Series(functools.reduce(operator.ior, correlations, {}))

 import functools
 import operator
+import Levenshtein
 import evaluate
 import pandas as pd
 from tqdm import tqdm
 BLEU = evaluate.load('bleu', cache_dir=config.CACHE_DIR)
+def bleu_fn(pred, ref, **kwargs):
     return BLEU.compute(predictions=[pred], references=[ref])["bleu"]
 METEOR = evaluate.load('meteor', cache_dir=config.CACHE_DIR)
+def meteor_fn(pred, ref, **kwargs):
     return METEOR.compute(predictions=[pred], references=[ref])["meteor"]
 ROUGE = evaluate.load('rouge', cache_dir=config.CACHE_DIR)
+def rouge1_fn(pred, ref, **kwargs):
     return ROUGE.compute(predictions=[pred], references=[ref])["rouge1"]
+def rouge2_fn(pred, ref, **kwargs):
     return ROUGE.compute(predictions=[pred], references=[ref])["rouge2"]
+def rougeL_fn(pred, ref, **kwargs):
     return ROUGE.compute(predictions=[pred], references=[ref])["rougeL"]
 BERTSCORE = evaluate.load('bertscore', cache_dir=config.CACHE_DIR)
+def bertscore_fn(pred, ref, **kwargs):
     return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
 CHRF = evaluate.load("chrf")
+def chrf_fn(pred, ref, **kwargs):
     return CHRF.compute(predictions=[pred], references=[[ref]])["score"]
 TER = evaluate.load("ter")
+def ter_fn(pred, ref, **kwargs):
     return TER.compute(predictions=[pred], references=[[ref]])["score"]
+def edit_distance_fn(pred, ref, **kwargs):
+    return Levenshtein.distance(pred, ref)
+def edit_time_fn(pred, ref, **kwargs):
+    return kwargs["edittime"]
+def gptscore_ref_1_fn(pred, ref, **kwargs):
+    return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=1)
+def gptscore_ref_3_fn(pred, ref, **kwargs):
+    return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=3)
+def gptscore_ref_5_fn(pred, ref, **kwargs):
+    return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
+IND_METRICS = {
+    "gptscore-ref-1-req": gptscore_ref_1_fn,
+    "gptscore-ref-3-req": gptscore_ref_3_fn,
+    # "gptscore-ref-5-req": gptscore_ref_5_fn,
+    "editdist": edit_distance_fn,
     "bleu": bleu_fn,
     "meteor": meteor_fn,
     "rouge1": rouge1_fn,
     "rougeL": rougeL_fn,
     "bertscore": bertscore_fn,
     "chrF": chrf_fn,
+    "ter": ter_fn,
+}
+REL_METRICS = {
+    "editdist": edit_distance_fn,
+    "edittime": edit_time_fn,
 }
     tqdm.pandas()
     def apply_metric_fn_to_row(row, fn, col_pred, col_ref):
+        return fn(row[col_pred], row[col_ref], edittime=row['edit_time'])
+    for metric in REL_METRICS:
+        print(f"Computing {metric} for the related pairs")
+        metric_fn = REL_METRICS[metric]
         df[f"{metric}_related"] = df.progress_apply(
             lambda row: apply_metric_fn_to_row(row=row,
                                                fn=metric_fn,
                                                col_ref="commit_msg_end"),
             axis=1
         )
+    for metric in IND_METRICS:
+        print(f"Computing {metric} for the independent pairs")
+        metric_fn = IND_METRICS[metric]
         df[f"{metric}_independent"] = df.progress_apply(
             lambda row: apply_metric_fn_to_row(row=row,
                                                fn=metric_fn,
             axis=1
         )
+    for rel_metric in REL_METRICS:
+        for ind_metric in IND_METRICS:
+            df[f"rel_{rel_metric}_ind_{ind_metric}_pearson"] = (
+                df[f"{rel_metric}_related"].corr(df[f"{ind_metric}_independent"], method="pearson"))
+            df[f"rel_{rel_metric}_ind_{ind_metric}_spearman"] = (
+                df[f"{rel_metric}_related"].corr(df[f"{ind_metric}_independent"], method="spearman"))
     return df
 def correlations_for_group(group):
     correlations = []
+    for rel_metric in REL_METRICS:
+        # correlations.append({
+        #     f"{metric}_pearson": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="pearson"),
+        #     f"{metric}_spearman": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="spearman")
+        # })
+        for ind_metric in IND_METRICS:
             correlations.append({
+                f"rel_{rel_metric}_ind_{ind_metric}_pearson": group[f"{rel_metric}_related"].corr(
+                    group[f"{ind_metric}_independent"], method="pearson"),
+                f"rel_{rel_metric}_ind_{ind_metric}_spearman": group[f"{rel_metric}_related"].corr(
+                    group[f"{ind_metric}_independent"], method="spearman"),
             })
     return pd.Series(functools.reduce(operator.ior, correlations, {}))

generation_steps/synthetic_end_to_start.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import pandas as pd
 from tqdm import tqdm
@@ -7,9 +9,9 @@ import statistics
 from api_wrappers import grazie_wrapper, hf_data_loader
 from generation_steps import examples
-GENERATION_MULTIPLIER = 2
 REL_INSERTIONS_THRESHOLD = 0.5
-GENERATION_ATTEMPTS = 5
 def build_prompt(reference, diff):
@@ -61,6 +63,8 @@ def generate_start_msg(end_msg, diff):
 COLS_TO_KEEP = ["hash", "repo", "commit_msg_end", "mods", "session"]
 def transform(df):
     print(f"End -> start synthesis:")
@@ -75,7 +79,7 @@ def transform(df):
         "commit_msg_start": []
     }
-    for col in COLS_TO_KEEP:
         generated_data[col] = []
     for _, row in tqdm(df.iterrows(), total=len(df)):
@@ -87,6 +91,9 @@ def transform(df):
             for col in COLS_TO_KEEP:
                 generated_data[col].append(row[col])
     generated_df = pd.DataFrame.from_dict(generated_data)
     generated_df['end_to_start'] = True

+from itertools import chain
 import pandas as pd
 from tqdm import tqdm
 from api_wrappers import grazie_wrapper, hf_data_loader
 from generation_steps import examples
+GENERATION_MULTIPLIER = 3
 REL_INSERTIONS_THRESHOLD = 0.5
+GENERATION_ATTEMPTS = 3
 def build_prompt(reference, diff):
 COLS_TO_KEEP = ["hash", "repo", "commit_msg_end", "mods", "session"]
+COLS_TO_DEFAULT = {"edit_time": None}
 def transform(df):
     print(f"End -> start synthesis:")
         "commit_msg_start": []
     }
+    for col in chain(COLS_TO_KEEP, COLS_TO_DEFAULT):
         generated_data[col] = []
     for _, row in tqdm(df.iterrows(), total=len(df)):
             for col in COLS_TO_KEEP:
                 generated_data[col].append(row[col])
+            for col in COLS_TO_DEFAULT:
+                generated_data[col].append(COLS_TO_DEFAULT[col])
     generated_df = pd.DataFrame.from_dict(generated_data)
     generated_df['end_to_start'] = True

generation_steps/synthetic_start_to_end.py CHANGED Viewed

@@ -7,9 +7,9 @@ import statistics
 from api_wrappers import grazie_wrapper
 from generation_steps import examples
-GENERATION_MULTIPLIER = 2
 REL_DELETIONS_THRESHOLD = 0.75
-GENERATION_ATTEMPTS = 5
 def build_prompt(prediction, diff):

 from api_wrappers import grazie_wrapper
 from generation_steps import examples
+GENERATION_MULTIPLIER = 3
 REL_DELETIONS_THRESHOLD = 0.75
+GENERATION_ATTEMPTS = 3
 def build_prompt(prediction, diff):

generate_synthetic_dataset.py → run_pipeline.py RENAMED Viewed

File without changes