{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": "# Data Stats", "id": "694a6cc631d4ab93" }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:43:07.644299Z", "start_time": "2024-10-15T18:43:02.316453Z" } }, "cell_type": "code", "source": [ "from datasets import load_dataset\n", "\n", "\n", "df = load_dataset(\"JetBrains-Research/synthetic-commit-msg-edits\", \"all_pairs\", split=\"train\").to_pandas()\n", "df.head()" ], "id": "ed42f4f83199feb2", "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading data: 100%|██████████| 6.35M/6.35M [00:00<00:00, 9.95MB/s]\n" ] }, { "data": { "text/plain": [ "Generating train split: 0 examples [00:00, ? examples/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "1a0523289d424b29974b60d017643280" } }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " hash repo \\\n", "0 2febb99eee8ed71c9122db88ca58dd33be0b9550 mesonbuild/meson \n", "1 2febb99eee8ed71c9122db88ca58dd33be0b9550 mesonbuild/meson \n", "2 2febb99eee8ed71c9122db88ca58dd33be0b9550 mesonbuild/meson \n", "3 2febb99eee8ed71c9122db88ca58dd33be0b9550 mesonbuild/meson \n", "4 2febb99eee8ed71c9122db88ca58dd33be0b9550 mesonbuild/meson \n", "\n", " G_text \\\n", "0 Enhance OptionOverrideProxy and simplify optio... \n", "1 Enhance OptionOverrideProxy and simplify optio... \n", "2 Enhance OptionOverrideProxy and simplify optio... \n", "3 Enhance OptionOverrideProxy and simplify optio... \n", "4 Enhance OptionOverrideProxy and simplify optio... \n", "\n", " E_text G_type \\\n", "0 Enhance OptionOverrideProxy for multiple optio... synthetic_backward \n", "1 Refactor OptionOverrideProxy and Backend class... synthetic_backward \n", "2 Refactor OptionOverrideProxy and backend optio... synthetic_backward \n", "3 Refactor: Enhance OptionOverrideProxy for mult... synthetic_backward \n", "4 Refactor OptionOverrideProxy and add target-sp... synthetic_backward \n", "\n", " E_type is_related \n", "0 expert_labeled True \n", "1 synthetic_forward True \n", "2 synthetic_forward True \n", "3 synthetic_forward True \n", "4 synthetic_forward_from_backward False " ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hashrepoG_textE_textG_typeE_typeis_related
02febb99eee8ed71c9122db88ca58dd33be0b9550mesonbuild/mesonEnhance OptionOverrideProxy and simplify optio...Enhance OptionOverrideProxy for multiple optio...synthetic_backwardexpert_labeledTrue
12febb99eee8ed71c9122db88ca58dd33be0b9550mesonbuild/mesonEnhance OptionOverrideProxy and simplify optio...Refactor OptionOverrideProxy and Backend class...synthetic_backwardsynthetic_forwardTrue
22febb99eee8ed71c9122db88ca58dd33be0b9550mesonbuild/mesonEnhance OptionOverrideProxy and simplify optio...Refactor OptionOverrideProxy and backend optio...synthetic_backwardsynthetic_forwardTrue
32febb99eee8ed71c9122db88ca58dd33be0b9550mesonbuild/mesonEnhance OptionOverrideProxy and simplify optio...Refactor: Enhance OptionOverrideProxy for mult...synthetic_backwardsynthetic_forwardTrue
42febb99eee8ed71c9122db88ca58dd33be0b9550mesonbuild/mesonEnhance OptionOverrideProxy and simplify optio...Refactor OptionOverrideProxy and add target-sp...synthetic_backwardsynthetic_forward_from_backwardFalse
\n", "
" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 3 }, { "metadata": {}, "cell_type": "markdown", "source": "## Full", "id": "922e7a73f11a4aec" }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:43:14.266540Z", "start_time": "2024-10-15T18:43:14.262103Z" } }, "cell_type": "code", "source": "len(df.loc[df.is_related])", "id": "562d9c53da109d1a", "outputs": [ { "data": { "text/plain": [ "656" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 4 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:43:18.073966Z", "start_time": "2024-10-15T18:43:18.069219Z" } }, "cell_type": "code", "source": "df.loc[df.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()", "id": "b4f3c96a4b676a0d", "outputs": [ { "data": { "text/plain": [ "43.733333333333334" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 5 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:43:19.026689Z", "start_time": "2024-10-15T18:43:19.021680Z" } }, "cell_type": "code", "source": "len(df.loc[~df.is_related])", "id": "54d9f32f1d18844f", "outputs": [ { "data": { "text/plain": [ "5140" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 6 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:43:19.484304Z", "start_time": "2024-10-15T18:43:19.480012Z" } }, "cell_type": "code", "source": "df.loc[~df.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()", "id": "679761631517b9e4", "outputs": [ { "data": { "text/plain": [ "342.6666666666667" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 7 }, { "metadata": {}, "cell_type": "markdown", "source": "## Expert-labeled", "id": "84561ea89717d61a" }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:45:52.905631Z", "start_time": "2024-10-15T18:45:52.901913Z" } }, "cell_type": "code", "source": "_ = df.loc[(df.G_type == \"initial\") & (df.E_type == \"expert_labeled\")]", "id": "be1c800f45cef26e", "outputs": [], "execution_count": 36 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:45:53.234109Z", "start_time": "2024-10-15T18:45:53.230986Z" } }, "cell_type": "code", "source": "len(_.loc[_.is_related])", "id": "1d092dff4d39bcd1", "outputs": [ { "data": { "text/plain": [ "57" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 37 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:45:53.629311Z", "start_time": "2024-10-15T18:45:53.625620Z" } }, "cell_type": "code", "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()", "id": "a06a532cd5a29725", "outputs": [ { "data": { "text/plain": [ "3.8" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 38 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:45:53.956790Z", "start_time": "2024-10-15T18:45:53.953842Z" } }, "cell_type": "code", "source": "len(_.loc[~_.is_related])", "id": "5e19c8a6309b62aa", "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 39 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:46:02.554527Z", "start_time": "2024-10-15T18:46:02.551084Z" } }, "cell_type": "code", "source": "_.loc[~_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()", "id": "e43179c5dcab5eb2", "outputs": [ { "data": { "text/plain": [ "nan" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 40 }, { "metadata": {}, "cell_type": "markdown", "source": "## Backward", "id": "70ee052fae2f88e3" }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:44:33.559606Z", "start_time": "2024-10-15T18:44:33.556802Z" } }, "cell_type": "code", "source": "_ = df.loc[(df.G_type == \"synthetic_backward\") & (~df.E_type.isin([\"synthetic_forward\", \"synthetic_forward_from_backward\"]))]", "id": "99f51ecc55c4db35", "outputs": [], "execution_count": 20 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:44:33.958325Z", "start_time": "2024-10-15T18:44:33.955847Z" } }, "cell_type": "code", "source": "len(_.loc[_.is_related])", "id": "6ff29390c8e127c2", "outputs": [ { "data": { "text/plain": [ "104" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 21 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:44:34.455560Z", "start_time": "2024-10-15T18:44:34.452303Z" } }, "cell_type": "code", "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()", "id": "e1ae04e1ecfb2040", "outputs": [ { "data": { "text/plain": [ "7.428571428571429" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 22 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:44:34.903849Z", "start_time": "2024-10-15T18:44:34.901226Z" } }, "cell_type": "code", "source": "len(_.loc[~_.is_related])", "id": "125c4c335e7761da", "outputs": [ { "data": { "text/plain": [ "1048" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 23 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:44:35.783538Z", "start_time": "2024-10-15T18:44:35.778676Z" } }, "cell_type": "code", "source": "_.loc[~_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()", "id": "4782f1d6e6863f89", "outputs": [ { "data": { "text/plain": [ "74.85714285714286" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 24 }, { "metadata": {}, "cell_type": "markdown", "source": "## Forward", "id": "bf61a4b422f779fa" }, { "metadata": {}, "cell_type": "markdown", "source": "### From human", "id": "1429f9f99acf75d" }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:46:21.359807Z", "start_time": "2024-10-15T18:46:21.356451Z" } }, "cell_type": "code", "source": "_ = df.loc[(df.G_type == \"initial\") & (df.E_type == \"synthetic_forward\")]", "id": "e13d55b0124f04b3", "outputs": [], "execution_count": 41 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:46:21.798508Z", "start_time": "2024-10-15T18:46:21.795885Z" } }, "cell_type": "code", "source": "len(_.loc[_.is_related])", "id": "b8353390df7da427", "outputs": [ { "data": { "text/plain": [ "177" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 42 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:46:22.163595Z", "start_time": "2024-10-15T18:46:22.160176Z" } }, "cell_type": "code", "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()", "id": "ac89afde65efd73d", "outputs": [ { "data": { "text/plain": [ "11.8" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 43 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:46:22.552314Z", "start_time": "2024-10-15T18:46:22.549570Z" } }, "cell_type": "code", "source": "len(_.loc[~_.is_related])", "id": "9b6cb335e3bbb7ff", "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 44 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:46:23.237736Z", "start_time": "2024-10-15T18:46:23.234085Z" } }, "cell_type": "code", "source": "__.loc[~__.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()", "id": "fe22189a70fc4149", "outputs": [ { "data": { "text/plain": [ "nan" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 45 }, { "metadata": {}, "cell_type": "markdown", "source": "### From backward", "id": "ace7afb876fb88a0" }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:47:06.641374Z", "start_time": "2024-10-15T18:47:06.637018Z" } }, "cell_type": "code", "source": "_ = df.loc[(df.G_type == \"synthetic_backward\") & (df.E_type.isin([\"synthetic_forward\", \"synthetic_forward_from_backward\"]))]", "id": "88800960dbff619a", "outputs": [], "execution_count": 53 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:47:15.358650Z", "start_time": "2024-10-15T18:47:15.355108Z" } }, "cell_type": "code", "source": "len(_.loc[_.is_related])", "id": "890613156e005c83", "outputs": [ { "data": { "text/plain": [ "318" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 56 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:47:15.579415Z", "start_time": "2024-10-15T18:47:15.576016Z" } }, "cell_type": "code", "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()", "id": "999f91382a2c8ff6", "outputs": [ { "data": { "text/plain": [ "22.714285714285715" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 57 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:47:15.834218Z", "start_time": "2024-10-15T18:47:15.831258Z" } }, "cell_type": "code", "source": "len(_.loc[~_.is_related])", "id": "d347941cbb4b2db1", "outputs": [ { "data": { "text/plain": [ "3753" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 58 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-15T18:47:16.138798Z", "start_time": "2024-10-15T18:47:16.133397Z" } }, "cell_type": "code", "source": "_.loc[~_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()", "id": "2db4d96713a8634d", "outputs": [ { "data": { "text/plain": [ "268.07142857142856" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 59 } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }