Petr Tsvetkov
commited on
Commit
β’
39950c9
1
Parent(s):
ca11b66
Fix the visualization
Browse files- analysis.ipynb +290 -233
- analysis_util.py +31 -6
- change_visualizer.py +4 -1
- generation_steps/metrics_analysis.py +28 -16
- requirements.txt +0 -1
analysis.ipynb
CHANGED
@@ -5,9 +5,12 @@
|
|
5 |
"id": "initial_id",
|
6 |
"metadata": {
|
7 |
"collapsed": true,
|
|
|
|
|
|
|
8 |
"ExecuteTime": {
|
9 |
-
"end_time": "2024-05-
|
10 |
-
"start_time": "2024-05-
|
11 |
}
|
12 |
},
|
13 |
"source": [
|
@@ -15,41 +18,23 @@
|
|
15 |
"\n",
|
16 |
"import config"
|
17 |
],
|
18 |
-
"outputs": [
|
19 |
-
|
20 |
-
"name": "stderr",
|
21 |
-
"output_type": "stream",
|
22 |
-
"text": [
|
23 |
-
"D:\\petrtsv\\work\\jetbrains\\commit-rewriting-processing\\.venv\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
24 |
-
" from .autonotebook import tqdm as notebook_tqdm\n",
|
25 |
-
"[nltk_data] Downloading package wordnet to C:\\Users\\Petr\n",
|
26 |
-
"[nltk_data] Tsvetkov\\AppData\\Roaming\\nltk_data...\n",
|
27 |
-
"[nltk_data] Package wordnet is already up-to-date!\n",
|
28 |
-
"[nltk_data] Downloading package punkt to C:\\Users\\Petr\n",
|
29 |
-
"[nltk_data] Tsvetkov\\AppData\\Roaming\\nltk_data...\n",
|
30 |
-
"[nltk_data] Package punkt is already up-to-date!\n",
|
31 |
-
"[nltk_data] Downloading package omw-1.4 to C:\\Users\\Petr\n",
|
32 |
-
"[nltk_data] Tsvetkov\\AppData\\Roaming\\nltk_data...\n",
|
33 |
-
"[nltk_data] Package omw-1.4 is already up-to-date!\n"
|
34 |
-
]
|
35 |
-
}
|
36 |
-
],
|
37 |
-
"execution_count": 20
|
38 |
},
|
39 |
{
|
|
|
|
|
40 |
"metadata": {
|
41 |
"ExecuteTime": {
|
42 |
-
"end_time": "2024-05-
|
43 |
-
"start_time": "2024-05-
|
44 |
}
|
45 |
},
|
46 |
-
"cell_type": "code",
|
47 |
"source": [
|
48 |
"df = pd.read_csv(config.SYNTHETIC_DATASET_ARTIFACT, index_col=0)\n",
|
49 |
"\n",
|
50 |
"df.head()"
|
51 |
],
|
52 |
-
"id": "2ac8757a17e62293",
|
53 |
"outputs": [
|
54 |
{
|
55 |
"data": {
|
@@ -316,6 +301,30 @@
|
|
316 |
"</div>"
|
317 |
]
|
318 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
"execution_count": 6,
|
320 |
"metadata": {},
|
321 |
"output_type": "execute_result"
|
@@ -324,18 +333,15 @@
|
|
324 |
"execution_count": 6
|
325 |
},
|
326 |
{
|
|
|
|
|
|
|
327 |
"metadata": {
|
328 |
"ExecuteTime": {
|
329 |
"end_time": "2024-05-01T13:02:40.761645Z",
|
330 |
"start_time": "2024-05-01T13:02:40.740647Z"
|
331 |
}
|
332 |
},
|
333 |
-
"cell_type": "code",
|
334 |
-
"source": [
|
335 |
-
"rel_metrics = [col.split(\"_\")[0] for col in df.columns if col.endswith(\"_related\")]\n",
|
336 |
-
"rel_metrics"
|
337 |
-
],
|
338 |
-
"id": "d19c12dd10b25c75",
|
339 |
"outputs": [
|
340 |
{
|
341 |
"data": {
|
@@ -348,21 +354,21 @@
|
|
348 |
"output_type": "execute_result"
|
349 |
}
|
350 |
],
|
351 |
-
"
|
|
|
|
|
|
|
352 |
},
|
353 |
{
|
|
|
|
|
|
|
354 |
"metadata": {
|
355 |
"ExecuteTime": {
|
356 |
"end_time": "2024-05-01T13:02:44.072037Z",
|
357 |
"start_time": "2024-05-01T13:02:44.055039Z"
|
358 |
}
|
359 |
},
|
360 |
-
"cell_type": "code",
|
361 |
-
"source": [
|
362 |
-
"ind_metrics = [col.split(\"_\")[0] for col in df.columns if col.endswith(\"_independent\")]\n",
|
363 |
-
"ind_metrics"
|
364 |
-
],
|
365 |
-
"id": "79d644cd780b28a1",
|
366 |
"outputs": [
|
367 |
{
|
368 |
"data": {
|
@@ -385,96 +391,24 @@
|
|
385 |
"output_type": "execute_result"
|
386 |
}
|
387 |
],
|
388 |
-
"
|
|
|
|
|
|
|
389 |
},
|
390 |
{
|
|
|
|
|
|
|
391 |
"metadata": {
|
392 |
"ExecuteTime": {
|
393 |
"end_time": "2024-05-01T13:03:52.623346Z",
|
394 |
"start_time": "2024-05-01T13:03:52.577076Z"
|
395 |
}
|
396 |
},
|
397 |
-
"cell_type": "code",
|
398 |
-
"source": [
|
399 |
-
"AGGREGATION = {\"hash\": [\"count\"]}\n",
|
400 |
-
"\n",
|
401 |
-
"for metric in rel_metrics:\n",
|
402 |
-
" AGGREGATION[f\"{metric}_related\"] = [\"mean\"]\n",
|
403 |
-
"\n",
|
404 |
-
"for metric in ind_metrics:\n",
|
405 |
-
" AGGREGATION[f\"{metric}_independent\"] = [\"mean\"]\n",
|
406 |
-
"\n",
|
407 |
-
"df.groupby(by=[\"end_to_start\", \"start_to_end\"]).agg(AGGREGATION)"
|
408 |
-
],
|
409 |
-
"id": "fdc5ae636bffbc8b",
|
410 |
"outputs": [
|
411 |
{
|
412 |
"data": {
|
413 |
-
"text/plain": [
|
414 |
-
" hash editdist_related edittime_related \\\n",
|
415 |
-
" count mean mean \n",
|
416 |
-
"end_to_start start_to_end \n",
|
417 |
-
"False False 43 355.441860 364099.0625 \n",
|
418 |
-
" True 129 406.627907 NaN \n",
|
419 |
-
"True False 129 433.899225 NaN \n",
|
420 |
-
" True 387 444.509044 NaN \n",
|
421 |
-
"\n",
|
422 |
-
" gptscore-ref-1-req_independent \\\n",
|
423 |
-
" mean \n",
|
424 |
-
"end_to_start start_to_end \n",
|
425 |
-
"False False 7.255814 \n",
|
426 |
-
" True 7.217054 \n",
|
427 |
-
"True False 7.356589 \n",
|
428 |
-
" True 7.312661 \n",
|
429 |
-
"\n",
|
430 |
-
" gptscore-noref-1-req_independent \\\n",
|
431 |
-
" mean \n",
|
432 |
-
"end_to_start start_to_end \n",
|
433 |
-
"False False 8.116279 \n",
|
434 |
-
" True 8.178295 \n",
|
435 |
-
"True False 8.302326 \n",
|
436 |
-
" True 8.276486 \n",
|
437 |
-
"\n",
|
438 |
-
" editdist_independent bleu_independent \\\n",
|
439 |
-
" mean mean \n",
|
440 |
-
"end_to_start start_to_end \n",
|
441 |
-
"False False 491.069767 0.012805 \n",
|
442 |
-
" True 491.069767 0.012805 \n",
|
443 |
-
"True False 534.015504 0.009542 \n",
|
444 |
-
" True 534.015504 0.009542 \n",
|
445 |
-
"\n",
|
446 |
-
" meteor_independent rouge1_independent \\\n",
|
447 |
-
" mean mean \n",
|
448 |
-
"end_to_start start_to_end \n",
|
449 |
-
"False False 0.224961 0.202063 \n",
|
450 |
-
" True 0.224961 0.202063 \n",
|
451 |
-
"True False 0.221893 0.205151 \n",
|
452 |
-
" True 0.221893 0.205151 \n",
|
453 |
-
"\n",
|
454 |
-
" rouge2_independent rougeL_independent \\\n",
|
455 |
-
" mean mean \n",
|
456 |
-
"end_to_start start_to_end \n",
|
457 |
-
"False False 0.040718 0.136427 \n",
|
458 |
-
" True 0.040718 0.136427 \n",
|
459 |
-
"True False 0.039033 0.134114 \n",
|
460 |
-
" True 0.039033 0.134114 \n",
|
461 |
-
"\n",
|
462 |
-
" bertscore_independent chrF_independent \\\n",
|
463 |
-
" mean mean \n",
|
464 |
-
"end_to_start start_to_end \n",
|
465 |
-
"False False 0.780266 32.067005 \n",
|
466 |
-
" True 0.780266 32.067005 \n",
|
467 |
-
"True False 0.777162 31.753065 \n",
|
468 |
-
" True 0.777162 31.753065 \n",
|
469 |
-
"\n",
|
470 |
-
" ter_independent \n",
|
471 |
-
" mean \n",
|
472 |
-
"end_to_start start_to_end \n",
|
473 |
-
"False False 312.732989 \n",
|
474 |
-
" True 312.732989 \n",
|
475 |
-
"True False 317.717517 \n",
|
476 |
-
" True 317.717517 "
|
477 |
-
],
|
478 |
"text/html": [
|
479 |
"<div>\n",
|
480 |
"<style scoped>\n",
|
@@ -625,6 +559,71 @@
|
|
625 |
" </tbody>\n",
|
626 |
"</table>\n",
|
627 |
"</div>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
628 |
]
|
629 |
},
|
630 |
"execution_count": 19,
|
@@ -632,100 +631,31 @@
|
|
632 |
"output_type": "execute_result"
|
633 |
}
|
634 |
],
|
635 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
636 |
},
|
637 |
{
|
|
|
|
|
|
|
638 |
"metadata": {
|
639 |
"ExecuteTime": {
|
640 |
"end_time": "2024-05-01T13:42:57.052768Z",
|
641 |
"start_time": "2024-05-01T13:42:56.812556Z"
|
642 |
}
|
643 |
},
|
644 |
-
"cell_type": "code",
|
645 |
-
"source": "",
|
646 |
-
"id": "3429b60eab154b79",
|
647 |
"outputs": [
|
648 |
{
|
649 |
"data": {
|
650 |
-
"text/plain": [
|
651 |
-
" all golden \\\n",
|
652 |
-
" spearman pearson spearman pearson \n",
|
653 |
-
"relative independent \n",
|
654 |
-
"editdist bertscore -0.184962 -0.129057 -0.316215 -0.254700 \n",
|
655 |
-
" bleu 0.260118 0.185995 0.269028 0.259690 \n",
|
656 |
-
" chrF -0.199200 -0.129029 -0.343201 -0.300656 \n",
|
657 |
-
" editdist 0.909934 0.910641 0.710772 0.662808 \n",
|
658 |
-
" gptscore-noref-1-req 0.032048 0.055364 0.155510 0.048588 \n",
|
659 |
-
" gptscore-ref-1-req 0.024550 0.035295 -0.009830 -0.062574 \n",
|
660 |
-
" meteor 0.336016 0.371949 0.068034 0.173237 \n",
|
661 |
-
" rouge1 -0.077574 -0.043738 -0.187349 -0.163230 \n",
|
662 |
-
" rouge2 0.414256 0.340732 0.276139 0.332087 \n",
|
663 |
-
" rougeL 0.006513 -0.008078 -0.041502 -0.034867 \n",
|
664 |
-
" ter 0.618095 0.385515 0.575614 0.501385 \n",
|
665 |
-
"edittime bertscore 0.140481 0.158807 0.140481 0.158807 \n",
|
666 |
-
" bleu 0.302380 0.326167 0.302380 0.326167 \n",
|
667 |
-
" chrF 0.079802 0.184202 0.079802 0.184202 \n",
|
668 |
-
" editdist 0.252645 0.411131 0.252645 0.411131 \n",
|
669 |
-
" gptscore-noref-1-req 0.206465 0.026235 0.206465 0.026235 \n",
|
670 |
-
" gptscore-ref-1-req 0.130419 -0.055218 0.130419 -0.055218 \n",
|
671 |
-
" meteor 0.253380 0.403564 0.253380 0.403564 \n",
|
672 |
-
" rouge1 0.155926 0.136971 0.155926 0.136971 \n",
|
673 |
-
" rouge2 0.218822 0.281944 0.218822 0.281944 \n",
|
674 |
-
" rougeL 0.071344 0.091196 0.071344 0.091196 \n",
|
675 |
-
" ter 0.305601 0.062616 0.305601 0.062616 \n",
|
676 |
-
"\n",
|
677 |
-
" +s2e +e2s \\\n",
|
678 |
-
" spearman pearson spearman pearson \n",
|
679 |
-
"relative independent \n",
|
680 |
-
"editdist bertscore -0.308494 -0.113525 -0.181393 -0.165924 \n",
|
681 |
-
" bleu 0.512841 0.502827 0.109831 0.068138 \n",
|
682 |
-
" chrF -0.238124 -0.064922 -0.233123 -0.201726 \n",
|
683 |
-
" editdist 0.950494 0.935064 0.861930 0.878118 \n",
|
684 |
-
" gptscore-noref-1-req 0.067857 0.047215 -0.029048 -0.013128 \n",
|
685 |
-
" gptscore-ref-1-req -0.015178 -0.036001 0.071345 0.087584 \n",
|
686 |
-
" meteor 0.203616 0.425775 0.372598 0.360051 \n",
|
687 |
-
" rouge1 -0.139874 -0.065543 -0.082093 -0.035603 \n",
|
688 |
-
" rouge2 0.523559 0.537560 0.323911 0.282872 \n",
|
689 |
-
" rougeL -0.022288 -0.004664 0.012409 0.016372 \n",
|
690 |
-
" ter 0.774086 0.462554 0.529338 0.388592 \n",
|
691 |
-
"edittime bertscore NaN NaN NaN NaN \n",
|
692 |
-
" bleu NaN NaN NaN NaN \n",
|
693 |
-
" chrF NaN NaN NaN NaN \n",
|
694 |
-
" editdist NaN NaN NaN NaN \n",
|
695 |
-
" gptscore-noref-1-req NaN NaN NaN NaN \n",
|
696 |
-
" gptscore-ref-1-req NaN NaN NaN NaN \n",
|
697 |
-
" meteor NaN NaN NaN NaN \n",
|
698 |
-
" rouge1 NaN NaN NaN NaN \n",
|
699 |
-
" rouge2 NaN NaN NaN NaN \n",
|
700 |
-
" rougeL NaN NaN NaN NaN \n",
|
701 |
-
" ter NaN NaN NaN NaN \n",
|
702 |
-
"\n",
|
703 |
-
" +e2s+s2e \n",
|
704 |
-
" spearman pearson \n",
|
705 |
-
"relative independent \n",
|
706 |
-
"editdist bertscore -0.135421 -0.091748 \n",
|
707 |
-
" bleu 0.229712 0.145062 \n",
|
708 |
-
" chrF -0.156914 -0.093376 \n",
|
709 |
-
" editdist 0.939318 0.962305 \n",
|
710 |
-
" gptscore-noref-1-req 0.012102 0.066882 \n",
|
711 |
-
" gptscore-ref-1-req 0.013012 0.033618 \n",
|
712 |
-
" meteor 0.392262 0.401802 \n",
|
713 |
-
" rouge1 -0.054034 -0.030799 \n",
|
714 |
-
" rouge2 0.433859 0.324538 \n",
|
715 |
-
" rougeL 0.021983 -0.010644 \n",
|
716 |
-
" ter 0.591684 0.354459 \n",
|
717 |
-
"edittime bertscore NaN NaN \n",
|
718 |
-
" bleu NaN NaN \n",
|
719 |
-
" chrF NaN NaN \n",
|
720 |
-
" editdist NaN NaN \n",
|
721 |
-
" gptscore-noref-1-req NaN NaN \n",
|
722 |
-
" gptscore-ref-1-req NaN NaN \n",
|
723 |
-
" meteor NaN NaN \n",
|
724 |
-
" rouge1 NaN NaN \n",
|
725 |
-
" rouge2 NaN NaN \n",
|
726 |
-
" rougeL NaN NaN \n",
|
727 |
-
" ter NaN NaN "
|
728 |
-
],
|
729 |
"text/html": [
|
730 |
"<div>\n",
|
731 |
"<style scoped>\n",
|
@@ -1077,32 +1007,7 @@
|
|
1077 |
" </tbody>\n",
|
1078 |
"</table>\n",
|
1079 |
"</div>"
|
1080 |
-
]
|
1081 |
-
},
|
1082 |
-
"execution_count": 47,
|
1083 |
-
"metadata": {},
|
1084 |
-
"output_type": "execute_result"
|
1085 |
-
}
|
1086 |
-
],
|
1087 |
-
"execution_count": 47
|
1088 |
-
},
|
1089 |
-
{
|
1090 |
-
"metadata": {
|
1091 |
-
"ExecuteTime": {
|
1092 |
-
"end_time": "2024-05-01T13:49:09.514129Z",
|
1093 |
-
"start_time": "2024-05-01T13:49:09.295101Z"
|
1094 |
-
}
|
1095 |
-
},
|
1096 |
-
"cell_type": "code",
|
1097 |
-
"source": [
|
1098 |
-
"from analysis_util import get_ref_only_correlations_for_groups\n",
|
1099 |
-
"\n",
|
1100 |
-
"get_ref_only_correlations_for_groups(df)"
|
1101 |
-
],
|
1102 |
-
"id": "a3531f28722fa5bc",
|
1103 |
-
"outputs": [
|
1104 |
-
{
|
1105 |
-
"data": {
|
1106 |
"text/plain": [
|
1107 |
" all golden \\\n",
|
1108 |
" spearman pearson spearman pearson \n",
|
@@ -1181,7 +1086,28 @@
|
|
1181 |
" rouge2 NaN NaN \n",
|
1182 |
" rougeL NaN NaN \n",
|
1183 |
" ter NaN NaN "
|
1184 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1185 |
"text/html": [
|
1186 |
"<div>\n",
|
1187 |
"<style scoped>\n",
|
@@ -1533,6 +1459,85 @@
|
|
1533 |
" </tbody>\n",
|
1534 |
"</table>\n",
|
1535 |
"</div>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1536 |
]
|
1537 |
},
|
1538 |
"execution_count": 50,
|
@@ -1540,26 +1545,78 @@
|
|
1540 |
"output_type": "execute_result"
|
1541 |
}
|
1542 |
],
|
1543 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1544 |
}
|
1545 |
],
|
1546 |
"metadata": {
|
1547 |
"kernelspec": {
|
1548 |
-
"display_name": "Python 3",
|
1549 |
"language": "python",
|
1550 |
"name": "python3"
|
1551 |
},
|
1552 |
"language_info": {
|
1553 |
"codemirror_mode": {
|
1554 |
"name": "ipython",
|
1555 |
-
"version":
|
1556 |
},
|
1557 |
"file_extension": ".py",
|
1558 |
"mimetype": "text/x-python",
|
1559 |
"name": "python",
|
1560 |
"nbconvert_exporter": "python",
|
1561 |
-
"pygments_lexer": "
|
1562 |
-
"version": "
|
1563 |
}
|
1564 |
},
|
1565 |
"nbformat": 4,
|
|
|
5 |
"id": "initial_id",
|
6 |
"metadata": {
|
7 |
"collapsed": true,
|
8 |
+
"jupyter": {
|
9 |
+
"outputs_hidden": true
|
10 |
+
},
|
11 |
"ExecuteTime": {
|
12 |
+
"end_time": "2024-05-01T15:23:17.507403Z",
|
13 |
+
"start_time": "2024-05-01T15:23:17.497406Z"
|
14 |
}
|
15 |
},
|
16 |
"source": [
|
|
|
18 |
"\n",
|
19 |
"import config"
|
20 |
],
|
21 |
+
"outputs": [],
|
22 |
+
"execution_count": 7
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
},
|
24 |
{
|
25 |
+
"cell_type": "code",
|
26 |
+
"id": "2ac8757a17e62293",
|
27 |
"metadata": {
|
28 |
"ExecuteTime": {
|
29 |
+
"end_time": "2024-05-01T15:23:19.365525Z",
|
30 |
+
"start_time": "2024-05-01T15:23:19.120308Z"
|
31 |
}
|
32 |
},
|
|
|
33 |
"source": [
|
34 |
"df = pd.read_csv(config.SYNTHETIC_DATASET_ARTIFACT, index_col=0)\n",
|
35 |
"\n",
|
36 |
"df.head()"
|
37 |
],
|
|
|
38 |
"outputs": [
|
39 |
{
|
40 |
"data": {
|
|
|
301 |
"</div>"
|
302 |
]
|
303 |
},
|
304 |
+
"execution_count": 8,
|
305 |
+
"metadata": {},
|
306 |
+
"output_type": "execute_result"
|
307 |
+
}
|
308 |
+
],
|
309 |
+
"execution_count": 8
|
310 |
+
},
|
311 |
+
{
|
312 |
+
"metadata": {
|
313 |
+
"ExecuteTime": {
|
314 |
+
"end_time": "2024-05-01T15:11:08.418257Z",
|
315 |
+
"start_time": "2024-05-01T15:11:08.408943Z"
|
316 |
+
}
|
317 |
+
},
|
318 |
+
"cell_type": "code",
|
319 |
+
"source": "len(set(df['session'].to_list()))",
|
320 |
+
"id": "4bcbc0f1d3d6d248",
|
321 |
+
"outputs": [
|
322 |
+
{
|
323 |
+
"data": {
|
324 |
+
"text/plain": [
|
325 |
+
"9"
|
326 |
+
]
|
327 |
+
},
|
328 |
"execution_count": 6,
|
329 |
"metadata": {},
|
330 |
"output_type": "execute_result"
|
|
|
333 |
"execution_count": 6
|
334 |
},
|
335 |
{
|
336 |
+
"cell_type": "code",
|
337 |
+
"execution_count": 15,
|
338 |
+
"id": "d19c12dd10b25c75",
|
339 |
"metadata": {
|
340 |
"ExecuteTime": {
|
341 |
"end_time": "2024-05-01T13:02:40.761645Z",
|
342 |
"start_time": "2024-05-01T13:02:40.740647Z"
|
343 |
}
|
344 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
345 |
"outputs": [
|
346 |
{
|
347 |
"data": {
|
|
|
354 |
"output_type": "execute_result"
|
355 |
}
|
356 |
],
|
357 |
+
"source": [
|
358 |
+
"rel_metrics = [col.split(\"_\")[0] for col in df.columns if col.endswith(\"_related\")]\n",
|
359 |
+
"rel_metrics"
|
360 |
+
]
|
361 |
},
|
362 |
{
|
363 |
+
"cell_type": "code",
|
364 |
+
"execution_count": 16,
|
365 |
+
"id": "79d644cd780b28a1",
|
366 |
"metadata": {
|
367 |
"ExecuteTime": {
|
368 |
"end_time": "2024-05-01T13:02:44.072037Z",
|
369 |
"start_time": "2024-05-01T13:02:44.055039Z"
|
370 |
}
|
371 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
"outputs": [
|
373 |
{
|
374 |
"data": {
|
|
|
391 |
"output_type": "execute_result"
|
392 |
}
|
393 |
],
|
394 |
+
"source": [
|
395 |
+
"ind_metrics = [col.split(\"_\")[0] for col in df.columns if col.endswith(\"_independent\")]\n",
|
396 |
+
"ind_metrics"
|
397 |
+
]
|
398 |
},
|
399 |
{
|
400 |
+
"cell_type": "code",
|
401 |
+
"execution_count": 19,
|
402 |
+
"id": "fdc5ae636bffbc8b",
|
403 |
"metadata": {
|
404 |
"ExecuteTime": {
|
405 |
"end_time": "2024-05-01T13:03:52.623346Z",
|
406 |
"start_time": "2024-05-01T13:03:52.577076Z"
|
407 |
}
|
408 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
409 |
"outputs": [
|
410 |
{
|
411 |
"data": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
412 |
"text/html": [
|
413 |
"<div>\n",
|
414 |
"<style scoped>\n",
|
|
|
559 |
" </tbody>\n",
|
560 |
"</table>\n",
|
561 |
"</div>"
|
562 |
+
],
|
563 |
+
"text/plain": [
|
564 |
+
" hash editdist_related edittime_related \\\n",
|
565 |
+
" count mean mean \n",
|
566 |
+
"end_to_start start_to_end \n",
|
567 |
+
"False False 43 355.441860 364099.0625 \n",
|
568 |
+
" True 129 406.627907 NaN \n",
|
569 |
+
"True False 129 433.899225 NaN \n",
|
570 |
+
" True 387 444.509044 NaN \n",
|
571 |
+
"\n",
|
572 |
+
" gptscore-ref-1-req_independent \\\n",
|
573 |
+
" mean \n",
|
574 |
+
"end_to_start start_to_end \n",
|
575 |
+
"False False 7.255814 \n",
|
576 |
+
" True 7.217054 \n",
|
577 |
+
"True False 7.356589 \n",
|
578 |
+
" True 7.312661 \n",
|
579 |
+
"\n",
|
580 |
+
" gptscore-noref-1-req_independent \\\n",
|
581 |
+
" mean \n",
|
582 |
+
"end_to_start start_to_end \n",
|
583 |
+
"False False 8.116279 \n",
|
584 |
+
" True 8.178295 \n",
|
585 |
+
"True False 8.302326 \n",
|
586 |
+
" True 8.276486 \n",
|
587 |
+
"\n",
|
588 |
+
" editdist_independent bleu_independent \\\n",
|
589 |
+
" mean mean \n",
|
590 |
+
"end_to_start start_to_end \n",
|
591 |
+
"False False 491.069767 0.012805 \n",
|
592 |
+
" True 491.069767 0.012805 \n",
|
593 |
+
"True False 534.015504 0.009542 \n",
|
594 |
+
" True 534.015504 0.009542 \n",
|
595 |
+
"\n",
|
596 |
+
" meteor_independent rouge1_independent \\\n",
|
597 |
+
" mean mean \n",
|
598 |
+
"end_to_start start_to_end \n",
|
599 |
+
"False False 0.224961 0.202063 \n",
|
600 |
+
" True 0.224961 0.202063 \n",
|
601 |
+
"True False 0.221893 0.205151 \n",
|
602 |
+
" True 0.221893 0.205151 \n",
|
603 |
+
"\n",
|
604 |
+
" rouge2_independent rougeL_independent \\\n",
|
605 |
+
" mean mean \n",
|
606 |
+
"end_to_start start_to_end \n",
|
607 |
+
"False False 0.040718 0.136427 \n",
|
608 |
+
" True 0.040718 0.136427 \n",
|
609 |
+
"True False 0.039033 0.134114 \n",
|
610 |
+
" True 0.039033 0.134114 \n",
|
611 |
+
"\n",
|
612 |
+
" bertscore_independent chrF_independent \\\n",
|
613 |
+
" mean mean \n",
|
614 |
+
"end_to_start start_to_end \n",
|
615 |
+
"False False 0.780266 32.067005 \n",
|
616 |
+
" True 0.780266 32.067005 \n",
|
617 |
+
"True False 0.777162 31.753065 \n",
|
618 |
+
" True 0.777162 31.753065 \n",
|
619 |
+
"\n",
|
620 |
+
" ter_independent \n",
|
621 |
+
" mean \n",
|
622 |
+
"end_to_start start_to_end \n",
|
623 |
+
"False False 312.732989 \n",
|
624 |
+
" True 312.732989 \n",
|
625 |
+
"True False 317.717517 \n",
|
626 |
+
" True 317.717517 "
|
627 |
]
|
628 |
},
|
629 |
"execution_count": 19,
|
|
|
631 |
"output_type": "execute_result"
|
632 |
}
|
633 |
],
|
634 |
+
"source": [
|
635 |
+
"AGGREGATION = {\"hash\": [\"count\"]}\n",
|
636 |
+
"\n",
|
637 |
+
"for metric in rel_metrics:\n",
|
638 |
+
" AGGREGATION[f\"{metric}_related\"] = [\"mean\"]\n",
|
639 |
+
"\n",
|
640 |
+
"for metric in ind_metrics:\n",
|
641 |
+
" AGGREGATION[f\"{metric}_independent\"] = [\"mean\"]\n",
|
642 |
+
"\n",
|
643 |
+
"df.groupby(by=[\"end_to_start\", \"start_to_end\"]).agg(AGGREGATION)"
|
644 |
+
]
|
645 |
},
|
646 |
{
|
647 |
+
"cell_type": "code",
|
648 |
+
"execution_count": 47,
|
649 |
+
"id": "3429b60eab154b79",
|
650 |
"metadata": {
|
651 |
"ExecuteTime": {
|
652 |
"end_time": "2024-05-01T13:42:57.052768Z",
|
653 |
"start_time": "2024-05-01T13:42:56.812556Z"
|
654 |
}
|
655 |
},
|
|
|
|
|
|
|
656 |
"outputs": [
|
657 |
{
|
658 |
"data": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
659 |
"text/html": [
|
660 |
"<div>\n",
|
661 |
"<style scoped>\n",
|
|
|
1007 |
" </tbody>\n",
|
1008 |
"</table>\n",
|
1009 |
"</div>"
|
1010 |
+
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1011 |
"text/plain": [
|
1012 |
" all golden \\\n",
|
1013 |
" spearman pearson spearman pearson \n",
|
|
|
1086 |
" rouge2 NaN NaN \n",
|
1087 |
" rougeL NaN NaN \n",
|
1088 |
" ter NaN NaN "
|
1089 |
+
]
|
1090 |
+
},
|
1091 |
+
"execution_count": 47,
|
1092 |
+
"metadata": {},
|
1093 |
+
"output_type": "execute_result"
|
1094 |
+
}
|
1095 |
+
],
|
1096 |
+
"source": []
|
1097 |
+
},
|
1098 |
+
{
|
1099 |
+
"cell_type": "code",
|
1100 |
+
"execution_count": 50,
|
1101 |
+
"id": "a3531f28722fa5bc",
|
1102 |
+
"metadata": {
|
1103 |
+
"ExecuteTime": {
|
1104 |
+
"end_time": "2024-05-01T13:49:09.514129Z",
|
1105 |
+
"start_time": "2024-05-01T13:49:09.295101Z"
|
1106 |
+
}
|
1107 |
+
},
|
1108 |
+
"outputs": [
|
1109 |
+
{
|
1110 |
+
"data": {
|
1111 |
"text/html": [
|
1112 |
"<div>\n",
|
1113 |
"<style scoped>\n",
|
|
|
1459 |
" </tbody>\n",
|
1460 |
"</table>\n",
|
1461 |
"</div>"
|
1462 |
+
],
|
1463 |
+
"text/plain": [
|
1464 |
+
" all golden \\\n",
|
1465 |
+
" spearman pearson spearman pearson \n",
|
1466 |
+
"relative independent \n",
|
1467 |
+
"editdist bertscore -0.184962 -0.129057 -0.316215 -0.254700 \n",
|
1468 |
+
" bleu 0.260118 0.185995 0.269028 0.259690 \n",
|
1469 |
+
" chrF -0.199200 -0.129029 -0.343201 -0.300656 \n",
|
1470 |
+
" editdist 0.909934 0.910641 0.710772 0.662808 \n",
|
1471 |
+
" gptscore-noref-1-req 0.032048 0.055364 0.155510 0.048588 \n",
|
1472 |
+
" gptscore-ref-1-req 0.024550 0.035295 -0.009830 -0.062574 \n",
|
1473 |
+
" meteor 0.336016 0.371949 0.068034 0.173237 \n",
|
1474 |
+
" rouge1 -0.077574 -0.043738 -0.187349 -0.163230 \n",
|
1475 |
+
" rouge2 0.414256 0.340732 0.276139 0.332087 \n",
|
1476 |
+
" rougeL 0.006513 -0.008078 -0.041502 -0.034867 \n",
|
1477 |
+
" ter 0.618095 0.385515 0.575614 0.501385 \n",
|
1478 |
+
"edittime bertscore 0.140481 0.158807 0.140481 0.158807 \n",
|
1479 |
+
" bleu 0.302380 0.326167 0.302380 0.326167 \n",
|
1480 |
+
" chrF 0.079802 0.184202 0.079802 0.184202 \n",
|
1481 |
+
" editdist 0.252645 0.411131 0.252645 0.411131 \n",
|
1482 |
+
" gptscore-noref-1-req 0.206465 0.026235 0.206465 0.026235 \n",
|
1483 |
+
" gptscore-ref-1-req 0.130419 -0.055218 0.130419 -0.055218 \n",
|
1484 |
+
" meteor 0.253380 0.403564 0.253380 0.403564 \n",
|
1485 |
+
" rouge1 0.155926 0.136971 0.155926 0.136971 \n",
|
1486 |
+
" rouge2 0.218822 0.281944 0.218822 0.281944 \n",
|
1487 |
+
" rougeL 0.071344 0.091196 0.071344 0.091196 \n",
|
1488 |
+
" ter 0.305601 0.062616 0.305601 0.062616 \n",
|
1489 |
+
"\n",
|
1490 |
+
" +s2e +e2s \\\n",
|
1491 |
+
" spearman pearson spearman pearson \n",
|
1492 |
+
"relative independent \n",
|
1493 |
+
"editdist bertscore -0.308494 -0.113525 -0.181393 -0.165924 \n",
|
1494 |
+
" bleu 0.512841 0.502827 0.109831 0.068138 \n",
|
1495 |
+
" chrF -0.238124 -0.064922 -0.233123 -0.201726 \n",
|
1496 |
+
" editdist 0.950494 0.935064 0.861930 0.878118 \n",
|
1497 |
+
" gptscore-noref-1-req 0.067857 0.047215 -0.029048 -0.013128 \n",
|
1498 |
+
" gptscore-ref-1-req -0.015178 -0.036001 0.071345 0.087584 \n",
|
1499 |
+
" meteor 0.203616 0.425775 0.372598 0.360051 \n",
|
1500 |
+
" rouge1 -0.139874 -0.065543 -0.082093 -0.035603 \n",
|
1501 |
+
" rouge2 0.523559 0.537560 0.323911 0.282872 \n",
|
1502 |
+
" rougeL -0.022288 -0.004664 0.012409 0.016372 \n",
|
1503 |
+
" ter 0.774086 0.462554 0.529338 0.388592 \n",
|
1504 |
+
"edittime bertscore NaN NaN NaN NaN \n",
|
1505 |
+
" bleu NaN NaN NaN NaN \n",
|
1506 |
+
" chrF NaN NaN NaN NaN \n",
|
1507 |
+
" editdist NaN NaN NaN NaN \n",
|
1508 |
+
" gptscore-noref-1-req NaN NaN NaN NaN \n",
|
1509 |
+
" gptscore-ref-1-req NaN NaN NaN NaN \n",
|
1510 |
+
" meteor NaN NaN NaN NaN \n",
|
1511 |
+
" rouge1 NaN NaN NaN NaN \n",
|
1512 |
+
" rouge2 NaN NaN NaN NaN \n",
|
1513 |
+
" rougeL NaN NaN NaN NaN \n",
|
1514 |
+
" ter NaN NaN NaN NaN \n",
|
1515 |
+
"\n",
|
1516 |
+
" +e2s+s2e \n",
|
1517 |
+
" spearman pearson \n",
|
1518 |
+
"relative independent \n",
|
1519 |
+
"editdist bertscore -0.135421 -0.091748 \n",
|
1520 |
+
" bleu 0.229712 0.145062 \n",
|
1521 |
+
" chrF -0.156914 -0.093376 \n",
|
1522 |
+
" editdist 0.939318 0.962305 \n",
|
1523 |
+
" gptscore-noref-1-req 0.012102 0.066882 \n",
|
1524 |
+
" gptscore-ref-1-req 0.013012 0.033618 \n",
|
1525 |
+
" meteor 0.392262 0.401802 \n",
|
1526 |
+
" rouge1 -0.054034 -0.030799 \n",
|
1527 |
+
" rouge2 0.433859 0.324538 \n",
|
1528 |
+
" rougeL 0.021983 -0.010644 \n",
|
1529 |
+
" ter 0.591684 0.354459 \n",
|
1530 |
+
"edittime bertscore NaN NaN \n",
|
1531 |
+
" bleu NaN NaN \n",
|
1532 |
+
" chrF NaN NaN \n",
|
1533 |
+
" editdist NaN NaN \n",
|
1534 |
+
" gptscore-noref-1-req NaN NaN \n",
|
1535 |
+
" gptscore-ref-1-req NaN NaN \n",
|
1536 |
+
" meteor NaN NaN \n",
|
1537 |
+
" rouge1 NaN NaN \n",
|
1538 |
+
" rouge2 NaN NaN \n",
|
1539 |
+
" rougeL NaN NaN \n",
|
1540 |
+
" ter NaN NaN "
|
1541 |
]
|
1542 |
},
|
1543 |
"execution_count": 50,
|
|
|
1545 |
"output_type": "execute_result"
|
1546 |
}
|
1547 |
],
|
1548 |
+
"source": [
|
1549 |
+
"from analysis_util import get_correlations_for_groups\n",
|
1550 |
+
"\n",
|
1551 |
+
"get_correlations_for_groups(df, right_side=\"ind\")"
|
1552 |
+
]
|
1553 |
+
},
|
1554 |
+
{
|
1555 |
+
"cell_type": "code",
|
1556 |
+
"execution_count": null,
|
1557 |
+
"id": "d5dc33a4251baf9a",
|
1558 |
+
"metadata": {},
|
1559 |
+
"outputs": [],
|
1560 |
+
"source": [
|
1561 |
+
"get_correlations_for_groups(df, right_side=\"aggr\")"
|
1562 |
+
]
|
1563 |
+
},
|
1564 |
+
{
|
1565 |
+
"metadata": {
|
1566 |
+
"ExecuteTime": {
|
1567 |
+
"end_time": "2024-05-01T15:25:18.226195Z",
|
1568 |
+
"start_time": "2024-05-01T15:25:17.464762Z"
|
1569 |
+
}
|
1570 |
+
},
|
1571 |
+
"cell_type": "code",
|
1572 |
+
"source": [
|
1573 |
+
"from matplotlib import pyplot as plt\n",
|
1574 |
+
"\n",
|
1575 |
+
"plt.scatter(x=df['edittime_related'], y=df['editdist_related'])"
|
1576 |
+
],
|
1577 |
+
"id": "5df60ac60034b274",
|
1578 |
+
"outputs": [
|
1579 |
+
{
|
1580 |
+
"data": {
|
1581 |
+
"text/plain": [
|
1582 |
+
"<matplotlib.collections.PathCollection at 0x17c179da970>"
|
1583 |
+
]
|
1584 |
+
},
|
1585 |
+
"execution_count": 11,
|
1586 |
+
"metadata": {},
|
1587 |
+
"output_type": "execute_result"
|
1588 |
+
},
|
1589 |
+
{
|
1590 |
+
"data": {
|
1591 |
+
"text/plain": [
|
1592 |
+
"<Figure size 640x480 with 1 Axes>"
|
1593 |
+
],
|
1594 |
+
"image/png": ""
|
1595 |
+
},
|
1596 |
+
"metadata": {},
|
1597 |
+
"output_type": "display_data"
|
1598 |
+
}
|
1599 |
+
],
|
1600 |
+
"execution_count": 11
|
1601 |
}
|
1602 |
],
|
1603 |
"metadata": {
|
1604 |
"kernelspec": {
|
1605 |
+
"display_name": "Python 3 (ipykernel)",
|
1606 |
"language": "python",
|
1607 |
"name": "python3"
|
1608 |
},
|
1609 |
"language_info": {
|
1610 |
"codemirror_mode": {
|
1611 |
"name": "ipython",
|
1612 |
+
"version": 3
|
1613 |
},
|
1614 |
"file_extension": ".py",
|
1615 |
"mimetype": "text/x-python",
|
1616 |
"name": "python",
|
1617 |
"nbconvert_exporter": "python",
|
1618 |
+
"pygments_lexer": "ipython3",
|
1619 |
+
"version": "3.9.5"
|
1620 |
}
|
1621 |
},
|
1622 |
"nbformat": 4,
|
analysis_util.py
CHANGED
@@ -1,6 +1,31 @@
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
|
6 |
def split_metrics_string(s):
|
@@ -8,10 +33,10 @@ def split_metrics_string(s):
|
|
8 |
return tokens[1], tokens[3]
|
9 |
|
10 |
|
11 |
-
def
|
12 |
correlations_raw = correlations_for_group(df)
|
13 |
|
14 |
-
idx = list(set("_".join(col.split("_")[:-1]) for col in correlations_raw.index))
|
15 |
|
16 |
data = []
|
17 |
for metrics in idx:
|
@@ -29,8 +54,8 @@ def get_ref_only_correlations_df(df):
|
|
29 |
return result
|
30 |
|
31 |
|
32 |
-
def
|
33 |
-
noref_correlations = {"all":
|
34 |
|
35 |
for e2s in (False, True):
|
36 |
for s2e in (False, True):
|
@@ -43,7 +68,7 @@ def get_ref_only_correlations_for_groups(df):
|
|
43 |
suffix = "golden"
|
44 |
|
45 |
subdf = df[(df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)]
|
46 |
-
subdf_noref_corr =
|
47 |
noref_correlations[suffix] = subdf_noref_corr
|
48 |
|
49 |
noref_correlations = pd.concat(noref_correlations, axis=1)
|
|
|
1 |
+
import functools
|
2 |
+
import operator
|
3 |
+
|
4 |
import pandas as pd
|
5 |
|
6 |
+
|
7 |
+
def correlations_for_group(group):
|
8 |
+
REL_METRICS = [col.split("_")[0] for col in group.colmns if col.endswith("_related")]
|
9 |
+
IND_METRICS = [col.split("_")[0] for col in group.colmns if col.endswith("_independent")]
|
10 |
+
AGGR_METRICS = [col.split("_")[0] for col in group.colmns if col.endswith("_aggr")]
|
11 |
+
|
12 |
+
correlations = []
|
13 |
+
for rel_metric in REL_METRICS:
|
14 |
+
for ind_metric in IND_METRICS:
|
15 |
+
correlations.append({
|
16 |
+
f"rel_{rel_metric}_ind_{ind_metric}_pearson": group[f"{rel_metric}_related"].corr(
|
17 |
+
group[f"{ind_metric}_independent"], method="pearson"),
|
18 |
+
f"rel_{rel_metric}_ind_{ind_metric}_spearman": group[f"{rel_metric}_related"].corr(
|
19 |
+
group[f"{ind_metric}_independent"], method="spearman"),
|
20 |
+
})
|
21 |
+
for aggr_metric in AGGR_METRICS:
|
22 |
+
correlations.append({
|
23 |
+
f"rel_{rel_metric}_aggr_{aggr_metric}_pearson": group[f"{rel_metric}_related"].corr(
|
24 |
+
group[f"{aggr_metric}_aggr"], method="pearson"),
|
25 |
+
f"rel_{rel_metric}_aggr_{aggr_metric}_spearman": group[f"{rel_metric}_related"].corr(
|
26 |
+
group[f"{aggr_metric}_aggr"], method="spearman"),
|
27 |
+
})
|
28 |
+
return pd.Series(functools.reduce(operator.ior, correlations, {}))
|
29 |
|
30 |
|
31 |
def split_metrics_string(s):
|
|
|
33 |
return tokens[1], tokens[3]
|
34 |
|
35 |
|
36 |
+
def get_correlations_df(df, right_side):
|
37 |
correlations_raw = correlations_for_group(df)
|
38 |
|
39 |
+
idx = list(set("_".join(col.split("_")[:-1]) for col in correlations_raw.index if right_side in col))
|
40 |
|
41 |
data = []
|
42 |
for metrics in idx:
|
|
|
54 |
return result
|
55 |
|
56 |
|
57 |
+
def get_correlations_for_groups(df, right_side):
|
58 |
+
noref_correlations = {"all": get_correlations_df(df, right_side=right_side)}
|
59 |
|
60 |
for e2s in (False, True):
|
61 |
for s2e in (False, True):
|
|
|
68 |
suffix = "golden"
|
69 |
|
70 |
subdf = df[(df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)]
|
71 |
+
subdf_noref_corr = get_correlations_for_groups(subdf, right_side=right_side)
|
72 |
noref_correlations[suffix] = subdf_noref_corr
|
73 |
|
74 |
noref_correlations = pd.concat(noref_correlations, axis=1)
|
change_visualizer.py
CHANGED
@@ -108,7 +108,10 @@ if __name__ == '__main__':
|
|
108 |
layout_for_statistics("synthetic")
|
109 |
|
110 |
gr.Markdown(f"### Reference-only correlations")
|
111 |
-
gr.Markdown(value=analysis_util.
|
|
|
|
|
|
|
112 |
|
113 |
application.load(update_dataset_view_manual, inputs=slider_manual,
|
114 |
outputs=view_manual)
|
|
|
108 |
layout_for_statistics("synthetic")
|
109 |
|
110 |
gr.Markdown(f"### Reference-only correlations")
|
111 |
+
gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="ind").to_markdown())
|
112 |
+
|
113 |
+
gr.Markdown(f"### Aggregated correlations")
|
114 |
+
gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="aggr").to_markdown())
|
115 |
|
116 |
application.load(update_dataset_view_manual, inputs=slider_manual,
|
117 |
outputs=view_manual)
|
generation_steps/metrics_analysis.py
CHANGED
@@ -7,6 +7,7 @@ import pandas as pd
|
|
7 |
from tqdm import tqdm
|
8 |
|
9 |
import config
|
|
|
10 |
from api_wrappers import hf_data_loader
|
11 |
from custom_metrics import gpt_eval
|
12 |
|
@@ -110,6 +111,10 @@ IND_METRICS = {
|
|
110 |
"ter": ter_fn,
|
111 |
}
|
112 |
|
|
|
|
|
|
|
|
|
113 |
REL_METRICS = {
|
114 |
"editdist": edit_distance_fn,
|
115 |
"edittime": edit_time_fn,
|
@@ -128,6 +133,22 @@ def compute_metrics(df):
|
|
128 |
def apply_metric_fn_to_row(row, fn, col_pred, col_ref):
|
129 |
return fn(row[col_pred], row[col_ref], edittime=row['edit_time'], diff=str(row['mods']))
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
for metric in REL_METRICS:
|
132 |
print(f"Computing {metric} for the related pairs")
|
133 |
metric_fn = REL_METRICS[metric]
|
@@ -158,24 +179,15 @@ def compute_metrics(df):
|
|
158 |
df[f"rel_{rel_metric}_ind_{ind_metric}_spearman"] = (
|
159 |
df[f"{rel_metric}_related"].corr(df[f"{ind_metric}_independent"], method="spearman"))
|
160 |
|
161 |
-
|
|
|
|
|
162 |
|
|
|
|
|
|
|
|
|
163 |
|
164 |
-
def correlations_for_group(group):
|
165 |
-
correlations = []
|
166 |
-
for rel_metric in REL_METRICS:
|
167 |
-
# correlations.append({
|
168 |
-
# f"{metric}_pearson": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="pearson"),
|
169 |
-
# f"{metric}_spearman": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="spearman")
|
170 |
-
# })
|
171 |
-
for ind_metric in IND_METRICS:
|
172 |
-
correlations.append({
|
173 |
-
f"rel_{rel_metric}_ind_{ind_metric}_pearson": group[f"{rel_metric}_related"].corr(
|
174 |
-
group[f"{ind_metric}_independent"], method="pearson"),
|
175 |
-
f"rel_{rel_metric}_ind_{ind_metric}_spearman": group[f"{rel_metric}_related"].corr(
|
176 |
-
group[f"{ind_metric}_independent"], method="spearman"),
|
177 |
-
})
|
178 |
-
return pd.Series(functools.reduce(operator.ior, correlations, {}))
|
179 |
|
180 |
|
181 |
def compute_correlations(df: pd.DataFrame):
|
|
|
7 |
from tqdm import tqdm
|
8 |
|
9 |
import config
|
10 |
+
from analysis_util import correlations_for_group
|
11 |
from api_wrappers import hf_data_loader
|
12 |
from custom_metrics import gpt_eval
|
13 |
|
|
|
111 |
"ter": ter_fn,
|
112 |
}
|
113 |
|
114 |
+
AGGR_METRICS = IND_METRICS.copy()
|
115 |
+
del AGGR_METRICS["gptscore-ref-1-req"]
|
116 |
+
del AGGR_METRICS["gptscore-noref-1-req"]
|
117 |
+
|
118 |
REL_METRICS = {
|
119 |
"editdist": edit_distance_fn,
|
120 |
"edittime": edit_time_fn,
|
|
|
133 |
def apply_metric_fn_to_row(row, fn, col_pred, col_ref):
|
134 |
return fn(row[col_pred], row[col_ref], edittime=row['edit_time'], diff=str(row['mods']))
|
135 |
|
136 |
+
for metric in AGGR_METRICS:
|
137 |
+
print(f"Computing {metric} for the aggregated independent pairs")
|
138 |
+
values = []
|
139 |
+
for i, row in tqdm(df.iterrows(), total=len(df)):
|
140 |
+
others = df[(df["hash"] == row["hash"]) & (df["repo"] == row["repo"]) & (
|
141 |
+
df["commit_msg_start"] != row["commit_msg_start"])]['commit_msg_end'].to_list()
|
142 |
+
others.append(row["reference"])
|
143 |
+
others = list(set(others))
|
144 |
+
metric_fn = AGGR_METRICS[metric]
|
145 |
+
values.append(
|
146 |
+
metric_fn(
|
147 |
+
row['commit_msg_start'], None, refs=others, edittime=row['edit_time'], diff=str(row['mods'])
|
148 |
+
)
|
149 |
+
)
|
150 |
+
df[f"{metric}_aggr"] = values
|
151 |
+
|
152 |
for metric in REL_METRICS:
|
153 |
print(f"Computing {metric} for the related pairs")
|
154 |
metric_fn = REL_METRICS[metric]
|
|
|
179 |
df[f"rel_{rel_metric}_ind_{ind_metric}_spearman"] = (
|
180 |
df[f"{rel_metric}_related"].corr(df[f"{ind_metric}_independent"], method="spearman"))
|
181 |
|
182 |
+
for aggr_metric in AGGR_METRICS:
|
183 |
+
df[f"rel_{rel_metric}_aggr_{aggr_metric}_pearson"] = (
|
184 |
+
df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="pearson"))
|
185 |
|
186 |
+
df[f"rel_{rel_metric}_ind_{aggr_metric}_spearman"] = (
|
187 |
+
df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="spearman"))
|
188 |
+
|
189 |
+
return df
|
190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
|
193 |
def compute_correlations(df: pd.DataFrame):
|
requirements.txt
CHANGED
@@ -63,7 +63,6 @@ jsonpointer==2.4
|
|
63 |
jsonschema==4.21.1
|
64 |
jsonschema-specifications==2023.12.1
|
65 |
kiwisolver==1.4.5
|
66 |
-
Levenshtein==0.25.1
|
67 |
lxml==5.2.1
|
68 |
markdown-it-py==3.0.0
|
69 |
MarkupSafe==2.1.5
|
|
|
63 |
jsonschema==4.21.1
|
64 |
jsonschema-specifications==2023.12.1
|
65 |
kiwisolver==1.4.5
|
|
|
66 |
lxml==5.2.1
|
67 |
markdown-it-py==3.0.0
|
68 |
MarkupSafe==2.1.5
|