Petr Tsvetkov commited on
Commit
39950c9
β€’
1 Parent(s): ca11b66

Fix the visualization

Browse files
analysis.ipynb CHANGED
@@ -5,9 +5,12 @@
5
  "id": "initial_id",
6
  "metadata": {
7
  "collapsed": true,
 
 
 
8
  "ExecuteTime": {
9
- "end_time": "2024-05-01T13:07:35.991719Z",
10
- "start_time": "2024-05-01T13:07:16.672667Z"
11
  }
12
  },
13
  "source": [
@@ -15,41 +18,23 @@
15
  "\n",
16
  "import config"
17
  ],
18
- "outputs": [
19
- {
20
- "name": "stderr",
21
- "output_type": "stream",
22
- "text": [
23
- "D:\\petrtsv\\work\\jetbrains\\commit-rewriting-processing\\.venv\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
24
- " from .autonotebook import tqdm as notebook_tqdm\n",
25
- "[nltk_data] Downloading package wordnet to C:\\Users\\Petr\n",
26
- "[nltk_data] Tsvetkov\\AppData\\Roaming\\nltk_data...\n",
27
- "[nltk_data] Package wordnet is already up-to-date!\n",
28
- "[nltk_data] Downloading package punkt to C:\\Users\\Petr\n",
29
- "[nltk_data] Tsvetkov\\AppData\\Roaming\\nltk_data...\n",
30
- "[nltk_data] Package punkt is already up-to-date!\n",
31
- "[nltk_data] Downloading package omw-1.4 to C:\\Users\\Petr\n",
32
- "[nltk_data] Tsvetkov\\AppData\\Roaming\\nltk_data...\n",
33
- "[nltk_data] Package omw-1.4 is already up-to-date!\n"
34
- ]
35
- }
36
- ],
37
- "execution_count": 20
38
  },
39
  {
 
 
40
  "metadata": {
41
  "ExecuteTime": {
42
- "end_time": "2024-05-01T12:57:08.596650Z",
43
- "start_time": "2024-05-01T12:57:08.435650Z"
44
  }
45
  },
46
- "cell_type": "code",
47
  "source": [
48
  "df = pd.read_csv(config.SYNTHETIC_DATASET_ARTIFACT, index_col=0)\n",
49
  "\n",
50
  "df.head()"
51
  ],
52
- "id": "2ac8757a17e62293",
53
  "outputs": [
54
  {
55
  "data": {
@@ -316,6 +301,30 @@
316
  "</div>"
317
  ]
318
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  "execution_count": 6,
320
  "metadata": {},
321
  "output_type": "execute_result"
@@ -324,18 +333,15 @@
324
  "execution_count": 6
325
  },
326
  {
 
 
 
327
  "metadata": {
328
  "ExecuteTime": {
329
  "end_time": "2024-05-01T13:02:40.761645Z",
330
  "start_time": "2024-05-01T13:02:40.740647Z"
331
  }
332
  },
333
- "cell_type": "code",
334
- "source": [
335
- "rel_metrics = [col.split(\"_\")[0] for col in df.columns if col.endswith(\"_related\")]\n",
336
- "rel_metrics"
337
- ],
338
- "id": "d19c12dd10b25c75",
339
  "outputs": [
340
  {
341
  "data": {
@@ -348,21 +354,21 @@
348
  "output_type": "execute_result"
349
  }
350
  ],
351
- "execution_count": 15
 
 
 
352
  },
353
  {
 
 
 
354
  "metadata": {
355
  "ExecuteTime": {
356
  "end_time": "2024-05-01T13:02:44.072037Z",
357
  "start_time": "2024-05-01T13:02:44.055039Z"
358
  }
359
  },
360
- "cell_type": "code",
361
- "source": [
362
- "ind_metrics = [col.split(\"_\")[0] for col in df.columns if col.endswith(\"_independent\")]\n",
363
- "ind_metrics"
364
- ],
365
- "id": "79d644cd780b28a1",
366
  "outputs": [
367
  {
368
  "data": {
@@ -385,96 +391,24 @@
385
  "output_type": "execute_result"
386
  }
387
  ],
388
- "execution_count": 16
 
 
 
389
  },
390
  {
 
 
 
391
  "metadata": {
392
  "ExecuteTime": {
393
  "end_time": "2024-05-01T13:03:52.623346Z",
394
  "start_time": "2024-05-01T13:03:52.577076Z"
395
  }
396
  },
397
- "cell_type": "code",
398
- "source": [
399
- "AGGREGATION = {\"hash\": [\"count\"]}\n",
400
- "\n",
401
- "for metric in rel_metrics:\n",
402
- " AGGREGATION[f\"{metric}_related\"] = [\"mean\"]\n",
403
- "\n",
404
- "for metric in ind_metrics:\n",
405
- " AGGREGATION[f\"{metric}_independent\"] = [\"mean\"]\n",
406
- "\n",
407
- "df.groupby(by=[\"end_to_start\", \"start_to_end\"]).agg(AGGREGATION)"
408
- ],
409
- "id": "fdc5ae636bffbc8b",
410
  "outputs": [
411
  {
412
  "data": {
413
- "text/plain": [
414
- " hash editdist_related edittime_related \\\n",
415
- " count mean mean \n",
416
- "end_to_start start_to_end \n",
417
- "False False 43 355.441860 364099.0625 \n",
418
- " True 129 406.627907 NaN \n",
419
- "True False 129 433.899225 NaN \n",
420
- " True 387 444.509044 NaN \n",
421
- "\n",
422
- " gptscore-ref-1-req_independent \\\n",
423
- " mean \n",
424
- "end_to_start start_to_end \n",
425
- "False False 7.255814 \n",
426
- " True 7.217054 \n",
427
- "True False 7.356589 \n",
428
- " True 7.312661 \n",
429
- "\n",
430
- " gptscore-noref-1-req_independent \\\n",
431
- " mean \n",
432
- "end_to_start start_to_end \n",
433
- "False False 8.116279 \n",
434
- " True 8.178295 \n",
435
- "True False 8.302326 \n",
436
- " True 8.276486 \n",
437
- "\n",
438
- " editdist_independent bleu_independent \\\n",
439
- " mean mean \n",
440
- "end_to_start start_to_end \n",
441
- "False False 491.069767 0.012805 \n",
442
- " True 491.069767 0.012805 \n",
443
- "True False 534.015504 0.009542 \n",
444
- " True 534.015504 0.009542 \n",
445
- "\n",
446
- " meteor_independent rouge1_independent \\\n",
447
- " mean mean \n",
448
- "end_to_start start_to_end \n",
449
- "False False 0.224961 0.202063 \n",
450
- " True 0.224961 0.202063 \n",
451
- "True False 0.221893 0.205151 \n",
452
- " True 0.221893 0.205151 \n",
453
- "\n",
454
- " rouge2_independent rougeL_independent \\\n",
455
- " mean mean \n",
456
- "end_to_start start_to_end \n",
457
- "False False 0.040718 0.136427 \n",
458
- " True 0.040718 0.136427 \n",
459
- "True False 0.039033 0.134114 \n",
460
- " True 0.039033 0.134114 \n",
461
- "\n",
462
- " bertscore_independent chrF_independent \\\n",
463
- " mean mean \n",
464
- "end_to_start start_to_end \n",
465
- "False False 0.780266 32.067005 \n",
466
- " True 0.780266 32.067005 \n",
467
- "True False 0.777162 31.753065 \n",
468
- " True 0.777162 31.753065 \n",
469
- "\n",
470
- " ter_independent \n",
471
- " mean \n",
472
- "end_to_start start_to_end \n",
473
- "False False 312.732989 \n",
474
- " True 312.732989 \n",
475
- "True False 317.717517 \n",
476
- " True 317.717517 "
477
- ],
478
  "text/html": [
479
  "<div>\n",
480
  "<style scoped>\n",
@@ -625,6 +559,71 @@
625
  " </tbody>\n",
626
  "</table>\n",
627
  "</div>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628
  ]
629
  },
630
  "execution_count": 19,
@@ -632,100 +631,31 @@
632
  "output_type": "execute_result"
633
  }
634
  ],
635
- "execution_count": 19
 
 
 
 
 
 
 
 
 
 
636
  },
637
  {
 
 
 
638
  "metadata": {
639
  "ExecuteTime": {
640
  "end_time": "2024-05-01T13:42:57.052768Z",
641
  "start_time": "2024-05-01T13:42:56.812556Z"
642
  }
643
  },
644
- "cell_type": "code",
645
- "source": "",
646
- "id": "3429b60eab154b79",
647
  "outputs": [
648
  {
649
  "data": {
650
- "text/plain": [
651
- " all golden \\\n",
652
- " spearman pearson spearman pearson \n",
653
- "relative independent \n",
654
- "editdist bertscore -0.184962 -0.129057 -0.316215 -0.254700 \n",
655
- " bleu 0.260118 0.185995 0.269028 0.259690 \n",
656
- " chrF -0.199200 -0.129029 -0.343201 -0.300656 \n",
657
- " editdist 0.909934 0.910641 0.710772 0.662808 \n",
658
- " gptscore-noref-1-req 0.032048 0.055364 0.155510 0.048588 \n",
659
- " gptscore-ref-1-req 0.024550 0.035295 -0.009830 -0.062574 \n",
660
- " meteor 0.336016 0.371949 0.068034 0.173237 \n",
661
- " rouge1 -0.077574 -0.043738 -0.187349 -0.163230 \n",
662
- " rouge2 0.414256 0.340732 0.276139 0.332087 \n",
663
- " rougeL 0.006513 -0.008078 -0.041502 -0.034867 \n",
664
- " ter 0.618095 0.385515 0.575614 0.501385 \n",
665
- "edittime bertscore 0.140481 0.158807 0.140481 0.158807 \n",
666
- " bleu 0.302380 0.326167 0.302380 0.326167 \n",
667
- " chrF 0.079802 0.184202 0.079802 0.184202 \n",
668
- " editdist 0.252645 0.411131 0.252645 0.411131 \n",
669
- " gptscore-noref-1-req 0.206465 0.026235 0.206465 0.026235 \n",
670
- " gptscore-ref-1-req 0.130419 -0.055218 0.130419 -0.055218 \n",
671
- " meteor 0.253380 0.403564 0.253380 0.403564 \n",
672
- " rouge1 0.155926 0.136971 0.155926 0.136971 \n",
673
- " rouge2 0.218822 0.281944 0.218822 0.281944 \n",
674
- " rougeL 0.071344 0.091196 0.071344 0.091196 \n",
675
- " ter 0.305601 0.062616 0.305601 0.062616 \n",
676
- "\n",
677
- " +s2e +e2s \\\n",
678
- " spearman pearson spearman pearson \n",
679
- "relative independent \n",
680
- "editdist bertscore -0.308494 -0.113525 -0.181393 -0.165924 \n",
681
- " bleu 0.512841 0.502827 0.109831 0.068138 \n",
682
- " chrF -0.238124 -0.064922 -0.233123 -0.201726 \n",
683
- " editdist 0.950494 0.935064 0.861930 0.878118 \n",
684
- " gptscore-noref-1-req 0.067857 0.047215 -0.029048 -0.013128 \n",
685
- " gptscore-ref-1-req -0.015178 -0.036001 0.071345 0.087584 \n",
686
- " meteor 0.203616 0.425775 0.372598 0.360051 \n",
687
- " rouge1 -0.139874 -0.065543 -0.082093 -0.035603 \n",
688
- " rouge2 0.523559 0.537560 0.323911 0.282872 \n",
689
- " rougeL -0.022288 -0.004664 0.012409 0.016372 \n",
690
- " ter 0.774086 0.462554 0.529338 0.388592 \n",
691
- "edittime bertscore NaN NaN NaN NaN \n",
692
- " bleu NaN NaN NaN NaN \n",
693
- " chrF NaN NaN NaN NaN \n",
694
- " editdist NaN NaN NaN NaN \n",
695
- " gptscore-noref-1-req NaN NaN NaN NaN \n",
696
- " gptscore-ref-1-req NaN NaN NaN NaN \n",
697
- " meteor NaN NaN NaN NaN \n",
698
- " rouge1 NaN NaN NaN NaN \n",
699
- " rouge2 NaN NaN NaN NaN \n",
700
- " rougeL NaN NaN NaN NaN \n",
701
- " ter NaN NaN NaN NaN \n",
702
- "\n",
703
- " +e2s+s2e \n",
704
- " spearman pearson \n",
705
- "relative independent \n",
706
- "editdist bertscore -0.135421 -0.091748 \n",
707
- " bleu 0.229712 0.145062 \n",
708
- " chrF -0.156914 -0.093376 \n",
709
- " editdist 0.939318 0.962305 \n",
710
- " gptscore-noref-1-req 0.012102 0.066882 \n",
711
- " gptscore-ref-1-req 0.013012 0.033618 \n",
712
- " meteor 0.392262 0.401802 \n",
713
- " rouge1 -0.054034 -0.030799 \n",
714
- " rouge2 0.433859 0.324538 \n",
715
- " rougeL 0.021983 -0.010644 \n",
716
- " ter 0.591684 0.354459 \n",
717
- "edittime bertscore NaN NaN \n",
718
- " bleu NaN NaN \n",
719
- " chrF NaN NaN \n",
720
- " editdist NaN NaN \n",
721
- " gptscore-noref-1-req NaN NaN \n",
722
- " gptscore-ref-1-req NaN NaN \n",
723
- " meteor NaN NaN \n",
724
- " rouge1 NaN NaN \n",
725
- " rouge2 NaN NaN \n",
726
- " rougeL NaN NaN \n",
727
- " ter NaN NaN "
728
- ],
729
  "text/html": [
730
  "<div>\n",
731
  "<style scoped>\n",
@@ -1077,32 +1007,7 @@
1077
  " </tbody>\n",
1078
  "</table>\n",
1079
  "</div>"
1080
- ]
1081
- },
1082
- "execution_count": 47,
1083
- "metadata": {},
1084
- "output_type": "execute_result"
1085
- }
1086
- ],
1087
- "execution_count": 47
1088
- },
1089
- {
1090
- "metadata": {
1091
- "ExecuteTime": {
1092
- "end_time": "2024-05-01T13:49:09.514129Z",
1093
- "start_time": "2024-05-01T13:49:09.295101Z"
1094
- }
1095
- },
1096
- "cell_type": "code",
1097
- "source": [
1098
- "from analysis_util import get_ref_only_correlations_for_groups\n",
1099
- "\n",
1100
- "get_ref_only_correlations_for_groups(df)"
1101
- ],
1102
- "id": "a3531f28722fa5bc",
1103
- "outputs": [
1104
- {
1105
- "data": {
1106
  "text/plain": [
1107
  " all golden \\\n",
1108
  " spearman pearson spearman pearson \n",
@@ -1181,7 +1086,28 @@
1181
  " rouge2 NaN NaN \n",
1182
  " rougeL NaN NaN \n",
1183
  " ter NaN NaN "
1184
- ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1185
  "text/html": [
1186
  "<div>\n",
1187
  "<style scoped>\n",
@@ -1533,6 +1459,85 @@
1533
  " </tbody>\n",
1534
  "</table>\n",
1535
  "</div>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1536
  ]
1537
  },
1538
  "execution_count": 50,
@@ -1540,26 +1545,78 @@
1540
  "output_type": "execute_result"
1541
  }
1542
  ],
1543
- "execution_count": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1544
  }
1545
  ],
1546
  "metadata": {
1547
  "kernelspec": {
1548
- "display_name": "Python 3",
1549
  "language": "python",
1550
  "name": "python3"
1551
  },
1552
  "language_info": {
1553
  "codemirror_mode": {
1554
  "name": "ipython",
1555
- "version": 2
1556
  },
1557
  "file_extension": ".py",
1558
  "mimetype": "text/x-python",
1559
  "name": "python",
1560
  "nbconvert_exporter": "python",
1561
- "pygments_lexer": "ipython2",
1562
- "version": "2.7.6"
1563
  }
1564
  },
1565
  "nbformat": 4,
 
5
  "id": "initial_id",
6
  "metadata": {
7
  "collapsed": true,
8
+ "jupyter": {
9
+ "outputs_hidden": true
10
+ },
11
  "ExecuteTime": {
12
+ "end_time": "2024-05-01T15:23:17.507403Z",
13
+ "start_time": "2024-05-01T15:23:17.497406Z"
14
  }
15
  },
16
  "source": [
 
18
  "\n",
19
  "import config"
20
  ],
21
+ "outputs": [],
22
+ "execution_count": 7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  },
24
  {
25
+ "cell_type": "code",
26
+ "id": "2ac8757a17e62293",
27
  "metadata": {
28
  "ExecuteTime": {
29
+ "end_time": "2024-05-01T15:23:19.365525Z",
30
+ "start_time": "2024-05-01T15:23:19.120308Z"
31
  }
32
  },
 
33
  "source": [
34
  "df = pd.read_csv(config.SYNTHETIC_DATASET_ARTIFACT, index_col=0)\n",
35
  "\n",
36
  "df.head()"
37
  ],
 
38
  "outputs": [
39
  {
40
  "data": {
 
301
  "</div>"
302
  ]
303
  },
304
+ "execution_count": 8,
305
+ "metadata": {},
306
+ "output_type": "execute_result"
307
+ }
308
+ ],
309
+ "execution_count": 8
310
+ },
311
+ {
312
+ "metadata": {
313
+ "ExecuteTime": {
314
+ "end_time": "2024-05-01T15:11:08.418257Z",
315
+ "start_time": "2024-05-01T15:11:08.408943Z"
316
+ }
317
+ },
318
+ "cell_type": "code",
319
+ "source": "len(set(df['session'].to_list()))",
320
+ "id": "4bcbc0f1d3d6d248",
321
+ "outputs": [
322
+ {
323
+ "data": {
324
+ "text/plain": [
325
+ "9"
326
+ ]
327
+ },
328
  "execution_count": 6,
329
  "metadata": {},
330
  "output_type": "execute_result"
 
333
  "execution_count": 6
334
  },
335
  {
336
+ "cell_type": "code",
337
+ "execution_count": 15,
338
+ "id": "d19c12dd10b25c75",
339
  "metadata": {
340
  "ExecuteTime": {
341
  "end_time": "2024-05-01T13:02:40.761645Z",
342
  "start_time": "2024-05-01T13:02:40.740647Z"
343
  }
344
  },
 
 
 
 
 
 
345
  "outputs": [
346
  {
347
  "data": {
 
354
  "output_type": "execute_result"
355
  }
356
  ],
357
+ "source": [
358
+ "rel_metrics = [col.split(\"_\")[0] for col in df.columns if col.endswith(\"_related\")]\n",
359
+ "rel_metrics"
360
+ ]
361
  },
362
  {
363
+ "cell_type": "code",
364
+ "execution_count": 16,
365
+ "id": "79d644cd780b28a1",
366
  "metadata": {
367
  "ExecuteTime": {
368
  "end_time": "2024-05-01T13:02:44.072037Z",
369
  "start_time": "2024-05-01T13:02:44.055039Z"
370
  }
371
  },
 
 
 
 
 
 
372
  "outputs": [
373
  {
374
  "data": {
 
391
  "output_type": "execute_result"
392
  }
393
  ],
394
+ "source": [
395
+ "ind_metrics = [col.split(\"_\")[0] for col in df.columns if col.endswith(\"_independent\")]\n",
396
+ "ind_metrics"
397
+ ]
398
  },
399
  {
400
+ "cell_type": "code",
401
+ "execution_count": 19,
402
+ "id": "fdc5ae636bffbc8b",
403
  "metadata": {
404
  "ExecuteTime": {
405
  "end_time": "2024-05-01T13:03:52.623346Z",
406
  "start_time": "2024-05-01T13:03:52.577076Z"
407
  }
408
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  "outputs": [
410
  {
411
  "data": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  "text/html": [
413
  "<div>\n",
414
  "<style scoped>\n",
 
559
  " </tbody>\n",
560
  "</table>\n",
561
  "</div>"
562
+ ],
563
+ "text/plain": [
564
+ " hash editdist_related edittime_related \\\n",
565
+ " count mean mean \n",
566
+ "end_to_start start_to_end \n",
567
+ "False False 43 355.441860 364099.0625 \n",
568
+ " True 129 406.627907 NaN \n",
569
+ "True False 129 433.899225 NaN \n",
570
+ " True 387 444.509044 NaN \n",
571
+ "\n",
572
+ " gptscore-ref-1-req_independent \\\n",
573
+ " mean \n",
574
+ "end_to_start start_to_end \n",
575
+ "False False 7.255814 \n",
576
+ " True 7.217054 \n",
577
+ "True False 7.356589 \n",
578
+ " True 7.312661 \n",
579
+ "\n",
580
+ " gptscore-noref-1-req_independent \\\n",
581
+ " mean \n",
582
+ "end_to_start start_to_end \n",
583
+ "False False 8.116279 \n",
584
+ " True 8.178295 \n",
585
+ "True False 8.302326 \n",
586
+ " True 8.276486 \n",
587
+ "\n",
588
+ " editdist_independent bleu_independent \\\n",
589
+ " mean mean \n",
590
+ "end_to_start start_to_end \n",
591
+ "False False 491.069767 0.012805 \n",
592
+ " True 491.069767 0.012805 \n",
593
+ "True False 534.015504 0.009542 \n",
594
+ " True 534.015504 0.009542 \n",
595
+ "\n",
596
+ " meteor_independent rouge1_independent \\\n",
597
+ " mean mean \n",
598
+ "end_to_start start_to_end \n",
599
+ "False False 0.224961 0.202063 \n",
600
+ " True 0.224961 0.202063 \n",
601
+ "True False 0.221893 0.205151 \n",
602
+ " True 0.221893 0.205151 \n",
603
+ "\n",
604
+ " rouge2_independent rougeL_independent \\\n",
605
+ " mean mean \n",
606
+ "end_to_start start_to_end \n",
607
+ "False False 0.040718 0.136427 \n",
608
+ " True 0.040718 0.136427 \n",
609
+ "True False 0.039033 0.134114 \n",
610
+ " True 0.039033 0.134114 \n",
611
+ "\n",
612
+ " bertscore_independent chrF_independent \\\n",
613
+ " mean mean \n",
614
+ "end_to_start start_to_end \n",
615
+ "False False 0.780266 32.067005 \n",
616
+ " True 0.780266 32.067005 \n",
617
+ "True False 0.777162 31.753065 \n",
618
+ " True 0.777162 31.753065 \n",
619
+ "\n",
620
+ " ter_independent \n",
621
+ " mean \n",
622
+ "end_to_start start_to_end \n",
623
+ "False False 312.732989 \n",
624
+ " True 312.732989 \n",
625
+ "True False 317.717517 \n",
626
+ " True 317.717517 "
627
  ]
628
  },
629
  "execution_count": 19,
 
631
  "output_type": "execute_result"
632
  }
633
  ],
634
+ "source": [
635
+ "AGGREGATION = {\"hash\": [\"count\"]}\n",
636
+ "\n",
637
+ "for metric in rel_metrics:\n",
638
+ " AGGREGATION[f\"{metric}_related\"] = [\"mean\"]\n",
639
+ "\n",
640
+ "for metric in ind_metrics:\n",
641
+ " AGGREGATION[f\"{metric}_independent\"] = [\"mean\"]\n",
642
+ "\n",
643
+ "df.groupby(by=[\"end_to_start\", \"start_to_end\"]).agg(AGGREGATION)"
644
+ ]
645
  },
646
  {
647
+ "cell_type": "code",
648
+ "execution_count": 47,
649
+ "id": "3429b60eab154b79",
650
  "metadata": {
651
  "ExecuteTime": {
652
  "end_time": "2024-05-01T13:42:57.052768Z",
653
  "start_time": "2024-05-01T13:42:56.812556Z"
654
  }
655
  },
 
 
 
656
  "outputs": [
657
  {
658
  "data": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
659
  "text/html": [
660
  "<div>\n",
661
  "<style scoped>\n",
 
1007
  " </tbody>\n",
1008
  "</table>\n",
1009
  "</div>"
1010
+ ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1011
  "text/plain": [
1012
  " all golden \\\n",
1013
  " spearman pearson spearman pearson \n",
 
1086
  " rouge2 NaN NaN \n",
1087
  " rougeL NaN NaN \n",
1088
  " ter NaN NaN "
1089
+ ]
1090
+ },
1091
+ "execution_count": 47,
1092
+ "metadata": {},
1093
+ "output_type": "execute_result"
1094
+ }
1095
+ ],
1096
+ "source": []
1097
+ },
1098
+ {
1099
+ "cell_type": "code",
1100
+ "execution_count": 50,
1101
+ "id": "a3531f28722fa5bc",
1102
+ "metadata": {
1103
+ "ExecuteTime": {
1104
+ "end_time": "2024-05-01T13:49:09.514129Z",
1105
+ "start_time": "2024-05-01T13:49:09.295101Z"
1106
+ }
1107
+ },
1108
+ "outputs": [
1109
+ {
1110
+ "data": {
1111
  "text/html": [
1112
  "<div>\n",
1113
  "<style scoped>\n",
 
1459
  " </tbody>\n",
1460
  "</table>\n",
1461
  "</div>"
1462
+ ],
1463
+ "text/plain": [
1464
+ " all golden \\\n",
1465
+ " spearman pearson spearman pearson \n",
1466
+ "relative independent \n",
1467
+ "editdist bertscore -0.184962 -0.129057 -0.316215 -0.254700 \n",
1468
+ " bleu 0.260118 0.185995 0.269028 0.259690 \n",
1469
+ " chrF -0.199200 -0.129029 -0.343201 -0.300656 \n",
1470
+ " editdist 0.909934 0.910641 0.710772 0.662808 \n",
1471
+ " gptscore-noref-1-req 0.032048 0.055364 0.155510 0.048588 \n",
1472
+ " gptscore-ref-1-req 0.024550 0.035295 -0.009830 -0.062574 \n",
1473
+ " meteor 0.336016 0.371949 0.068034 0.173237 \n",
1474
+ " rouge1 -0.077574 -0.043738 -0.187349 -0.163230 \n",
1475
+ " rouge2 0.414256 0.340732 0.276139 0.332087 \n",
1476
+ " rougeL 0.006513 -0.008078 -0.041502 -0.034867 \n",
1477
+ " ter 0.618095 0.385515 0.575614 0.501385 \n",
1478
+ "edittime bertscore 0.140481 0.158807 0.140481 0.158807 \n",
1479
+ " bleu 0.302380 0.326167 0.302380 0.326167 \n",
1480
+ " chrF 0.079802 0.184202 0.079802 0.184202 \n",
1481
+ " editdist 0.252645 0.411131 0.252645 0.411131 \n",
1482
+ " gptscore-noref-1-req 0.206465 0.026235 0.206465 0.026235 \n",
1483
+ " gptscore-ref-1-req 0.130419 -0.055218 0.130419 -0.055218 \n",
1484
+ " meteor 0.253380 0.403564 0.253380 0.403564 \n",
1485
+ " rouge1 0.155926 0.136971 0.155926 0.136971 \n",
1486
+ " rouge2 0.218822 0.281944 0.218822 0.281944 \n",
1487
+ " rougeL 0.071344 0.091196 0.071344 0.091196 \n",
1488
+ " ter 0.305601 0.062616 0.305601 0.062616 \n",
1489
+ "\n",
1490
+ " +s2e +e2s \\\n",
1491
+ " spearman pearson spearman pearson \n",
1492
+ "relative independent \n",
1493
+ "editdist bertscore -0.308494 -0.113525 -0.181393 -0.165924 \n",
1494
+ " bleu 0.512841 0.502827 0.109831 0.068138 \n",
1495
+ " chrF -0.238124 -0.064922 -0.233123 -0.201726 \n",
1496
+ " editdist 0.950494 0.935064 0.861930 0.878118 \n",
1497
+ " gptscore-noref-1-req 0.067857 0.047215 -0.029048 -0.013128 \n",
1498
+ " gptscore-ref-1-req -0.015178 -0.036001 0.071345 0.087584 \n",
1499
+ " meteor 0.203616 0.425775 0.372598 0.360051 \n",
1500
+ " rouge1 -0.139874 -0.065543 -0.082093 -0.035603 \n",
1501
+ " rouge2 0.523559 0.537560 0.323911 0.282872 \n",
1502
+ " rougeL -0.022288 -0.004664 0.012409 0.016372 \n",
1503
+ " ter 0.774086 0.462554 0.529338 0.388592 \n",
1504
+ "edittime bertscore NaN NaN NaN NaN \n",
1505
+ " bleu NaN NaN NaN NaN \n",
1506
+ " chrF NaN NaN NaN NaN \n",
1507
+ " editdist NaN NaN NaN NaN \n",
1508
+ " gptscore-noref-1-req NaN NaN NaN NaN \n",
1509
+ " gptscore-ref-1-req NaN NaN NaN NaN \n",
1510
+ " meteor NaN NaN NaN NaN \n",
1511
+ " rouge1 NaN NaN NaN NaN \n",
1512
+ " rouge2 NaN NaN NaN NaN \n",
1513
+ " rougeL NaN NaN NaN NaN \n",
1514
+ " ter NaN NaN NaN NaN \n",
1515
+ "\n",
1516
+ " +e2s+s2e \n",
1517
+ " spearman pearson \n",
1518
+ "relative independent \n",
1519
+ "editdist bertscore -0.135421 -0.091748 \n",
1520
+ " bleu 0.229712 0.145062 \n",
1521
+ " chrF -0.156914 -0.093376 \n",
1522
+ " editdist 0.939318 0.962305 \n",
1523
+ " gptscore-noref-1-req 0.012102 0.066882 \n",
1524
+ " gptscore-ref-1-req 0.013012 0.033618 \n",
1525
+ " meteor 0.392262 0.401802 \n",
1526
+ " rouge1 -0.054034 -0.030799 \n",
1527
+ " rouge2 0.433859 0.324538 \n",
1528
+ " rougeL 0.021983 -0.010644 \n",
1529
+ " ter 0.591684 0.354459 \n",
1530
+ "edittime bertscore NaN NaN \n",
1531
+ " bleu NaN NaN \n",
1532
+ " chrF NaN NaN \n",
1533
+ " editdist NaN NaN \n",
1534
+ " gptscore-noref-1-req NaN NaN \n",
1535
+ " gptscore-ref-1-req NaN NaN \n",
1536
+ " meteor NaN NaN \n",
1537
+ " rouge1 NaN NaN \n",
1538
+ " rouge2 NaN NaN \n",
1539
+ " rougeL NaN NaN \n",
1540
+ " ter NaN NaN "
1541
  ]
1542
  },
1543
  "execution_count": 50,
 
1545
  "output_type": "execute_result"
1546
  }
1547
  ],
1548
+ "source": [
1549
+ "from analysis_util import get_correlations_for_groups\n",
1550
+ "\n",
1551
+ "get_correlations_for_groups(df, right_side=\"ind\")"
1552
+ ]
1553
+ },
1554
+ {
1555
+ "cell_type": "code",
1556
+ "execution_count": null,
1557
+ "id": "d5dc33a4251baf9a",
1558
+ "metadata": {},
1559
+ "outputs": [],
1560
+ "source": [
1561
+ "get_correlations_for_groups(df, right_side=\"aggr\")"
1562
+ ]
1563
+ },
1564
+ {
1565
+ "metadata": {
1566
+ "ExecuteTime": {
1567
+ "end_time": "2024-05-01T15:25:18.226195Z",
1568
+ "start_time": "2024-05-01T15:25:17.464762Z"
1569
+ }
1570
+ },
1571
+ "cell_type": "code",
1572
+ "source": [
1573
+ "from matplotlib import pyplot as plt\n",
1574
+ "\n",
1575
+ "plt.scatter(x=df['edittime_related'], y=df['editdist_related'])"
1576
+ ],
1577
+ "id": "5df60ac60034b274",
1578
+ "outputs": [
1579
+ {
1580
+ "data": {
1581
+ "text/plain": [
1582
+ "<matplotlib.collections.PathCollection at 0x17c179da970>"
1583
+ ]
1584
+ },
1585
+ "execution_count": 11,
1586
+ "metadata": {},
1587
+ "output_type": "execute_result"
1588
+ },
1589
+ {
1590
+ "data": {
1591
+ "text/plain": [
1592
+ "<Figure size 640x480 with 1 Axes>"
1593
+ ],
1594
+ "image/png": ""
1595
+ },
1596
+ "metadata": {},
1597
+ "output_type": "display_data"
1598
+ }
1599
+ ],
1600
+ "execution_count": 11
1601
  }
1602
  ],
1603
  "metadata": {
1604
  "kernelspec": {
1605
+ "display_name": "Python 3 (ipykernel)",
1606
  "language": "python",
1607
  "name": "python3"
1608
  },
1609
  "language_info": {
1610
  "codemirror_mode": {
1611
  "name": "ipython",
1612
+ "version": 3
1613
  },
1614
  "file_extension": ".py",
1615
  "mimetype": "text/x-python",
1616
  "name": "python",
1617
  "nbconvert_exporter": "python",
1618
+ "pygments_lexer": "ipython3",
1619
+ "version": "3.9.5"
1620
  }
1621
  },
1622
  "nbformat": 4,
analysis_util.py CHANGED
@@ -1,6 +1,31 @@
 
 
 
1
  import pandas as pd
2
 
3
- from generation_steps.metrics_analysis import correlations_for_group
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
 
6
  def split_metrics_string(s):
@@ -8,10 +33,10 @@ def split_metrics_string(s):
8
  return tokens[1], tokens[3]
9
 
10
 
11
- def get_ref_only_correlations_df(df):
12
  correlations_raw = correlations_for_group(df)
13
 
14
- idx = list(set("_".join(col.split("_")[:-1]) for col in correlations_raw.index))
15
 
16
  data = []
17
  for metrics in idx:
@@ -29,8 +54,8 @@ def get_ref_only_correlations_df(df):
29
  return result
30
 
31
 
32
- def get_ref_only_correlations_for_groups(df):
33
- noref_correlations = {"all": get_ref_only_correlations_df(df)}
34
 
35
  for e2s in (False, True):
36
  for s2e in (False, True):
@@ -43,7 +68,7 @@ def get_ref_only_correlations_for_groups(df):
43
  suffix = "golden"
44
 
45
  subdf = df[(df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)]
46
- subdf_noref_corr = get_ref_only_correlations_df(subdf)
47
  noref_correlations[suffix] = subdf_noref_corr
48
 
49
  noref_correlations = pd.concat(noref_correlations, axis=1)
 
1
+ import functools
2
+ import operator
3
+
4
  import pandas as pd
5
 
6
+
7
+ def correlations_for_group(group):
8
+ REL_METRICS = [col.split("_")[0] for col in group.colmns if col.endswith("_related")]
9
+ IND_METRICS = [col.split("_")[0] for col in group.colmns if col.endswith("_independent")]
10
+ AGGR_METRICS = [col.split("_")[0] for col in group.colmns if col.endswith("_aggr")]
11
+
12
+ correlations = []
13
+ for rel_metric in REL_METRICS:
14
+ for ind_metric in IND_METRICS:
15
+ correlations.append({
16
+ f"rel_{rel_metric}_ind_{ind_metric}_pearson": group[f"{rel_metric}_related"].corr(
17
+ group[f"{ind_metric}_independent"], method="pearson"),
18
+ f"rel_{rel_metric}_ind_{ind_metric}_spearman": group[f"{rel_metric}_related"].corr(
19
+ group[f"{ind_metric}_independent"], method="spearman"),
20
+ })
21
+ for aggr_metric in AGGR_METRICS:
22
+ correlations.append({
23
+ f"rel_{rel_metric}_aggr_{aggr_metric}_pearson": group[f"{rel_metric}_related"].corr(
24
+ group[f"{aggr_metric}_aggr"], method="pearson"),
25
+ f"rel_{rel_metric}_aggr_{aggr_metric}_spearman": group[f"{rel_metric}_related"].corr(
26
+ group[f"{aggr_metric}_aggr"], method="spearman"),
27
+ })
28
+ return pd.Series(functools.reduce(operator.ior, correlations, {}))
29
 
30
 
31
  def split_metrics_string(s):
 
33
  return tokens[1], tokens[3]
34
 
35
 
36
+ def get_correlations_df(df, right_side):
37
  correlations_raw = correlations_for_group(df)
38
 
39
+ idx = list(set("_".join(col.split("_")[:-1]) for col in correlations_raw.index if right_side in col))
40
 
41
  data = []
42
  for metrics in idx:
 
54
  return result
55
 
56
 
57
+ def get_correlations_for_groups(df, right_side):
58
+ noref_correlations = {"all": get_correlations_df(df, right_side=right_side)}
59
 
60
  for e2s in (False, True):
61
  for s2e in (False, True):
 
68
  suffix = "golden"
69
 
70
  subdf = df[(df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)]
71
+ subdf_noref_corr = get_correlations_for_groups(subdf, right_side=right_side)
72
  noref_correlations[suffix] = subdf_noref_corr
73
 
74
  noref_correlations = pd.concat(noref_correlations, axis=1)
change_visualizer.py CHANGED
@@ -108,7 +108,10 @@ if __name__ == '__main__':
108
  layout_for_statistics("synthetic")
109
 
110
  gr.Markdown(f"### Reference-only correlations")
111
- gr.Markdown(value=analysis_util.get_ref_only_correlations_for_groups(df_synthetic).to_markdown())
 
 
 
112
 
113
  application.load(update_dataset_view_manual, inputs=slider_manual,
114
  outputs=view_manual)
 
108
  layout_for_statistics("synthetic")
109
 
110
  gr.Markdown(f"### Reference-only correlations")
111
+ gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="ind").to_markdown())
112
+
113
+ gr.Markdown(f"### Aggregated correlations")
114
+ gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="aggr").to_markdown())
115
 
116
  application.load(update_dataset_view_manual, inputs=slider_manual,
117
  outputs=view_manual)
generation_steps/metrics_analysis.py CHANGED
@@ -7,6 +7,7 @@ import pandas as pd
7
  from tqdm import tqdm
8
 
9
  import config
 
10
  from api_wrappers import hf_data_loader
11
  from custom_metrics import gpt_eval
12
 
@@ -110,6 +111,10 @@ IND_METRICS = {
110
  "ter": ter_fn,
111
  }
112
 
 
 
 
 
113
  REL_METRICS = {
114
  "editdist": edit_distance_fn,
115
  "edittime": edit_time_fn,
@@ -128,6 +133,22 @@ def compute_metrics(df):
128
  def apply_metric_fn_to_row(row, fn, col_pred, col_ref):
129
  return fn(row[col_pred], row[col_ref], edittime=row['edit_time'], diff=str(row['mods']))
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  for metric in REL_METRICS:
132
  print(f"Computing {metric} for the related pairs")
133
  metric_fn = REL_METRICS[metric]
@@ -158,24 +179,15 @@ def compute_metrics(df):
158
  df[f"rel_{rel_metric}_ind_{ind_metric}_spearman"] = (
159
  df[f"{rel_metric}_related"].corr(df[f"{ind_metric}_independent"], method="spearman"))
160
 
161
- return df
 
 
162
 
 
 
 
 
163
 
164
- def correlations_for_group(group):
165
- correlations = []
166
- for rel_metric in REL_METRICS:
167
- # correlations.append({
168
- # f"{metric}_pearson": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="pearson"),
169
- # f"{metric}_spearman": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="spearman")
170
- # })
171
- for ind_metric in IND_METRICS:
172
- correlations.append({
173
- f"rel_{rel_metric}_ind_{ind_metric}_pearson": group[f"{rel_metric}_related"].corr(
174
- group[f"{ind_metric}_independent"], method="pearson"),
175
- f"rel_{rel_metric}_ind_{ind_metric}_spearman": group[f"{rel_metric}_related"].corr(
176
- group[f"{ind_metric}_independent"], method="spearman"),
177
- })
178
- return pd.Series(functools.reduce(operator.ior, correlations, {}))
179
 
180
 
181
  def compute_correlations(df: pd.DataFrame):
 
7
  from tqdm import tqdm
8
 
9
  import config
10
+ from analysis_util import correlations_for_group
11
  from api_wrappers import hf_data_loader
12
  from custom_metrics import gpt_eval
13
 
 
111
  "ter": ter_fn,
112
  }
113
 
114
+ AGGR_METRICS = IND_METRICS.copy()
115
+ del AGGR_METRICS["gptscore-ref-1-req"]
116
+ del AGGR_METRICS["gptscore-noref-1-req"]
117
+
118
  REL_METRICS = {
119
  "editdist": edit_distance_fn,
120
  "edittime": edit_time_fn,
 
133
  def apply_metric_fn_to_row(row, fn, col_pred, col_ref):
134
  return fn(row[col_pred], row[col_ref], edittime=row['edit_time'], diff=str(row['mods']))
135
 
136
+ for metric in AGGR_METRICS:
137
+ print(f"Computing {metric} for the aggregated independent pairs")
138
+ values = []
139
+ for i, row in tqdm(df.iterrows(), total=len(df)):
140
+ others = df[(df["hash"] == row["hash"]) & (df["repo"] == row["repo"]) & (
141
+ df["commit_msg_start"] != row["commit_msg_start"])]['commit_msg_end'].to_list()
142
+ others.append(row["reference"])
143
+ others = list(set(others))
144
+ metric_fn = AGGR_METRICS[metric]
145
+ values.append(
146
+ metric_fn(
147
+ row['commit_msg_start'], None, refs=others, edittime=row['edit_time'], diff=str(row['mods'])
148
+ )
149
+ )
150
+ df[f"{metric}_aggr"] = values
151
+
152
  for metric in REL_METRICS:
153
  print(f"Computing {metric} for the related pairs")
154
  metric_fn = REL_METRICS[metric]
 
179
  df[f"rel_{rel_metric}_ind_{ind_metric}_spearman"] = (
180
  df[f"{rel_metric}_related"].corr(df[f"{ind_metric}_independent"], method="spearman"))
181
 
182
+ for aggr_metric in AGGR_METRICS:
183
+ df[f"rel_{rel_metric}_aggr_{aggr_metric}_pearson"] = (
184
+ df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="pearson"))
185
 
186
+ df[f"rel_{rel_metric}_ind_{aggr_metric}_spearman"] = (
187
+ df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="spearman"))
188
+
189
+ return df
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
 
193
  def compute_correlations(df: pd.DataFrame):
requirements.txt CHANGED
@@ -63,7 +63,6 @@ jsonpointer==2.4
63
  jsonschema==4.21.1
64
  jsonschema-specifications==2023.12.1
65
  kiwisolver==1.4.5
66
- Levenshtein==0.25.1
67
  lxml==5.2.1
68
  markdown-it-py==3.0.0
69
  MarkupSafe==2.1.5
 
63
  jsonschema==4.21.1
64
  jsonschema-specifications==2023.12.1
65
  kiwisolver==1.4.5
 
66
  lxml==5.2.1
67
  markdown-it-py==3.0.0
68
  MarkupSafe==2.1.5