giulio98 commited on
Commit
b7f853e
1 Parent(s): aff5728

Upload 7 files

Browse files
Files changed (6) hide show
  1. bleu.py +584 -128
  2. calc_code_bleu.py +72 -0
  3. dataflow_match.py +9 -1274
  4. readme.txt +1 -0
  5. syntax_match.py +9 -1274
  6. weighted_ngram_match.py +4 -102
bleu.py CHANGED
@@ -1,134 +1,590 @@
1
- # Copyright 2017 Google Inc. All Rights Reserved.
 
2
  #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- # ==============================================================================
15
-
16
- """Python implementation of BLEU and smooth-BLEU.
17
-
18
- This module provides a Python implementation of BLEU and smooth-BLEU.
19
- Smooth BLEU is computed following the method outlined in the paper:
20
- Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
21
- evaluation metrics for machine translation. COLING 2004.
22
- """
23
-
24
- import collections
25
  import math
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- def _get_ngrams(segment, max_order):
29
- """Extracts all n-grams upto a given maximum order from an input segment.
30
-
31
- Args:
32
- segment: text segment from which n-grams will be extracted.
33
- max_order: maximum length in tokens of the n-grams returned by this
34
- methods.
35
-
36
- Returns:
37
- The Counter containing all n-grams upto max_order in segment
38
- with a count of how many times each n-gram occurred.
39
- """
40
- ngram_counts = collections.Counter()
41
- for order in range(1, max_order + 1):
42
- for i in range(0, len(segment) - order + 1):
43
- ngram = tuple(segment[i:i+order])
44
- ngram_counts[ngram] += 1
45
- return ngram_counts
46
-
47
-
48
- def compute_bleu(reference_corpus, translation_corpus, max_order=4,
49
- smooth=False):
50
- """Computes BLEU score of translated segments against one or more references.
51
-
52
- Args:
53
- reference_corpus: list of lists of references for each translation. Each
54
- reference should be tokenized into a list of tokens.
55
- translation_corpus: list of translations to score. Each translation
56
- should be tokenized into a list of tokens.
57
- max_order: Maximum n-gram order to use when computing BLEU score.
58
- smooth: Whether or not to apply Lin et al. 2004 smoothing.
59
-
60
- Returns:
61
- 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
62
- precisions and brevity penalty.
63
- """
64
- matches_by_order = [0] * max_order
65
- possible_matches_by_order = [0] * max_order
66
- reference_length = 0
67
- translation_length = 0
68
- for (references, translation) in zip(reference_corpus,
69
- translation_corpus):
70
- reference_length += min(len(r) for r in references)
71
- translation_length += len(translation)
72
-
73
- merged_ref_ngram_counts = collections.Counter()
74
  for reference in references:
75
- merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
76
- translation_ngram_counts = _get_ngrams(translation, max_order)
77
- overlap = translation_ngram_counts & merged_ref_ngram_counts
78
- for ngram in overlap:
79
- matches_by_order[len(ngram)-1] += overlap[ngram]
80
- for order in range(1, max_order+1):
81
- possible_matches = len(translation) - order + 1
82
- if possible_matches > 0:
83
- possible_matches_by_order[order-1] += possible_matches
84
-
85
- precisions = [0] * max_order
86
- for i in range(0, max_order):
87
- if smooth:
88
- precisions[i] = ((matches_by_order[i] + 1.) /
89
- (possible_matches_by_order[i] + 1.))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  else:
91
- if possible_matches_by_order[i] > 0:
92
- precisions[i] = (float(matches_by_order[i]) /
93
- possible_matches_by_order[i])
94
- else:
95
- precisions[i] = 0.0
96
-
97
- if min(precisions) > 0:
98
- p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
99
- geo_mean = math.exp(p_log_sum)
100
- else:
101
- geo_mean = 0
102
-
103
- ratio = float(translation_length) / reference_length
104
-
105
- if ratio > 1.0:
106
- bp = 1.
107
- else:
108
- bp = math.exp(1 - 1. / ratio)
109
-
110
- bleu = geo_mean * bp
111
-
112
- return (bleu, precisions, bp, ratio, translation_length, reference_length)
113
-
114
-
115
- def _bleu(ref_file, trans_file, subword_option=None):
116
- max_order = 4
117
- smooth = True
118
- ref_files = [ref_file]
119
- reference_text = []
120
- for reference_filename in ref_files:
121
- with open(reference_filename) as fh:
122
- reference_text.append(fh.readlines())
123
- per_segment_references = []
124
- for references in zip(*reference_text):
125
- reference_list = []
126
- for reference in references:
127
- reference_list.append(reference.strip().split())
128
- per_segment_references.append(reference_list)
129
- translations = []
130
- with open(trans_file) as fh:
131
- for line in fh:
132
- translations.append(line.strip().split())
133
- bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134
- return round(100 * bleu_score,2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Natural Language Toolkit: BLEU Score
3
  #
4
+ # Copyright (C) 2001-2020 NLTK Project
5
+ # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
6
+ # Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan
7
+ # URL: <http://nltk.org/>
8
+ # For license information, see LICENSE.TXT
9
+
10
+ """BLEU score implementation."""
11
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  import math
13
+ import sys
14
+ from fractions import Fraction
15
+ import warnings
16
+ from collections import Counter
17
+
18
+ from .utils import ngrams
19
+ import pdb
20
+
21
+
22
+ def sentence_bleu(
23
+ references,
24
+ hypothesis,
25
+ weights=(0.25, 0.25, 0.25, 0.25),
26
+ smoothing_function=None,
27
+ auto_reweigh=False,
28
+ ):
29
+ """
30
+ Calculate BLEU score (Bilingual Evaluation Understudy) from
31
+ Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
32
+ "BLEU: a method for automatic evaluation of machine translation."
33
+ In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf
34
+ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
35
+ ... 'ensures', 'that', 'the', 'military', 'always',
36
+ ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
37
+ >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
38
+ ... 'forever', 'hearing', 'the', 'activity', 'guidebook',
39
+ ... 'that', 'party', 'direct']
40
+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
41
+ ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
42
+ ... 'heed', 'Party', 'commands']
43
+ >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
44
+ ... 'guarantees', 'the', 'military', 'forces', 'always',
45
+ ... 'being', 'under', 'the', 'command', 'of', 'the',
46
+ ... 'Party']
47
+ >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
48
+ ... 'army', 'always', 'to', 'heed', 'the', 'directions',
49
+ ... 'of', 'the', 'party']
50
+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
51
+ 0.5045...
52
+ If there is no ngrams overlap for any order of n-grams, BLEU returns the
53
+ value 0. This is because the precision for the order of n-grams without
54
+ overlap is 0, and the geometric mean in the final BLEU score computation
55
+ multiplies the 0 with the precision of other n-grams. This results in 0
56
+ (independently of the precision of the othe n-gram orders). The following
57
+ example has zero 3-gram and 4-gram overlaps:
58
+ >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS
59
+ 0.0
60
+ To avoid this harsh behaviour when no ngram overlaps are found a smoothing
61
+ function can be used.
62
+ >>> chencherry = SmoothingFunction()
63
+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis2,
64
+ ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS
65
+ 0.0370...
66
+ The default BLEU calculates a score for up to 4-grams using uniform
67
+ weights (this is called BLEU-4). To evaluate your translations with
68
+ higher/lower order ngrams, use customized weights. E.g. when accounting
69
+ for up to 5-grams with uniform weights (this is called BLEU-5) use:
70
+ >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.)
71
+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
72
+ 0.3920...
73
+ :param references: reference sentences
74
+ :type references: list(list(str))
75
+ :param hypothesis: a hypothesis sentence
76
+ :type hypothesis: list(str)
77
+ :param weights: weights for unigrams, bigrams, trigrams and so on
78
+ :type weights: list(float)
79
+ :param smoothing_function:
80
+ :type smoothing_function: SmoothingFunction
81
+ :param auto_reweigh: Option to re-normalize the weights uniformly.
82
+ :type auto_reweigh: bool
83
+ :return: The sentence-level BLEU score.
84
+ :rtype: float
85
+ """
86
+ return corpus_bleu(
87
+ [references], [hypothesis], weights, smoothing_function, auto_reweigh
88
+ )
89
+
90
+
91
+ def corpus_bleu(
92
+ list_of_references,
93
+ hypotheses,
94
+ weights=(0.25, 0.25, 0.25, 0.25),
95
+ smoothing_function=None,
96
+ auto_reweigh=False,
97
+ ):
98
+ """
99
+ Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
100
+ the hypotheses and their respective references.
101
+ Instead of averaging the sentence level BLEU scores (i.e. marco-average
102
+ precision), the original BLEU metric (Papineni et al. 2002) accounts for
103
+ the micro-average precision (i.e. summing the numerators and denominators
104
+ for each hypothesis-reference(s) pairs before the division).
105
+ >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
106
+ ... 'ensures', 'that', 'the', 'military', 'always',
107
+ ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
108
+ >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
109
+ ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
110
+ ... 'heed', 'Party', 'commands']
111
+ >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
112
+ ... 'guarantees', 'the', 'military', 'forces', 'always',
113
+ ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
114
+ >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
115
+ ... 'army', 'always', 'to', 'heed', 'the', 'directions',
116
+ ... 'of', 'the', 'party']
117
+ >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
118
+ ... 'interested', 'in', 'world', 'history']
119
+ >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
120
+ ... 'because', 'he', 'read', 'the', 'book']
121
+ >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
122
+ >>> hypotheses = [hyp1, hyp2]
123
+ >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
124
+ 0.5920...
125
+ The example below show that corpus_bleu() is different from averaging
126
+ sentence_bleu() for hypotheses
127
+ >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
128
+ >>> score2 = sentence_bleu([ref2a], hyp2)
129
+ >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
130
+ 0.6223...
131
+ :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
132
+ :type list_of_references: list(list(list(str)))
133
+ :param hypotheses: a list of hypothesis sentences
134
+ :type hypotheses: list(list(str))
135
+ :param weights: weights for unigrams, bigrams, trigrams and so on
136
+ :type weights: list(float)
137
+ :param smoothing_function:
138
+ :type smoothing_function: SmoothingFunction
139
+ :param auto_reweigh: Option to re-normalize the weights uniformly.
140
+ :type auto_reweigh: bool
141
+ :return: The corpus-level BLEU score.
142
+ :rtype: float
143
+ """
144
+ # Before proceeding to compute BLEU, perform sanity checks.
145
+
146
+ p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
147
+ p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
148
+ hyp_lengths, ref_lengths = 0, 0
149
+
150
+ assert len(list_of_references) == len(hypotheses), (
151
+ "The number of hypotheses and their reference(s) should be the " "same "
152
+ )
153
+
154
+ # Iterate through each hypothesis and their corresponding references.
155
+ for references, hypothesis in zip(list_of_references, hypotheses):
156
+ # For each order of ngram, calculate the numerator and
157
+ # denominator for the corpus-level modified precision.
158
+ for i, _ in enumerate(weights, start=1):
159
+ p_i = modified_precision(references, hypothesis, i)
160
+ p_numerators[i] += p_i.numerator
161
+ p_denominators[i] += p_i.denominator
162
+
163
+ # Calculate the hypothesis length and the closest reference length.
164
+ # Adds them to the corpus-level hypothesis and reference counts.
165
+ hyp_len = len(hypothesis)
166
+ hyp_lengths += hyp_len
167
+ ref_lengths += closest_ref_length(references, hyp_len)
168
+
169
+ # Calculate corpus-level brevity penalty.
170
+ bp = brevity_penalty(ref_lengths, hyp_lengths)
171
+
172
+ # Uniformly re-weighting based on maximum hypothesis lengths if largest
173
+ # order of n-grams < 4 and weights is set at default.
174
+ if auto_reweigh:
175
+ if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
176
+ weights = (1 / hyp_lengths,) * hyp_lengths
177
+
178
+ # Collects the various precision values for the different ngram orders.
179
+ p_n = [
180
+ Fraction(p_numerators[i], p_denominators[i], _normalize=False)
181
+ for i, _ in enumerate(weights, start=1)
182
+ ]
183
+
184
+ # Returns 0 if there's no matching n-grams
185
+ # We only need to check for p_numerators[1] == 0, since if there's
186
+ # no unigrams, there won't be any higher order ngrams.
187
+ if p_numerators[1] == 0:
188
+ return 0
189
+
190
+ # If there's no smoothing, set use method0 from SmoothinFunction class.
191
+ if not smoothing_function:
192
+ smoothing_function = SmoothingFunction().method1
193
+ # Smoothen the modified precision.
194
+ # Note: smoothing_function() may convert values into floats;
195
+ # it tries to retain the Fraction object as much as the
196
+ # smoothing method allows.
197
+ p_n = smoothing_function(
198
+ p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
199
+ )
200
+ s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
201
+ s = bp * math.exp(math.fsum(s))
202
+ return s
203
+
204
 
205
+ def modified_precision(references, hypothesis, n):
206
+ """
207
+ Calculate modified ngram precision.
208
+ The normal precision method may lead to some wrong translations with
209
+ high-precision, e.g., the translation, in which a word of reference
210
+ repeats several times, has very high precision.
211
+ This function only returns the Fraction object that contains the numerator
212
+ and denominator necessary to calculate the corpus-level precision.
213
+ To calculate the modified precision for a single pair of hypothesis and
214
+ references, cast the Fraction object into a float.
215
+ The famous "the the the ... " example shows that you can get BLEU precision
216
+ by duplicating high frequency words.
217
+ >>> reference1 = 'the cat is on the mat'.split()
218
+ >>> reference2 = 'there is a cat on the mat'.split()
219
+ >>> hypothesis1 = 'the the the the the the the'.split()
220
+ >>> references = [reference1, reference2]
221
+ >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
222
+ 0.2857...
223
+ In the modified n-gram precision, a reference word will be considered
224
+ exhausted after a matching hypothesis word is identified, e.g.
225
+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
226
+ ... 'ensures', 'that', 'the', 'military', 'will',
227
+ ... 'forever', 'heed', 'Party', 'commands']
228
+ >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
229
+ ... 'guarantees', 'the', 'military', 'forces', 'always',
230
+ ... 'being', 'under', 'the', 'command', 'of', 'the',
231
+ ... 'Party']
232
+ >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
233
+ ... 'army', 'always', 'to', 'heed', 'the', 'directions',
234
+ ... 'of', 'the', 'party']
235
+ >>> hypothesis = 'of the'.split()
236
+ >>> references = [reference1, reference2, reference3]
237
+ >>> float(modified_precision(references, hypothesis, n=1))
238
+ 1.0
239
+ >>> float(modified_precision(references, hypothesis, n=2))
240
+ 1.0
241
+ An example of a normal machine translation hypothesis:
242
+ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
243
+ ... 'ensures', 'that', 'the', 'military', 'always',
244
+ ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
245
+ >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
246
+ ... 'forever', 'hearing', 'the', 'activity', 'guidebook',
247
+ ... 'that', 'party', 'direct']
248
+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
249
+ ... 'ensures', 'that', 'the', 'military', 'will',
250
+ ... 'forever', 'heed', 'Party', 'commands']
251
+ >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
252
+ ... 'guarantees', 'the', 'military', 'forces', 'always',
253
+ ... 'being', 'under', 'the', 'command', 'of', 'the',
254
+ ... 'Party']
255
+ >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
256
+ ... 'army', 'always', 'to', 'heed', 'the', 'directions',
257
+ ... 'of', 'the', 'party']
258
+ >>> references = [reference1, reference2, reference3]
259
+ >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
260
+ 0.9444...
261
+ >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS
262
+ 0.5714...
263
+ >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS
264
+ 0.5882352941176471
265
+ >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
266
+ 0.07692...
267
+ :param references: A list of reference translations.
268
+ :type references: list(list(str))
269
+ :param hypothesis: A hypothesis translation.
270
+ :type hypothesis: list(str)
271
+ :param n: The ngram order.
272
+ :type n: int
273
+ :return: BLEU's modified precision for the nth order ngram.
274
+ :rtype: Fraction
275
+ """
276
+ # Extracts all ngrams in hypothesis
277
+ # Set an empty Counter if hypothesis is empty.
278
 
279
+ counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
280
+ # Extract a union of references' counts.
281
+ # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
282
+ max_counts = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  for reference in references:
284
+ reference_counts = (
285
+ Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
286
+ )
287
+ for ngram in counts:
288
+ max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
289
+
290
+ # Assigns the intersection between hypothesis and references' counts.
291
+ clipped_counts = {
292
+ ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
293
+ }
294
+
295
+ numerator = sum(clipped_counts.values())
296
+ # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
297
+ # Usually this happens when the ngram order is > len(reference).
298
+ denominator = max(1, sum(counts.values()))
299
+
300
+ return Fraction(numerator, denominator, _normalize=False)
301
+
302
+
303
+ def closest_ref_length(references, hyp_len):
304
+ """
305
+ This function finds the reference that is the closest length to the
306
+ hypothesis. The closest reference length is referred to as *r* variable
307
+ from the brevity penalty formula in Papineni et. al. (2002)
308
+ :param references: A list of reference translations.
309
+ :type references: list(list(str))
310
+ :param hyp_len: The length of the hypothesis.
311
+ :type hyp_len: int
312
+ :return: The length of the reference that's closest to the hypothesis.
313
+ :rtype: int
314
+ """
315
+ ref_lens = (len(reference) for reference in references)
316
+ closest_ref_len = min(
317
+ ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)
318
+ )
319
+ return closest_ref_len
320
+
321
+
322
+ def brevity_penalty(closest_ref_len, hyp_len):
323
+ """
324
+ Calculate brevity penalty.
325
+ As the modified n-gram precision still has the problem from the short
326
+ length sentence, brevity penalty is used to modify the overall BLEU
327
+ score according to length.
328
+ An example from the paper. There are three references with length 12, 15
329
+ and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
330
+ >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
331
+ >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15
332
+ >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
333
+ >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
334
+ >>> references = [reference1, reference2, reference3]
335
+ >>> hyp_len = len(hypothesis)
336
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
337
+ >>> brevity_penalty(closest_ref_len, hyp_len)
338
+ 1.0
339
+ In case a hypothesis translation is shorter than the references, penalty is
340
+ applied.
341
+ >>> references = [['a'] * 28, ['a'] * 28]
342
+ >>> hypothesis = ['a'] * 12
343
+ >>> hyp_len = len(hypothesis)
344
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
345
+ >>> brevity_penalty(closest_ref_len, hyp_len)
346
+ 0.2635971381157267
347
+ The length of the closest reference is used to compute the penalty. If the
348
+ length of a hypothesis is 12, and the reference lengths are 13 and 2, the
349
+ penalty is applied because the hypothesis length (12) is less then the
350
+ closest reference length (13).
351
+ >>> references = [['a'] * 13, ['a'] * 2]
352
+ >>> hypothesis = ['a'] * 12
353
+ >>> hyp_len = len(hypothesis)
354
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
355
+ >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
356
+ 0.9200...
357
+ The brevity penalty doesn't depend on reference order. More importantly,
358
+ when two reference sentences are at the same distance, the shortest
359
+ reference sentence length is used.
360
+ >>> references = [['a'] * 13, ['a'] * 11]
361
+ >>> hypothesis = ['a'] * 12
362
+ >>> hyp_len = len(hypothesis)
363
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
364
+ >>> bp1 = brevity_penalty(closest_ref_len, hyp_len)
365
+ >>> hyp_len = len(hypothesis)
366
+ >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len)
367
+ >>> bp2 = brevity_penalty(closest_ref_len, hyp_len)
368
+ >>> bp1 == bp2 == 1
369
+ True
370
+ A test example from mteval-v13a.pl (starting from the line 705):
371
+ >>> references = [['a'] * 11, ['a'] * 8]
372
+ >>> hypothesis = ['a'] * 7
373
+ >>> hyp_len = len(hypothesis)
374
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
375
+ >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
376
+ 0.8668...
377
+ >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
378
+ >>> hypothesis = ['a'] * 7
379
+ >>> hyp_len = len(hypothesis)
380
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
381
+ >>> brevity_penalty(closest_ref_len, hyp_len)
382
+ 1.0
383
+ :param hyp_len: The length of the hypothesis for a single sentence OR the
384
+ sum of all the hypotheses' lengths for a corpus
385
+ :type hyp_len: int
386
+ :param closest_ref_len: The length of the closest reference for a single
387
+ hypothesis OR the sum of all the closest references for every hypotheses.
388
+ :type closest_ref_len: int
389
+ :return: BLEU's brevity penalty.
390
+ :rtype: float
391
+ """
392
+ if hyp_len > closest_ref_len:
393
+ return 1
394
+ # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
395
+ elif hyp_len == 0:
396
+ return 0
397
  else:
398
+ return math.exp(1 - closest_ref_len / hyp_len)
399
+
400
+
401
+ class SmoothingFunction:
402
+ """
403
+ This is an implementation of the smoothing techniques
404
+ for segment-level BLEU scores that was presented in
405
+ Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
406
+ Smoothing Techniques for Sentence-Level BLEU. In WMT14.
407
+ http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
408
+ """
409
+
410
+ def __init__(self, epsilon=0.1, alpha=5, k=5):
411
+ """
412
+ This will initialize the parameters required for the various smoothing
413
+ techniques, the default values are set to the numbers used in the
414
+ experiments from Chen and Cherry (2014).
415
+ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
416
+ ... 'that', 'the', 'military', 'always', 'obeys', 'the',
417
+ ... 'commands', 'of', 'the', 'party']
418
+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
419
+ ... 'that', 'the', 'military', 'will', 'forever', 'heed',
420
+ ... 'Party', 'commands']
421
+ >>> chencherry = SmoothingFunction()
422
+ >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
423
+ 0.4118...
424
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
425
+ 0.4118...
426
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
427
+ 0.4118...
428
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
429
+ 0.4489...
430
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
431
+ 0.4118...
432
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
433
+ 0.4118...
434
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
435
+ 0.4905...
436
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
437
+ 0.4135...
438
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
439
+ 0.4905...
440
+ :param epsilon: the epsilon value use in method 1
441
+ :type epsilon: float
442
+ :param alpha: the alpha value use in method 6
443
+ :type alpha: int
444
+ :param k: the k value use in method 4
445
+ :type k: int
446
+ """
447
+ self.epsilon = epsilon
448
+ self.alpha = alpha
449
+ self.k = k
450
+
451
+ def method0(self, p_n, *args, **kwargs):
452
+ """
453
+ No smoothing.
454
+ """
455
+ p_n_new = []
456
+ for i, p_i in enumerate(p_n):
457
+ if p_i.numerator != 0:
458
+ p_n_new.append(p_i)
459
+ else:
460
+ _msg = str(
461
+ "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n"
462
+ "Therefore the BLEU score evaluates to 0, independently of\n"
463
+ "how many N-gram overlaps of lower order it contains.\n"
464
+ "Consider using lower n-gram order or use "
465
+ "SmoothingFunction()"
466
+ ).format(i + 1)
467
+ warnings.warn(_msg)
468
+ # When numerator==0 where denonminator==0 or !=0, the result
469
+ # for the precision score should be equal to 0 or undefined.
470
+ # Due to BLEU geometric mean computation in logarithm space,
471
+ # we we need to take the return sys.float_info.min such that
472
+ # math.log(sys.float_info.min) returns a 0 precision score.
473
+ p_n_new.append(sys.float_info.min)
474
+ return p_n_new
475
+
476
+ def method1(self, p_n, *args, **kwargs):
477
+ """
478
+ Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
479
+ """
480
+ return [
481
+ (p_i.numerator + self.epsilon) / p_i.denominator
482
+ if p_i.numerator == 0
483
+ else p_i
484
+ for p_i in p_n
485
+ ]
486
+
487
+ def method2(self, p_n, *args, **kwargs):
488
+ """
489
+ Smoothing method 2: Add 1 to both numerator and denominator from
490
+ Chin-Yew Lin and Franz Josef Och (2004) Automatic evaluation of
491
+ machine translation quality using longest common subsequence and
492
+ skip-bigram statistics. In ACL04.
493
+ """
494
+ return [
495
+ Fraction(p_i.numerator + 1, p_i.denominator + 1, _normalize=False)
496
+ for p_i in p_n
497
+ ]
498
+
499
+ def method3(self, p_n, *args, **kwargs):
500
+ """
501
+ Smoothing method 3: NIST geometric sequence smoothing
502
+ The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
503
+ precision score whose matching n-gram count is null.
504
+ k is 1 for the first 'n' value for which the n-gram match count is null/
505
+ For example, if the text contains:
506
+ - one 2-gram match
507
+ - and (consequently) two 1-gram matches
508
+ the n-gram count for each individual precision score would be:
509
+ - n=1 => prec_count = 2 (two unigrams)
510
+ - n=2 => prec_count = 1 (one bigram)
511
+ - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
512
+ - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
513
+ """
514
+ incvnt = 1 # From the mteval-v13a.pl, it's referred to as k.
515
+ for i, p_i in enumerate(p_n):
516
+ if p_i.numerator == 0:
517
+ p_n[i] = 1 / (2 ** incvnt * p_i.denominator)
518
+ incvnt += 1
519
+ return p_n
520
+
521
+ def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
522
+ """
523
+ Smoothing method 4:
524
+ Shorter translations may have inflated precision values due to having
525
+ smaller denominators; therefore, we give them proportionally
526
+ smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
527
+ suggests dividing by 1/ln(len(T)), where T is the length of the translation.
528
+ """
529
+ hyp_len = hyp_len if hyp_len else len(hypothesis)
530
+ for i, p_i in enumerate(p_n):
531
+ if p_i.numerator == 0 and hyp_len != 0:
532
+ incvnt = i + 1 * self.k / math.log(
533
+ hyp_len
534
+ ) # Note that this K is different from the K from NIST.
535
+ p_n[i] = incvnt / p_i.denominator
536
+ return p_n
537
+
538
+ def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
539
+ """
540
+ Smoothing method 5:
541
+ The matched counts for similar values of n should be similar. To a
542
+ calculate the n-gram matched count, it averages the n−1, n and n+1 gram
543
+ matched counts.
544
+ """
545
+ hyp_len = hyp_len if hyp_len else len(hypothesis)
546
+ m = {}
547
+ # Requires an precision value for an addition ngram order.
548
+ p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
549
+ m[-1] = p_n[0] + 1
550
+ for i, p_i in enumerate(p_n):
551
+ p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
552
+ m[i] = p_n[i]
553
+ return p_n
554
+
555
+ def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
556
+ """
557
+ Smoothing method 6:
558
+ Interpolates the maximum likelihood estimate of the precision *p_n* with
559
+ a prior estimate *pi0*. The prior is estimated by assuming that the ratio
560
+ between pn and pn−1 will be the same as that between pn−1 and pn−2; from
561
+ Gao and He (2013) Training MRF-Based Phrase Translation Models using
562
+ Gradient Ascent. In NAACL.
563
+ """
564
+ hyp_len = hyp_len if hyp_len else len(hypothesis)
565
+ # This smoothing only works when p_1 and p_2 is non-zero.
566
+ # Raise an error with an appropriate message when the input is too short
567
+ # to use this smoothing technique.
568
+ assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
569
+ for i, p_i in enumerate(p_n):
570
+ if i in [0, 1]: # Skips the first 2 orders of ngrams.
571
+ continue
572
+ else:
573
+ pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
574
+ # No. of ngrams in translation that matches the reference.
575
+ m = p_i.numerator
576
+ # No. of ngrams in translation.
577
+ l = sum(1 for _ in ngrams(hypothesis, i + 1))
578
+ # Calculates the interpolated precision.
579
+ p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
580
+ return p_n
581
+
582
+ def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
583
+ """
584
+ Smoothing method 7:
585
+ Interpolates methods 4 and 5.
586
+ """
587
+ hyp_len = hyp_len if hyp_len else len(hypothesis)
588
+ p_n = self.method4(p_n, references, hypothesis, hyp_len)
589
+ p_n = self.method5(p_n, references, hypothesis, hyp_len)
590
+ return p_n
calc_code_bleu.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ # -*- coding:utf-8 -*-
5
+ import argparse
6
+ from .bleu import corpus_bleu
7
+ from .weighted_ngram_match import corpus_weighted_ngram_match
8
+ from .syntax_match import corpus_syntax_match
9
+ from .dataflow_match import corpus_dataflow_match
10
+ import os
11
+
12
+ def calculate(predictions, references, language="python", alpha=0.25, beta=0.25, gamma=0.25, theta=0.25):
13
+
14
+ # preprocess inputs
15
+ pre_references = [[s.strip() for s in my_list] for my_list in references]
16
+ hypothesis = [s.strip() for s in predictions]
17
+
18
+ for i in range(len(pre_references)):
19
+ assert len(hypothesis) == len(pre_references[i])
20
+
21
+ references = []
22
+ for i in range(len(hypothesis)):
23
+ ref_for_instance = []
24
+ for j in range(len(pre_references)):
25
+ ref_for_instance.append(pre_references[j][i])
26
+ references.append(ref_for_instance)
27
+ assert len(references) == len(pre_references)*len(hypothesis)
28
+
29
+
30
+ # calculate ngram match (BLEU)
31
+ tokenized_hyps = [x.split() for x in hypothesis]
32
+ tokenized_refs = [[x.split() for x in reference] for reference in references]
33
+
34
+ ngram_match_score = corpus_bleu(tokenized_refs,tokenized_hyps)
35
+
36
+ # calculate weighted ngram match
37
+ # from os import listdir
38
+ # from os.path import isfile, join
39
+ # onlyfiles = [f for f in listdir("./keywords") if isfile(join("keywords", f))]
40
+ # print(onlyfiles)
41
+ curr_path = os.path.dirname(os.path.abspath(__file__))
42
+ keywords = [x.strip() for x in open(curr_path + "/keywords/" + language +'.txt', 'r', encoding='utf-8').readlines()]
43
+ def make_weights(reference_tokens, key_word_list):
44
+ return {token:1 if token in key_word_list else 0.2 \
45
+ for token in reference_tokens}
46
+ tokenized_refs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)]\
47
+ for reference_tokens in reference] for reference in tokenized_refs]
48
+
49
+ weighted_ngram_match_score = corpus_weighted_ngram_match(tokenized_refs_with_weights,tokenized_hyps)
50
+
51
+ # calculate syntax match
52
+ syntax_match_score = corpus_syntax_match(references, hypothesis, language)
53
+
54
+ # calculate dataflow match
55
+ dataflow_match_score = corpus_dataflow_match(references, hypothesis, language)
56
+
57
+ code_bleu_score = alpha*ngram_match_score\
58
+ + beta*weighted_ngram_match_score\
59
+ + gamma*syntax_match_score\
60
+ + theta*dataflow_match_score
61
+
62
+ return {
63
+ "ngram_match_score": ngram_match_score,
64
+ "weighted_ngram_match_score": weighted_ngram_match_score,
65
+ "syntax_match_score": syntax_match_score,
66
+ "dataflow_match_score": dataflow_match_score,
67
+ "code_bleu_score": code_bleu_score
68
+ }
69
+
70
+
71
+
72
+
dataflow_match.py CHANGED
@@ -1,1280 +1,14 @@
1
  # Copyright (c) Microsoft Corporation.
2
  # Licensed under the MIT license.
3
 
4
-
 
 
 
 
5
  from tree_sitter import Language, Parser
6
  import pdb
7
-
8
- import re
9
- from io import StringIO
10
- import tokenize
11
- def remove_comments_and_docstrings(source,lang):
12
- if lang in ['python']:
13
- """
14
- Returns 'source' minus comments and docstrings.
15
- """
16
- io_obj = StringIO(source)
17
- out = ""
18
- prev_toktype = tokenize.INDENT
19
- last_lineno = -1
20
- last_col = 0
21
- for tok in tokenize.generate_tokens(io_obj.readline):
22
- token_type = tok[0]
23
- token_string = tok[1]
24
- start_line, start_col = tok[2]
25
- end_line, end_col = tok[3]
26
- ltext = tok[4]
27
- if start_line > last_lineno:
28
- last_col = 0
29
- if start_col > last_col:
30
- out += (" " * (start_col - last_col))
31
- # Remove comments:
32
- if token_type == tokenize.COMMENT:
33
- pass
34
- # This series of conditionals removes docstrings:
35
- elif token_type == tokenize.STRING:
36
- if prev_toktype != tokenize.INDENT:
37
- # This is likely a docstring; double-check we're not inside an operator:
38
- if prev_toktype != tokenize.NEWLINE:
39
- if start_col > 0:
40
- out += token_string
41
- else:
42
- out += token_string
43
- prev_toktype = token_type
44
- last_col = end_col
45
- last_lineno = end_line
46
- temp=[]
47
- for x in out.split('\n'):
48
- if x.strip()!="":
49
- temp.append(x)
50
- return '\n'.join(temp)
51
- elif lang in ['ruby']:
52
- return source
53
- else:
54
- def replacer(match):
55
- s = match.group(0)
56
- if s.startswith('/'):
57
- return " " # note: a space and not an empty string
58
- else:
59
- return s
60
- pattern = re.compile(
61
- r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
62
- re.DOTALL | re.MULTILINE
63
- )
64
- temp=[]
65
- for x in re.sub(pattern, replacer, source).split('\n'):
66
- if x.strip()!="":
67
- temp.append(x)
68
- return '\n'.join(temp)
69
-
70
- def tree_to_token_index(root_node):
71
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
72
- return [(root_node.start_point,root_node.end_point)]
73
- else:
74
- code_tokens=[]
75
- for child in root_node.children:
76
- code_tokens+=tree_to_token_index(child)
77
- return code_tokens
78
-
79
- def tree_to_variable_index(root_node,index_to_code):
80
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
81
- index=(root_node.start_point,root_node.end_point)
82
- _,code=index_to_code[index]
83
- if root_node.type!=code:
84
- return [(root_node.start_point,root_node.end_point)]
85
- else:
86
- return []
87
- else:
88
- code_tokens=[]
89
- for child in root_node.children:
90
- code_tokens+=tree_to_variable_index(child,index_to_code)
91
- return code_tokens
92
-
93
- def index_to_code_token(index,code):
94
- start_point=index[0]
95
- end_point=index[1]
96
- if start_point[0]==end_point[0]:
97
- s=code[start_point[0]][start_point[1]:end_point[1]]
98
- else:
99
- s=""
100
- s+=code[start_point[0]][start_point[1]:]
101
- for i in range(start_point[0]+1,end_point[0]):
102
- s+=code[i]
103
- s+=code[end_point[0]][:end_point[1]]
104
- return s
105
-
106
-
107
- def DFG_python(root_node,index_to_code,states):
108
- assignment=['assignment','augmented_assignment','for_in_clause']
109
- if_statement=['if_statement']
110
- for_statement=['for_statement']
111
- while_statement=['while_statement']
112
- do_first_statement=['for_in_clause']
113
- def_statement=['default_parameter']
114
- states=states.copy()
115
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
116
- idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
117
- if root_node.type==code:
118
- return [],states
119
- elif code in states:
120
- return [(code,idx,'comesFrom',[code],states[code].copy())],states
121
- else:
122
- if root_node.type=='identifier':
123
- states[code]=[idx]
124
- return [(code,idx,'comesFrom',[],[])],states
125
- elif root_node.type in def_statement:
126
- name=root_node.child_by_field_name('name')
127
- value=root_node.child_by_field_name('value')
128
- DFG=[]
129
- if value is None:
130
- indexs=tree_to_variable_index(name,index_to_code)
131
- for index in indexs:
132
- idx,code=index_to_code[index]
133
- DFG.append((code,idx,'comesFrom',[],[]))
134
- states[code]=[idx]
135
- return sorted(DFG,key=lambda x:x[1]),states
136
- else:
137
- name_indexs=tree_to_variable_index(name,index_to_code)
138
- value_indexs=tree_to_variable_index(value,index_to_code)
139
- temp,states=DFG_python(value,index_to_code,states)
140
- DFG+=temp
141
- for index1 in name_indexs:
142
- idx1,code1=index_to_code[index1]
143
- for index2 in value_indexs:
144
- idx2,code2=index_to_code[index2]
145
- DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
146
- states[code1]=[idx1]
147
- return sorted(DFG,key=lambda x:x[1]),states
148
- elif root_node.type in assignment:
149
- if root_node.type=='for_in_clause':
150
- right_nodes=[root_node.children[-1]]
151
- left_nodes=[root_node.child_by_field_name('left')]
152
- else:
153
- if root_node.child_by_field_name('right') is None:
154
- return [],states
155
- left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=',']
156
- right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=',']
157
- if len(right_nodes)!=len(left_nodes):
158
- left_nodes=[root_node.child_by_field_name('left')]
159
- right_nodes=[root_node.child_by_field_name('right')]
160
- if len(left_nodes)==0:
161
- left_nodes=[root_node.child_by_field_name('left')]
162
- if len(right_nodes)==0:
163
- right_nodes=[root_node.child_by_field_name('right')]
164
- DFG=[]
165
- for node in right_nodes:
166
- temp,states=DFG_python(node,index_to_code,states)
167
- DFG+=temp
168
-
169
- for left_node,right_node in zip(left_nodes,right_nodes):
170
- left_tokens_index=tree_to_variable_index(left_node,index_to_code)
171
- right_tokens_index=tree_to_variable_index(right_node,index_to_code)
172
- temp=[]
173
- for token1_index in left_tokens_index:
174
- idx1,code1=index_to_code[token1_index]
175
- temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
176
- [index_to_code[x][0] for x in right_tokens_index]))
177
- states[code1]=[idx1]
178
- DFG+=temp
179
- return sorted(DFG,key=lambda x:x[1]),states
180
- elif root_node.type in if_statement:
181
- DFG=[]
182
- current_states=states.copy()
183
- others_states=[]
184
- tag=False
185
- if 'else' in root_node.type:
186
- tag=True
187
- for child in root_node.children:
188
- if 'else' in child.type:
189
- tag=True
190
- if child.type not in ['elif_clause','else_clause']:
191
- temp,current_states=DFG_python(child,index_to_code,current_states)
192
- DFG+=temp
193
- else:
194
- temp,new_states=DFG_python(child,index_to_code,states)
195
- DFG+=temp
196
- others_states.append(new_states)
197
- others_states.append(current_states)
198
- if tag is False:
199
- others_states.append(states)
200
- new_states={}
201
- for dic in others_states:
202
- for key in dic:
203
- if key not in new_states:
204
- new_states[key]=dic[key].copy()
205
- else:
206
- new_states[key]+=dic[key]
207
- for key in new_states:
208
- new_states[key]=sorted(list(set(new_states[key])))
209
- return sorted(DFG,key=lambda x:x[1]),new_states
210
- elif root_node.type in for_statement:
211
- DFG=[]
212
- for i in range(2):
213
- right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=',']
214
- left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=',']
215
- if len(right_nodes)!=len(left_nodes):
216
- left_nodes=[root_node.child_by_field_name('left')]
217
- right_nodes=[root_node.child_by_field_name('right')]
218
- if len(left_nodes)==0:
219
- left_nodes=[root_node.child_by_field_name('left')]
220
- if len(right_nodes)==0:
221
- right_nodes=[root_node.child_by_field_name('right')]
222
- for node in right_nodes:
223
- temp,states=DFG_python(node,index_to_code,states)
224
- DFG+=temp
225
- for left_node,right_node in zip(left_nodes,right_nodes):
226
- left_tokens_index=tree_to_variable_index(left_node,index_to_code)
227
- right_tokens_index=tree_to_variable_index(right_node,index_to_code)
228
- temp=[]
229
- for token1_index in left_tokens_index:
230
- idx1,code1=index_to_code[token1_index]
231
- temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
232
- [index_to_code[x][0] for x in right_tokens_index]))
233
- states[code1]=[idx1]
234
- DFG+=temp
235
- if root_node.children[-1].type=="block":
236
- temp,states=DFG_python(root_node.children[-1],index_to_code,states)
237
- DFG+=temp
238
- dic={}
239
- for x in DFG:
240
- if (x[0],x[1],x[2]) not in dic:
241
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
242
- else:
243
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
244
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
245
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
246
- return sorted(DFG,key=lambda x:x[1]),states
247
- elif root_node.type in while_statement:
248
- DFG=[]
249
- for i in range(2):
250
- for child in root_node.children:
251
- temp,states=DFG_python(child,index_to_code,states)
252
- DFG+=temp
253
- dic={}
254
- for x in DFG:
255
- if (x[0],x[1],x[2]) not in dic:
256
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
257
- else:
258
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
259
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
260
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
261
- return sorted(DFG,key=lambda x:x[1]),states
262
- else:
263
- DFG=[]
264
- for child in root_node.children:
265
- if child.type in do_first_statement:
266
- temp,states=DFG_python(child,index_to_code,states)
267
- DFG+=temp
268
- for child in root_node.children:
269
- if child.type not in do_first_statement:
270
- temp,states=DFG_python(child,index_to_code,states)
271
- DFG+=temp
272
-
273
- return sorted(DFG,key=lambda x:x[1]),states
274
-
275
-
276
- def DFG_java(root_node,index_to_code,states):
277
- assignment=['assignment_expression']
278
- def_statement=['variable_declarator']
279
- increment_statement=['update_expression']
280
- if_statement=['if_statement','else']
281
- for_statement=['for_statement']
282
- enhanced_for_statement=['enhanced_for_statement']
283
- while_statement=['while_statement']
284
- do_first_statement=[]
285
- states=states.copy()
286
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
287
- idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
288
- if root_node.type==code:
289
- return [],states
290
- elif code in states:
291
- return [(code,idx,'comesFrom',[code],states[code].copy())],states
292
- else:
293
- if root_node.type=='identifier':
294
- states[code]=[idx]
295
- return [(code,idx,'comesFrom',[],[])],states
296
- elif root_node.type in def_statement:
297
- name=root_node.child_by_field_name('name')
298
- value=root_node.child_by_field_name('value')
299
- DFG=[]
300
- if value is None:
301
- indexs=tree_to_variable_index(name,index_to_code)
302
- for index in indexs:
303
- idx,code=index_to_code[index]
304
- DFG.append((code,idx,'comesFrom',[],[]))
305
- states[code]=[idx]
306
- return sorted(DFG,key=lambda x:x[1]),states
307
- else:
308
- name_indexs=tree_to_variable_index(name,index_to_code)
309
- value_indexs=tree_to_variable_index(value,index_to_code)
310
- temp,states=DFG_java(value,index_to_code,states)
311
- DFG+=temp
312
- for index1 in name_indexs:
313
- idx1,code1=index_to_code[index1]
314
- for index2 in value_indexs:
315
- idx2,code2=index_to_code[index2]
316
- DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
317
- states[code1]=[idx1]
318
- return sorted(DFG,key=lambda x:x[1]),states
319
- elif root_node.type in assignment:
320
- left_nodes=root_node.child_by_field_name('left')
321
- right_nodes=root_node.child_by_field_name('right')
322
- DFG=[]
323
- temp,states=DFG_java(right_nodes,index_to_code,states)
324
- DFG+=temp
325
- name_indexs=tree_to_variable_index(left_nodes,index_to_code)
326
- value_indexs=tree_to_variable_index(right_nodes,index_to_code)
327
- for index1 in name_indexs:
328
- idx1,code1=index_to_code[index1]
329
- for index2 in value_indexs:
330
- idx2,code2=index_to_code[index2]
331
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
332
- states[code1]=[idx1]
333
- return sorted(DFG,key=lambda x:x[1]),states
334
- elif root_node.type in increment_statement:
335
- DFG=[]
336
- indexs=tree_to_variable_index(root_node,index_to_code)
337
- for index1 in indexs:
338
- idx1,code1=index_to_code[index1]
339
- for index2 in indexs:
340
- idx2,code2=index_to_code[index2]
341
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
342
- states[code1]=[idx1]
343
- return sorted(DFG,key=lambda x:x[1]),states
344
- elif root_node.type in if_statement:
345
- DFG=[]
346
- current_states=states.copy()
347
- others_states=[]
348
- flag=False
349
- tag=False
350
- if 'else' in root_node.type:
351
- tag=True
352
- for child in root_node.children:
353
- if 'else' in child.type:
354
- tag=True
355
- if child.type not in if_statement and flag is False:
356
- temp,current_states=DFG_java(child,index_to_code,current_states)
357
- DFG+=temp
358
- else:
359
- flag=True
360
- temp,new_states=DFG_java(child,index_to_code,states)
361
- DFG+=temp
362
- others_states.append(new_states)
363
- others_states.append(current_states)
364
- if tag is False:
365
- others_states.append(states)
366
- new_states={}
367
- for dic in others_states:
368
- for key in dic:
369
- if key not in new_states:
370
- new_states[key]=dic[key].copy()
371
- else:
372
- new_states[key]+=dic[key]
373
- for key in new_states:
374
- new_states[key]=sorted(list(set(new_states[key])))
375
- return sorted(DFG,key=lambda x:x[1]),new_states
376
- elif root_node.type in for_statement:
377
- DFG=[]
378
- for child in root_node.children:
379
- temp,states=DFG_java(child,index_to_code,states)
380
- DFG+=temp
381
- flag=False
382
- for child in root_node.children:
383
- if flag:
384
- temp,states=DFG_java(child,index_to_code,states)
385
- DFG+=temp
386
- elif child.type=="local_variable_declaration":
387
- flag=True
388
- dic={}
389
- for x in DFG:
390
- if (x[0],x[1],x[2]) not in dic:
391
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
392
- else:
393
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
394
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
395
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
396
- return sorted(DFG,key=lambda x:x[1]),states
397
- elif root_node.type in enhanced_for_statement:
398
- name=root_node.child_by_field_name('name')
399
- value=root_node.child_by_field_name('value')
400
- body=root_node.child_by_field_name('body')
401
- DFG=[]
402
- for i in range(2):
403
- temp,states=DFG_java(value,index_to_code,states)
404
- DFG+=temp
405
- name_indexs=tree_to_variable_index(name,index_to_code)
406
- value_indexs=tree_to_variable_index(value,index_to_code)
407
- for index1 in name_indexs:
408
- idx1,code1=index_to_code[index1]
409
- for index2 in value_indexs:
410
- idx2,code2=index_to_code[index2]
411
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
412
- states[code1]=[idx1]
413
- temp,states=DFG_java(body,index_to_code,states)
414
- DFG+=temp
415
- dic={}
416
- for x in DFG:
417
- if (x[0],x[1],x[2]) not in dic:
418
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
419
- else:
420
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
421
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
422
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
423
- return sorted(DFG,key=lambda x:x[1]),states
424
- elif root_node.type in while_statement:
425
- DFG=[]
426
- for i in range(2):
427
- for child in root_node.children:
428
- temp,states=DFG_java(child,index_to_code,states)
429
- DFG+=temp
430
- dic={}
431
- for x in DFG:
432
- if (x[0],x[1],x[2]) not in dic:
433
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
434
- else:
435
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
436
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
437
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
438
- return sorted(DFG,key=lambda x:x[1]),states
439
- else:
440
- DFG=[]
441
- for child in root_node.children:
442
- if child.type in do_first_statement:
443
- temp,states=DFG_java(child,index_to_code,states)
444
- DFG+=temp
445
- for child in root_node.children:
446
- if child.type not in do_first_statement:
447
- temp,states=DFG_java(child,index_to_code,states)
448
- DFG+=temp
449
-
450
- return sorted(DFG,key=lambda x:x[1]),states
451
-
452
- def DFG_csharp(root_node,index_to_code,states):
453
- assignment=['assignment_expression']
454
- def_statement=['variable_declarator']
455
- increment_statement=['postfix_unary_expression']
456
- if_statement=['if_statement','else']
457
- for_statement=['for_statement']
458
- enhanced_for_statement=['for_each_statement']
459
- while_statement=['while_statement']
460
- do_first_statement=[]
461
- states=states.copy()
462
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
463
- idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
464
- if root_node.type==code:
465
- return [],states
466
- elif code in states:
467
- return [(code,idx,'comesFrom',[code],states[code].copy())],states
468
- else:
469
- if root_node.type=='identifier':
470
- states[code]=[idx]
471
- return [(code,idx,'comesFrom',[],[])],states
472
- elif root_node.type in def_statement:
473
- if len(root_node.children)==2:
474
- name=root_node.children[0]
475
- value=root_node.children[1]
476
- else:
477
- name=root_node.children[0]
478
- value=None
479
- DFG=[]
480
- if value is None:
481
- indexs=tree_to_variable_index(name,index_to_code)
482
- for index in indexs:
483
- idx,code=index_to_code[index]
484
- DFG.append((code,idx,'comesFrom',[],[]))
485
- states[code]=[idx]
486
- return sorted(DFG,key=lambda x:x[1]),states
487
- else:
488
- name_indexs=tree_to_variable_index(name,index_to_code)
489
- value_indexs=tree_to_variable_index(value,index_to_code)
490
- temp,states=DFG_csharp(value,index_to_code,states)
491
- DFG+=temp
492
- for index1 in name_indexs:
493
- idx1,code1=index_to_code[index1]
494
- for index2 in value_indexs:
495
- idx2,code2=index_to_code[index2]
496
- DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
497
- states[code1]=[idx1]
498
- return sorted(DFG,key=lambda x:x[1]),states
499
- elif root_node.type in assignment:
500
- left_nodes=root_node.child_by_field_name('left')
501
- right_nodes=root_node.child_by_field_name('right')
502
- DFG=[]
503
- temp,states=DFG_csharp(right_nodes,index_to_code,states)
504
- DFG+=temp
505
- name_indexs=tree_to_variable_index(left_nodes,index_to_code)
506
- value_indexs=tree_to_variable_index(right_nodes,index_to_code)
507
- for index1 in name_indexs:
508
- idx1,code1=index_to_code[index1]
509
- for index2 in value_indexs:
510
- idx2,code2=index_to_code[index2]
511
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
512
- states[code1]=[idx1]
513
- return sorted(DFG,key=lambda x:x[1]),states
514
- elif root_node.type in increment_statement:
515
- DFG=[]
516
- indexs=tree_to_variable_index(root_node,index_to_code)
517
- for index1 in indexs:
518
- idx1,code1=index_to_code[index1]
519
- for index2 in indexs:
520
- idx2,code2=index_to_code[index2]
521
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
522
- states[code1]=[idx1]
523
- return sorted(DFG,key=lambda x:x[1]),states
524
- elif root_node.type in if_statement:
525
- DFG=[]
526
- current_states=states.copy()
527
- others_states=[]
528
- flag=False
529
- tag=False
530
- if 'else' in root_node.type:
531
- tag=True
532
- for child in root_node.children:
533
- if 'else' in child.type:
534
- tag=True
535
- if child.type not in if_statement and flag is False:
536
- temp,current_states=DFG_csharp(child,index_to_code,current_states)
537
- DFG+=temp
538
- else:
539
- flag=True
540
- temp,new_states=DFG_csharp(child,index_to_code,states)
541
- DFG+=temp
542
- others_states.append(new_states)
543
- others_states.append(current_states)
544
- if tag is False:
545
- others_states.append(states)
546
- new_states={}
547
- for dic in others_states:
548
- for key in dic:
549
- if key not in new_states:
550
- new_states[key]=dic[key].copy()
551
- else:
552
- new_states[key]+=dic[key]
553
- for key in new_states:
554
- new_states[key]=sorted(list(set(new_states[key])))
555
- return sorted(DFG,key=lambda x:x[1]),new_states
556
- elif root_node.type in for_statement:
557
- DFG=[]
558
- for child in root_node.children:
559
- temp,states=DFG_csharp(child,index_to_code,states)
560
- DFG+=temp
561
- flag=False
562
- for child in root_node.children:
563
- if flag:
564
- temp,states=DFG_csharp(child,index_to_code,states)
565
- DFG+=temp
566
- elif child.type=="local_variable_declaration":
567
- flag=True
568
- dic={}
569
- for x in DFG:
570
- if (x[0],x[1],x[2]) not in dic:
571
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
572
- else:
573
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
574
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
575
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
576
- return sorted(DFG,key=lambda x:x[1]),states
577
- elif root_node.type in enhanced_for_statement:
578
- name=root_node.child_by_field_name('left')
579
- value=root_node.child_by_field_name('right')
580
- body=root_node.child_by_field_name('body')
581
- DFG=[]
582
- for i in range(2):
583
- temp,states=DFG_csharp(value,index_to_code,states)
584
- DFG+=temp
585
- name_indexs=tree_to_variable_index(name,index_to_code)
586
- value_indexs=tree_to_variable_index(value,index_to_code)
587
- for index1 in name_indexs:
588
- idx1,code1=index_to_code[index1]
589
- for index2 in value_indexs:
590
- idx2,code2=index_to_code[index2]
591
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
592
- states[code1]=[idx1]
593
- temp,states=DFG_csharp(body,index_to_code,states)
594
- DFG+=temp
595
- dic={}
596
- for x in DFG:
597
- if (x[0],x[1],x[2]) not in dic:
598
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
599
- else:
600
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
601
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
602
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
603
- return sorted(DFG,key=lambda x:x[1]),states
604
- elif root_node.type in while_statement:
605
- DFG=[]
606
- for i in range(2):
607
- for child in root_node.children:
608
- temp,states=DFG_csharp(child,index_to_code,states)
609
- DFG+=temp
610
- dic={}
611
- for x in DFG:
612
- if (x[0],x[1],x[2]) not in dic:
613
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
614
- else:
615
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
616
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
617
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
618
- return sorted(DFG,key=lambda x:x[1]),states
619
- else:
620
- DFG=[]
621
- for child in root_node.children:
622
- if child.type in do_first_statement:
623
- temp,states=DFG_csharp(child,index_to_code,states)
624
- DFG+=temp
625
- for child in root_node.children:
626
- if child.type not in do_first_statement:
627
- temp,states=DFG_csharp(child,index_to_code,states)
628
- DFG+=temp
629
-
630
- return sorted(DFG,key=lambda x:x[1]),states
631
-
632
-
633
-
634
-
635
- def DFG_ruby(root_node,index_to_code,states):
636
- assignment=['assignment','operator_assignment']
637
- if_statement=['if','elsif','else','unless','when']
638
- for_statement=['for']
639
- while_statement=['while_modifier','until']
640
- do_first_statement=[]
641
- def_statement=['keyword_parameter']
642
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
643
- states=states.copy()
644
- idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
645
- if root_node.type==code:
646
- return [],states
647
- elif code in states:
648
- return [(code,idx,'comesFrom',[code],states[code].copy())],states
649
- else:
650
- if root_node.type=='identifier':
651
- states[code]=[idx]
652
- return [(code,idx,'comesFrom',[],[])],states
653
- elif root_node.type in def_statement:
654
- name=root_node.child_by_field_name('name')
655
- value=root_node.child_by_field_name('value')
656
- DFG=[]
657
- if value is None:
658
- indexs=tree_to_variable_index(name,index_to_code)
659
- for index in indexs:
660
- idx,code=index_to_code[index]
661
- DFG.append((code,idx,'comesFrom',[],[]))
662
- states[code]=[idx]
663
- return sorted(DFG,key=lambda x:x[1]),states
664
- else:
665
- name_indexs=tree_to_variable_index(name,index_to_code)
666
- value_indexs=tree_to_variable_index(value,index_to_code)
667
- temp,states=DFG_ruby(value,index_to_code,states)
668
- DFG+=temp
669
- for index1 in name_indexs:
670
- idx1,code1=index_to_code[index1]
671
- for index2 in value_indexs:
672
- idx2,code2=index_to_code[index2]
673
- DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
674
- states[code1]=[idx1]
675
- return sorted(DFG,key=lambda x:x[1]),states
676
- elif root_node.type in assignment:
677
- left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=',']
678
- right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=',']
679
- if len(right_nodes)!=len(left_nodes):
680
- left_nodes=[root_node.child_by_field_name('left')]
681
- right_nodes=[root_node.child_by_field_name('right')]
682
- if len(left_nodes)==0:
683
- left_nodes=[root_node.child_by_field_name('left')]
684
- if len(right_nodes)==0:
685
- right_nodes=[root_node.child_by_field_name('right')]
686
- if root_node.type=="operator_assignment":
687
- left_nodes=[root_node.children[0]]
688
- right_nodes=[root_node.children[-1]]
689
-
690
- DFG=[]
691
- for node in right_nodes:
692
- temp,states=DFG_ruby(node,index_to_code,states)
693
- DFG+=temp
694
-
695
- for left_node,right_node in zip(left_nodes,right_nodes):
696
- left_tokens_index=tree_to_variable_index(left_node,index_to_code)
697
- right_tokens_index=tree_to_variable_index(right_node,index_to_code)
698
- temp=[]
699
- for token1_index in left_tokens_index:
700
- idx1,code1=index_to_code[token1_index]
701
- temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
702
- [index_to_code[x][0] for x in right_tokens_index]))
703
- states[code1]=[idx1]
704
- DFG+=temp
705
- return sorted(DFG,key=lambda x:x[1]),states
706
- elif root_node.type in if_statement:
707
- DFG=[]
708
- current_states=states.copy()
709
- others_states=[]
710
- tag=False
711
- if 'else' in root_node.type:
712
- tag=True
713
- for child in root_node.children:
714
- if 'else' in child.type:
715
- tag=True
716
- if child.type not in if_statement:
717
- temp,current_states=DFG_ruby(child,index_to_code,current_states)
718
- DFG+=temp
719
- else:
720
- temp,new_states=DFG_ruby(child,index_to_code,states)
721
- DFG+=temp
722
- others_states.append(new_states)
723
- others_states.append(current_states)
724
- if tag is False:
725
- others_states.append(states)
726
- new_states={}
727
- for dic in others_states:
728
- for key in dic:
729
- if key not in new_states:
730
- new_states[key]=dic[key].copy()
731
- else:
732
- new_states[key]+=dic[key]
733
- for key in new_states:
734
- new_states[key]=sorted(list(set(new_states[key])))
735
- return sorted(DFG,key=lambda x:x[1]),new_states
736
- elif root_node.type in for_statement:
737
- DFG=[]
738
- for i in range(2):
739
- left_nodes=[root_node.child_by_field_name('pattern')]
740
- right_nodes=[root_node.child_by_field_name('value')]
741
- assert len(right_nodes)==len(left_nodes)
742
- for node in right_nodes:
743
- temp,states=DFG_ruby(node,index_to_code,states)
744
- DFG+=temp
745
- for left_node,right_node in zip(left_nodes,right_nodes):
746
- left_tokens_index=tree_to_variable_index(left_node,index_to_code)
747
- right_tokens_index=tree_to_variable_index(right_node,index_to_code)
748
- temp=[]
749
- for token1_index in left_tokens_index:
750
- idx1,code1=index_to_code[token1_index]
751
- temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
752
- [index_to_code[x][0] for x in right_tokens_index]))
753
- states[code1]=[idx1]
754
- DFG+=temp
755
- temp,states=DFG_ruby(root_node.child_by_field_name('body'),index_to_code,states)
756
- DFG+=temp
757
- dic={}
758
- for x in DFG:
759
- if (x[0],x[1],x[2]) not in dic:
760
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
761
- else:
762
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
763
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
764
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
765
- return sorted(DFG,key=lambda x:x[1]),states
766
- elif root_node.type in while_statement:
767
- DFG=[]
768
- for i in range(2):
769
- for child in root_node.children:
770
- temp,states=DFG_ruby(child,index_to_code,states)
771
- DFG+=temp
772
- dic={}
773
- for x in DFG:
774
- if (x[0],x[1],x[2]) not in dic:
775
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
776
- else:
777
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
778
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
779
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
780
- return sorted(DFG,key=lambda x:x[1]),states
781
- else:
782
- DFG=[]
783
- for child in root_node.children:
784
- if child.type in do_first_statement:
785
- temp,states=DFG_ruby(child,index_to_code,states)
786
- DFG+=temp
787
- for child in root_node.children:
788
- if child.type not in do_first_statement:
789
- temp,states=DFG_ruby(child,index_to_code,states)
790
- DFG+=temp
791
-
792
- return sorted(DFG,key=lambda x:x[1]),states
793
-
794
- def DFG_go(root_node,index_to_code,states):
795
- assignment=['assignment_statement',]
796
- def_statement=['var_spec']
797
- increment_statement=['inc_statement']
798
- if_statement=['if_statement','else']
799
- for_statement=['for_statement']
800
- enhanced_for_statement=[]
801
- while_statement=[]
802
- do_first_statement=[]
803
- states=states.copy()
804
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
805
- idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
806
- if root_node.type==code:
807
- return [],states
808
- elif code in states:
809
- return [(code,idx,'comesFrom',[code],states[code].copy())],states
810
- else:
811
- if root_node.type=='identifier':
812
- states[code]=[idx]
813
- return [(code,idx,'comesFrom',[],[])],states
814
- elif root_node.type in def_statement:
815
- name=root_node.child_by_field_name('name')
816
- value=root_node.child_by_field_name('value')
817
- DFG=[]
818
- if value is None:
819
- indexs=tree_to_variable_index(name,index_to_code)
820
- for index in indexs:
821
- idx,code=index_to_code[index]
822
- DFG.append((code,idx,'comesFrom',[],[]))
823
- states[code]=[idx]
824
- return sorted(DFG,key=lambda x:x[1]),states
825
- else:
826
- name_indexs=tree_to_variable_index(name,index_to_code)
827
- value_indexs=tree_to_variable_index(value,index_to_code)
828
- temp,states=DFG_go(value,index_to_code,states)
829
- DFG+=temp
830
- for index1 in name_indexs:
831
- idx1,code1=index_to_code[index1]
832
- for index2 in value_indexs:
833
- idx2,code2=index_to_code[index2]
834
- DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
835
- states[code1]=[idx1]
836
- return sorted(DFG,key=lambda x:x[1]),states
837
- elif root_node.type in assignment:
838
- left_nodes=root_node.child_by_field_name('left')
839
- right_nodes=root_node.child_by_field_name('right')
840
- DFG=[]
841
- temp,states=DFG_go(right_nodes,index_to_code,states)
842
- DFG+=temp
843
- name_indexs=tree_to_variable_index(left_nodes,index_to_code)
844
- value_indexs=tree_to_variable_index(right_nodes,index_to_code)
845
- for index1 in name_indexs:
846
- idx1,code1=index_to_code[index1]
847
- for index2 in value_indexs:
848
- idx2,code2=index_to_code[index2]
849
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
850
- states[code1]=[idx1]
851
- return sorted(DFG,key=lambda x:x[1]),states
852
- elif root_node.type in increment_statement:
853
- DFG=[]
854
- indexs=tree_to_variable_index(root_node,index_to_code)
855
- for index1 in indexs:
856
- idx1,code1=index_to_code[index1]
857
- for index2 in indexs:
858
- idx2,code2=index_to_code[index2]
859
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
860
- states[code1]=[idx1]
861
- return sorted(DFG,key=lambda x:x[1]),states
862
- elif root_node.type in if_statement:
863
- DFG=[]
864
- current_states=states.copy()
865
- others_states=[]
866
- flag=False
867
- tag=False
868
- if 'else' in root_node.type:
869
- tag=True
870
- for child in root_node.children:
871
- if 'else' in child.type:
872
- tag=True
873
- if child.type not in if_statement and flag is False:
874
- temp,current_states=DFG_go(child,index_to_code,current_states)
875
- DFG+=temp
876
- else:
877
- flag=True
878
- temp,new_states=DFG_go(child,index_to_code,states)
879
- DFG+=temp
880
- others_states.append(new_states)
881
- others_states.append(current_states)
882
- if tag is False:
883
- others_states.append(states)
884
- new_states={}
885
- for dic in others_states:
886
- for key in dic:
887
- if key not in new_states:
888
- new_states[key]=dic[key].copy()
889
- else:
890
- new_states[key]+=dic[key]
891
- for key in states:
892
- if key not in new_states:
893
- new_states[key]=states[key]
894
- else:
895
- new_states[key]+=states[key]
896
- for key in new_states:
897
- new_states[key]=sorted(list(set(new_states[key])))
898
- return sorted(DFG,key=lambda x:x[1]),new_states
899
- elif root_node.type in for_statement:
900
- DFG=[]
901
- for child in root_node.children:
902
- temp,states=DFG_go(child,index_to_code,states)
903
- DFG+=temp
904
- flag=False
905
- for child in root_node.children:
906
- if flag:
907
- temp,states=DFG_go(child,index_to_code,states)
908
- DFG+=temp
909
- elif child.type=="for_clause":
910
- if child.child_by_field_name('update') is not None:
911
- temp,states=DFG_go(child.child_by_field_name('update'),index_to_code,states)
912
- DFG+=temp
913
- flag=True
914
- dic={}
915
- for x in DFG:
916
- if (x[0],x[1],x[2]) not in dic:
917
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
918
- else:
919
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
920
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
921
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
922
- return sorted(DFG,key=lambda x:x[1]),states
923
- else:
924
- DFG=[]
925
- for child in root_node.children:
926
- if child.type in do_first_statement:
927
- temp,states=DFG_go(child,index_to_code,states)
928
- DFG+=temp
929
- for child in root_node.children:
930
- if child.type not in do_first_statement:
931
- temp,states=DFG_go(child,index_to_code,states)
932
- DFG+=temp
933
-
934
- return sorted(DFG,key=lambda x:x[1]),states
935
-
936
-
937
-
938
-
939
- def DFG_php(root_node,index_to_code,states):
940
- assignment=['assignment_expression','augmented_assignment_expression']
941
- def_statement=['simple_parameter']
942
- increment_statement=['update_expression']
943
- if_statement=['if_statement','else_clause']
944
- for_statement=['for_statement']
945
- enhanced_for_statement=['foreach_statement']
946
- while_statement=['while_statement']
947
- do_first_statement=[]
948
- states=states.copy()
949
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
950
- idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
951
- if root_node.type==code:
952
- return [],states
953
- elif code in states:
954
- return [(code,idx,'comesFrom',[code],states[code].copy())],states
955
- else:
956
- if root_node.type=='identifier':
957
- states[code]=[idx]
958
- return [(code,idx,'comesFrom',[],[])],states
959
- elif root_node.type in def_statement:
960
- name=root_node.child_by_field_name('name')
961
- value=root_node.child_by_field_name('default_value')
962
- DFG=[]
963
- if value is None:
964
- indexs=tree_to_variable_index(name,index_to_code)
965
- for index in indexs:
966
- idx,code=index_to_code[index]
967
- DFG.append((code,idx,'comesFrom',[],[]))
968
- states[code]=[idx]
969
- return sorted(DFG,key=lambda x:x[1]),states
970
- else:
971
- name_indexs=tree_to_variable_index(name,index_to_code)
972
- value_indexs=tree_to_variable_index(value,index_to_code)
973
- temp,states=DFG_php(value,index_to_code,states)
974
- DFG+=temp
975
- for index1 in name_indexs:
976
- idx1,code1=index_to_code[index1]
977
- for index2 in value_indexs:
978
- idx2,code2=index_to_code[index2]
979
- DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
980
- states[code1]=[idx1]
981
- return sorted(DFG,key=lambda x:x[1]),states
982
- elif root_node.type in assignment:
983
- left_nodes=root_node.child_by_field_name('left')
984
- right_nodes=root_node.child_by_field_name('right')
985
- DFG=[]
986
- temp,states=DFG_php(right_nodes,index_to_code,states)
987
- DFG+=temp
988
- name_indexs=tree_to_variable_index(left_nodes,index_to_code)
989
- value_indexs=tree_to_variable_index(right_nodes,index_to_code)
990
- for index1 in name_indexs:
991
- idx1,code1=index_to_code[index1]
992
- for index2 in value_indexs:
993
- idx2,code2=index_to_code[index2]
994
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
995
- states[code1]=[idx1]
996
- return sorted(DFG,key=lambda x:x[1]),states
997
- elif root_node.type in increment_statement:
998
- DFG=[]
999
- indexs=tree_to_variable_index(root_node,index_to_code)
1000
- for index1 in indexs:
1001
- idx1,code1=index_to_code[index1]
1002
- for index2 in indexs:
1003
- idx2,code2=index_to_code[index2]
1004
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
1005
- states[code1]=[idx1]
1006
- return sorted(DFG,key=lambda x:x[1]),states
1007
- elif root_node.type in if_statement:
1008
- DFG=[]
1009
- current_states=states.copy()
1010
- others_states=[]
1011
- flag=False
1012
- tag=False
1013
- if 'else' in root_node.type:
1014
- tag=True
1015
- for child in root_node.children:
1016
- if 'else' in child.type:
1017
- tag=True
1018
- if child.type not in if_statement and flag is False:
1019
- temp,current_states=DFG_php(child,index_to_code,current_states)
1020
- DFG+=temp
1021
- else:
1022
- flag=True
1023
- temp,new_states=DFG_php(child,index_to_code,states)
1024
- DFG+=temp
1025
- others_states.append(new_states)
1026
- others_states.append(current_states)
1027
- new_states={}
1028
- for dic in others_states:
1029
- for key in dic:
1030
- if key not in new_states:
1031
- new_states[key]=dic[key].copy()
1032
- else:
1033
- new_states[key]+=dic[key]
1034
- for key in states:
1035
- if key not in new_states:
1036
- new_states[key]=states[key]
1037
- else:
1038
- new_states[key]+=states[key]
1039
- for key in new_states:
1040
- new_states[key]=sorted(list(set(new_states[key])))
1041
- return sorted(DFG,key=lambda x:x[1]),new_states
1042
- elif root_node.type in for_statement:
1043
- DFG=[]
1044
- for child in root_node.children:
1045
- temp,states=DFG_php(child,index_to_code,states)
1046
- DFG+=temp
1047
- flag=False
1048
- for child in root_node.children:
1049
- if flag:
1050
- temp,states=DFG_php(child,index_to_code,states)
1051
- DFG+=temp
1052
- elif child.type=="assignment_expression":
1053
- flag=True
1054
- dic={}
1055
- for x in DFG:
1056
- if (x[0],x[1],x[2]) not in dic:
1057
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
1058
- else:
1059
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
1060
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
1061
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
1062
- return sorted(DFG,key=lambda x:x[1]),states
1063
- elif root_node.type in enhanced_for_statement:
1064
- name=None
1065
- value=None
1066
- for child in root_node.children:
1067
- if child.type=='variable_name' and value is None:
1068
- value=child
1069
- elif child.type=='variable_name' and name is None:
1070
- name=child
1071
- break
1072
- body=root_node.child_by_field_name('body')
1073
- DFG=[]
1074
- for i in range(2):
1075
- temp,states=DFG_php(value,index_to_code,states)
1076
- DFG+=temp
1077
- name_indexs=tree_to_variable_index(name,index_to_code)
1078
- value_indexs=tree_to_variable_index(value,index_to_code)
1079
- for index1 in name_indexs:
1080
- idx1,code1=index_to_code[index1]
1081
- for index2 in value_indexs:
1082
- idx2,code2=index_to_code[index2]
1083
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
1084
- states[code1]=[idx1]
1085
- temp,states=DFG_php(body,index_to_code,states)
1086
- DFG+=temp
1087
- dic={}
1088
- for x in DFG:
1089
- if (x[0],x[1],x[2]) not in dic:
1090
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
1091
- else:
1092
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
1093
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
1094
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
1095
- return sorted(DFG,key=lambda x:x[1]),states
1096
- elif root_node.type in while_statement:
1097
- DFG=[]
1098
- for i in range(2):
1099
- for child in root_node.children:
1100
- temp,states=DFG_php(child,index_to_code,states)
1101
- DFG+=temp
1102
- dic={}
1103
- for x in DFG:
1104
- if (x[0],x[1],x[2]) not in dic:
1105
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
1106
- else:
1107
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
1108
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
1109
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
1110
- return sorted(DFG,key=lambda x:x[1]),states
1111
- else:
1112
- DFG=[]
1113
- for child in root_node.children:
1114
- if child.type in do_first_statement:
1115
- temp,states=DFG_php(child,index_to_code,states)
1116
- DFG+=temp
1117
- for child in root_node.children:
1118
- if child.type not in do_first_statement:
1119
- temp,states=DFG_php(child,index_to_code,states)
1120
- DFG+=temp
1121
-
1122
- return sorted(DFG,key=lambda x:x[1]),states
1123
-
1124
-
1125
- def DFG_javascript(root_node,index_to_code,states):
1126
- assignment=['assignment_pattern','augmented_assignment_expression']
1127
- def_statement=['variable_declarator']
1128
- increment_statement=['update_expression']
1129
- if_statement=['if_statement','else']
1130
- for_statement=['for_statement']
1131
- enhanced_for_statement=[]
1132
- while_statement=['while_statement']
1133
- do_first_statement=[]
1134
- states=states.copy()
1135
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
1136
- idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
1137
- if root_node.type==code:
1138
- return [],states
1139
- elif code in states:
1140
- return [(code,idx,'comesFrom',[code],states[code].copy())],states
1141
- else:
1142
- if root_node.type=='identifier':
1143
- states[code]=[idx]
1144
- return [(code,idx,'comesFrom',[],[])],states
1145
- elif root_node.type in def_statement:
1146
- name=root_node.child_by_field_name('name')
1147
- value=root_node.child_by_field_name('value')
1148
- DFG=[]
1149
- if value is None:
1150
- indexs=tree_to_variable_index(name,index_to_code)
1151
- for index in indexs:
1152
- idx,code=index_to_code[index]
1153
- DFG.append((code,idx,'comesFrom',[],[]))
1154
- states[code]=[idx]
1155
- return sorted(DFG,key=lambda x:x[1]),states
1156
- else:
1157
- name_indexs=tree_to_variable_index(name,index_to_code)
1158
- value_indexs=tree_to_variable_index(value,index_to_code)
1159
- temp,states=DFG_javascript(value,index_to_code,states)
1160
- DFG+=temp
1161
- for index1 in name_indexs:
1162
- idx1,code1=index_to_code[index1]
1163
- for index2 in value_indexs:
1164
- idx2,code2=index_to_code[index2]
1165
- DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
1166
- states[code1]=[idx1]
1167
- return sorted(DFG,key=lambda x:x[1]),states
1168
- elif root_node.type in assignment:
1169
- left_nodes=root_node.child_by_field_name('left')
1170
- right_nodes=root_node.child_by_field_name('right')
1171
- DFG=[]
1172
- temp,states=DFG_javascript(right_nodes,index_to_code,states)
1173
- DFG+=temp
1174
- name_indexs=tree_to_variable_index(left_nodes,index_to_code)
1175
- value_indexs=tree_to_variable_index(right_nodes,index_to_code)
1176
- for index1 in name_indexs:
1177
- idx1,code1=index_to_code[index1]
1178
- for index2 in value_indexs:
1179
- idx2,code2=index_to_code[index2]
1180
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
1181
- states[code1]=[idx1]
1182
- return sorted(DFG,key=lambda x:x[1]),states
1183
- elif root_node.type in increment_statement:
1184
- DFG=[]
1185
- indexs=tree_to_variable_index(root_node,index_to_code)
1186
- for index1 in indexs:
1187
- idx1,code1=index_to_code[index1]
1188
- for index2 in indexs:
1189
- idx2,code2=index_to_code[index2]
1190
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
1191
- states[code1]=[idx1]
1192
- return sorted(DFG,key=lambda x:x[1]),states
1193
- elif root_node.type in if_statement:
1194
- DFG=[]
1195
- current_states=states.copy()
1196
- others_states=[]
1197
- flag=False
1198
- tag=False
1199
- if 'else' in root_node.type:
1200
- tag=True
1201
- for child in root_node.children:
1202
- if 'else' in child.type:
1203
- tag=True
1204
- if child.type not in if_statement and flag is False:
1205
- temp,current_states=DFG_javascript(child,index_to_code,current_states)
1206
- DFG+=temp
1207
- else:
1208
- flag=True
1209
- temp,new_states=DFG_javascript(child,index_to_code,states)
1210
- DFG+=temp
1211
- others_states.append(new_states)
1212
- others_states.append(current_states)
1213
- if tag is False:
1214
- others_states.append(states)
1215
- new_states={}
1216
- for dic in others_states:
1217
- for key in dic:
1218
- if key not in new_states:
1219
- new_states[key]=dic[key].copy()
1220
- else:
1221
- new_states[key]+=dic[key]
1222
- for key in states:
1223
- if key not in new_states:
1224
- new_states[key]=states[key]
1225
- else:
1226
- new_states[key]+=states[key]
1227
- for key in new_states:
1228
- new_states[key]=sorted(list(set(new_states[key])))
1229
- return sorted(DFG,key=lambda x:x[1]),new_states
1230
- elif root_node.type in for_statement:
1231
- DFG=[]
1232
- for child in root_node.children:
1233
- temp,states=DFG_javascript(child,index_to_code,states)
1234
- DFG+=temp
1235
- flag=False
1236
- for child in root_node.children:
1237
- if flag:
1238
- temp,states=DFG_javascript(child,index_to_code,states)
1239
- DFG+=temp
1240
- elif child.type=="variable_declaration":
1241
- flag=True
1242
- dic={}
1243
- for x in DFG:
1244
- if (x[0],x[1],x[2]) not in dic:
1245
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
1246
- else:
1247
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
1248
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
1249
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
1250
- return sorted(DFG,key=lambda x:x[1]),states
1251
- elif root_node.type in while_statement:
1252
- DFG=[]
1253
- for i in range(2):
1254
- for child in root_node.children:
1255
- temp,states=DFG_javascript(child,index_to_code,states)
1256
- DFG+=temp
1257
- dic={}
1258
- for x in DFG:
1259
- if (x[0],x[1],x[2]) not in dic:
1260
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
1261
- else:
1262
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
1263
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
1264
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
1265
- return sorted(DFG,key=lambda x:x[1]),states
1266
- else:
1267
- DFG=[]
1268
- for child in root_node.children:
1269
- if child.type in do_first_statement:
1270
- temp,states=DFG_javascript(child,index_to_code,states)
1271
- DFG+=temp
1272
- for child in root_node.children:
1273
- if child.type not in do_first_statement:
1274
- temp,states=DFG_javascript(child,index_to_code,states)
1275
- DFG+=temp
1276
-
1277
- return sorted(DFG,key=lambda x:x[1]),states
1278
 
1279
  dfg_function={
1280
  'python':DFG_python,
@@ -1291,8 +25,9 @@ dfg_function={
1291
  def calc_dataflow_match(references, candidate, lang):
1292
  return corpus_dataflow_match([references], [candidate], lang)
1293
 
1294
- def corpus_dataflow_match(references, candidates, lang):
1295
- LANGUAGE = Language('parser/my-languages.so', lang)
 
1296
  parser = Parser()
1297
  parser.set_language(LANGUAGE)
1298
  parser = [parser,dfg_function[lang]]
 
1
  # Copyright (c) Microsoft Corporation.
2
  # Licensed under the MIT license.
3
 
4
+ from .parsercode.DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp
5
+ from .parsercode.utils import (remove_comments_and_docstrings,
6
+ tree_to_token_index,
7
+ index_to_code_token,
8
+ tree_to_variable_index)
9
  from tree_sitter import Language, Parser
10
  import pdb
11
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  dfg_function={
14
  'python':DFG_python,
 
25
  def calc_dataflow_match(references, candidate, lang):
26
  return corpus_dataflow_match([references], [candidate], lang)
27
 
28
+ def corpus_dataflow_match(references, candidates, lang):
29
+ curr_path = os.path.dirname(os.path.abspath(__file__))
30
+ LANGUAGE = Language(curr_path + '/parsercode/my-languages.so', lang)
31
  parser = Parser()
32
  parser.set_language(LANGUAGE)
33
  parser = [parser,dfg_function[lang]]
readme.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python calc_code_bleu.py --refs reference_files --hyp candidate_file --language java ( or c_sharp) --params 0.25,0.25,0.25,0.25(default)
syntax_match.py CHANGED
@@ -1,1279 +1,13 @@
1
  # Copyright (c) Microsoft Corporation.
2
  # Licensed under the MIT license.
3
 
4
-
 
 
 
 
5
  from tree_sitter import Language, Parser
6
-
7
- import re
8
- from io import StringIO
9
- import tokenize
10
- def remove_comments_and_docstrings(source,lang):
11
- if lang in ['python']:
12
- """
13
- Returns 'source' minus comments and docstrings.
14
- """
15
- io_obj = StringIO(source)
16
- out = ""
17
- prev_toktype = tokenize.INDENT
18
- last_lineno = -1
19
- last_col = 0
20
- for tok in tokenize.generate_tokens(io_obj.readline):
21
- token_type = tok[0]
22
- token_string = tok[1]
23
- start_line, start_col = tok[2]
24
- end_line, end_col = tok[3]
25
- ltext = tok[4]
26
- if start_line > last_lineno:
27
- last_col = 0
28
- if start_col > last_col:
29
- out += (" " * (start_col - last_col))
30
- # Remove comments:
31
- if token_type == tokenize.COMMENT:
32
- pass
33
- # This series of conditionals removes docstrings:
34
- elif token_type == tokenize.STRING:
35
- if prev_toktype != tokenize.INDENT:
36
- # This is likely a docstring; double-check we're not inside an operator:
37
- if prev_toktype != tokenize.NEWLINE:
38
- if start_col > 0:
39
- out += token_string
40
- else:
41
- out += token_string
42
- prev_toktype = token_type
43
- last_col = end_col
44
- last_lineno = end_line
45
- temp=[]
46
- for x in out.split('\n'):
47
- if x.strip()!="":
48
- temp.append(x)
49
- return '\n'.join(temp)
50
- elif lang in ['ruby']:
51
- return source
52
- else:
53
- def replacer(match):
54
- s = match.group(0)
55
- if s.startswith('/'):
56
- return " " # note: a space and not an empty string
57
- else:
58
- return s
59
- pattern = re.compile(
60
- r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
61
- re.DOTALL | re.MULTILINE
62
- )
63
- temp=[]
64
- for x in re.sub(pattern, replacer, source).split('\n'):
65
- if x.strip()!="":
66
- temp.append(x)
67
- return '\n'.join(temp)
68
-
69
- def tree_to_token_index(root_node):
70
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
71
- return [(root_node.start_point,root_node.end_point)]
72
- else:
73
- code_tokens=[]
74
- for child in root_node.children:
75
- code_tokens+=tree_to_token_index(child)
76
- return code_tokens
77
-
78
- def tree_to_variable_index(root_node,index_to_code):
79
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
80
- index=(root_node.start_point,root_node.end_point)
81
- _,code=index_to_code[index]
82
- if root_node.type!=code:
83
- return [(root_node.start_point,root_node.end_point)]
84
- else:
85
- return []
86
- else:
87
- code_tokens=[]
88
- for child in root_node.children:
89
- code_tokens+=tree_to_variable_index(child,index_to_code)
90
- return code_tokens
91
-
92
- def index_to_code_token(index,code):
93
- start_point=index[0]
94
- end_point=index[1]
95
- if start_point[0]==end_point[0]:
96
- s=code[start_point[0]][start_point[1]:end_point[1]]
97
- else:
98
- s=""
99
- s+=code[start_point[0]][start_point[1]:]
100
- for i in range(start_point[0]+1,end_point[0]):
101
- s+=code[i]
102
- s+=code[end_point[0]][:end_point[1]]
103
- return s
104
-
105
-
106
- def DFG_python(root_node,index_to_code,states):
107
- assignment=['assignment','augmented_assignment','for_in_clause']
108
- if_statement=['if_statement']
109
- for_statement=['for_statement']
110
- while_statement=['while_statement']
111
- do_first_statement=['for_in_clause']
112
- def_statement=['default_parameter']
113
- states=states.copy()
114
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
115
- idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
116
- if root_node.type==code:
117
- return [],states
118
- elif code in states:
119
- return [(code,idx,'comesFrom',[code],states[code].copy())],states
120
- else:
121
- if root_node.type=='identifier':
122
- states[code]=[idx]
123
- return [(code,idx,'comesFrom',[],[])],states
124
- elif root_node.type in def_statement:
125
- name=root_node.child_by_field_name('name')
126
- value=root_node.child_by_field_name('value')
127
- DFG=[]
128
- if value is None:
129
- indexs=tree_to_variable_index(name,index_to_code)
130
- for index in indexs:
131
- idx,code=index_to_code[index]
132
- DFG.append((code,idx,'comesFrom',[],[]))
133
- states[code]=[idx]
134
- return sorted(DFG,key=lambda x:x[1]),states
135
- else:
136
- name_indexs=tree_to_variable_index(name,index_to_code)
137
- value_indexs=tree_to_variable_index(value,index_to_code)
138
- temp,states=DFG_python(value,index_to_code,states)
139
- DFG+=temp
140
- for index1 in name_indexs:
141
- idx1,code1=index_to_code[index1]
142
- for index2 in value_indexs:
143
- idx2,code2=index_to_code[index2]
144
- DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
145
- states[code1]=[idx1]
146
- return sorted(DFG,key=lambda x:x[1]),states
147
- elif root_node.type in assignment:
148
- if root_node.type=='for_in_clause':
149
- right_nodes=[root_node.children[-1]]
150
- left_nodes=[root_node.child_by_field_name('left')]
151
- else:
152
- if root_node.child_by_field_name('right') is None:
153
- return [],states
154
- left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=',']
155
- right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=',']
156
- if len(right_nodes)!=len(left_nodes):
157
- left_nodes=[root_node.child_by_field_name('left')]
158
- right_nodes=[root_node.child_by_field_name('right')]
159
- if len(left_nodes)==0:
160
- left_nodes=[root_node.child_by_field_name('left')]
161
- if len(right_nodes)==0:
162
- right_nodes=[root_node.child_by_field_name('right')]
163
- DFG=[]
164
- for node in right_nodes:
165
- temp,states=DFG_python(node,index_to_code,states)
166
- DFG+=temp
167
-
168
- for left_node,right_node in zip(left_nodes,right_nodes):
169
- left_tokens_index=tree_to_variable_index(left_node,index_to_code)
170
- right_tokens_index=tree_to_variable_index(right_node,index_to_code)
171
- temp=[]
172
- for token1_index in left_tokens_index:
173
- idx1,code1=index_to_code[token1_index]
174
- temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
175
- [index_to_code[x][0] for x in right_tokens_index]))
176
- states[code1]=[idx1]
177
- DFG+=temp
178
- return sorted(DFG,key=lambda x:x[1]),states
179
- elif root_node.type in if_statement:
180
- DFG=[]
181
- current_states=states.copy()
182
- others_states=[]
183
- tag=False
184
- if 'else' in root_node.type:
185
- tag=True
186
- for child in root_node.children:
187
- if 'else' in child.type:
188
- tag=True
189
- if child.type not in ['elif_clause','else_clause']:
190
- temp,current_states=DFG_python(child,index_to_code,current_states)
191
- DFG+=temp
192
- else:
193
- temp,new_states=DFG_python(child,index_to_code,states)
194
- DFG+=temp
195
- others_states.append(new_states)
196
- others_states.append(current_states)
197
- if tag is False:
198
- others_states.append(states)
199
- new_states={}
200
- for dic in others_states:
201
- for key in dic:
202
- if key not in new_states:
203
- new_states[key]=dic[key].copy()
204
- else:
205
- new_states[key]+=dic[key]
206
- for key in new_states:
207
- new_states[key]=sorted(list(set(new_states[key])))
208
- return sorted(DFG,key=lambda x:x[1]),new_states
209
- elif root_node.type in for_statement:
210
- DFG=[]
211
- for i in range(2):
212
- right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=',']
213
- left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=',']
214
- if len(right_nodes)!=len(left_nodes):
215
- left_nodes=[root_node.child_by_field_name('left')]
216
- right_nodes=[root_node.child_by_field_name('right')]
217
- if len(left_nodes)==0:
218
- left_nodes=[root_node.child_by_field_name('left')]
219
- if len(right_nodes)==0:
220
- right_nodes=[root_node.child_by_field_name('right')]
221
- for node in right_nodes:
222
- temp,states=DFG_python(node,index_to_code,states)
223
- DFG+=temp
224
- for left_node,right_node in zip(left_nodes,right_nodes):
225
- left_tokens_index=tree_to_variable_index(left_node,index_to_code)
226
- right_tokens_index=tree_to_variable_index(right_node,index_to_code)
227
- temp=[]
228
- for token1_index in left_tokens_index:
229
- idx1,code1=index_to_code[token1_index]
230
- temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
231
- [index_to_code[x][0] for x in right_tokens_index]))
232
- states[code1]=[idx1]
233
- DFG+=temp
234
- if root_node.children[-1].type=="block":
235
- temp,states=DFG_python(root_node.children[-1],index_to_code,states)
236
- DFG+=temp
237
- dic={}
238
- for x in DFG:
239
- if (x[0],x[1],x[2]) not in dic:
240
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
241
- else:
242
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
243
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
244
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
245
- return sorted(DFG,key=lambda x:x[1]),states
246
- elif root_node.type in while_statement:
247
- DFG=[]
248
- for i in range(2):
249
- for child in root_node.children:
250
- temp,states=DFG_python(child,index_to_code,states)
251
- DFG+=temp
252
- dic={}
253
- for x in DFG:
254
- if (x[0],x[1],x[2]) not in dic:
255
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
256
- else:
257
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
258
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
259
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
260
- return sorted(DFG,key=lambda x:x[1]),states
261
- else:
262
- DFG=[]
263
- for child in root_node.children:
264
- if child.type in do_first_statement:
265
- temp,states=DFG_python(child,index_to_code,states)
266
- DFG+=temp
267
- for child in root_node.children:
268
- if child.type not in do_first_statement:
269
- temp,states=DFG_python(child,index_to_code,states)
270
- DFG+=temp
271
-
272
- return sorted(DFG,key=lambda x:x[1]),states
273
-
274
-
275
- def DFG_java(root_node,index_to_code,states):
276
- assignment=['assignment_expression']
277
- def_statement=['variable_declarator']
278
- increment_statement=['update_expression']
279
- if_statement=['if_statement','else']
280
- for_statement=['for_statement']
281
- enhanced_for_statement=['enhanced_for_statement']
282
- while_statement=['while_statement']
283
- do_first_statement=[]
284
- states=states.copy()
285
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
286
- idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
287
- if root_node.type==code:
288
- return [],states
289
- elif code in states:
290
- return [(code,idx,'comesFrom',[code],states[code].copy())],states
291
- else:
292
- if root_node.type=='identifier':
293
- states[code]=[idx]
294
- return [(code,idx,'comesFrom',[],[])],states
295
- elif root_node.type in def_statement:
296
- name=root_node.child_by_field_name('name')
297
- value=root_node.child_by_field_name('value')
298
- DFG=[]
299
- if value is None:
300
- indexs=tree_to_variable_index(name,index_to_code)
301
- for index in indexs:
302
- idx,code=index_to_code[index]
303
- DFG.append((code,idx,'comesFrom',[],[]))
304
- states[code]=[idx]
305
- return sorted(DFG,key=lambda x:x[1]),states
306
- else:
307
- name_indexs=tree_to_variable_index(name,index_to_code)
308
- value_indexs=tree_to_variable_index(value,index_to_code)
309
- temp,states=DFG_java(value,index_to_code,states)
310
- DFG+=temp
311
- for index1 in name_indexs:
312
- idx1,code1=index_to_code[index1]
313
- for index2 in value_indexs:
314
- idx2,code2=index_to_code[index2]
315
- DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
316
- states[code1]=[idx1]
317
- return sorted(DFG,key=lambda x:x[1]),states
318
- elif root_node.type in assignment:
319
- left_nodes=root_node.child_by_field_name('left')
320
- right_nodes=root_node.child_by_field_name('right')
321
- DFG=[]
322
- temp,states=DFG_java(right_nodes,index_to_code,states)
323
- DFG+=temp
324
- name_indexs=tree_to_variable_index(left_nodes,index_to_code)
325
- value_indexs=tree_to_variable_index(right_nodes,index_to_code)
326
- for index1 in name_indexs:
327
- idx1,code1=index_to_code[index1]
328
- for index2 in value_indexs:
329
- idx2,code2=index_to_code[index2]
330
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
331
- states[code1]=[idx1]
332
- return sorted(DFG,key=lambda x:x[1]),states
333
- elif root_node.type in increment_statement:
334
- DFG=[]
335
- indexs=tree_to_variable_index(root_node,index_to_code)
336
- for index1 in indexs:
337
- idx1,code1=index_to_code[index1]
338
- for index2 in indexs:
339
- idx2,code2=index_to_code[index2]
340
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
341
- states[code1]=[idx1]
342
- return sorted(DFG,key=lambda x:x[1]),states
343
- elif root_node.type in if_statement:
344
- DFG=[]
345
- current_states=states.copy()
346
- others_states=[]
347
- flag=False
348
- tag=False
349
- if 'else' in root_node.type:
350
- tag=True
351
- for child in root_node.children:
352
- if 'else' in child.type:
353
- tag=True
354
- if child.type not in if_statement and flag is False:
355
- temp,current_states=DFG_java(child,index_to_code,current_states)
356
- DFG+=temp
357
- else:
358
- flag=True
359
- temp,new_states=DFG_java(child,index_to_code,states)
360
- DFG+=temp
361
- others_states.append(new_states)
362
- others_states.append(current_states)
363
- if tag is False:
364
- others_states.append(states)
365
- new_states={}
366
- for dic in others_states:
367
- for key in dic:
368
- if key not in new_states:
369
- new_states[key]=dic[key].copy()
370
- else:
371
- new_states[key]+=dic[key]
372
- for key in new_states:
373
- new_states[key]=sorted(list(set(new_states[key])))
374
- return sorted(DFG,key=lambda x:x[1]),new_states
375
- elif root_node.type in for_statement:
376
- DFG=[]
377
- for child in root_node.children:
378
- temp,states=DFG_java(child,index_to_code,states)
379
- DFG+=temp
380
- flag=False
381
- for child in root_node.children:
382
- if flag:
383
- temp,states=DFG_java(child,index_to_code,states)
384
- DFG+=temp
385
- elif child.type=="local_variable_declaration":
386
- flag=True
387
- dic={}
388
- for x in DFG:
389
- if (x[0],x[1],x[2]) not in dic:
390
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
391
- else:
392
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
393
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
394
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
395
- return sorted(DFG,key=lambda x:x[1]),states
396
- elif root_node.type in enhanced_for_statement:
397
- name=root_node.child_by_field_name('name')
398
- value=root_node.child_by_field_name('value')
399
- body=root_node.child_by_field_name('body')
400
- DFG=[]
401
- for i in range(2):
402
- temp,states=DFG_java(value,index_to_code,states)
403
- DFG+=temp
404
- name_indexs=tree_to_variable_index(name,index_to_code)
405
- value_indexs=tree_to_variable_index(value,index_to_code)
406
- for index1 in name_indexs:
407
- idx1,code1=index_to_code[index1]
408
- for index2 in value_indexs:
409
- idx2,code2=index_to_code[index2]
410
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
411
- states[code1]=[idx1]
412
- temp,states=DFG_java(body,index_to_code,states)
413
- DFG+=temp
414
- dic={}
415
- for x in DFG:
416
- if (x[0],x[1],x[2]) not in dic:
417
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
418
- else:
419
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
420
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
421
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
422
- return sorted(DFG,key=lambda x:x[1]),states
423
- elif root_node.type in while_statement:
424
- DFG=[]
425
- for i in range(2):
426
- for child in root_node.children:
427
- temp,states=DFG_java(child,index_to_code,states)
428
- DFG+=temp
429
- dic={}
430
- for x in DFG:
431
- if (x[0],x[1],x[2]) not in dic:
432
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
433
- else:
434
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
435
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
436
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
437
- return sorted(DFG,key=lambda x:x[1]),states
438
- else:
439
- DFG=[]
440
- for child in root_node.children:
441
- if child.type in do_first_statement:
442
- temp,states=DFG_java(child,index_to_code,states)
443
- DFG+=temp
444
- for child in root_node.children:
445
- if child.type not in do_first_statement:
446
- temp,states=DFG_java(child,index_to_code,states)
447
- DFG+=temp
448
-
449
- return sorted(DFG,key=lambda x:x[1]),states
450
-
451
- def DFG_csharp(root_node,index_to_code,states):
452
- assignment=['assignment_expression']
453
- def_statement=['variable_declarator']
454
- increment_statement=['postfix_unary_expression']
455
- if_statement=['if_statement','else']
456
- for_statement=['for_statement']
457
- enhanced_for_statement=['for_each_statement']
458
- while_statement=['while_statement']
459
- do_first_statement=[]
460
- states=states.copy()
461
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
462
- idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
463
- if root_node.type==code:
464
- return [],states
465
- elif code in states:
466
- return [(code,idx,'comesFrom',[code],states[code].copy())],states
467
- else:
468
- if root_node.type=='identifier':
469
- states[code]=[idx]
470
- return [(code,idx,'comesFrom',[],[])],states
471
- elif root_node.type in def_statement:
472
- if len(root_node.children)==2:
473
- name=root_node.children[0]
474
- value=root_node.children[1]
475
- else:
476
- name=root_node.children[0]
477
- value=None
478
- DFG=[]
479
- if value is None:
480
- indexs=tree_to_variable_index(name,index_to_code)
481
- for index in indexs:
482
- idx,code=index_to_code[index]
483
- DFG.append((code,idx,'comesFrom',[],[]))
484
- states[code]=[idx]
485
- return sorted(DFG,key=lambda x:x[1]),states
486
- else:
487
- name_indexs=tree_to_variable_index(name,index_to_code)
488
- value_indexs=tree_to_variable_index(value,index_to_code)
489
- temp,states=DFG_csharp(value,index_to_code,states)
490
- DFG+=temp
491
- for index1 in name_indexs:
492
- idx1,code1=index_to_code[index1]
493
- for index2 in value_indexs:
494
- idx2,code2=index_to_code[index2]
495
- DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
496
- states[code1]=[idx1]
497
- return sorted(DFG,key=lambda x:x[1]),states
498
- elif root_node.type in assignment:
499
- left_nodes=root_node.child_by_field_name('left')
500
- right_nodes=root_node.child_by_field_name('right')
501
- DFG=[]
502
- temp,states=DFG_csharp(right_nodes,index_to_code,states)
503
- DFG+=temp
504
- name_indexs=tree_to_variable_index(left_nodes,index_to_code)
505
- value_indexs=tree_to_variable_index(right_nodes,index_to_code)
506
- for index1 in name_indexs:
507
- idx1,code1=index_to_code[index1]
508
- for index2 in value_indexs:
509
- idx2,code2=index_to_code[index2]
510
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
511
- states[code1]=[idx1]
512
- return sorted(DFG,key=lambda x:x[1]),states
513
- elif root_node.type in increment_statement:
514
- DFG=[]
515
- indexs=tree_to_variable_index(root_node,index_to_code)
516
- for index1 in indexs:
517
- idx1,code1=index_to_code[index1]
518
- for index2 in indexs:
519
- idx2,code2=index_to_code[index2]
520
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
521
- states[code1]=[idx1]
522
- return sorted(DFG,key=lambda x:x[1]),states
523
- elif root_node.type in if_statement:
524
- DFG=[]
525
- current_states=states.copy()
526
- others_states=[]
527
- flag=False
528
- tag=False
529
- if 'else' in root_node.type:
530
- tag=True
531
- for child in root_node.children:
532
- if 'else' in child.type:
533
- tag=True
534
- if child.type not in if_statement and flag is False:
535
- temp,current_states=DFG_csharp(child,index_to_code,current_states)
536
- DFG+=temp
537
- else:
538
- flag=True
539
- temp,new_states=DFG_csharp(child,index_to_code,states)
540
- DFG+=temp
541
- others_states.append(new_states)
542
- others_states.append(current_states)
543
- if tag is False:
544
- others_states.append(states)
545
- new_states={}
546
- for dic in others_states:
547
- for key in dic:
548
- if key not in new_states:
549
- new_states[key]=dic[key].copy()
550
- else:
551
- new_states[key]+=dic[key]
552
- for key in new_states:
553
- new_states[key]=sorted(list(set(new_states[key])))
554
- return sorted(DFG,key=lambda x:x[1]),new_states
555
- elif root_node.type in for_statement:
556
- DFG=[]
557
- for child in root_node.children:
558
- temp,states=DFG_csharp(child,index_to_code,states)
559
- DFG+=temp
560
- flag=False
561
- for child in root_node.children:
562
- if flag:
563
- temp,states=DFG_csharp(child,index_to_code,states)
564
- DFG+=temp
565
- elif child.type=="local_variable_declaration":
566
- flag=True
567
- dic={}
568
- for x in DFG:
569
- if (x[0],x[1],x[2]) not in dic:
570
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
571
- else:
572
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
573
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
574
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
575
- return sorted(DFG,key=lambda x:x[1]),states
576
- elif root_node.type in enhanced_for_statement:
577
- name=root_node.child_by_field_name('left')
578
- value=root_node.child_by_field_name('right')
579
- body=root_node.child_by_field_name('body')
580
- DFG=[]
581
- for i in range(2):
582
- temp,states=DFG_csharp(value,index_to_code,states)
583
- DFG+=temp
584
- name_indexs=tree_to_variable_index(name,index_to_code)
585
- value_indexs=tree_to_variable_index(value,index_to_code)
586
- for index1 in name_indexs:
587
- idx1,code1=index_to_code[index1]
588
- for index2 in value_indexs:
589
- idx2,code2=index_to_code[index2]
590
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
591
- states[code1]=[idx1]
592
- temp,states=DFG_csharp(body,index_to_code,states)
593
- DFG+=temp
594
- dic={}
595
- for x in DFG:
596
- if (x[0],x[1],x[2]) not in dic:
597
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
598
- else:
599
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
600
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
601
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
602
- return sorted(DFG,key=lambda x:x[1]),states
603
- elif root_node.type in while_statement:
604
- DFG=[]
605
- for i in range(2):
606
- for child in root_node.children:
607
- temp,states=DFG_csharp(child,index_to_code,states)
608
- DFG+=temp
609
- dic={}
610
- for x in DFG:
611
- if (x[0],x[1],x[2]) not in dic:
612
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
613
- else:
614
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
615
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
616
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
617
- return sorted(DFG,key=lambda x:x[1]),states
618
- else:
619
- DFG=[]
620
- for child in root_node.children:
621
- if child.type in do_first_statement:
622
- temp,states=DFG_csharp(child,index_to_code,states)
623
- DFG+=temp
624
- for child in root_node.children:
625
- if child.type not in do_first_statement:
626
- temp,states=DFG_csharp(child,index_to_code,states)
627
- DFG+=temp
628
-
629
- return sorted(DFG,key=lambda x:x[1]),states
630
-
631
-
632
-
633
-
634
- def DFG_ruby(root_node,index_to_code,states):
635
- assignment=['assignment','operator_assignment']
636
- if_statement=['if','elsif','else','unless','when']
637
- for_statement=['for']
638
- while_statement=['while_modifier','until']
639
- do_first_statement=[]
640
- def_statement=['keyword_parameter']
641
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
642
- states=states.copy()
643
- idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
644
- if root_node.type==code:
645
- return [],states
646
- elif code in states:
647
- return [(code,idx,'comesFrom',[code],states[code].copy())],states
648
- else:
649
- if root_node.type=='identifier':
650
- states[code]=[idx]
651
- return [(code,idx,'comesFrom',[],[])],states
652
- elif root_node.type in def_statement:
653
- name=root_node.child_by_field_name('name')
654
- value=root_node.child_by_field_name('value')
655
- DFG=[]
656
- if value is None:
657
- indexs=tree_to_variable_index(name,index_to_code)
658
- for index in indexs:
659
- idx,code=index_to_code[index]
660
- DFG.append((code,idx,'comesFrom',[],[]))
661
- states[code]=[idx]
662
- return sorted(DFG,key=lambda x:x[1]),states
663
- else:
664
- name_indexs=tree_to_variable_index(name,index_to_code)
665
- value_indexs=tree_to_variable_index(value,index_to_code)
666
- temp,states=DFG_ruby(value,index_to_code,states)
667
- DFG+=temp
668
- for index1 in name_indexs:
669
- idx1,code1=index_to_code[index1]
670
- for index2 in value_indexs:
671
- idx2,code2=index_to_code[index2]
672
- DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
673
- states[code1]=[idx1]
674
- return sorted(DFG,key=lambda x:x[1]),states
675
- elif root_node.type in assignment:
676
- left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=',']
677
- right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=',']
678
- if len(right_nodes)!=len(left_nodes):
679
- left_nodes=[root_node.child_by_field_name('left')]
680
- right_nodes=[root_node.child_by_field_name('right')]
681
- if len(left_nodes)==0:
682
- left_nodes=[root_node.child_by_field_name('left')]
683
- if len(right_nodes)==0:
684
- right_nodes=[root_node.child_by_field_name('right')]
685
- if root_node.type=="operator_assignment":
686
- left_nodes=[root_node.children[0]]
687
- right_nodes=[root_node.children[-1]]
688
-
689
- DFG=[]
690
- for node in right_nodes:
691
- temp,states=DFG_ruby(node,index_to_code,states)
692
- DFG+=temp
693
-
694
- for left_node,right_node in zip(left_nodes,right_nodes):
695
- left_tokens_index=tree_to_variable_index(left_node,index_to_code)
696
- right_tokens_index=tree_to_variable_index(right_node,index_to_code)
697
- temp=[]
698
- for token1_index in left_tokens_index:
699
- idx1,code1=index_to_code[token1_index]
700
- temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
701
- [index_to_code[x][0] for x in right_tokens_index]))
702
- states[code1]=[idx1]
703
- DFG+=temp
704
- return sorted(DFG,key=lambda x:x[1]),states
705
- elif root_node.type in if_statement:
706
- DFG=[]
707
- current_states=states.copy()
708
- others_states=[]
709
- tag=False
710
- if 'else' in root_node.type:
711
- tag=True
712
- for child in root_node.children:
713
- if 'else' in child.type:
714
- tag=True
715
- if child.type not in if_statement:
716
- temp,current_states=DFG_ruby(child,index_to_code,current_states)
717
- DFG+=temp
718
- else:
719
- temp,new_states=DFG_ruby(child,index_to_code,states)
720
- DFG+=temp
721
- others_states.append(new_states)
722
- others_states.append(current_states)
723
- if tag is False:
724
- others_states.append(states)
725
- new_states={}
726
- for dic in others_states:
727
- for key in dic:
728
- if key not in new_states:
729
- new_states[key]=dic[key].copy()
730
- else:
731
- new_states[key]+=dic[key]
732
- for key in new_states:
733
- new_states[key]=sorted(list(set(new_states[key])))
734
- return sorted(DFG,key=lambda x:x[1]),new_states
735
- elif root_node.type in for_statement:
736
- DFG=[]
737
- for i in range(2):
738
- left_nodes=[root_node.child_by_field_name('pattern')]
739
- right_nodes=[root_node.child_by_field_name('value')]
740
- assert len(right_nodes)==len(left_nodes)
741
- for node in right_nodes:
742
- temp,states=DFG_ruby(node,index_to_code,states)
743
- DFG+=temp
744
- for left_node,right_node in zip(left_nodes,right_nodes):
745
- left_tokens_index=tree_to_variable_index(left_node,index_to_code)
746
- right_tokens_index=tree_to_variable_index(right_node,index_to_code)
747
- temp=[]
748
- for token1_index in left_tokens_index:
749
- idx1,code1=index_to_code[token1_index]
750
- temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
751
- [index_to_code[x][0] for x in right_tokens_index]))
752
- states[code1]=[idx1]
753
- DFG+=temp
754
- temp,states=DFG_ruby(root_node.child_by_field_name('body'),index_to_code,states)
755
- DFG+=temp
756
- dic={}
757
- for x in DFG:
758
- if (x[0],x[1],x[2]) not in dic:
759
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
760
- else:
761
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
762
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
763
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
764
- return sorted(DFG,key=lambda x:x[1]),states
765
- elif root_node.type in while_statement:
766
- DFG=[]
767
- for i in range(2):
768
- for child in root_node.children:
769
- temp,states=DFG_ruby(child,index_to_code,states)
770
- DFG+=temp
771
- dic={}
772
- for x in DFG:
773
- if (x[0],x[1],x[2]) not in dic:
774
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
775
- else:
776
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
777
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
778
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
779
- return sorted(DFG,key=lambda x:x[1]),states
780
- else:
781
- DFG=[]
782
- for child in root_node.children:
783
- if child.type in do_first_statement:
784
- temp,states=DFG_ruby(child,index_to_code,states)
785
- DFG+=temp
786
- for child in root_node.children:
787
- if child.type not in do_first_statement:
788
- temp,states=DFG_ruby(child,index_to_code,states)
789
- DFG+=temp
790
-
791
- return sorted(DFG,key=lambda x:x[1]),states
792
-
793
- def DFG_go(root_node,index_to_code,states):
794
- assignment=['assignment_statement',]
795
- def_statement=['var_spec']
796
- increment_statement=['inc_statement']
797
- if_statement=['if_statement','else']
798
- for_statement=['for_statement']
799
- enhanced_for_statement=[]
800
- while_statement=[]
801
- do_first_statement=[]
802
- states=states.copy()
803
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
804
- idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
805
- if root_node.type==code:
806
- return [],states
807
- elif code in states:
808
- return [(code,idx,'comesFrom',[code],states[code].copy())],states
809
- else:
810
- if root_node.type=='identifier':
811
- states[code]=[idx]
812
- return [(code,idx,'comesFrom',[],[])],states
813
- elif root_node.type in def_statement:
814
- name=root_node.child_by_field_name('name')
815
- value=root_node.child_by_field_name('value')
816
- DFG=[]
817
- if value is None:
818
- indexs=tree_to_variable_index(name,index_to_code)
819
- for index in indexs:
820
- idx,code=index_to_code[index]
821
- DFG.append((code,idx,'comesFrom',[],[]))
822
- states[code]=[idx]
823
- return sorted(DFG,key=lambda x:x[1]),states
824
- else:
825
- name_indexs=tree_to_variable_index(name,index_to_code)
826
- value_indexs=tree_to_variable_index(value,index_to_code)
827
- temp,states=DFG_go(value,index_to_code,states)
828
- DFG+=temp
829
- for index1 in name_indexs:
830
- idx1,code1=index_to_code[index1]
831
- for index2 in value_indexs:
832
- idx2,code2=index_to_code[index2]
833
- DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
834
- states[code1]=[idx1]
835
- return sorted(DFG,key=lambda x:x[1]),states
836
- elif root_node.type in assignment:
837
- left_nodes=root_node.child_by_field_name('left')
838
- right_nodes=root_node.child_by_field_name('right')
839
- DFG=[]
840
- temp,states=DFG_go(right_nodes,index_to_code,states)
841
- DFG+=temp
842
- name_indexs=tree_to_variable_index(left_nodes,index_to_code)
843
- value_indexs=tree_to_variable_index(right_nodes,index_to_code)
844
- for index1 in name_indexs:
845
- idx1,code1=index_to_code[index1]
846
- for index2 in value_indexs:
847
- idx2,code2=index_to_code[index2]
848
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
849
- states[code1]=[idx1]
850
- return sorted(DFG,key=lambda x:x[1]),states
851
- elif root_node.type in increment_statement:
852
- DFG=[]
853
- indexs=tree_to_variable_index(root_node,index_to_code)
854
- for index1 in indexs:
855
- idx1,code1=index_to_code[index1]
856
- for index2 in indexs:
857
- idx2,code2=index_to_code[index2]
858
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
859
- states[code1]=[idx1]
860
- return sorted(DFG,key=lambda x:x[1]),states
861
- elif root_node.type in if_statement:
862
- DFG=[]
863
- current_states=states.copy()
864
- others_states=[]
865
- flag=False
866
- tag=False
867
- if 'else' in root_node.type:
868
- tag=True
869
- for child in root_node.children:
870
- if 'else' in child.type:
871
- tag=True
872
- if child.type not in if_statement and flag is False:
873
- temp,current_states=DFG_go(child,index_to_code,current_states)
874
- DFG+=temp
875
- else:
876
- flag=True
877
- temp,new_states=DFG_go(child,index_to_code,states)
878
- DFG+=temp
879
- others_states.append(new_states)
880
- others_states.append(current_states)
881
- if tag is False:
882
- others_states.append(states)
883
- new_states={}
884
- for dic in others_states:
885
- for key in dic:
886
- if key not in new_states:
887
- new_states[key]=dic[key].copy()
888
- else:
889
- new_states[key]+=dic[key]
890
- for key in states:
891
- if key not in new_states:
892
- new_states[key]=states[key]
893
- else:
894
- new_states[key]+=states[key]
895
- for key in new_states:
896
- new_states[key]=sorted(list(set(new_states[key])))
897
- return sorted(DFG,key=lambda x:x[1]),new_states
898
- elif root_node.type in for_statement:
899
- DFG=[]
900
- for child in root_node.children:
901
- temp,states=DFG_go(child,index_to_code,states)
902
- DFG+=temp
903
- flag=False
904
- for child in root_node.children:
905
- if flag:
906
- temp,states=DFG_go(child,index_to_code,states)
907
- DFG+=temp
908
- elif child.type=="for_clause":
909
- if child.child_by_field_name('update') is not None:
910
- temp,states=DFG_go(child.child_by_field_name('update'),index_to_code,states)
911
- DFG+=temp
912
- flag=True
913
- dic={}
914
- for x in DFG:
915
- if (x[0],x[1],x[2]) not in dic:
916
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
917
- else:
918
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
919
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
920
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
921
- return sorted(DFG,key=lambda x:x[1]),states
922
- else:
923
- DFG=[]
924
- for child in root_node.children:
925
- if child.type in do_first_statement:
926
- temp,states=DFG_go(child,index_to_code,states)
927
- DFG+=temp
928
- for child in root_node.children:
929
- if child.type not in do_first_statement:
930
- temp,states=DFG_go(child,index_to_code,states)
931
- DFG+=temp
932
-
933
- return sorted(DFG,key=lambda x:x[1]),states
934
-
935
-
936
-
937
-
938
- def DFG_php(root_node,index_to_code,states):
939
- assignment=['assignment_expression','augmented_assignment_expression']
940
- def_statement=['simple_parameter']
941
- increment_statement=['update_expression']
942
- if_statement=['if_statement','else_clause']
943
- for_statement=['for_statement']
944
- enhanced_for_statement=['foreach_statement']
945
- while_statement=['while_statement']
946
- do_first_statement=[]
947
- states=states.copy()
948
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
949
- idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
950
- if root_node.type==code:
951
- return [],states
952
- elif code in states:
953
- return [(code,idx,'comesFrom',[code],states[code].copy())],states
954
- else:
955
- if root_node.type=='identifier':
956
- states[code]=[idx]
957
- return [(code,idx,'comesFrom',[],[])],states
958
- elif root_node.type in def_statement:
959
- name=root_node.child_by_field_name('name')
960
- value=root_node.child_by_field_name('default_value')
961
- DFG=[]
962
- if value is None:
963
- indexs=tree_to_variable_index(name,index_to_code)
964
- for index in indexs:
965
- idx,code=index_to_code[index]
966
- DFG.append((code,idx,'comesFrom',[],[]))
967
- states[code]=[idx]
968
- return sorted(DFG,key=lambda x:x[1]),states
969
- else:
970
- name_indexs=tree_to_variable_index(name,index_to_code)
971
- value_indexs=tree_to_variable_index(value,index_to_code)
972
- temp,states=DFG_php(value,index_to_code,states)
973
- DFG+=temp
974
- for index1 in name_indexs:
975
- idx1,code1=index_to_code[index1]
976
- for index2 in value_indexs:
977
- idx2,code2=index_to_code[index2]
978
- DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
979
- states[code1]=[idx1]
980
- return sorted(DFG,key=lambda x:x[1]),states
981
- elif root_node.type in assignment:
982
- left_nodes=root_node.child_by_field_name('left')
983
- right_nodes=root_node.child_by_field_name('right')
984
- DFG=[]
985
- temp,states=DFG_php(right_nodes,index_to_code,states)
986
- DFG+=temp
987
- name_indexs=tree_to_variable_index(left_nodes,index_to_code)
988
- value_indexs=tree_to_variable_index(right_nodes,index_to_code)
989
- for index1 in name_indexs:
990
- idx1,code1=index_to_code[index1]
991
- for index2 in value_indexs:
992
- idx2,code2=index_to_code[index2]
993
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
994
- states[code1]=[idx1]
995
- return sorted(DFG,key=lambda x:x[1]),states
996
- elif root_node.type in increment_statement:
997
- DFG=[]
998
- indexs=tree_to_variable_index(root_node,index_to_code)
999
- for index1 in indexs:
1000
- idx1,code1=index_to_code[index1]
1001
- for index2 in indexs:
1002
- idx2,code2=index_to_code[index2]
1003
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
1004
- states[code1]=[idx1]
1005
- return sorted(DFG,key=lambda x:x[1]),states
1006
- elif root_node.type in if_statement:
1007
- DFG=[]
1008
- current_states=states.copy()
1009
- others_states=[]
1010
- flag=False
1011
- tag=False
1012
- if 'else' in root_node.type:
1013
- tag=True
1014
- for child in root_node.children:
1015
- if 'else' in child.type:
1016
- tag=True
1017
- if child.type not in if_statement and flag is False:
1018
- temp,current_states=DFG_php(child,index_to_code,current_states)
1019
- DFG+=temp
1020
- else:
1021
- flag=True
1022
- temp,new_states=DFG_php(child,index_to_code,states)
1023
- DFG+=temp
1024
- others_states.append(new_states)
1025
- others_states.append(current_states)
1026
- new_states={}
1027
- for dic in others_states:
1028
- for key in dic:
1029
- if key not in new_states:
1030
- new_states[key]=dic[key].copy()
1031
- else:
1032
- new_states[key]+=dic[key]
1033
- for key in states:
1034
- if key not in new_states:
1035
- new_states[key]=states[key]
1036
- else:
1037
- new_states[key]+=states[key]
1038
- for key in new_states:
1039
- new_states[key]=sorted(list(set(new_states[key])))
1040
- return sorted(DFG,key=lambda x:x[1]),new_states
1041
- elif root_node.type in for_statement:
1042
- DFG=[]
1043
- for child in root_node.children:
1044
- temp,states=DFG_php(child,index_to_code,states)
1045
- DFG+=temp
1046
- flag=False
1047
- for child in root_node.children:
1048
- if flag:
1049
- temp,states=DFG_php(child,index_to_code,states)
1050
- DFG+=temp
1051
- elif child.type=="assignment_expression":
1052
- flag=True
1053
- dic={}
1054
- for x in DFG:
1055
- if (x[0],x[1],x[2]) not in dic:
1056
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
1057
- else:
1058
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
1059
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
1060
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
1061
- return sorted(DFG,key=lambda x:x[1]),states
1062
- elif root_node.type in enhanced_for_statement:
1063
- name=None
1064
- value=None
1065
- for child in root_node.children:
1066
- if child.type=='variable_name' and value is None:
1067
- value=child
1068
- elif child.type=='variable_name' and name is None:
1069
- name=child
1070
- break
1071
- body=root_node.child_by_field_name('body')
1072
- DFG=[]
1073
- for i in range(2):
1074
- temp,states=DFG_php(value,index_to_code,states)
1075
- DFG+=temp
1076
- name_indexs=tree_to_variable_index(name,index_to_code)
1077
- value_indexs=tree_to_variable_index(value,index_to_code)
1078
- for index1 in name_indexs:
1079
- idx1,code1=index_to_code[index1]
1080
- for index2 in value_indexs:
1081
- idx2,code2=index_to_code[index2]
1082
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
1083
- states[code1]=[idx1]
1084
- temp,states=DFG_php(body,index_to_code,states)
1085
- DFG+=temp
1086
- dic={}
1087
- for x in DFG:
1088
- if (x[0],x[1],x[2]) not in dic:
1089
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
1090
- else:
1091
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
1092
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
1093
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
1094
- return sorted(DFG,key=lambda x:x[1]),states
1095
- elif root_node.type in while_statement:
1096
- DFG=[]
1097
- for i in range(2):
1098
- for child in root_node.children:
1099
- temp,states=DFG_php(child,index_to_code,states)
1100
- DFG+=temp
1101
- dic={}
1102
- for x in DFG:
1103
- if (x[0],x[1],x[2]) not in dic:
1104
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
1105
- else:
1106
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
1107
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
1108
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
1109
- return sorted(DFG,key=lambda x:x[1]),states
1110
- else:
1111
- DFG=[]
1112
- for child in root_node.children:
1113
- if child.type in do_first_statement:
1114
- temp,states=DFG_php(child,index_to_code,states)
1115
- DFG+=temp
1116
- for child in root_node.children:
1117
- if child.type not in do_first_statement:
1118
- temp,states=DFG_php(child,index_to_code,states)
1119
- DFG+=temp
1120
-
1121
- return sorted(DFG,key=lambda x:x[1]),states
1122
-
1123
-
1124
- def DFG_javascript(root_node,index_to_code,states):
1125
- assignment=['assignment_pattern','augmented_assignment_expression']
1126
- def_statement=['variable_declarator']
1127
- increment_statement=['update_expression']
1128
- if_statement=['if_statement','else']
1129
- for_statement=['for_statement']
1130
- enhanced_for_statement=[]
1131
- while_statement=['while_statement']
1132
- do_first_statement=[]
1133
- states=states.copy()
1134
- if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
1135
- idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
1136
- if root_node.type==code:
1137
- return [],states
1138
- elif code in states:
1139
- return [(code,idx,'comesFrom',[code],states[code].copy())],states
1140
- else:
1141
- if root_node.type=='identifier':
1142
- states[code]=[idx]
1143
- return [(code,idx,'comesFrom',[],[])],states
1144
- elif root_node.type in def_statement:
1145
- name=root_node.child_by_field_name('name')
1146
- value=root_node.child_by_field_name('value')
1147
- DFG=[]
1148
- if value is None:
1149
- indexs=tree_to_variable_index(name,index_to_code)
1150
- for index in indexs:
1151
- idx,code=index_to_code[index]
1152
- DFG.append((code,idx,'comesFrom',[],[]))
1153
- states[code]=[idx]
1154
- return sorted(DFG,key=lambda x:x[1]),states
1155
- else:
1156
- name_indexs=tree_to_variable_index(name,index_to_code)
1157
- value_indexs=tree_to_variable_index(value,index_to_code)
1158
- temp,states=DFG_javascript(value,index_to_code,states)
1159
- DFG+=temp
1160
- for index1 in name_indexs:
1161
- idx1,code1=index_to_code[index1]
1162
- for index2 in value_indexs:
1163
- idx2,code2=index_to_code[index2]
1164
- DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
1165
- states[code1]=[idx1]
1166
- return sorted(DFG,key=lambda x:x[1]),states
1167
- elif root_node.type in assignment:
1168
- left_nodes=root_node.child_by_field_name('left')
1169
- right_nodes=root_node.child_by_field_name('right')
1170
- DFG=[]
1171
- temp,states=DFG_javascript(right_nodes,index_to_code,states)
1172
- DFG+=temp
1173
- name_indexs=tree_to_variable_index(left_nodes,index_to_code)
1174
- value_indexs=tree_to_variable_index(right_nodes,index_to_code)
1175
- for index1 in name_indexs:
1176
- idx1,code1=index_to_code[index1]
1177
- for index2 in value_indexs:
1178
- idx2,code2=index_to_code[index2]
1179
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
1180
- states[code1]=[idx1]
1181
- return sorted(DFG,key=lambda x:x[1]),states
1182
- elif root_node.type in increment_statement:
1183
- DFG=[]
1184
- indexs=tree_to_variable_index(root_node,index_to_code)
1185
- for index1 in indexs:
1186
- idx1,code1=index_to_code[index1]
1187
- for index2 in indexs:
1188
- idx2,code2=index_to_code[index2]
1189
- DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
1190
- states[code1]=[idx1]
1191
- return sorted(DFG,key=lambda x:x[1]),states
1192
- elif root_node.type in if_statement:
1193
- DFG=[]
1194
- current_states=states.copy()
1195
- others_states=[]
1196
- flag=False
1197
- tag=False
1198
- if 'else' in root_node.type:
1199
- tag=True
1200
- for child in root_node.children:
1201
- if 'else' in child.type:
1202
- tag=True
1203
- if child.type not in if_statement and flag is False:
1204
- temp,current_states=DFG_javascript(child,index_to_code,current_states)
1205
- DFG+=temp
1206
- else:
1207
- flag=True
1208
- temp,new_states=DFG_javascript(child,index_to_code,states)
1209
- DFG+=temp
1210
- others_states.append(new_states)
1211
- others_states.append(current_states)
1212
- if tag is False:
1213
- others_states.append(states)
1214
- new_states={}
1215
- for dic in others_states:
1216
- for key in dic:
1217
- if key not in new_states:
1218
- new_states[key]=dic[key].copy()
1219
- else:
1220
- new_states[key]+=dic[key]
1221
- for key in states:
1222
- if key not in new_states:
1223
- new_states[key]=states[key]
1224
- else:
1225
- new_states[key]+=states[key]
1226
- for key in new_states:
1227
- new_states[key]=sorted(list(set(new_states[key])))
1228
- return sorted(DFG,key=lambda x:x[1]),new_states
1229
- elif root_node.type in for_statement:
1230
- DFG=[]
1231
- for child in root_node.children:
1232
- temp,states=DFG_javascript(child,index_to_code,states)
1233
- DFG+=temp
1234
- flag=False
1235
- for child in root_node.children:
1236
- if flag:
1237
- temp,states=DFG_javascript(child,index_to_code,states)
1238
- DFG+=temp
1239
- elif child.type=="variable_declaration":
1240
- flag=True
1241
- dic={}
1242
- for x in DFG:
1243
- if (x[0],x[1],x[2]) not in dic:
1244
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
1245
- else:
1246
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
1247
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
1248
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
1249
- return sorted(DFG,key=lambda x:x[1]),states
1250
- elif root_node.type in while_statement:
1251
- DFG=[]
1252
- for i in range(2):
1253
- for child in root_node.children:
1254
- temp,states=DFG_javascript(child,index_to_code,states)
1255
- DFG+=temp
1256
- dic={}
1257
- for x in DFG:
1258
- if (x[0],x[1],x[2]) not in dic:
1259
- dic[(x[0],x[1],x[2])]=[x[3],x[4]]
1260
- else:
1261
- dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
1262
- dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
1263
- DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
1264
- return sorted(DFG,key=lambda x:x[1]),states
1265
- else:
1266
- DFG=[]
1267
- for child in root_node.children:
1268
- if child.type in do_first_statement:
1269
- temp,states=DFG_javascript(child,index_to_code,states)
1270
- DFG+=temp
1271
- for child in root_node.children:
1272
- if child.type not in do_first_statement:
1273
- temp,states=DFG_javascript(child,index_to_code,states)
1274
- DFG+=temp
1275
-
1276
- return sorted(DFG,key=lambda x:x[1]),states
1277
 
1278
  dfg_function={
1279
  'python':DFG_python,
@@ -1288,8 +22,9 @@ dfg_function={
1288
  def calc_syntax_match(references, candidate, lang):
1289
  return corpus_syntax_match([references], [candidate], lang)
1290
 
1291
- def corpus_syntax_match(references, candidates, lang):
1292
- JAVA_LANGUAGE = Language('parser/my-languages.so', lang)
 
1293
  parser = Parser()
1294
  parser.set_language(JAVA_LANGUAGE)
1295
  match_count = 0
 
1
  # Copyright (c) Microsoft Corporation.
2
  # Licensed under the MIT license.
3
 
4
+ from .parsercode.DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp
5
+ from .parsercode.utils import (remove_comments_and_docstrings,
6
+ tree_to_token_index,
7
+ index_to_code_token,
8
+ tree_to_variable_index)
9
  from tree_sitter import Language, Parser
10
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  dfg_function={
13
  'python':DFG_python,
 
22
  def calc_syntax_match(references, candidate, lang):
23
  return corpus_syntax_match([references], [candidate], lang)
24
 
25
+ def corpus_syntax_match(references, candidates, lang):
26
+ curr_path = os.path.dirname(os.path.abspath(__file__))
27
+ JAVA_LANGUAGE = Language(curr_path + '/parsercode/my-languages.so', lang)
28
  parser = Parser()
29
  parser.set_language(JAVA_LANGUAGE)
30
  match_count = 0
weighted_ngram_match.py CHANGED
@@ -17,107 +17,9 @@ import sys
17
  from fractions import Fraction
18
  import warnings
19
  from collections import Counter
20
- import pdb
21
-
22
- from itertools import chain
23
-
24
- def pad_sequence(
25
- sequence,
26
- n,
27
- pad_left=False,
28
- pad_right=False,
29
- left_pad_symbol=None,
30
- right_pad_symbol=None,
31
- ):
32
- """
33
- Returns a padded sequence of items before ngram extraction.
34
- >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
35
- ['<s>', 1, 2, 3, 4, 5, '</s>']
36
- >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
37
- ['<s>', 1, 2, 3, 4, 5]
38
- >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
39
- [1, 2, 3, 4, 5, '</s>']
40
- :param sequence: the source data to be padded
41
- :type sequence: sequence or iter
42
- :param n: the degree of the ngrams
43
- :type n: int
44
- :param pad_left: whether the ngrams should be left-padded
45
- :type pad_left: bool
46
- :param pad_right: whether the ngrams should be right-padded
47
- :type pad_right: bool
48
- :param left_pad_symbol: the symbol to use for left padding (default is None)
49
- :type left_pad_symbol: any
50
- :param right_pad_symbol: the symbol to use for right padding (default is None)
51
- :type right_pad_symbol: any
52
- :rtype: sequence or iter
53
- """
54
- sequence = iter(sequence)
55
- if pad_left:
56
- sequence = chain((left_pad_symbol,) * (n - 1), sequence)
57
- if pad_right:
58
- sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
59
- return sequence
60
-
61
-
62
- # add a flag to pad the sequence so we get peripheral ngrams?
63
 
64
-
65
- def ngrams(
66
- sequence,
67
- n,
68
- pad_left=False,
69
- pad_right=False,
70
- left_pad_symbol=None,
71
- right_pad_symbol=None,
72
- ):
73
- """
74
- Return the ngrams generated from a sequence of items, as an iterator.
75
- For example:
76
- >>> from nltk.util import ngrams
77
- >>> list(ngrams([1,2,3,4,5], 3))
78
- [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
79
- Wrap with list for a list version of this function. Set pad_left
80
- or pad_right to true in order to get additional ngrams:
81
- >>> list(ngrams([1,2,3,4,5], 2, pad_right=True))
82
- [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
83
- >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
84
- [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
85
- >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
86
- [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)]
87
- >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
88
- [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
89
- :param sequence: the source data to be converted into ngrams
90
- :type sequence: sequence or iter
91
- :param n: the degree of the ngrams
92
- :type n: int
93
- :param pad_left: whether the ngrams should be left-padded
94
- :type pad_left: bool
95
- :param pad_right: whether the ngrams should be right-padded
96
- :type pad_right: bool
97
- :param left_pad_symbol: the symbol to use for left padding (default is None)
98
- :type left_pad_symbol: any
99
- :param right_pad_symbol: the symbol to use for right padding (default is None)
100
- :type right_pad_symbol: any
101
- :rtype: sequence or iter
102
- """
103
- sequence = pad_sequence(
104
- sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol
105
- )
106
-
107
- history = []
108
- while n > 1:
109
- # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
110
- try:
111
- next_item = next(sequence)
112
- except StopIteration:
113
- # no more data, terminate the generator
114
- return
115
- history.append(next_item)
116
- n -= 1
117
- for item in sequence:
118
- history.append(item)
119
- yield tuple(history)
120
- del history[0]
121
 
122
 
123
  def sentence_bleu(
@@ -184,12 +86,12 @@ def sentence_bleu(
184
  :return: The sentence-level BLEU score.
185
  :rtype: float
186
  """
187
- return corpus_bleu(
188
  [references], [hypothesis], weights, smoothing_function, auto_reweigh
189
  )
190
 
191
 
192
- def corpus_bleu(
193
  list_of_references,
194
  hypotheses,
195
  weights=(0.25, 0.25, 0.25, 0.25),
 
17
  from fractions import Fraction
18
  import warnings
19
  from collections import Counter
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ from .utils import ngrams
22
+ import pdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
 
25
  def sentence_bleu(
 
86
  :return: The sentence-level BLEU score.
87
  :rtype: float
88
  """
89
+ return corpus_weighted_ngram_match(
90
  [references], [hypothesis], weights, smoothing_function, auto_reweigh
91
  )
92
 
93
 
94
+ def corpus_weighted_ngram_match(
95
  list_of_references,
96
  hypotheses,
97
  weights=(0.25, 0.25, 0.25, 0.25),