# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. from tree_sitter import Language, Parser import re from io import StringIO import tokenize def remove_comments_and_docstrings(source,lang): if lang in ['python']: """ Returns 'source' minus comments and docstrings. """ io_obj = StringIO(source) out = "" prev_toktype = tokenize.INDENT last_lineno = -1 last_col = 0 for tok in tokenize.generate_tokens(io_obj.readline): token_type = tok[0] token_string = tok[1] start_line, start_col = tok[2] end_line, end_col = tok[3] ltext = tok[4] if start_line > last_lineno: last_col = 0 if start_col > last_col: out += (" " * (start_col - last_col)) # Remove comments: if token_type == tokenize.COMMENT: pass # This series of conditionals removes docstrings: elif token_type == tokenize.STRING: if prev_toktype != tokenize.INDENT: # This is likely a docstring; double-check we're not inside an operator: if prev_toktype != tokenize.NEWLINE: if start_col > 0: out += token_string else: out += token_string prev_toktype = token_type last_col = end_col last_lineno = end_line temp=[] for x in out.split('\n'): if x.strip()!="": temp.append(x) return '\n'.join(temp) elif lang in ['ruby']: return source else: def replacer(match): s = match.group(0) if s.startswith('/'): return " " # note: a space and not an empty string else: return s pattern = re.compile( r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', re.DOTALL | re.MULTILINE ) temp=[] for x in re.sub(pattern, replacer, source).split('\n'): if x.strip()!="": temp.append(x) return '\n'.join(temp) def tree_to_token_index(root_node): if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': return [(root_node.start_point,root_node.end_point)] else: code_tokens=[] for child in root_node.children: code_tokens+=tree_to_token_index(child) return code_tokens def tree_to_variable_index(root_node,index_to_code): if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': index=(root_node.start_point,root_node.end_point) _,code=index_to_code[index] if root_node.type!=code: return [(root_node.start_point,root_node.end_point)] else: return [] else: code_tokens=[] for child in root_node.children: code_tokens+=tree_to_variable_index(child,index_to_code) return code_tokens def index_to_code_token(index,code): start_point=index[0] end_point=index[1] if start_point[0]==end_point[0]: s=code[start_point[0]][start_point[1]:end_point[1]] else: s="" s+=code[start_point[0]][start_point[1]:] for i in range(start_point[0]+1,end_point[0]): s+=code[i] s+=code[end_point[0]][:end_point[1]] return s def DFG_python(root_node,index_to_code,states): assignment=['assignment','augmented_assignment','for_in_clause'] if_statement=['if_statement'] for_statement=['for_statement'] while_statement=['while_statement'] do_first_statement=['for_in_clause'] def_statement=['default_parameter'] states=states.copy() if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': idx,code=index_to_code[(root_node.start_point,root_node.end_point)] if root_node.type==code: return [],states elif code in states: return [(code,idx,'comesFrom',[code],states[code].copy())],states else: if root_node.type=='identifier': states[code]=[idx] return [(code,idx,'comesFrom',[],[])],states elif root_node.type in def_statement: name=root_node.child_by_field_name('name') value=root_node.child_by_field_name('value') DFG=[] if value is None: indexs=tree_to_variable_index(name,index_to_code) for index in indexs: idx,code=index_to_code[index] DFG.append((code,idx,'comesFrom',[],[])) states[code]=[idx] return sorted(DFG,key=lambda x:x[1]),states else: name_indexs=tree_to_variable_index(name,index_to_code) value_indexs=tree_to_variable_index(value,index_to_code) temp,states=DFG_python(value,index_to_code,states) DFG+=temp for index1 in name_indexs: idx1,code1=index_to_code[index1] for index2 in value_indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'comesFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in assignment: if root_node.type=='for_in_clause': right_nodes=[root_node.children[-1]] left_nodes=[root_node.child_by_field_name('left')] else: if root_node.child_by_field_name('right') is None: return [],states left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=','] right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=','] if len(right_nodes)!=len(left_nodes): left_nodes=[root_node.child_by_field_name('left')] right_nodes=[root_node.child_by_field_name('right')] if len(left_nodes)==0: left_nodes=[root_node.child_by_field_name('left')] if len(right_nodes)==0: right_nodes=[root_node.child_by_field_name('right')] DFG=[] for node in right_nodes: temp,states=DFG_python(node,index_to_code,states) DFG+=temp for left_node,right_node in zip(left_nodes,right_nodes): left_tokens_index=tree_to_variable_index(left_node,index_to_code) right_tokens_index=tree_to_variable_index(right_node,index_to_code) temp=[] for token1_index in left_tokens_index: idx1,code1=index_to_code[token1_index] temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index], [index_to_code[x][0] for x in right_tokens_index])) states[code1]=[idx1] DFG+=temp return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in if_statement: DFG=[] current_states=states.copy() others_states=[] tag=False if 'else' in root_node.type: tag=True for child in root_node.children: if 'else' in child.type: tag=True if child.type not in ['elif_clause','else_clause']: temp,current_states=DFG_python(child,index_to_code,current_states) DFG+=temp else: temp,new_states=DFG_python(child,index_to_code,states) DFG+=temp others_states.append(new_states) others_states.append(current_states) if tag is False: others_states.append(states) new_states={} for dic in others_states: for key in dic: if key not in new_states: new_states[key]=dic[key].copy() else: new_states[key]+=dic[key] for key in new_states: new_states[key]=sorted(list(set(new_states[key]))) return sorted(DFG,key=lambda x:x[1]),new_states elif root_node.type in for_statement: DFG=[] for i in range(2): right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=','] left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=','] if len(right_nodes)!=len(left_nodes): left_nodes=[root_node.child_by_field_name('left')] right_nodes=[root_node.child_by_field_name('right')] if len(left_nodes)==0: left_nodes=[root_node.child_by_field_name('left')] if len(right_nodes)==0: right_nodes=[root_node.child_by_field_name('right')] for node in right_nodes: temp,states=DFG_python(node,index_to_code,states) DFG+=temp for left_node,right_node in zip(left_nodes,right_nodes): left_tokens_index=tree_to_variable_index(left_node,index_to_code) right_tokens_index=tree_to_variable_index(right_node,index_to_code) temp=[] for token1_index in left_tokens_index: idx1,code1=index_to_code[token1_index] temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index], [index_to_code[x][0] for x in right_tokens_index])) states[code1]=[idx1] DFG+=temp if root_node.children[-1].type=="block": temp,states=DFG_python(root_node.children[-1],index_to_code,states) DFG+=temp dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in while_statement: DFG=[] for i in range(2): for child in root_node.children: temp,states=DFG_python(child,index_to_code,states) DFG+=temp dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states else: DFG=[] for child in root_node.children: if child.type in do_first_statement: temp,states=DFG_python(child,index_to_code,states) DFG+=temp for child in root_node.children: if child.type not in do_first_statement: temp,states=DFG_python(child,index_to_code,states) DFG+=temp return sorted(DFG,key=lambda x:x[1]),states def DFG_java(root_node,index_to_code,states): assignment=['assignment_expression'] def_statement=['variable_declarator'] increment_statement=['update_expression'] if_statement=['if_statement','else'] for_statement=['for_statement'] enhanced_for_statement=['enhanced_for_statement'] while_statement=['while_statement'] do_first_statement=[] states=states.copy() if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': idx,code=index_to_code[(root_node.start_point,root_node.end_point)] if root_node.type==code: return [],states elif code in states: return [(code,idx,'comesFrom',[code],states[code].copy())],states else: if root_node.type=='identifier': states[code]=[idx] return [(code,idx,'comesFrom',[],[])],states elif root_node.type in def_statement: name=root_node.child_by_field_name('name') value=root_node.child_by_field_name('value') DFG=[] if value is None: indexs=tree_to_variable_index(name,index_to_code) for index in indexs: idx,code=index_to_code[index] DFG.append((code,idx,'comesFrom',[],[])) states[code]=[idx] return sorted(DFG,key=lambda x:x[1]),states else: name_indexs=tree_to_variable_index(name,index_to_code) value_indexs=tree_to_variable_index(value,index_to_code) temp,states=DFG_java(value,index_to_code,states) DFG+=temp for index1 in name_indexs: idx1,code1=index_to_code[index1] for index2 in value_indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'comesFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in assignment: left_nodes=root_node.child_by_field_name('left') right_nodes=root_node.child_by_field_name('right') DFG=[] temp,states=DFG_java(right_nodes,index_to_code,states) DFG+=temp name_indexs=tree_to_variable_index(left_nodes,index_to_code) value_indexs=tree_to_variable_index(right_nodes,index_to_code) for index1 in name_indexs: idx1,code1=index_to_code[index1] for index2 in value_indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in increment_statement: DFG=[] indexs=tree_to_variable_index(root_node,index_to_code) for index1 in indexs: idx1,code1=index_to_code[index1] for index2 in indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in if_statement: DFG=[] current_states=states.copy() others_states=[] flag=False tag=False if 'else' in root_node.type: tag=True for child in root_node.children: if 'else' in child.type: tag=True if child.type not in if_statement and flag is False: temp,current_states=DFG_java(child,index_to_code,current_states) DFG+=temp else: flag=True temp,new_states=DFG_java(child,index_to_code,states) DFG+=temp others_states.append(new_states) others_states.append(current_states) if tag is False: others_states.append(states) new_states={} for dic in others_states: for key in dic: if key not in new_states: new_states[key]=dic[key].copy() else: new_states[key]+=dic[key] for key in new_states: new_states[key]=sorted(list(set(new_states[key]))) return sorted(DFG,key=lambda x:x[1]),new_states elif root_node.type in for_statement: DFG=[] for child in root_node.children: temp,states=DFG_java(child,index_to_code,states) DFG+=temp flag=False for child in root_node.children: if flag: temp,states=DFG_java(child,index_to_code,states) DFG+=temp elif child.type=="local_variable_declaration": flag=True dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in enhanced_for_statement: name=root_node.child_by_field_name('name') value=root_node.child_by_field_name('value') body=root_node.child_by_field_name('body') DFG=[] for i in range(2): temp,states=DFG_java(value,index_to_code,states) DFG+=temp name_indexs=tree_to_variable_index(name,index_to_code) value_indexs=tree_to_variable_index(value,index_to_code) for index1 in name_indexs: idx1,code1=index_to_code[index1] for index2 in value_indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) states[code1]=[idx1] temp,states=DFG_java(body,index_to_code,states) DFG+=temp dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in while_statement: DFG=[] for i in range(2): for child in root_node.children: temp,states=DFG_java(child,index_to_code,states) DFG+=temp dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states else: DFG=[] for child in root_node.children: if child.type in do_first_statement: temp,states=DFG_java(child,index_to_code,states) DFG+=temp for child in root_node.children: if child.type not in do_first_statement: temp,states=DFG_java(child,index_to_code,states) DFG+=temp return sorted(DFG,key=lambda x:x[1]),states def DFG_csharp(root_node,index_to_code,states): assignment=['assignment_expression'] def_statement=['variable_declarator'] increment_statement=['postfix_unary_expression'] if_statement=['if_statement','else'] for_statement=['for_statement'] enhanced_for_statement=['for_each_statement'] while_statement=['while_statement'] do_first_statement=[] states=states.copy() if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': idx,code=index_to_code[(root_node.start_point,root_node.end_point)] if root_node.type==code: return [],states elif code in states: return [(code,idx,'comesFrom',[code],states[code].copy())],states else: if root_node.type=='identifier': states[code]=[idx] return [(code,idx,'comesFrom',[],[])],states elif root_node.type in def_statement: if len(root_node.children)==2: name=root_node.children[0] value=root_node.children[1] else: name=root_node.children[0] value=None DFG=[] if value is None: indexs=tree_to_variable_index(name,index_to_code) for index in indexs: idx,code=index_to_code[index] DFG.append((code,idx,'comesFrom',[],[])) states[code]=[idx] return sorted(DFG,key=lambda x:x[1]),states else: name_indexs=tree_to_variable_index(name,index_to_code) value_indexs=tree_to_variable_index(value,index_to_code) temp,states=DFG_csharp(value,index_to_code,states) DFG+=temp for index1 in name_indexs: idx1,code1=index_to_code[index1] for index2 in value_indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'comesFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in assignment: left_nodes=root_node.child_by_field_name('left') right_nodes=root_node.child_by_field_name('right') DFG=[] temp,states=DFG_csharp(right_nodes,index_to_code,states) DFG+=temp name_indexs=tree_to_variable_index(left_nodes,index_to_code) value_indexs=tree_to_variable_index(right_nodes,index_to_code) for index1 in name_indexs: idx1,code1=index_to_code[index1] for index2 in value_indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in increment_statement: DFG=[] indexs=tree_to_variable_index(root_node,index_to_code) for index1 in indexs: idx1,code1=index_to_code[index1] for index2 in indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in if_statement: DFG=[] current_states=states.copy() others_states=[] flag=False tag=False if 'else' in root_node.type: tag=True for child in root_node.children: if 'else' in child.type: tag=True if child.type not in if_statement and flag is False: temp,current_states=DFG_csharp(child,index_to_code,current_states) DFG+=temp else: flag=True temp,new_states=DFG_csharp(child,index_to_code,states) DFG+=temp others_states.append(new_states) others_states.append(current_states) if tag is False: others_states.append(states) new_states={} for dic in others_states: for key in dic: if key not in new_states: new_states[key]=dic[key].copy() else: new_states[key]+=dic[key] for key in new_states: new_states[key]=sorted(list(set(new_states[key]))) return sorted(DFG,key=lambda x:x[1]),new_states elif root_node.type in for_statement: DFG=[] for child in root_node.children: temp,states=DFG_csharp(child,index_to_code,states) DFG+=temp flag=False for child in root_node.children: if flag: temp,states=DFG_csharp(child,index_to_code,states) DFG+=temp elif child.type=="local_variable_declaration": flag=True dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in enhanced_for_statement: name=root_node.child_by_field_name('left') value=root_node.child_by_field_name('right') body=root_node.child_by_field_name('body') DFG=[] for i in range(2): temp,states=DFG_csharp(value,index_to_code,states) DFG+=temp name_indexs=tree_to_variable_index(name,index_to_code) value_indexs=tree_to_variable_index(value,index_to_code) for index1 in name_indexs: idx1,code1=index_to_code[index1] for index2 in value_indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) states[code1]=[idx1] temp,states=DFG_csharp(body,index_to_code,states) DFG+=temp dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in while_statement: DFG=[] for i in range(2): for child in root_node.children: temp,states=DFG_csharp(child,index_to_code,states) DFG+=temp dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states else: DFG=[] for child in root_node.children: if child.type in do_first_statement: temp,states=DFG_csharp(child,index_to_code,states) DFG+=temp for child in root_node.children: if child.type not in do_first_statement: temp,states=DFG_csharp(child,index_to_code,states) DFG+=temp return sorted(DFG,key=lambda x:x[1]),states def DFG_ruby(root_node,index_to_code,states): assignment=['assignment','operator_assignment'] if_statement=['if','elsif','else','unless','when'] for_statement=['for'] while_statement=['while_modifier','until'] do_first_statement=[] def_statement=['keyword_parameter'] if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': states=states.copy() idx,code=index_to_code[(root_node.start_point,root_node.end_point)] if root_node.type==code: return [],states elif code in states: return [(code,idx,'comesFrom',[code],states[code].copy())],states else: if root_node.type=='identifier': states[code]=[idx] return [(code,idx,'comesFrom',[],[])],states elif root_node.type in def_statement: name=root_node.child_by_field_name('name') value=root_node.child_by_field_name('value') DFG=[] if value is None: indexs=tree_to_variable_index(name,index_to_code) for index in indexs: idx,code=index_to_code[index] DFG.append((code,idx,'comesFrom',[],[])) states[code]=[idx] return sorted(DFG,key=lambda x:x[1]),states else: name_indexs=tree_to_variable_index(name,index_to_code) value_indexs=tree_to_variable_index(value,index_to_code) temp,states=DFG_ruby(value,index_to_code,states) DFG+=temp for index1 in name_indexs: idx1,code1=index_to_code[index1] for index2 in value_indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'comesFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in assignment: left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=','] right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=','] if len(right_nodes)!=len(left_nodes): left_nodes=[root_node.child_by_field_name('left')] right_nodes=[root_node.child_by_field_name('right')] if len(left_nodes)==0: left_nodes=[root_node.child_by_field_name('left')] if len(right_nodes)==0: right_nodes=[root_node.child_by_field_name('right')] if root_node.type=="operator_assignment": left_nodes=[root_node.children[0]] right_nodes=[root_node.children[-1]] DFG=[] for node in right_nodes: temp,states=DFG_ruby(node,index_to_code,states) DFG+=temp for left_node,right_node in zip(left_nodes,right_nodes): left_tokens_index=tree_to_variable_index(left_node,index_to_code) right_tokens_index=tree_to_variable_index(right_node,index_to_code) temp=[] for token1_index in left_tokens_index: idx1,code1=index_to_code[token1_index] temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index], [index_to_code[x][0] for x in right_tokens_index])) states[code1]=[idx1] DFG+=temp return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in if_statement: DFG=[] current_states=states.copy() others_states=[] tag=False if 'else' in root_node.type: tag=True for child in root_node.children: if 'else' in child.type: tag=True if child.type not in if_statement: temp,current_states=DFG_ruby(child,index_to_code,current_states) DFG+=temp else: temp,new_states=DFG_ruby(child,index_to_code,states) DFG+=temp others_states.append(new_states) others_states.append(current_states) if tag is False: others_states.append(states) new_states={} for dic in others_states: for key in dic: if key not in new_states: new_states[key]=dic[key].copy() else: new_states[key]+=dic[key] for key in new_states: new_states[key]=sorted(list(set(new_states[key]))) return sorted(DFG,key=lambda x:x[1]),new_states elif root_node.type in for_statement: DFG=[] for i in range(2): left_nodes=[root_node.child_by_field_name('pattern')] right_nodes=[root_node.child_by_field_name('value')] assert len(right_nodes)==len(left_nodes) for node in right_nodes: temp,states=DFG_ruby(node,index_to_code,states) DFG+=temp for left_node,right_node in zip(left_nodes,right_nodes): left_tokens_index=tree_to_variable_index(left_node,index_to_code) right_tokens_index=tree_to_variable_index(right_node,index_to_code) temp=[] for token1_index in left_tokens_index: idx1,code1=index_to_code[token1_index] temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index], [index_to_code[x][0] for x in right_tokens_index])) states[code1]=[idx1] DFG+=temp temp,states=DFG_ruby(root_node.child_by_field_name('body'),index_to_code,states) DFG+=temp dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in while_statement: DFG=[] for i in range(2): for child in root_node.children: temp,states=DFG_ruby(child,index_to_code,states) DFG+=temp dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states else: DFG=[] for child in root_node.children: if child.type in do_first_statement: temp,states=DFG_ruby(child,index_to_code,states) DFG+=temp for child in root_node.children: if child.type not in do_first_statement: temp,states=DFG_ruby(child,index_to_code,states) DFG+=temp return sorted(DFG,key=lambda x:x[1]),states def DFG_go(root_node,index_to_code,states): assignment=['assignment_statement',] def_statement=['var_spec'] increment_statement=['inc_statement'] if_statement=['if_statement','else'] for_statement=['for_statement'] enhanced_for_statement=[] while_statement=[] do_first_statement=[] states=states.copy() if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': idx,code=index_to_code[(root_node.start_point,root_node.end_point)] if root_node.type==code: return [],states elif code in states: return [(code,idx,'comesFrom',[code],states[code].copy())],states else: if root_node.type=='identifier': states[code]=[idx] return [(code,idx,'comesFrom',[],[])],states elif root_node.type in def_statement: name=root_node.child_by_field_name('name') value=root_node.child_by_field_name('value') DFG=[] if value is None: indexs=tree_to_variable_index(name,index_to_code) for index in indexs: idx,code=index_to_code[index] DFG.append((code,idx,'comesFrom',[],[])) states[code]=[idx] return sorted(DFG,key=lambda x:x[1]),states else: name_indexs=tree_to_variable_index(name,index_to_code) value_indexs=tree_to_variable_index(value,index_to_code) temp,states=DFG_go(value,index_to_code,states) DFG+=temp for index1 in name_indexs: idx1,code1=index_to_code[index1] for index2 in value_indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'comesFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in assignment: left_nodes=root_node.child_by_field_name('left') right_nodes=root_node.child_by_field_name('right') DFG=[] temp,states=DFG_go(right_nodes,index_to_code,states) DFG+=temp name_indexs=tree_to_variable_index(left_nodes,index_to_code) value_indexs=tree_to_variable_index(right_nodes,index_to_code) for index1 in name_indexs: idx1,code1=index_to_code[index1] for index2 in value_indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in increment_statement: DFG=[] indexs=tree_to_variable_index(root_node,index_to_code) for index1 in indexs: idx1,code1=index_to_code[index1] for index2 in indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in if_statement: DFG=[] current_states=states.copy() others_states=[] flag=False tag=False if 'else' in root_node.type: tag=True for child in root_node.children: if 'else' in child.type: tag=True if child.type not in if_statement and flag is False: temp,current_states=DFG_go(child,index_to_code,current_states) DFG+=temp else: flag=True temp,new_states=DFG_go(child,index_to_code,states) DFG+=temp others_states.append(new_states) others_states.append(current_states) if tag is False: others_states.append(states) new_states={} for dic in others_states: for key in dic: if key not in new_states: new_states[key]=dic[key].copy() else: new_states[key]+=dic[key] for key in states: if key not in new_states: new_states[key]=states[key] else: new_states[key]+=states[key] for key in new_states: new_states[key]=sorted(list(set(new_states[key]))) return sorted(DFG,key=lambda x:x[1]),new_states elif root_node.type in for_statement: DFG=[] for child in root_node.children: temp,states=DFG_go(child,index_to_code,states) DFG+=temp flag=False for child in root_node.children: if flag: temp,states=DFG_go(child,index_to_code,states) DFG+=temp elif child.type=="for_clause": if child.child_by_field_name('update') is not None: temp,states=DFG_go(child.child_by_field_name('update'),index_to_code,states) DFG+=temp flag=True dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states else: DFG=[] for child in root_node.children: if child.type in do_first_statement: temp,states=DFG_go(child,index_to_code,states) DFG+=temp for child in root_node.children: if child.type not in do_first_statement: temp,states=DFG_go(child,index_to_code,states) DFG+=temp return sorted(DFG,key=lambda x:x[1]),states def DFG_php(root_node,index_to_code,states): assignment=['assignment_expression','augmented_assignment_expression'] def_statement=['simple_parameter'] increment_statement=['update_expression'] if_statement=['if_statement','else_clause'] for_statement=['for_statement'] enhanced_for_statement=['foreach_statement'] while_statement=['while_statement'] do_first_statement=[] states=states.copy() if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': idx,code=index_to_code[(root_node.start_point,root_node.end_point)] if root_node.type==code: return [],states elif code in states: return [(code,idx,'comesFrom',[code],states[code].copy())],states else: if root_node.type=='identifier': states[code]=[idx] return [(code,idx,'comesFrom',[],[])],states elif root_node.type in def_statement: name=root_node.child_by_field_name('name') value=root_node.child_by_field_name('default_value') DFG=[] if value is None: indexs=tree_to_variable_index(name,index_to_code) for index in indexs: idx,code=index_to_code[index] DFG.append((code,idx,'comesFrom',[],[])) states[code]=[idx] return sorted(DFG,key=lambda x:x[1]),states else: name_indexs=tree_to_variable_index(name,index_to_code) value_indexs=tree_to_variable_index(value,index_to_code) temp,states=DFG_php(value,index_to_code,states) DFG+=temp for index1 in name_indexs: idx1,code1=index_to_code[index1] for index2 in value_indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'comesFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in assignment: left_nodes=root_node.child_by_field_name('left') right_nodes=root_node.child_by_field_name('right') DFG=[] temp,states=DFG_php(right_nodes,index_to_code,states) DFG+=temp name_indexs=tree_to_variable_index(left_nodes,index_to_code) value_indexs=tree_to_variable_index(right_nodes,index_to_code) for index1 in name_indexs: idx1,code1=index_to_code[index1] for index2 in value_indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in increment_statement: DFG=[] indexs=tree_to_variable_index(root_node,index_to_code) for index1 in indexs: idx1,code1=index_to_code[index1] for index2 in indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in if_statement: DFG=[] current_states=states.copy() others_states=[] flag=False tag=False if 'else' in root_node.type: tag=True for child in root_node.children: if 'else' in child.type: tag=True if child.type not in if_statement and flag is False: temp,current_states=DFG_php(child,index_to_code,current_states) DFG+=temp else: flag=True temp,new_states=DFG_php(child,index_to_code,states) DFG+=temp others_states.append(new_states) others_states.append(current_states) new_states={} for dic in others_states: for key in dic: if key not in new_states: new_states[key]=dic[key].copy() else: new_states[key]+=dic[key] for key in states: if key not in new_states: new_states[key]=states[key] else: new_states[key]+=states[key] for key in new_states: new_states[key]=sorted(list(set(new_states[key]))) return sorted(DFG,key=lambda x:x[1]),new_states elif root_node.type in for_statement: DFG=[] for child in root_node.children: temp,states=DFG_php(child,index_to_code,states) DFG+=temp flag=False for child in root_node.children: if flag: temp,states=DFG_php(child,index_to_code,states) DFG+=temp elif child.type=="assignment_expression": flag=True dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in enhanced_for_statement: name=None value=None for child in root_node.children: if child.type=='variable_name' and value is None: value=child elif child.type=='variable_name' and name is None: name=child break body=root_node.child_by_field_name('body') DFG=[] for i in range(2): temp,states=DFG_php(value,index_to_code,states) DFG+=temp name_indexs=tree_to_variable_index(name,index_to_code) value_indexs=tree_to_variable_index(value,index_to_code) for index1 in name_indexs: idx1,code1=index_to_code[index1] for index2 in value_indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) states[code1]=[idx1] temp,states=DFG_php(body,index_to_code,states) DFG+=temp dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in while_statement: DFG=[] for i in range(2): for child in root_node.children: temp,states=DFG_php(child,index_to_code,states) DFG+=temp dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states else: DFG=[] for child in root_node.children: if child.type in do_first_statement: temp,states=DFG_php(child,index_to_code,states) DFG+=temp for child in root_node.children: if child.type not in do_first_statement: temp,states=DFG_php(child,index_to_code,states) DFG+=temp return sorted(DFG,key=lambda x:x[1]),states def DFG_javascript(root_node,index_to_code,states): assignment=['assignment_pattern','augmented_assignment_expression'] def_statement=['variable_declarator'] increment_statement=['update_expression'] if_statement=['if_statement','else'] for_statement=['for_statement'] enhanced_for_statement=[] while_statement=['while_statement'] do_first_statement=[] states=states.copy() if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': idx,code=index_to_code[(root_node.start_point,root_node.end_point)] if root_node.type==code: return [],states elif code in states: return [(code,idx,'comesFrom',[code],states[code].copy())],states else: if root_node.type=='identifier': states[code]=[idx] return [(code,idx,'comesFrom',[],[])],states elif root_node.type in def_statement: name=root_node.child_by_field_name('name') value=root_node.child_by_field_name('value') DFG=[] if value is None: indexs=tree_to_variable_index(name,index_to_code) for index in indexs: idx,code=index_to_code[index] DFG.append((code,idx,'comesFrom',[],[])) states[code]=[idx] return sorted(DFG,key=lambda x:x[1]),states else: name_indexs=tree_to_variable_index(name,index_to_code) value_indexs=tree_to_variable_index(value,index_to_code) temp,states=DFG_javascript(value,index_to_code,states) DFG+=temp for index1 in name_indexs: idx1,code1=index_to_code[index1] for index2 in value_indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'comesFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in assignment: left_nodes=root_node.child_by_field_name('left') right_nodes=root_node.child_by_field_name('right') DFG=[] temp,states=DFG_javascript(right_nodes,index_to_code,states) DFG+=temp name_indexs=tree_to_variable_index(left_nodes,index_to_code) value_indexs=tree_to_variable_index(right_nodes,index_to_code) for index1 in name_indexs: idx1,code1=index_to_code[index1] for index2 in value_indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in increment_statement: DFG=[] indexs=tree_to_variable_index(root_node,index_to_code) for index1 in indexs: idx1,code1=index_to_code[index1] for index2 in indexs: idx2,code2=index_to_code[index2] DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) states[code1]=[idx1] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in if_statement: DFG=[] current_states=states.copy() others_states=[] flag=False tag=False if 'else' in root_node.type: tag=True for child in root_node.children: if 'else' in child.type: tag=True if child.type not in if_statement and flag is False: temp,current_states=DFG_javascript(child,index_to_code,current_states) DFG+=temp else: flag=True temp,new_states=DFG_javascript(child,index_to_code,states) DFG+=temp others_states.append(new_states) others_states.append(current_states) if tag is False: others_states.append(states) new_states={} for dic in others_states: for key in dic: if key not in new_states: new_states[key]=dic[key].copy() else: new_states[key]+=dic[key] for key in states: if key not in new_states: new_states[key]=states[key] else: new_states[key]+=states[key] for key in new_states: new_states[key]=sorted(list(set(new_states[key]))) return sorted(DFG,key=lambda x:x[1]),new_states elif root_node.type in for_statement: DFG=[] for child in root_node.children: temp,states=DFG_javascript(child,index_to_code,states) DFG+=temp flag=False for child in root_node.children: if flag: temp,states=DFG_javascript(child,index_to_code,states) DFG+=temp elif child.type=="variable_declaration": flag=True dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states elif root_node.type in while_statement: DFG=[] for i in range(2): for child in root_node.children: temp,states=DFG_javascript(child,index_to_code,states) DFG+=temp dic={} for x in DFG: if (x[0],x[1],x[2]) not in dic: dic[(x[0],x[1],x[2])]=[x[3],x[4]] else: dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] return sorted(DFG,key=lambda x:x[1]),states else: DFG=[] for child in root_node.children: if child.type in do_first_statement: temp,states=DFG_javascript(child,index_to_code,states) DFG+=temp for child in root_node.children: if child.type not in do_first_statement: temp,states=DFG_javascript(child,index_to_code,states) DFG+=temp return sorted(DFG,key=lambda x:x[1]),states dfg_function={ 'python':DFG_python, 'java':DFG_java, 'ruby':DFG_ruby, 'go':DFG_go, 'php':DFG_php, 'javascript':DFG_javascript, 'c_sharp':DFG_csharp, } def calc_syntax_match(references, candidate, lang): return corpus_syntax_match([references], [candidate], lang) def corpus_syntax_match(references, candidates, lang): JAVA_LANGUAGE = Language('parser/my-languages.so', lang) parser = Parser() parser.set_language(JAVA_LANGUAGE) match_count = 0 total_count = 0 for i in range(len(candidates)): references_sample = references[i] candidate = candidates[i] for reference in references_sample: try: candidate=remove_comments_and_docstrings(candidate,'java') except: pass try: reference=remove_comments_and_docstrings(reference,'java') except: pass candidate_tree = parser.parse(bytes(candidate,'utf8')).root_node reference_tree = parser.parse(bytes(reference,'utf8')).root_node def get_all_sub_trees(root_node): node_stack = [] sub_tree_sexp_list = [] depth = 1 node_stack.append([root_node, depth]) while len(node_stack) != 0: cur_node, cur_depth = node_stack.pop() sub_tree_sexp_list.append([cur_node.sexp(), cur_depth]) for child_node in cur_node.children: if len(child_node.children) != 0: depth = cur_depth + 1 node_stack.append([child_node, depth]) return sub_tree_sexp_list cand_sexps = [x[0] for x in get_all_sub_trees(candidate_tree)] ref_sexps = get_all_sub_trees(reference_tree) # print(cand_sexps) # print(ref_sexps) for sub_tree, depth in ref_sexps: if sub_tree in cand_sexps: match_count += 1 total_count += len(ref_sexps) score = match_count / total_count return score