Lazyhope commited on
Commit
d0e1e46
1 Parent(s): f820bce

Add pipeline for clone detection

Browse files
Files changed (3) hide show
  1. clone_detection_pipeline.py +180 -0
  2. config.json +19 -1
  3. tokenizer_config.json +1 -1
clone_detection_pipeline.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Original work:
3
+ https://github.com/sangHa0411/CloneDetection/blob/main/utils/preprocessor.py
4
+
5
+ Copyright (c) 2022 Sangha Park(sangha110495), Young Jin Ahn(snoop2head)
6
+
7
+ All credits to the original authors.
8
+ """
9
+ import re
10
+ import torch
11
+ from transformers import Pipeline
12
+
13
+
14
+ class FunctionPreprocessor:
15
+ def get_function(self, code):
16
+ results = []
17
+ fn_list = re.findall("\ndef [a-zA-Z0-9_]+\(", code)
18
+
19
+ for fn in fn_list:
20
+ results.append(fn[4:-1].strip())
21
+ return results
22
+
23
+ def determine_function(self, code, function_name):
24
+ num = len(re.findall("[^a-zA-Z]" + function_name + "[^a-zA-Z]", code))
25
+ return False if num <= 1 else True
26
+
27
+ def delete_function(self, code, name):
28
+ start_id, _ = re.search("def " + name, code).span()
29
+ ptr = start_id
30
+
31
+ while ptr < len(code) - 1:
32
+ if code[ptr] == "\n" and re.search("[a-zA-Z]", code[ptr + 1]) is not None:
33
+ break
34
+ ptr += 1
35
+
36
+ if ptr != len(code) - 1:
37
+ end_id = ptr
38
+ code = code[:start_id] + code[end_id:]
39
+
40
+ return code
41
+
42
+ def preprocess(self, code):
43
+ code = "\n" + code
44
+ fn_list = self.get_function(code)
45
+ if len(fn_list) == 0:
46
+ return code
47
+
48
+ for fn in fn_list:
49
+ flag = self.determine_function(code, fn)
50
+
51
+ if flag == False:
52
+ code = self.delete_function(code, fn)
53
+
54
+ return code
55
+
56
+
57
+ class AnnotationPreprocessor:
58
+ def search(self, sen_list, string):
59
+ for i, sen in enumerate(sen_list):
60
+ if string in sen:
61
+ return i
62
+ return -1
63
+
64
+ def delete_annotation_block(self, code, string):
65
+ sens = [sen for sen in code.split("\n")]
66
+
67
+ start_id = self.search(sens, string)
68
+ end_id = self.search(sens[start_id + 1 :], string)
69
+ if end_id != -1:
70
+ end_id += start_id + 1
71
+ code = sens[:start_id] + sens[end_id + 1 :]
72
+ else:
73
+ code = sens[:start_id] + sens[start_id + 1 :]
74
+
75
+ code = "\n".join(code)
76
+ return code
77
+
78
+ def delete_block(self, code, string):
79
+ while string in code:
80
+ code = self.delete_annotation_block(code, string)
81
+ return code
82
+
83
+ def delete_annotation(self, code):
84
+ sens = code.split("\n")
85
+
86
+ sens_processed = []
87
+ for sen in sens:
88
+ if "#" in sen:
89
+ index = sen.index("#")
90
+ sen = sen[:index]
91
+ sens_processed.append(sen)
92
+
93
+ return "\n".join(sens_processed)
94
+
95
+ def delete_import(self, code):
96
+ sens = code.split("\n")
97
+
98
+ sens_processed = []
99
+ for sen in sens:
100
+ if "import" not in sen:
101
+ sens_processed.append(sen)
102
+
103
+ return "\n".join(sens_processed)
104
+
105
+ def preprocess(self, code):
106
+ code = self.delete_block(code, '"""')
107
+ code = self.delete_block(code, "'''")
108
+ code = self.delete_annotation(code)
109
+ code = self.delete_import(code)
110
+ code = re.sub("\s+", " ", code).strip()
111
+ return code
112
+
113
+
114
+ def preprocessor(code, instance):
115
+ processed_code = instance.preprocess(code)
116
+ return processed_code if processed_code.strip() else code
117
+
118
+
119
+ def token_to_inputs(feature):
120
+ inputs = {}
121
+ for k, v in feature.items():
122
+ inputs[k] = torch.tensor(v).unsqueeze(0)
123
+
124
+ return inputs
125
+
126
+
127
+ class CloneDetectionPipeline(Pipeline):
128
+ fn_preprocessor = FunctionPreprocessor()
129
+ an_preprocessor = AnnotationPreprocessor()
130
+
131
+ def _sanitize_parameters(self, **kwargs):
132
+ preprocess_kwargs = {}
133
+ return preprocess_kwargs, {}, {}
134
+
135
+ def preprocess(self, inputs):
136
+ code1 = inputs[0]
137
+ code2 = inputs[1]
138
+ if code1.strip() == "" or code2.strip() == "":
139
+ ture_prob = float(code1.strip() == code2.strip())
140
+ return {"skip": True, "output": {False: 1 - ture_prob, True: ture_prob}}
141
+
142
+ code1 = preprocessor(
143
+ preprocessor(code1, self.fn_preprocessor), self.an_preprocessor
144
+ )
145
+ code2 = preprocessor(
146
+ preprocessor(code2, self.fn_preprocessor), self.an_preprocessor
147
+ )
148
+
149
+ feature1 = self.tokenizer(
150
+ code1, code2, max_length=512, return_token_type_ids=False, truncation=True
151
+ )
152
+ feature2 = self.tokenizer(
153
+ code2, code1, max_length=512, return_token_type_ids=False, truncation=True
154
+ )
155
+
156
+ return {
157
+ "inputs1": token_to_inputs(feature1),
158
+ "inputs2": token_to_inputs(feature2),
159
+ }
160
+
161
+ def _forward(self, model_inputs):
162
+ if model_inputs.get("skip", False):
163
+ return model_inputs
164
+
165
+ inputs1 = model_inputs["inputs1"]
166
+ inputs2 = model_inputs["inputs2"]
167
+
168
+ logits1 = self.model(**inputs1).logits[0]
169
+ logits2 = self.model(**inputs2).logits[0]
170
+ logits = (logits1 + logits2) / 2
171
+
172
+ return {"logits": logits}
173
+
174
+ def postprocess(self, model_outputs):
175
+ if model_outputs.get("skip", False):
176
+ return model_outputs["output"]
177
+
178
+ probs = model_outputs["logits"].softmax(-1).tolist()
179
+
180
+ return {False: probs[0], True: probs[1]}
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "microsoft/graphcodebert-base",
3
  "architectures": [
4
  "CloneDetectionModel"
5
  ],
@@ -9,6 +9,24 @@
9
  },
10
  "bos_token_id": 0,
11
  "classifier_dropout": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  "dropout_rate": 0.1,
13
  "eos_token_id": 2,
14
  "gradient_checkpointing": false,
 
1
  {
2
+ "_name_or_path": "Lazyhope/python-clone-detection",
3
  "architectures": [
4
  "CloneDetectionModel"
5
  ],
 
9
  },
10
  "bos_token_id": 0,
11
  "classifier_dropout": null,
12
+ "custom_pipelines": {
13
+ "python-clone-detection": {
14
+ "default": {
15
+ "model": {
16
+ "pt": [
17
+ "Lazyhope/python-clone-detection",
18
+ "main"
19
+ ]
20
+ }
21
+ },
22
+ "impl": "clone_detection_pipeline.CloneDetectionPipeline",
23
+ "pt": [
24
+ "AutoModel"
25
+ ],
26
+ "tf": [],
27
+ "type": "text"
28
+ }
29
+ },
30
  "dropout_rate": 0.1,
31
  "eos_token_id": 2,
32
  "gradient_checkpointing": false,
tokenizer_config.json CHANGED
@@ -34,7 +34,7 @@
34
  "single_word": false
35
  },
36
  "model_max_length": 512,
37
- "name_or_path": "microsoft/graphcodebert-base",
38
  "pad_token": {
39
  "__type": "AddedToken",
40
  "content": "<pad>",
 
34
  "single_word": false
35
  },
36
  "model_max_length": 512,
37
+ "name_or_path": "Lazyhope/python-clone-detection",
38
  "pad_token": {
39
  "__type": "AddedToken",
40
  "content": "<pad>",