gautam-shetty commited on
Commit
a5fb347
1 Parent(s): 60330fa

Initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # PyBuilder
7
+ target/
8
+
9
+ # Environments
10
+ .env
11
+ .venv
12
+ env/
13
+ venv/
14
+ ENV/
15
+ env.bak/
16
+ venv.bak/
17
+
18
+ .idea
19
+ *_venv/
20
+ data/
21
+ output*/
22
+
23
+ *.zip
Database.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import dotenv_values
2
+ from pymongo import MongoClient
3
+ from bson.objectid import ObjectId
4
+
5
+ class Database:
6
+
7
+ def __init__(self, collection_name) -> None:
8
+ env_values = dotenv_values(".env")
9
+ self.url = env_values['MONGO_CLIENT']
10
+ self.db_name = env_values['DB_NAME']
11
+ self.collection_name = collection_name
12
+ self.__connect_db()
13
+
14
+ def __connect_db(self):
15
+ client = MongoClient(self.url)
16
+ self.db = client[self.db_name]
17
+
18
+ def __fetch_collection(self, collection_name: str):
19
+ collection = self.db.get_collection(collection_name)
20
+ return collection
21
+
22
+ def insert_docs(self,doc_list):
23
+ collection = self.__fetch_collection(self.collection_name)
24
+ collection.insert_many(doc_list)
25
+
26
+ def find_docs(self, query,projection={}):
27
+ collection = self.__fetch_collection(self.collection_name)
28
+ return collection.find(query,projection)
29
+
30
+ def estimated_doc_count(self):
31
+ collection = self.__fetch_collection(self.collection_name)
32
+ return collection.estimated_document_count()
33
+
34
+ def update_by_id(self, doc_id, col_name: str, col_val):
35
+ collection = self.__fetch_collection(self.collection_name)
36
+ collection.update_one(
37
+ {"_id": ObjectId(doc_id)},
38
+ {"$set": {col_name: col_val}}
39
+ )
40
+
41
+ def update_by_field(self, match, replacement):
42
+ collection = self.__fetch_collection(self.collection_name)
43
+ # collection.update_one(match,{"$set":replacement})
44
+ collection.update_many(match,{"$set":replacement})
README.md CHANGED
@@ -1,13 +1,43 @@
1
- ---
2
- title: JRefactoring
3
- emoji: 📉
4
- colorFrom: red
5
- colorTo: blue
6
- sdk: streamlit
7
- sdk_version: 1.17.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # jRefactoring
2
+
3
+
4
+ # Project Overview
5
+
6
+
7
+
8
+ ## Dependencies
9
+
10
+ - keras==2.12.0
11
+ - matplotlib==3.7.1
12
+ - numpy==1.23.5
13
+ - pandas==1.5.3
14
+ - PyDriller==2.4.1
15
+ - pymongo==4.3.3
16
+ - python-dotenv==1.0.0
17
+ - scikit_learn==1.2.2
18
+ - torch==2.0.0
19
+ - transformers==4.27.2
20
+
21
+ ## Dataset
22
+
23
+ The dataset used in this project is collected from SEART tool. The parameters used in this tool are as follows:
24
+
25
+ ![Example image](helpers/SEART.png)
26
+
27
+ It resulted in 146 projects. These projects served as our dataset.
28
+
29
+ ## Models
30
+
31
+ ## Results
32
+
33
+ ## Instructions for Replication
34
+
35
+ ## Code Structure
36
+
37
+ ## Acknowledgments
38
+
39
+ ## Contact Information
40
+
41
+
42
+
43
+
app.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from flask import Flask, request, render_template
3
+ from predict import Predict
4
+
5
+ app = Flask(__name__)
6
+
7
+ @app.route('/')
8
+ def home():
9
+ return render_template('index.html')
10
+
11
+ @app.route('/predict', methods=['POST'])
12
+ def predict():
13
+ feature_list = request.form.to_dict()
14
+ result = Predict().predict(feature_list['code'])
15
+ return render_template('index.html', prediction_text=result)
16
+
17
+ if __name__ == "__main__":
18
+ app.run(debug=True, use_reloader=False, port='8080')
archive/codeBert.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from refactor_analysis import RefactorAnalysis
2
+ from transformers import AutoTokenizer, AutoModel
3
+ import torch
4
+ model_name = "huggingface/CodeBERTa-small-v1"
5
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
6
+ model = AutoModel.from_pretrained(model_name)
7
+ tokenized_inputs =[tokenizer(file_content, return_tensors="pt") for file_content in RefactorAnalysis()._parent_child_commit_map()]
8
+
9
+
10
+ with torch.no_grad():
11
+ outputs = [model(**input) for input in tokenized_inputs]
12
+ embeddings = [output.last_hidden_state.mean(dim=1).squeeze() for output in outputs]
13
+ # print(RefactorAnalysis()._parent_child_commit_map())
14
+
15
+ print(embeddings[0].shape)
autoencoder.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from keras.layers import Input, Dense, Flatten
2
+ from keras.models import Model
3
+ from Database import Database
4
+ import numpy as np, json
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.metrics import mean_squared_error
8
+ from dotenv import dotenv_values
9
+ import pandas as pd
10
+ # from tensorflow.python.ops.confusion_matrix import confusion_matrix
11
+ from sklearn.metrics import precision_recall_fscore_support
12
+
13
+ class Autoencoder:
14
+
15
+ def __get_autoencoder(self, input_dim) -> Model:
16
+ input_shape = (input_dim,)
17
+ input_layer = Input(shape=input_shape)
18
+
19
+ # Encoder layers
20
+ encoder = Flatten()(input_layer)
21
+ encoder = Dense(128, activation='relu')(encoder)
22
+ encoder = Dense(64, activation='relu')(encoder)
23
+ # encoder = Dense(32, activation='relu')(encoder)
24
+
25
+ # Decoder layers
26
+ # decoder = Dense(64, activation='relu')(encoder)
27
+ decoder = Dense(128, activation='relu')(encoder) #decoder
28
+ decoder = Dense(input_dim, activation='sigmoid')(decoder)
29
+
30
+ # Autoencoder model
31
+ autoencoder = Model(inputs=input_layer, outputs=decoder)
32
+ # autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
33
+ autoencoder.compile(optimizer='adam', loss='mse')
34
+
35
+ return autoencoder
36
+
37
+ def __print_summary(self, model: Model):
38
+ print(model.summary())
39
+ return
40
+
41
+ def __fit_autoencoder(self,epochs,batch_size,model: Model, train_var,valid_var=None):
42
+ history = model.fit(train_var,train_var,
43
+ # validation_data=(valid_var,valid_var),
44
+ epochs=epochs,batch_size=batch_size)
45
+ return history, model
46
+
47
+ def __split_train_test_val(self, data):
48
+ train_array, test_array = train_test_split(data,test_size=0.2,random_state=42)
49
+ train_array, valid_array = train_test_split(train_array,test_size=0.1,random_state=42)
50
+ return train_array, valid_array, test_array
51
+
52
+ @staticmethod
53
+ def __compute_metrics(conf_matrix):
54
+ precision = conf_matrix[1][1] / (conf_matrix[1][1] + conf_matrix[0][1])
55
+
56
+ if precision==1:
57
+ print(conf_matrix)
58
+
59
+ recall = conf_matrix[1][1] / (conf_matrix[1][1] + conf_matrix[1][0])
60
+ f1 = (2 * precision * recall) / (precision + recall)
61
+ # print("precision: " + str(precision) + ", recall: " + str(recall) + ", f1: " + str(f1))
62
+ return precision, recall, f1
63
+
64
+ def __find_optimal_modified(self,error_df: pd.DataFrame, steps=50):
65
+ min_error, max_error = error_df["Reconstruction_error"].min(), error_df["Reconstruction_error"].max()
66
+ optimal_threshold = (min_error+max_error)/2
67
+ y_pred = [0 if e > optimal_threshold else 1 for e in error_df.Reconstruction_error.values]
68
+ precision, recall, f1,_=precision_recall_fscore_support(error_df.True_class, y_pred, average='macro')
69
+
70
+ return optimal_threshold, precision, recall, f1
71
+
72
+ def __find_optimal(self,error_df: pd.DataFrame, steps=50):
73
+ min_error, max_error = error_df["Reconstruction_error"].min(), error_df["Reconstruction_error"].max()
74
+ optimal_threshold = min_error
75
+ max_f1 = 0
76
+ max_pr = 0
77
+ max_re = 0
78
+ # step_value = (max_error-min_error)/(steps - 1)
79
+ for threshold in np.arange(min_error, max_error, 0.005):
80
+ # print("Threshold: " + str(threshold))
81
+ # y_pred = [1 if e > threshold else 0 for e in error_df.Reconstruction_error.values]
82
+ y_pred = [0 if e > threshold else 1 for e in error_df.Reconstruction_error.values]
83
+ # conf_matrix = confusion_matrix(error_df.True_class, y_pred)
84
+ # precision, recall, f1 = self.__compute_metrics(conf_matrix)
85
+ # precision, recall, f1,_=precision_recall_fscore_support(error_df.True_class, y_pred, average='macro')
86
+ # precision, recall, f1,_=precision_recall_fscore_support(error_df.True_class, y_pred, average='micro')
87
+ # precision, recall, f1,_=precision_recall_fscore_support(error_df.True_class, y_pred, average='weighted')
88
+ precision, recall, f1,_=precision_recall_fscore_support(error_df.True_class, y_pred, average='binary')
89
+
90
+ if f1 > max_f1:
91
+ max_f1 = f1
92
+ optimal_threshold = threshold
93
+ max_pr = precision
94
+ max_re = recall
95
+ print(f"Result optimal_threshold={optimal_threshold}, max_precision={max_pr}, max_recall={max_re}, max_f1={max_f1}")
96
+ # return optimal_threshold, max_pr.numpy(), max_re.numpy(), max_f1.numpy()
97
+ return optimal_threshold, max_pr, max_re, max_f1
98
+
99
+ @staticmethod
100
+ def __split_by_percent(data,percent):
101
+ return train_test_split(data,test_size=0.3,random_state=42)
102
+
103
+
104
+
105
+ def train_autoencoder(self):
106
+ #GraphCodeBERT
107
+
108
+ autoencoder = self.__get_autoencoder(768)
109
+ self.__print_summary(autoencoder)
110
+
111
+ #Create Dataset df
112
+ df = pd.DataFrame(columns=['Embedding','True_class'])
113
+
114
+
115
+ #DB
116
+ db = Database(dotenv_values(".env")['COLLECTION_NAME'])
117
+ # embeddings_list = [emb["embedding"] for emb in list(db.find_docs({"refactoring_type":"Extract Method"}))]
118
+ pos_emb_list, neg_emb_list = [],[]
119
+ for doc in list(db.find_docs({"refactoring_type":"Extract Method"})):
120
+ pos_emb_list.append(doc['embedding_pos'])
121
+ neg_emb_list.append(doc['embedding_neg'])
122
+
123
+ pos_emb_list_train, pos_emb_list_test = self.__split_by_percent(pos_emb_list,0.3)
124
+ _, neg_emb_list_test = self.__split_by_percent(neg_emb_list,0.3)
125
+
126
+ x_train = np.array(pos_emb_list_train)
127
+ x_test = np.array(pos_emb_list_test+neg_emb_list_test)
128
+ y_test = np.array([1 for i in range(0,len(pos_emb_list_test))]+[0 for i in range(0,len(neg_emb_list_test))])
129
+ # print(np.array(pos_emb_list_train).shape)
130
+
131
+ epoch = 25
132
+ history, trained_model = self.__fit_autoencoder(epoch,32,autoencoder,x_train)
133
+ trained_model.save('./results/autoencoder_'+str(epoch)+'.hdf5')
134
+
135
+ #Test
136
+ test_predict = trained_model.predict(x_test)
137
+
138
+ mse = np.mean(np.power(x_test - test_predict, 2), axis=1)
139
+
140
+
141
+ error_df = pd.DataFrame({'Reconstruction_error': mse,
142
+ 'True_class': y_test})
143
+
144
+ print("Max: ", error_df["Reconstruction_error"].max())
145
+ print("Min: ", error_df["Reconstruction_error"].min())
146
+
147
+ # optimal_threshold, precision, recall, f1 = self.__find_optimal(error_df,100)
148
+ optimal_threshold, precision, recall, f1 = self.__find_optimal_modified(error_df,100)
149
+ print(f"Result optimal_threshold={optimal_threshold}, max_precision={precision}, max_recall={recall}, max_f1={f1}")
150
+ metrics = {
151
+ "Threshold":optimal_threshold,
152
+ "Precision": precision,
153
+ "Recall":recall,
154
+ "F1":f1
155
+ }
156
+ with open('./results/metrics.json','w') as fp:
157
+ json.dump(metrics,fp)
158
+
159
+ plt.plot(history.history['loss'])
160
+
161
+ plt.savefig("./results/training_graph.png")
162
+
163
+ if __name__=="__main__":
164
+ Autoencoder().train_autoencoder()
graphCodeBert.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModel
2
+
3
+ from Database import Database
4
+
5
+ class GraphCodeBert:
6
+
7
+ def __init__(self) -> None:
8
+ model_name = "microsoft/graphcodebert-base"
9
+ self.tokenizer= AutoTokenizer.from_pretrained(model_name)
10
+ self.model=AutoModel.from_pretrained(model_name)
11
+
12
+
13
+ def generate_embeddings(self):
14
+ database = Database("refactoring_details_neg")
15
+ # database.connect_db()
16
+ # collection = database.fetch_collection("refactoring_information")
17
+ # collection_len = collection.estimated_document_count()
18
+ collection_len = database.estimated_doc_count()
19
+
20
+
21
+ doc_count = 1
22
+ for doc in database.find_docs({}, {"_id": 1, "method_refactored": 1, "meth_rf_neg":1}):
23
+ doc_id = doc["_id"]
24
+ code_snippet = doc["method_refactored"]
25
+ code_snippet_neg = doc["meth_rf_neg"]
26
+ print(f'Generating embedding for doc_id:{doc_id} | Count-{doc_count}...')
27
+
28
+ # Compute embeddings
29
+ tokenized_input_pos = self.tokenizer(code_snippet, return_tensors="pt", padding=True, truncation=True)
30
+ output = self.model(**tokenized_input_pos)
31
+ embedding_pos = output.last_hidden_state.mean(dim=1).squeeze().tolist()
32
+
33
+ #Neg Embedding
34
+ tokenized_input_neg = self.tokenizer(code_snippet_neg, return_tensors="pt", padding=True, truncation=True)
35
+ output = self.model(**tokenized_input_neg)
36
+ embedding_neg = output.last_hidden_state.mean(dim=1).squeeze().tolist()
37
+
38
+ # Update document in MongoDB with embedding
39
+ database.update_by_id(doc_id, "embedding_pos", embedding_pos)
40
+ database.update_by_id(doc_id,"embedding_neg", embedding_neg)
41
+
42
+ collection_len -= 1
43
+ doc_count += 1
44
+ print(f'Embedding added for doc_id:{doc_id} | Remaining: {collection_len}.')
45
+
46
+ def generate_individual_embedding(self,code_snippet):
47
+ tokenized_input_pos = self.tokenizer(code_snippet, return_tensors="pt", padding=True, truncation=True)
48
+ output = self.model(**tokenized_input_pos)
49
+ embedding = output.last_hidden_state.mean(dim=1).squeeze().tolist()
50
+
51
+ return embedding
52
+
53
+
54
+
55
+ if __name__=="__main__":
56
+ GraphCodeBert().generate_embeddings()
helpers/SEART.png ADDED
mongodb-playground/queries.mongodb ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ use('jrefactoring');
2
+
3
+ db.refactoring_information.count({})
4
+ // db.refactoring_information.drop()
predict.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from graphCodeBert import GraphCodeBert
2
+ from keras.models import load_model, Model
3
+ import numpy as np, json
4
+
5
+ class Predict:
6
+
7
+ def __generate_code_embedding(self,code_snippet):
8
+ embedding = np.array(GraphCodeBert().generate_individual_embedding(code_snippet)).reshape((1,768))
9
+ return embedding
10
+ def __calculate_loss(self,code_embedding,model_name):
11
+ model:Model = load_model(f'results/{model_name}.hdf5')
12
+ return model.evaluate(code_embedding,code_embedding)
13
+
14
+
15
+ def predict(self,code_snippet):
16
+ model_name="autoencoder_25"
17
+
18
+ code_embedding = self.__generate_code_embedding(code_snippet)
19
+ print("Input code snippet shape: ",code_embedding.shape)
20
+ loss = self.__calculate_loss(code_embedding,model_name)
21
+ print("Reconstruction Loss: ",loss)
22
+
23
+ with open('./results/metrics.json',"r") as fp:
24
+ metric_json = json.loads(fp.read())
25
+
26
+ threshold = metric_json["Threshold"]
27
+
28
+ return "Not a candidate for refactoring" if loss>threshold else "Is a candidate for refactoring"
29
+
30
+
31
+
32
+
33
+ if __name__=="__main__":
34
+ Predict().predict(""" public void sleep(){
35
+ int s1 = 1;
36
+ int s2 = 2;
37
+ int s3 = 3;
38
+ int s4 = 4;
39
+ int s5 = 5;
40
+ int s6 = 6;
41
+ int s7 = 7;
42
+ int s8 = 8;
43
+
44
+ }""")
45
+
refactor_analysis.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, subprocess, pydriller,json, pandas as pd
2
+ import sys
3
+ from dotenv import dotenv_values
4
+
5
+ from Database import Database
6
+
7
+ class RefactorAnalysis:
8
+
9
+ def __init__(self,input_path="",output_path=""):
10
+ if input_path=="":
11
+ self.input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"data","refactoring-toy-example")
12
+ else:
13
+ self.input_path=input_path
14
+ if output_path=="":
15
+ self.output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"output_ref","output.json")
16
+ else:
17
+ self.output_path=output_path
18
+
19
+
20
+ def generate_refactor_details(self):
21
+ # ref_miner_bin = os.path.join(os.path.dirname(os.path.abspath(__file__)),"executable","RefactoringMiner","bin")
22
+ ref_miner_bin = os.path.abspath("executable/RefactoringMiner/bin")
23
+ # command = ["cd",ref_miner_bin,"&&","sh","RefactoringMiner","-a",self.input_path,"-json",self.output_path]
24
+ command = ["sh","RefactoringMiner","-a",self.input_path,"-json",self.output_path]
25
+ try:
26
+ os.chdir(ref_miner_bin)
27
+ shell_result = subprocess.run(command,capture_output=True,text=True)
28
+ shell_result.check_returncode()
29
+ # if shell_result!=0:
30
+ # raise Exception("Couldn't analyze repository - "+self.input_path+" with RefactorMiner")
31
+ # return 0
32
+ except subprocess.CalledProcessError as error:
33
+ print(error)
34
+ sys.exit()
35
+
36
+ except Exception as e:
37
+ print(e)
38
+ return 1
39
+
40
+ def parse_json_output(self):
41
+ #TODO
42
+ #Filter for Method Refs
43
+ with open(self.output_path) as f:
44
+ json_output = json.load(f)
45
+
46
+
47
+ dict_output = {}
48
+ for obj in json_output["commits"]:
49
+ if len(obj["refactorings"])==0:
50
+ continue
51
+ changes = []
52
+ se_lines = []
53
+ for ref in obj["refactorings"]:
54
+ if not "Method" in ref["type"]:
55
+ continue
56
+ for parent_refs in ref["leftSideLocations"]:
57
+
58
+ changes.append(parent_refs["filePath"])
59
+ se_lines.append((parent_refs["startLine"],parent_refs["endLine"]))
60
+ # list_output.append(dict_output)
61
+ dict_output[obj["sha1"]]={
62
+ "paths":changes,
63
+ "ref_start_end":se_lines,
64
+ "ref_type":ref["type"]
65
+ }
66
+
67
+ return dict_output
68
+
69
+ def create_project_dataframe(self):
70
+
71
+ df = pd.DataFrame(columns=['commit','refactoring_type','filename','meth_rf_neg','method_refactored'])
72
+
73
+ parse_output_dict = self.parse_json_output()
74
+ commits_to_analyze = list(parse_output_dict.keys())
75
+ for commit in pydriller.Repository(self.input_path, only_commits=commits_to_analyze).traverse_commits():
76
+ ref_list = parse_output_dict.get(commit.hash)
77
+ ref_path_name = list(map(lambda x: str(x).split("/")[len(str(x).split("/"))-1],ref_list["paths"]))
78
+ for cf in commit.modified_files:
79
+ try:
80
+ index_ref = ref_path_name.index(cf.filename)
81
+ except ValueError as ve:
82
+ continue
83
+ if len(cf.changed_methods)==0:
84
+ continue
85
+ #Diff between methods_changed and methods_before - does methods_changed reduces loop else we have to loop for all methods
86
+ for cm in cf.changed_methods:
87
+
88
+ if cm.start_line<=ref_list["ref_start_end"][index_ref][0] and cm.end_line>=ref_list["ref_start_end"][index_ref][1]:
89
+ method_source_code = self.__split_and_extract_methods(cf.source_code_before,cm.start_line,cm.end_line)
90
+ method_source_code_neg = self.__split_and_extract_methods(cf.source_code,cm.start_line,cm.end_line)
91
+ class_source_code = cf.source_code_before
92
+
93
+ # df_row = {"commit":commit.hash,"refactoring_type":ref_list["ref_type"],"filename":cf.filename, "meth_rf_neg":class_source_code,"method_refactored":method_source_code}
94
+ df_row = {"commit":commit.hash,"refactoring_type":ref_list["ref_type"],"filename":cf.filename, "meth_rf_neg":method_source_code_neg,"method_refactored":method_source_code}
95
+ df.loc[len(df)] = df_row
96
+ return df
97
+
98
+
99
+ def __split_and_extract_methods(self, source_code,start_line, end_line):
100
+ source_code_lines = str(source_code).splitlines()
101
+ return "\n".join(source_code_lines[start_line-1:end_line])
102
+
103
+ def main():
104
+ if not os.path.exists("data/repos/"):
105
+ try:
106
+ print("Starting repo download")
107
+ repo_script = subprocess.run(["python","repo_download.py"], capture_output=True, text=True)
108
+ repo_script.check_returncode()
109
+ except subprocess.CalledProcessError as err:
110
+ print(err)
111
+ sys.exit(1)
112
+ print("Repo Download Completed")
113
+ lst_repos = next(os.walk("data/repos/"))[1]
114
+ print(len(lst_repos))
115
+
116
+ cwd = os.path.dirname(os.path.abspath(__file__))
117
+ final_df = pd.DataFrame(columns=['commit','refactoring_type','filename','meth_rf_neg','method_refactored'])
118
+ database = Database(dotenv_values(".env")['COLLECTION_NAME'])
119
+ # database.connect_db()
120
+ count=1
121
+ batch_size = 5
122
+ for idx,repo in enumerate(lst_repos):
123
+ os.chdir(cwd)
124
+ try:
125
+ ref_obj = RefactorAnalysis(os.path.abspath(os.path.join("data/repos",repo)),os.path.abspath(os.path.join("output_ref",repo+".json")))
126
+ # ref_miner = ref_obj.generate_refactor_details() #Modify
127
+ df = ref_obj.create_project_dataframe()
128
+ except Exception as e:
129
+ print(e)
130
+ continue
131
+
132
+ final_df = pd.concat([final_df,df], ignore_index=True)
133
+ if count==batch_size or idx==len(lst_repos)-1:
134
+ print("Inserting into DB", idx)
135
+ database.insert_docs(final_df.to_dict(orient="records"))
136
+ final_df = pd.DataFrame(columns=['commit','refactoring_type','filename','meth_rf_neg','method_refactored'])
137
+ count=1
138
+ else:
139
+ count+=1
140
+
141
+ if __name__=="__main__":
142
+ main()
repo_download.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, subprocess, csv
2
+
3
+ def download_repo(repo_name, repos_base_path):
4
+ repo_fullname = repo_name.strip('\n')
5
+ if not repo_fullname == "":
6
+ project_url = "https://github.com/" + repo_fullname + ".git"
7
+ folder_name = repo_fullname.replace("/", "_")
8
+ folder_path_new = os.path.join(repos_base_path, folder_name)
9
+
10
+ if not os.path.exists(folder_path_new):
11
+ _download_with_url(project_url, folder_path_new)
12
+ else:
13
+ print(folder_name + " already exists. skipping ...")
14
+
15
+ def _download_with_url(project_url, folder_path):
16
+ if not os.path.exists(folder_path):
17
+ os.makedirs(folder_path)
18
+ print("cloning... " + project_url)
19
+ try:
20
+ # depth=1 is only to get the current snapshot (rather than all commits)
21
+ subprocess.call(["git", "clone", project_url, folder_path])
22
+ except Exception as ex:
23
+ print("Exception occurred!!" + str(ex))
24
+ return
25
+ print("cloning done.")
26
+
27
+ if __name__=='__main__':
28
+ print("Starting repo download")
29
+ with open("data/repos.csv") as repo_file:
30
+ reader = csv.reader(repo_file, delimiter=',')
31
+ next(reader)
32
+ for line in reader:
33
+ download_repo(line[0], 'data/repos')
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Flask==2.2.3
2
+ keras==2.12.0
3
+ matplotlib==3.7.1
4
+ numpy==1.23.5
5
+ pandas==1.5.3
6
+ PyDriller==2.4.1
7
+ pymongo==4.3.3
8
+ python-dotenv==1.0.0
9
+ scikit_learn==1.2.2
10
+ torch==2.0.0
11
+ transformers==4.27.2
results/metrics.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"Threshold": 0.11413681637927062, "Precision": 0.7785714285714286, "Recall": 0.6025641025641025, "F1": 0.5280109310950615}
results/training_graph.png ADDED
templates/css/main.css ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ margin: 0;
3
+ padding: 0;
4
+ font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
5
+ color: #444;
6
+ }
7
+ /*
8
+ * Formatting the header area
9
+ */
10
+ header {
11
+ background-color: #DFB887;
12
+ height: 35px;
13
+ width: 100%;
14
+ opacity: .9;
15
+ margin-bottom: 10px;
16
+ }
17
+ header h1.logo {
18
+ margin: 0;
19
+ font-size: 1.7em;
20
+ color: #fff;
21
+ text-transform: uppercase;
22
+ float: left;
23
+ }
24
+ header h1.logo:hover {
25
+ color: #fff;
26
+ text-decoration: none;
27
+ }
28
+ /*
29
+ * Centering the body content
30
+ */
31
+ .container {
32
+ width: 1200px;
33
+ margin: 0 auto;
34
+ }
35
+ div.home {
36
+ padding: 10px 0 30px 0;
37
+ background-color: #E6E6FA;
38
+ -webkit-border-radius: 6px;
39
+ -moz-border-radius: 6px;
40
+ border-radius: 6px;
41
+ }
42
+ div.about {
43
+ padding: 10px 0 30px 0;
44
+ background-color: #E6E6FA;
45
+ -webkit-border-radius: 6px;
46
+ -moz-border-radius: 6px;
47
+ border-radius: 6px;
48
+ }
49
+ h2 {
50
+ font-size: 3em;
51
+ margin-top: 40px;
52
+ text-align: center;
53
+ letter-spacing: -2px;
54
+ }
55
+ h3 {
56
+ font-size: 1.7em;
57
+ font-weight: 100;
58
+ margin-top: 30px;
59
+ text-align: center;
60
+ letter-spacing: -1px;
61
+ color: #999;
62
+ }
63
+ .menu {
64
+ float: right;
65
+ margin-top: 8px;
66
+ }
67
+ .menu li {
68
+ display: inline;
69
+ }
70
+ .menu li + li {
71
+ margin-left: 35px;
72
+ }
73
+ .menu li a {
74
+ color: #444;
75
+ text-decoration: none;
76
+ }
templates/index.html ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html >
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>jRefactoring</title>
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <link rel="stylesheet" href="{{ url_for('static', filename='css/main.css') }}">
8
+ <script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
9
+ <script src="https://cdn.jsdelivr.net/npm/popper.js@1.14.7/dist/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
10
+ <script src="https://cdn.jsdelivr.net/npm/bootstrap@4.3.1/dist/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
11
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.3.1/dist/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
12
+ </head>
13
+
14
+ <body>
15
+ <div class="container">
16
+ <div class="prediction">
17
+ <h1><a href="https://git.cs.dal.ca/gshetty/jrefactoring.git">jRefactoring</a></h1>
18
+ <form action="{{ url_for('predict')}}"method="post">
19
+ <p>Source Code:</p>
20
+ <p>Enter your code snippet (Java)</p>
21
+ <code><textarea type="textbox" id="code" name="code" required="required" rows="10" cols="100" autofocus autocomplete="off"></textarea></code><br>
22
+ <button type="submit" class="btn btn-primary btn-large">Predict</button>
23
+ </form>
24
+ <br>
25
+ <b>{{ prediction_text }}</b>
26
+ </div>
27
+ </div>
28
+ </body>
29
+ </html>
train.png ADDED