Spaces:

MickyMike
/

AIBugHunter

Sleeping

App Files Files Community

MickyMike commited on Jul 24, 2023

Commit

f513a95

•

1 Parent(s): ee0fa10

Upload 14 files

Browse files

Files changed (12) hide show

app.py +89 -0
data/process.py +39 -0
data/test.csv +0 -0
models/statement_t5_model.bin +3 -0
requirements.txt +6 -0
statement_t5.py +78 -0
statement_t5_tokenizer/merges.txt +0 -0
statement_t5_tokenizer/special_tokens_map.json +753 -0
statement_t5_tokenizer/tokenizer_config.json +64 -0
statement_t5_tokenizer/vocab.json +0 -0
t5_config.json +68 -0
utils.py +192 -0

app.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import streamlit as st
+import os
+import pandas as pd
+from utils import *
+PATH = os.getcwd()
+if __name__ == "__main__":
+    MAX_NUM_STATEMENTS = 155
+    st.set_page_config(page_title="AIBugHunter")
+    # sidebar
+    st.sidebar.title("AIBugHunter Web App")
+    behavior = st.sidebar.selectbox(label="NAVIGATOR IS HERE:",
+                                    options=["DEMO", "Analyze my own"])
+    if behavior == "DEMO":
+        # function title
+        st.title("C/C++ Vulnerability Dataset Viewer")
+        dataset_path = PATH + "/data/test.csv"
+        st.dataframe(pd.read_csv(dataset_path))
+        with st.form("input_form_a"):
+            idx = st.selectbox('Select an index', (str(i) for i in range(100)))
+            sub = st.form_submit_button("Select")
+            if sub:
+                idx = int(idx)
+                df = pd.read_csv(dataset_path)
+                input_code = df["function"][idx]
+                input_code = input_code.split("\n")[:MAX_NUM_STATEMENTS]
+                input_code = "\n".join(input_code)
+                # load model
+                with st.spinner("Scanning security issues..."):
+                    # do inference
+                    out = predict_vul_lines([input_code])
+                    func_pred = out["batch_func_pred"][0]
+                    func_confidence = out["batch_func_pred_prob"][0]
+                    line_pred = out["batch_statement_pred"][0]
+                    line_confidence = out["batch_statement_pred_prob"][0]
+                output = None
+                # inference complete
+                st.snow()
+                print_code = input_code.split("\n")[:MAX_NUM_STATEMENTS]
+                st.markdown("### Scanning Results:")
+                if func_pred == 0:
+                    st.write("<span style='color:green'>" + "No vulnerabilities detected"+ "</span>", unsafe_allow_html=True)
+                    st.markdown("### Non-Vulnerable Function:")
+                else:
+                    for i in range(len(print_code)):
+                        c = print_code[i]
+                        vul = line_pred[i]
+                        if vul == 1:
+                            st.write(f"<span style='color:red'> Vulnerable Line {i+1} </span>", unsafe_allow_html=True)
+                            st.code(c)
+                    st.markdown("### Vulnerable Function:")
+                st.code(input_code, language="cpp", line_numbers=True)
+    elif behavior == "Analyze my own":
+        # user input of project title
+        ## todo- limit the input to 150 lines
+        with st.form("input_form_b"):
+            input_code = st.text_area("Input a C/C++ function:", height=275)
+            submitted = st.form_submit_button("Analyze")
+            if submitted:
+                # load model
+                with st.spinner("Scanning security issues..."):
+                    # do inference
+                    out = predict_vul_lines([input_code])
+                    func_pred = out["batch_func_pred"][0]
+                    func_confidence = out["batch_func_pred_prob"][0]
+                    line_pred = out["batch_statement_pred"][0]
+                    line_confidence = out["batch_statement_pred_prob"][0]
+                output = None
+                # inference complete
+                st.snow()
+                print_code = input_code.split("\n")[:MAX_NUM_STATEMENTS]
+                st.markdown("### Scanning Results:")
+                if func_pred == 0:
+                    st.write("<span style='color:green'>" + "No vulnerabilities detected"+ "</span>", unsafe_allow_html=True)
+                    st.markdown("### Non-Vulnerable Function:")
+                else:
+                    for i in range(len(print_code)):
+                        c = print_code[i]
+                        vul = line_pred[i]
+                        if vul == 1:
+                            st.write(f"<span style='color:red'> Vulnerable Line {i+1} </span>", unsafe_allow_html=True)
+                            st.code(c)
+                    st.markdown("### Vulnerable Function:")
+                st.code(input_code, language="cpp", line_numbers=True)

data/process.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import pandas as pd
+df = pd.read_csv("./processed_test.csv")
+func_lab = []
+stat_lab = []
+cwe_id = []
+func = []
+df_vul = df[df["function_label"]==1][:50]
+df_vul = df_vul.reset_index()
+df_non_vul = df[df["function_label"]==0][:50]
+df_non_vul = df_non_vul.reset_index()
+for i in range(len(df_vul)):
+    func_lab.append(df_vul["function_label"][i])
+    stat_lab.append(df_vul["statement_label"][i])
+    id = df_vul["cwe_id"][i]
+    if df_vul["function_label"][i] == 0:
+        id = None
+    cwe_id.append(id)
+    func.append(df_vul["func_before"][i])
+    func_lab.append(df_non_vul["function_label"][i])
+    stat_lab.append(df_non_vul["statement_label"][i])
+    id = df_non_vul["cwe_id"][i]
+    if df_non_vul["function_label"][i] == 0:
+        id = None
+    cwe_id.append(id)
+    func.append(df_non_vul["func_before"][i])
+pd.DataFrame({"function": func, "function_label": func_lab, "cwe_id": cwe_id, "statement_label": stat_lab}).to_csv("./test.csv", index=False)

data/test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

models/statement_t5_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19747f298f181dc8488dcf128991acdbf1df75e140df2ca4ecd92922cb9f16d6
+size 471562706

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+transformers
+torch
+pickle
+numpy
+onnxruntime
+pandas

statement_t5.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+import torch.nn as nn
+class ClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+    def __init__(self, hidden_dim):
+        super().__init__()
+        self.dense = nn.Linear(hidden_dim, hidden_dim)
+        self.Dropout = nn.Dropout(0.1)
+        self.out_proj = nn.Linear(hidden_dim, 1)
+        self.rnn_pool = nn.GRU(input_size=768,
+                                hidden_size=768,
+                                num_layers=1,
+                                batch_first=True)
+        self.func_dense = nn.Linear(hidden_dim, hidden_dim)
+        self.func_out_proj = nn.Linear(hidden_dim, 2)
+    def forward(self, hidden):
+        x = self.Dropout(hidden)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.Dropout(x)
+        x = self.out_proj(x)
+        out, func_x = self.rnn_pool(hidden)
+        func_x = func_x.squeeze(0)
+        func_x = self.Dropout(func_x)
+        func_x = self.func_dense(func_x)
+        func_x = torch.tanh(func_x)
+        func_x = self.Dropout(func_x)
+        func_x = self.func_out_proj(func_x)
+        return x.squeeze(-1), func_x
+class StatementT5(nn.Module):
+    def __init__(self, t5, tokenizer, device, hidden_dim=768):
+        super(StatementT5, self).__init__()
+        self.max_num_statement = 155
+        self.word_embedding = t5.shared
+        self.rnn_statement_embedding = nn.GRU(input_size=768,
+                                              hidden_size=768,
+                                              num_layers=1,
+                                              batch_first=True)
+        self.t5 = t5
+        self.tokenizer = tokenizer
+        self.device = device
+        # CLS head
+        self.classifier = ClassificationHead(hidden_dim=hidden_dim)
+    def forward(self, input_ids, statement_mask, labels=None, func_labels=None):
+        statement_mask = statement_mask[:, :self.max_num_statement]
+        if self.training:
+            embed = self.word_embedding(input_ids)
+            inputs_embeds = torch.randn(embed.shape[0], embed.shape[1], embed.shape[3]).to(self.device)
+            for i in range(len(embed)):
+                statement_of_tokens = embed[i]
+                out, statement_embed = self.rnn_statement_embedding(statement_of_tokens)
+                inputs_embeds[i, :, :] = statement_embed
+            inputs_embeds = inputs_embeds[:, :self.max_num_statement, :]
+            rep = self.t5(inputs_embeds=inputs_embeds, attention_mask=statement_mask).last_hidden_state
+            logits, func_logits = self.classifier(rep)
+            loss_fct = nn.CrossEntropyLoss()
+            statement_loss = loss_fct(logits, labels)
+            loss_fct_2 = nn.CrossEntropyLoss()
+            func_loss = loss_fct_2(func_logits, func_labels)
+            return statement_loss, func_loss
+        else:
+            embed = self.word_embedding(input_ids)
+            inputs_embeds = torch.randn(embed.shape[0], embed.shape[1], embed.shape[3]).to(self.device)
+            for i in range(len(embed)):
+                statement_of_tokens = embed[i]
+                out, statement_embed = self.rnn_statement_embedding(statement_of_tokens)
+                inputs_embeds[i, :, :] = statement_embed
+            inputs_embeds = inputs_embeds[:, :self.max_num_statement, :]
+            rep = self.t5(inputs_embeds=inputs_embeds, attention_mask=statement_mask).last_hidden_state
+            logits, func_logits = self.classifier(rep)
+            probs = torch.sigmoid(logits)
+            func_probs = torch.softmax(func_logits, dim=-1)
+            return probs, func_probs

statement_t5_tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

statement_t5_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,753 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<extra_id_99>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_98>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_97>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_96>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_95>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_94>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_93>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_92>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_91>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_90>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_89>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_88>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_87>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_86>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_85>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_84>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_83>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_82>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_81>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_80>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_79>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_78>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_77>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_76>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_75>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_74>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_73>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_72>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_71>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_70>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_69>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_68>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_67>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_66>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_65>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_64>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_63>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_62>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_61>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_60>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_59>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_58>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_57>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_56>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_55>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_54>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_53>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_52>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_51>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_50>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_49>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_48>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_47>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_46>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_45>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_44>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_43>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_42>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_41>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_40>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_39>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_38>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_37>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_36>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_35>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_34>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_33>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_32>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_31>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_30>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_29>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_28>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_27>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_26>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_25>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_24>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_23>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_22>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_21>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_20>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_19>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_18>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_17>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_16>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_15>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_14>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_13>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_12>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_11>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_10>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_9>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_8>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_7>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_6>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_5>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_4>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_3>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_2>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_1>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_0>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

statement_t5_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 512,
+  "name_or_path": "Salesforce/codet5-base",
+  "pad_token": {
+    "__type": "AddedToken",
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "special_tokens_map_file": "/home/michael/.cache/huggingface/transformers/5941df5e4315c5ab63b7b2ac791fb0bf0f209744a055c06b43b5274849137cdd.b9905d0575bde443a20834122b6e2d48e853b2e36444ce98ddeb43c38097eb3f",
+  "tokenizer_class": "RobertaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

statement_t5_tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

t5_config.json ADDED Viewed

	@@ -0,0 +1,68 @@

+{
+  "_name_or_path": "Salesforce/codet5-base",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "d_ff": 3072,
+  "d_kv": 64,
+  "d_model": 768,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 12,
+  "num_heads": 12,
+  "num_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.27.3",
+  "use_cache": true,
+  "vocab_size": 32100
+}

utils.py ADDED Viewed

	@@ -0,0 +1,192 @@

+from transformers import RobertaTokenizer, T5Config, T5EncoderModel
+from statement_t5 import StatementT5
+import torch
+import pickle
+import numpy as np
+import onnxruntime
+def to_numpy(tensor):
+    """ get np input for onnx runtime model """
+    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
+def predict_vul_lines(code: list, gpu: bool = False) -> dict:
+    """Generate statement-level and function-level vulnerability prediction probabilities.
+    Parameters
+    ----------
+    code : :obj:`list`
+        A list of String functions.
+    gpu : bool
+        Defines if CUDA inference is enabled
+    Returns
+    -------
+    :obj:`dict`
+        A dictionary with two keys, "batch_vul_pred", "batch_vul_pred_prob", and "batch_line_scores"
+        "batch_func_pred" stores a list of function-level vulnerability prediction: [0, 1, ...] where 0 means non-vulnerable and 1 means vulnerable
+        "batch_func_pred_prob" stores a list of function-level vulnerability prediction probabilities [0.89, 0.75, ...] corresponding to "batch_func_pred"
+        "batch_statement_pred" stores a list of statement-level vulnerability prediction: [0, 1, ...] where 0 means non-vulnerable and 1 means vulnerable
+        "batch_statement_pred_prob" stores a list of statement-level vulnerability prediction probabilities [0.89, 0.75, ...] corresponding to "batch_statement_pred"
+    """
+    MAX_STATEMENTS = 155
+    MAX_STATEMENT_LENGTH = 20
+    DEVICE = 'cuda' if gpu else 'cpu'
+    # load tokenizer
+    tokenizer = RobertaTokenizer.from_pretrained("./statement_t5_tokenizer")
+    # load model
+    config = T5Config.from_pretrained("./t5_config.json")
+    model = T5EncoderModel(config=config)
+    model = StatementT5(model, tokenizer, device=DEVICE)
+    output_dir = "./models/statement_t5_model.bin"
+    model.load_state_dict(torch.load(output_dir, map_location=DEVICE))
+    model.to(DEVICE)
+    model.eval()
+    input_ids, statement_mask = statement_tokenization(code, MAX_STATEMENTS, MAX_STATEMENT_LENGTH, tokenizer)
+    with torch.no_grad():
+        statement_probs, func_probs = model(input_ids=input_ids, statement_mask=statement_mask)
+    func_preds = torch.argmax(func_probs, dim=-1)
+    statement_preds = torch.where(statement_probs>0.5, 1, 0)
+    return {"batch_func_pred": func_preds, "batch_func_pred_prob": func_probs,
+            "batch_statement_pred": statement_preds, "batch_statement_pred_prob": statement_probs}
+def statement_tokenization(code: list, max_statements: int, max_statement_length: int, tokenizer):
+    batch_input_ids = []
+    batch_statement_mask = []
+    for c in code:
+        source = c.split("\n")
+        source = [statement for statement in source if statement != ""]
+        source = source[:max_statements]
+        padding_statement = [tokenizer.pad_token_id for _ in range(20)]
+        input_ids = []
+        for stat in source:
+            ids_ = tokenizer.encode(str(stat),
+                                    truncation=True,
+                                    max_length=max_statement_length,
+                                    padding='max_length',
+                                    add_special_tokens=False)
+            input_ids.append(ids_)
+        if len(input_ids) < max_statements:
+            for _ in range(max_statements-len(input_ids)):
+                input_ids.append(padding_statement)
+        statement_mask = []
+        for statement in input_ids:
+            if statement == padding_statement:
+                statement_mask.append(0)
+            else:
+                statement_mask.append(1)
+        batch_input_ids.append(input_ids)
+        batch_statement_mask.append(statement_mask)
+    return torch.tensor(batch_input_ids), torch.tensor(batch_statement_mask)
+def predict_cweid(code: list, gpu: bool = False) -> dict:
+    """Generate CWE-IDs and CWE Abstract Types Predictions.
+    Parameters
+    ----------
+    code : :obj:`list`
+        A list of String functions.
+    gpu : bool
+        Defines if CUDA inference is enabled
+    Returns
+    -------
+    :obj:`dict`
+        A dictionary with four keys, "cwe_id", "cwe_id_prob", "cwe_type", "cwe_type_prob"
+        "cwe_id" stores a list of CWE-ID predictions: [CWE-787, CWE-119, ...]
+        "cwe_id_prob" stores a list of confidence scores of CWE-ID predictions [0.9, 0.7, ...]
+        "cwe_type" stores a list of CWE abstract types predictions: ["Base", "Class", ...]
+        "cwe_type_prob" stores a list of confidence scores of CWE abstract types predictions [0.9, 0.7, ...]
+    """
+    provider = ["CUDAExecutionProvider", "CPUExecutionProvider"] if gpu else ["CPUExecutionProvider"]
+    with open("./inference-common/label_map.pkl", "rb") as f:
+        cwe_id_map, cwe_type_map = pickle.load(f)
+    # load tokenizer
+    tokenizer = RobertaTokenizer.from_pretrained("./inference-common/tokenizer")
+    tokenizer.add_tokens(["<cls_type>"])
+    tokenizer.cls_type_token = "<cls_type>"
+    model_input = []
+    for c in code:
+        code_tokens = tokenizer.tokenize(str(c))[:512 - 3]
+        source_tokens = [tokenizer.cls_token] + code_tokens + [tokenizer.cls_type_token] + [tokenizer.sep_token]
+        input_ids = tokenizer.convert_tokens_to_ids(source_tokens)
+        padding_length = 512 - len(input_ids)
+        input_ids += [tokenizer.pad_token_id] * padding_length
+        model_input.append(input_ids)
+    device = "cuda" if gpu else "cpu"
+    model_input = torch.tensor(model_input, device=device)
+    # onnx runtime session
+    ort_session = onnxruntime.InferenceSession("./models/cwe_model.onnx", providers=provider)
+    # compute ONNX Runtime output prediction
+    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(model_input)}
+    cwe_id_prob, cwe_type_prob = ort_session.run(None, ort_inputs)
+    # batch_cwe_id_pred (1D list with shape of [batch size]): [pred_1, pred_2, ..., pred_n]
+    batch_cwe_id = np.argmax(cwe_id_prob, axis=-1).tolist()
+    # map predicted idx back to CWE-ID
+    batch_cwe_id_pred = [cwe_id_map[str(idx)] for idx in batch_cwe_id]
+    # batch_cwe_id_pred_prob (1D list with shape of [batch_size]): [prob_1, prob_2, ..., prob_n]
+    batch_cwe_id_pred_prob = []
+    for i in range(len(cwe_id_prob)):
+        batch_cwe_id_pred_prob.append(cwe_id_prob[i][batch_cwe_id[i]].item())
+    # batch_cwe_type_pred (1D list with shape of [batch size]): [pred_1, pred_2, ..., pred_n]
+    batch_cwe_type = np.argmax(cwe_type_prob, axis=-1).tolist()
+    # map predicted idx back to CWE-Type
+    batch_cwe_type_pred = [cwe_type_map[str(idx)] for idx in batch_cwe_type]
+    # batch_cwe_type_pred_prob (1D list with shape of [batch_size]): [prob_1, prob_2, ..., prob_n]
+    batch_cwe_type_pred_prob = []
+    for i in range(len(cwe_type_prob)):
+        batch_cwe_type_pred_prob.append(cwe_type_prob[i][batch_cwe_type[i]].item())
+    return {"cwe_id": batch_cwe_id_pred,
+            "cwe_id_prob": batch_cwe_id_pred_prob,
+            "cwe_type": batch_cwe_type_pred,
+            "cwe_type_prob": batch_cwe_type_pred_prob}
+def predict_sev(code: list, gpu: bool = False) -> dict:
+    """Generate CVSS severity score predictions.
+    Parameters
+    ----------
+    code : :obj:`list`
+        A list of String functions.
+    gpu : bool
+        Defines if CUDA inference is enabled
+    Returns
+    -------
+    :obj:`dict`
+        A dictionary with two keys, "batch_sev_score", "batch_sev_class"
+        "batch_sev_score" stores a list of severity score prediction: [1.0, 5.0, 9.0 ...]
+        "batch_sev_class" stores a list of severity class based on predicted severity score ["Medium", "Critical"...]
+    """
+    provider = ["CUDAExecutionProvider", "CPUExecutionProvider"] if gpu else ["CPUExecutionProvider"]
+    # load tokenizer
+    tokenizer = RobertaTokenizer.from_pretrained("./inference-common/tokenizer")
+    model_input = tokenizer(code, truncation=True, max_length=512, padding='max_length',
+                            return_tensors="pt").input_ids
+    # onnx runtime session
+    ort_session = onnxruntime.InferenceSession("./models/sev_model.onnx", providers=provider)
+    # compute ONNX Runtime output prediction
+    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(model_input)}
+    cvss_score = ort_session.run(None, ort_inputs)
+    batch_sev_score = list(cvss_score[0].flatten().tolist())
+    batch_sev_class = []
+    for i in range(len(batch_sev_score)):
+        if batch_sev_score[i] == 0:
+            batch_sev_class.append("None")
+        elif batch_sev_score[i] < 4:
+            batch_sev_class.append("Low")
+        elif batch_sev_score[i] < 7:
+            batch_sev_class.append("Medium")
+        elif batch_sev_score[i] < 9:
+            batch_sev_class.append("High")
+        else:
+            batch_sev_class.append("Critical")
+    return {"batch_sev_score": batch_sev_score, "batch_sev_class": batch_sev_class}
+def predict(code: list):
+    vul_preds = predict_vul_lines(code)
+    cwe_preds = predict_cweid(code)
+    sev_preds = predict_sev(code)
+if __name__ == "__main__":
+    import pandas as pd
+    df = pd.read_csv("./data/processed_test.csv")
+    funcs = df["func_before"].tolist()
+    for code in funcs:
+        out = predict_vul_lines([code])
+        print(out["batch_func_pred"][0])