list_of_pos_tags = [ "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X" ] realis_list = ["O", "Generic", "Other", "Actual" ] event_args_list = ['O', 'B-System', 'I-System', 'B-Organization', 'B-Money', 'I-Money', 'B-Device', 'B-Person', 'I-Person', 'B-Vulnerability', 'I-Vulnerability', 'B-Capabilities', 'I-Capabilities', 'I-Organization', 'B-PaymentMethod', 'I-PaymentMethod', 'B-Data', 'I-Data', 'B-Number', 'I-Number', 'B-Malware', 'I-Malware', 'B-PII', 'I-PII', 'B-CVE', 'I-CVE', 'B-Purpose', 'I-Purpose', 'B-File', 'I-File', 'I-Device', 'B-Time', 'I-Time', 'B-Software', 'I-Software', 'B-Patch', 'I-Patch', 'B-Version', 'I-Version', 'B-Website', 'I-Website', 'B-GPE', 'I-GPE' ] event_nugget_list = ['O', 'B-Ransom', 'I-Ransom', 'B-DiscoverVulnerability', 'I-DiscoverVulnerability', 'B-PatchVulnerability', 'I-PatchVulnerability', 'B-Databreach', 'I-Databreach', 'B-Phishing', 'I-Phishing' ] arg_2_role = { "File" : ['Tool', 'Trusted-Entity'], "Person" : ['Victim', 'Attacker', 'Discoverer', 'Releaser', 'Trusted-Entity', 'Vulnerable_System_Owner'], "Capabilities" : ['Attack-Pattern', 'Capabilities', 'Issues-Addressed'], "Purpose" : ['Purpose'], "Time" : ['Time'], "PII" : ['Compromised-Data', 'Trusted-Entity'], "Data" : ['Compromised-Data', 'Trusted-Entity'], "Organization" : ['Victim', 'Releaser', 'Discoverer', 'Attacker', 'Vulnerable_System_Owner', 'Trusted-Entity'], "Patch" : ['Patch'], "Software" : ['Vulnerable_System', 'Victim', 'Trusted-Entity', 'Supported_Platform'], "Vulnerability" : ['Vulnerability'], "Version" : ['Patch-Number', 'Vulnerable_System_Version'], "Device" : ['Vulnerable_System', 'Victim', 'Supported_Platform'], "CVE" : ['CVE'], "Number" : ['Number-of-Data', 'Number-of-Victim'], "System" : ['Victim', 'Supported_Platform', 'Vulnerable_System', 'Trusted-Entity'], "Malware" : ['Tool'], "Money" : ['Price', 'Damage-Amount'], "PaymentMethod" : ['Payment-Method'], "GPE" : ['Place'], "Website" : ['Trusted-Entity', 'Tool', 'Vulnerable_System', 'Victim', 'Supported_Platform'], } def get_content(data): return data["content"] def get_event_nugget(data): return [ {"nugget" : event["nugget"], "type" : event["type"], "subtype" : event["subtype"], "realis" : event["realis"]} for hopper in data["cyberevent"]["hopper"] for event in hopper["events"] ] def get_event_args(data): events = [event for hopper in data["cyberevent"]["hopper"] for event in hopper["events"]] args = [] for event in events: if "argument" in event.keys(): args.extend(event["argument"]) return args def get_idxs_from_text(text, text_tokenized): rest_text = text last_idx = 0 result_dict = [] for substring in text_tokenized: index = rest_text.find(substring) result_dict.append( { "word" : substring, "start_idx" : last_idx + index, "end_idx" : last_idx + index + len(substring) } ) rest_text = rest_text[index + len(substring) : ] last_idx += index + len(substring) return result_dict def get_entity_from_idx(start_idx, end_idx, event_nuggets): event_nuggets_idxs = [(nugget["nugget"]["startOffset"], nugget["nugget"]["endOffset"]) for nugget in event_nuggets] for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs): if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start): return "B-" + event_nuggets[idx]["subtype"] elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end): return "I-" + event_nuggets[idx]["subtype"] return "O" def get_entity_and_realis_from_idx(start_idx, end_idx, event_nuggets): event_nuggets_idxs = [(nugget["nugget"]["startOffset"], nugget["nugget"]["endOffset"]) for nugget in event_nuggets] for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs): if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start): return "B-" + event_nuggets[idx]["subtype"], "B-" + event_nuggets[idx]["realis"] elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end): return "I-" + event_nuggets[idx]["subtype"], "I-" + event_nuggets[idx]["realis"] return "O", "O" def get_args_entity_from_idx(start_idx, end_idx, event_args): event_nuggets_idxs = [(nugget["startOffset"], nugget["endOffset"]) for nugget in event_args] for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs): if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start): return "B-" + event_args[idx]["type"] elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end): return "I-" + event_args[idx]["type"] return "O" def split_with_character(string, char): result = [] start = 0 for i, c in enumerate(string): if c == char: result.append(string[start:i]) result.append(char) start = i + 1 result.append(string[start:]) return [x for x in result if x != ''] def extend_list_with_character(content_list, character): content_as_words = [] for word in content_list: if character in word: split_list = split_with_character(word, character) content_as_words.extend(split_list) else: content_as_words.append(word) return content_as_words def find_dict_by_overlap(list_of_dicts, key_value_pairs): for dictionary in list_of_dicts: if max(dictionary["start"], dictionary["end"]) >= min(key_value_pairs["start"], key_value_pairs["end"]) and max(key_value_pairs["start"], key_value_pairs["end"]) >= min(dictionary["start"], dictionary["end"]): return dictionary return None