File size: 4,182 Bytes
10058cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import re
import json
from collections import Counter


def load_texttag_file(texttag_filename):
    try:
        with open(texttag_filename, "r") as data_file:
            data_all = data_file.read()
            tags_all = list()
            texts_selected = list()
            tags_selected = list()

            for line in re.split(r'\n\t?\n', data_all):
                if len(line) != 0:
                    texts_line = list()
                    tags_line = list()
                    for item in line.split("\n"):
                        if len(item)!=0:
                            text, tag = item.split("\t")
                            if re.search(r"[@|?|!+?|:|(|)]|\\|\.*?\|-|/|/|/.*?/|http\S+|www\S+", text) == None:
                                texts_line.append(text.lower())
                                tags_line.append(tag)
                                tags_all.append(tag)

                    texts_selected.append(texts_line)
                    tags_selected.append(tags_line)
    except FileNotFoundError as error:
        msg = "Sorry, the file" + data_file + "does not exist."
        print(msg)
        print("error:" + error)

    return texts_selected, tags_selected, tags_all


def tag_ids_map(tags_all, tags2ids_name, ids2tags_name):
    tags = list(set(tags_all))
    tags.sort()
    unique_tags = len(tags)
    ids = [i for i in range(unique_tags)]

    tags2ids = dict(zip(tags, ids))
    ids2tags = dict(zip(ids, tags))

    with open(tags2ids_name, "w") as filename:
        json.dump(tags2ids, filename)

    with open(ids2tags_name, "w") as filename:
        json.dump(ids2tags, filename)

    return tags2ids, ids2tags


def add_tagids(tags_selected, tags2ids, ids2tags):
    tagids_selected = list()
    for tags_line in tags_selected:
        tagids_line = list()
        for tag in tags_line:
            tagids_line.append(tags2ids[tag])
        tagids_selected.append(tagids_line)
    # print(tagids_selected)
    return tagids_selected


def add_text_tagid(tags_selected, tags2ids, ids2tags):
    tags_chunk = list()
    tagids_chunk = list()
    for tags_line in tags_selected:
        tag_line_chunk = list()
        tagid_line_chunk = list()
        tag_line_count = Counter(tags_line)
        if len(tag_line_count) == 1:
            tag_line_chunk.append(max(tag_line_count))
            tagid_line_chunk.append(tags2ids[max(tag_line_count)])
        else:
            del tag_line_count["O"]
            tag_line_chunk.append(max(tag_line_count))
            tagid_line_chunk.append(tags2ids[max(tag_line_count)])

        tags_chunk.append(tag_line_chunk)
        tagids_chunk.append(tagid_line_chunk)

    return tags_chunk, tagids_chunk

def save_json(json_filename, texts_selected, tags_selected, tagids_selected, tags_chunk, tagids_chunk):
    total_length = len(texts_selected)
    save_datalist = list()
    total_length = 32
    for index in range(total_length):
        item_dict = dict()
        item_dict["text"] = texts_selected[index]
        item_dict["word_tag"] = tags_selected[index]
        item_dict["word_tag_id"] = tagids_selected[index]
        item_dict["text_tag"] = tags_chunk[index]
        item_dict["text_tag_id"] = tagids_chunk[index]
        save_datalist.append(item_dict)

    with open(json_filename, 'w') as file:
        json.dump(save_datalist, file)

    return

def main(data_filename, json_filename, tags2ids_name, ids2tags_name):
    texts_selected, tags_selected, tags_all = load_texttag_file(data_filename)
    tags2ids, ids2tags = tag_ids_map(tags_all, tags2ids_name, ids2tags_name)

    tagids_selected = add_tagids(tags_selected, tags2ids, ids2tags)
    tags_chunk, tagids_chunk = add_text_tagid(tags_selected, tags2ids, ids2tags)

    save_json(json_filename, texts_selected, tags_selected, tagids_selected, tags_chunk, tagids_chunk)


if __name__ == "__main__":
    test_raw = "../data/raw_EDT/Event_detection/dev.txt"
    test_save = '../data/raw_EDT/Event_detection/dev.json'
    tags2ids_name = "../data/raw_EDT/Event_detection/tags2ids.json"
    ids2tags_name = "../data/raw_EDT/Event_detection/ids2tags.json"
    main(test_raw, test_save, tags2ids_name, ids2tags_name)