File size: 3,575 Bytes
0806367
5348cff
0806367
5348cff
0806367
 
5348cff
 
 
0806367
5348cff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0806367
5348cff
 
 
 
 
 
 
 
 
 
 
 
 
0806367
5348cff
 
0806367
5348cff
 
 
 
 
 
 
 
 
 
0806367
5348cff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2d1297
 
5348cff
 
0806367
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import json
import numpy as np
import pandas as pd
from pathlib import Path


class Comprehend2NERFormat:
    def __init__(self, letterfilepath):
        self.letterfilepath = letterfilepath

    def load_data(self):
        with open(self.letterfilepath, "r") as file: #r"data/raw_data/annotations/Letter 0-1-ccf1b225-ann.json"
            json_letter = json.load(file)
        return json_letter
    
    @staticmethod
    def get_tokens(jsondata):
        data_token = []
        for block in jsondata['Blocks']:
            if block["BlockType"] == 'WORD':
                data_token.append({'blockid' : block['Id'], 'token' : block['Text']})
        df_token = pd.DataFrame(data_token, columns = ['blockid', 'token'])
        return df_token
    
    @staticmethod
    def get_line_child_ids(jsondata):
        df_line = pd.DataFrame(columns = ['lineid', 'childid'])
        for block in jsondata['Blocks']:
            if block["BlockType"] == 'LINE':
                childid = block['Relationships'][0]['Ids']
                lineid = [block['Id']] *len(childid)
                df_line = pd.concat([df_line, pd.DataFrame({'lineid' : lineid, 'childid' : childid})], axis=0)
        return df_line

    @staticmethod
    def get_ner_tags(jsondata):
        data_nertags = []
        for block in jsondata['Entities']:
            ner_tag = block['Type']
            for subref in block['BlockReferences']:
                counter = 0
                for child in subref['ChildBlocks']:
                    if counter == 0:
                        data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"B-{ner_tag}"})
                        counter = counter+1
                    else:
                        data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"I-{ner_tag}"})

        df_nertags = pd.DataFrame(data_nertags, columns = ['blockid', 'ner_tag'])
        return df_nertags

    @staticmethod
    def insert_newline_char(df_prev):
        df = pd.DataFrame()
        df_insert = pd.DataFrame({'blockid' : 'newline', 'token' : '\n', 'ner_tag': 'O','lineid': 'newline'},index=[0])
        for group in  df_prev.groupby('lineid'):
            insertrank = group[1]['linewordrank'].iloc[-1] + 0.1
            df_insert['linewordrank'] = insertrank
            df = pd.concat([df, group[1], df_insert], axis=0)
        df.sort_values(by='linewordrank', inplace=True)
        return df

    def __call__(self):
        json_letter = self.load_data()
        df_token = self.get_tokens(jsondata=json_letter)
        df_line = self.get_line_child_ids(jsondata=json_letter)
        df_nertags = self.get_ner_tags(jsondata=json_letter)

        df1 = pd.merge(df_token, df_nertags, on='blockid', how='left')
        df1['ner_tag'][df1['ner_tag'].isna()] = 'O'
        df2 = pd.merge(df1, df_line, left_on='blockid', right_on='childid', how='left').drop(columns=['childid'])
        df2['linewordrank'] = np.arange(df2.shape[0])

        df3 = self.insert_newline_char(df_prev=df2)
        return {"tokens": df3['token'].tolist(),
                "ner_tags": df3['ner_tag'].tolist(),
                "filename": self.letterfilepath.name
        }


if __name__ == '__main__':
    dataset_lst = []
    for file in Path(r'data/raw_data/annotations/').glob('**/*'):
        comprehend2NERFormat = Comprehend2NERFormat(letterfilepath=file)
        dataset_lst.append(comprehend2NERFormat())
    print(" ".join(dataset_lst[0]['tokens']))
    
    with open('data/ner_input_data/ner_dataset.json', 'w') as f:
        json.dump(dataset_lst, f)