legal-entity-ner-transformers / source /services /ner /awscomprehend_2_ner_format.py
aimlnerd's picture
add train
a2d1297
raw
history blame
3.58 kB
import json
import numpy as np
import pandas as pd
from pathlib import Path
class Comprehend2NERFormat:
def __init__(self, letterfilepath):
self.letterfilepath = letterfilepath
def load_data(self):
with open(self.letterfilepath, "r") as file: #r"data/raw_data/annotations/Letter 0-1-ccf1b225-ann.json"
json_letter = json.load(file)
return json_letter
@staticmethod
def get_tokens(jsondata):
data_token = []
for block in jsondata['Blocks']:
if block["BlockType"] == 'WORD':
data_token.append({'blockid' : block['Id'], 'token' : block['Text']})
df_token = pd.DataFrame(data_token, columns = ['blockid', 'token'])
return df_token
@staticmethod
def get_line_child_ids(jsondata):
df_line = pd.DataFrame(columns = ['lineid', 'childid'])
for block in jsondata['Blocks']:
if block["BlockType"] == 'LINE':
childid = block['Relationships'][0]['Ids']
lineid = [block['Id']] *len(childid)
df_line = pd.concat([df_line, pd.DataFrame({'lineid' : lineid, 'childid' : childid})], axis=0)
return df_line
@staticmethod
def get_ner_tags(jsondata):
data_nertags = []
for block in jsondata['Entities']:
ner_tag = block['Type']
for subref in block['BlockReferences']:
counter = 0
for child in subref['ChildBlocks']:
if counter == 0:
data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"B-{ner_tag}"})
counter = counter+1
else:
data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"I-{ner_tag}"})
df_nertags = pd.DataFrame(data_nertags, columns = ['blockid', 'ner_tag'])
return df_nertags
@staticmethod
def insert_newline_char(df_prev):
df = pd.DataFrame()
df_insert = pd.DataFrame({'blockid' : 'newline', 'token' : '\n', 'ner_tag': 'O','lineid': 'newline'},index=[0])
for group in df_prev.groupby('lineid'):
insertrank = group[1]['linewordrank'].iloc[-1] + 0.1
df_insert['linewordrank'] = insertrank
df = pd.concat([df, group[1], df_insert], axis=0)
df.sort_values(by='linewordrank', inplace=True)
return df
def __call__(self):
json_letter = self.load_data()
df_token = self.get_tokens(jsondata=json_letter)
df_line = self.get_line_child_ids(jsondata=json_letter)
df_nertags = self.get_ner_tags(jsondata=json_letter)
df1 = pd.merge(df_token, df_nertags, on='blockid', how='left')
df1['ner_tag'][df1['ner_tag'].isna()] = 'O'
df2 = pd.merge(df1, df_line, left_on='blockid', right_on='childid', how='left').drop(columns=['childid'])
df2['linewordrank'] = np.arange(df2.shape[0])
df3 = self.insert_newline_char(df_prev=df2)
return {"tokens": df3['token'].tolist(),
"ner_tags": df3['ner_tag'].tolist(),
"filename": self.letterfilepath.name
}
if __name__ == '__main__':
dataset_lst = []
for file in Path(r'data/raw_data/annotations/').glob('**/*'):
comprehend2NERFormat = Comprehend2NERFormat(letterfilepath=file)
dataset_lst.append(comprehend2NERFormat())
print(" ".join(dataset_lst[0]['tokens']))
with open('data/ner_input_data/ner_dataset.json', 'w') as f:
json.dump(dataset_lst, f)