Spaces:
Runtime error
Runtime error
import json | |
import numpy as np | |
import pandas as pd | |
from pathlib import Path | |
class Comprehend2NERFormat: | |
def __init__(self, letterfilepath): | |
self.letterfilepath = letterfilepath | |
def load_data(self): | |
with open(self.letterfilepath, "r") as file: #r"data/raw_data/annotations/Letter 0-1-ccf1b225-ann.json" | |
json_letter = json.load(file) | |
return json_letter | |
def get_tokens(jsondata): | |
data_token = [] | |
for block in jsondata['Blocks']: | |
if block["BlockType"] == 'WORD': | |
data_token.append({'blockid' : block['Id'], 'token' : block['Text']}) | |
df_token = pd.DataFrame(data_token, columns = ['blockid', 'token']) | |
return df_token | |
def get_line_child_ids(jsondata): | |
df_line = pd.DataFrame(columns = ['lineid', 'childid']) | |
for block in jsondata['Blocks']: | |
if block["BlockType"] == 'LINE': | |
childid = block['Relationships'][0]['Ids'] | |
lineid = [block['Id']] *len(childid) | |
df_line = pd.concat([df_line, pd.DataFrame({'lineid' : lineid, 'childid' : childid})], axis=0) | |
return df_line | |
def get_ner_tags(jsondata): | |
data_nertags = [] | |
for block in jsondata['Entities']: | |
ner_tag = block['Type'] | |
for subref in block['BlockReferences']: | |
counter = 0 | |
for child in subref['ChildBlocks']: | |
if counter == 0: | |
data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"B-{ner_tag}"}) | |
counter = counter+1 | |
else: | |
data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"I-{ner_tag}"}) | |
df_nertags = pd.DataFrame(data_nertags, columns = ['blockid', 'ner_tag']) | |
return df_nertags | |
def insert_newline_char(df_prev): | |
df = pd.DataFrame() | |
df_insert = pd.DataFrame({'blockid' : 'newline', 'token' : '\n', 'ner_tag': 'O','lineid': 'newline'},index=[0]) | |
for group in df_prev.groupby('lineid'): | |
insertrank = group[1]['linewordrank'].iloc[-1] + 0.1 | |
df_insert['linewordrank'] = insertrank | |
df = pd.concat([df, group[1], df_insert], axis=0) | |
df.sort_values(by='linewordrank', inplace=True) | |
return df | |
def __call__(self): | |
json_letter = self.load_data() | |
df_token = self.get_tokens(jsondata=json_letter) | |
df_line = self.get_line_child_ids(jsondata=json_letter) | |
df_nertags = self.get_ner_tags(jsondata=json_letter) | |
df1 = pd.merge(df_token, df_nertags, on='blockid', how='left') | |
df1['ner_tag'][df1['ner_tag'].isna()] = 'O' | |
df2 = pd.merge(df1, df_line, left_on='blockid', right_on='childid', how='left').drop(columns=['childid']) | |
df2['linewordrank'] = np.arange(df2.shape[0]) | |
df3 = self.insert_newline_char(df_prev=df2) | |
return {"tokens": df3['token'].tolist(), | |
"ner_tags": df3['ner_tag'].tolist(), | |
"filename": self.letterfilepath.name | |
} | |
if __name__ == '__main__': | |
dataset_lst = [] | |
for file in Path(r'data/raw_data/annotations/').glob('**/*'): | |
comprehend2NERFormat = Comprehend2NERFormat(letterfilepath=file) | |
dataset_lst.append(comprehend2NERFormat()) | |
print(" ".join(dataset_lst[0]['tokens'])) | |
with open('data/ner_input_data/ner_dataset.json', 'w') as f: | |
json.dump(dataset_lst, f) | |