aimlnerd commited on
Commit
0806367
1 Parent(s): 6eb192a
source/services/ner/awscomprehend_2_ner_format.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ with open(r"data/raw_data/annotations/Letter 0-1-ccf1b225-ann.json", "r") as file:
4
+ json_letter = json.load(file)
5
+
6
+
7
+
8
+ data_token = []
9
+ for block in json_letter['Blocks']:
10
+ if block["BlockType"] == 'WORD':
11
+ data_token.append({'blockid' : block['Id'], 'token' : block['Text']})
12
+
13
+ df_token = pd.DataFrame(data_token, columns = ['blockid', 'token'])
14
+
15
+ data_nertags = []
16
+ for block in json_letter['Entities']:
17
+ ner_tag = block['Type']
18
+ for subref in block['BlockReferences']:
19
+ counter = 0
20
+ for child in subref['ChildBlocks']:
21
+ if counter == 0:
22
+ data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"B-{ner_tag}"})
23
+ counter = counter+1
24
+ else:
25
+ data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"I-{ner_tag}"})
26
+
27
+ df_nertags = pd.DataFrame(data_nertags, columns = ['blockid', 'ner_tag'])
28
+
29
+ df = pd.merge(df_token, df_nertags, on='blockid', how='left')
30
+
31
+ print()