File size: 11,923 Bytes
f73dc21
 
 
 
 
94c41db
f73dc21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4067c90
 
 
 
 
 
 
 
 
 
 
5e0e037
 
 
 
4067c90
 
 
 
 
 
 
 
 
 
f73dc21
94c41db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f73dc21
6a4a8e0
 
 
 
 
 
f73dc21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a4a8e0
f73dc21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4067c90
 
 
 
 
 
f73dc21
 
 
 
 
 
 
4067c90
f73dc21
 
 
 
 
6a4a8e0
 
 
 
 
 
 
 
 
 
 
 
 
 
4067c90
6a4a8e0
 
4067c90
 
6a4a8e0
 
 
4067c90
6a4a8e0
 
4067c90
 
 
 
 
 
 
 
 
6a4a8e0
 
f73dc21
4067c90
f73dc21
 
 
 
4067c90
6a4a8e0
f73dc21
 
6a4a8e0
4067c90
6a4a8e0
f73dc21
 
 
 
 
 
 
 
 
4067c90
6a4a8e0
f73dc21
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
import urllib.request, urllib.error, urllib.parse
import json
import pandas as pd
import ssl
import torch
import re
from pprint import pprint
from captum.attr import visualization

REST_URL = "http://data.bioontology.org"
API_KEY = "604a90bc-ef14-4c26-a347-f4928fa086ea"
ssl._create_default_https_context = ssl._create_unverified_context

class PyTMinMaxScalerVectorized(object):
    """
    From https://discuss.pytorch.org/t/using-scikit-learns-scalers-for-torchvision/53455
    Transforms each channel to the range [0, 1].
    """
    def __call__(self, tensor):
        scale = 1.0 / (tensor.max(dim=0, keepdim=True)[0] - tensor.min(dim=0, keepdim=True)[0]) 
        tensor.mul_(scale).sub_(tensor.min(dim=0, keepdim=True)[0])
        return tensor
    
def get_diseases(text, pipe):
    results = pipe(text)
    diseases = []
    disease_span = []
    for result in results:
        ent = result['entity']
        # start of a new entity
        if ent == 'B-DISEASE':
            disease_span = result['start'], result['end']
        elif ent == 'I-DISEASE':
            if len(disease_span) == 0:
                disease_span = []
            else:
                disease_span = disease_span[0], result['end']
        else:
            if len(disease_span) > 1:
                disease = text[disease_span[0]: disease_span[1]]
                if len(disease) > 2:
                    diseases.append(disease)
            disease_span = []
    if len(disease_span) > 1:
        disease = text[disease_span[0]: disease_span[1]]
        diseases.append(disease)
    return diseases    

def find_end(text):
    """Find the end of the report."""
    ends = [len(text)]
    patterns = [
        re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
        re.compile(r'\n {3,}DR.', re.I),
        re.compile(r'[ ]{1,}RADLINE ', re.I),
        re.compile(r'.*electronically signed on', re.I),
        re.compile(r'M\[0KM\[0KM')
    ]
    for pattern in patterns:
        matchobj = pattern.search(text)
        if matchobj:
            ends.append(matchobj.start())
    return min(ends)

def pattern_repl(matchobj):
    """
    Return a replacement string to be used for match object
    """
    return ' '.rjust(len(matchobj.group(0)))

def clean_text(text):
    """
    Clean text
    """

    # Replace [**Patterns**] with spaces.
    text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
    # Replace `_` with spaces.
    text = re.sub(r'_', ' ', text)

    start = 0
    end = find_end(text)
    new_text = ''
    if start > 0:
        new_text += ' ' * start
    new_text = text[start:end]

    # make sure the new text has the same length of old text.
    if len(text) - end > 0:
        new_text += ' ' * (len(text) - end)
    return new_text

def get_drg_link(drg_code):
    drg_code = str(drg_code)
    if len(drg_code) == 1:
        drg_code = '00' + drg_code
    elif len(drg_code) == 2:
        drg_code = '0' + drg_code
    return f'https://www.findacode.com/code.php?set=DRG&c={drg_code}'

def prettify(dict_list, k):
    li = [di[k] for di in dict_list]
    result = "\n".join(l for l in li)
    return result

def get_json(text_to_annotate):
    url = REST_URL + "/annotator?text=" + urllib.parse.quote(text_to_annotate) + "&ontologies=ICD9CM" +\
        "&longest_only=false" + "&exclude_numbers=false" + "&whole_word_only=true" + '&exclude_synonyms=false'
    opener = urllib.request.build_opener()
    opener.addheaders = [('Authorization', 'apikey token=' + API_KEY)]
    try:
        return json.loads(opener.open(url).read())
    except:
        return []

def parse_results(results):
    if len(results) == 0:
        return []
    rlist = []
    for result in results:
        annotations = result['annotations']
        for annotation in annotations:
            start = annotation['from']-1
            end = annotation['to'] - 1
            text = annotation['text']
            rlist.append({
                'start': start,
                'end': end,
                'text': text,
                'link': result['annotatedClass']['@id']
            })
    return rlist

def get_icd_annotations(text):
    response = get_json(text)
    annotation_list = parse_results(response)
    return annotation_list

def subfinder(mylist, pattern):
    mylist = mylist.tolist()
    pattern = pattern.tolist()
    return list(filter(lambda x: x in pattern, mylist))

def tokenize_icds(tokenizer, annotations, token_ids):
    icd_tokens = torch.zeros(token_ids.shape)
    for annotation in annotations:
        icd = annotation['text']
        icd_token_ids = tokenizer(icd, add_special_tokens=False, return_tensors='pt').input_ids[0]
        # find index of the beginning icd token
        starting_indices = (token_ids==icd_token_ids[0]).nonzero(as_tuple=False)
        num_icd_tokens = icd_token_ids.shape[0]

        # if there's more than 1 icd token for the given annotation
        if num_icd_tokens > 1:
            # if there's only one starting index
            if starting_indices.shape[0] == 1:
                starting_index = starting_indices.item()
                icd_tokens[starting_index: starting_index + num_icd_tokens] = 1
            # if there's more than 1 starting index, determine which is the appropriate
            else:
                for starting_index in starting_indices:
                    if token_ids[starting_index + num_icd_tokens] == icd_token_ids:
                        icd_tokens[starting_index: starting_index + num_icd_tokens] = 1
        
        # otherwise, set the corresponding index to a value of 1
        else:
            icd_tokens[starting_indices] = 1
    return icd_tokens

def get_attribution(text, tokenizer, model_outputs, inputs, k=7):
    tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
    padding_idx = tokens.index('[PAD]')
    tokens = tokens[:padding_idx][1:-1]
    attn = model_outputs[-1][0]
    agg_attn, final_text = reconstruct_text(tokenizer=tokenizer, tokens=tokens, attn=attn)
    return agg_attn, final_text
    
def reconstruct_text(tokenizer, tokens, attn):
    """
    find a word -> token_id mapping that allows you to
    perform an aggregation on the sub-tokens' attention
    values
    """
    reconstructed_text = tokenizer.convert_tokens_to_string(tokens)
    num_subtokens = len([t for t in tokens if t.startswith('#')])
    aggregated_attn = torch.zeros(len(tokens) - num_subtokens)
    token_indices = [0]
    token_idx = 0
    reconstructed_tokens = []
    for i, token in enumerate(tokens[1:], start=1):
        # case when a token is a subtoken
        if token.startswith('#'):
            token_indices.append(i)    
        else:
            # reconstruct the tokens to make sure you're doing this correctly
            reconstructed_token = ''.join(tokens[i].replace('#', '') for i in token_indices)
            reconstructed_tokens.append(reconstructed_token)
            # find the corresponding attention vectors
            aggregated_attn[token_idx] = torch.mean(attn[token_indices])
            # create new index list
            token_indices = [i]
            token_idx += 1
    # reconstruct the tokens to make sure you're doing this correctly
    reconstructed_token = ''.join(tokens[i].replace('#', '') for i in token_indices)
    reconstructed_tokens.append(reconstructed_token)
    # find the corresponding attention vectors
    aggregated_attn[token_idx] = torch.mean(attn[token_indices])   

    # final representation of text
    final_text = ' '.join(reconstructed_tokens).replace(' .', '.')
    final_text = final_text.replace(' ,', ',')
    # final_text == reconstructed_text
    return aggregated_attn, reconstructed_tokens

def load_rule(path):
    rule_df = pd.read_csv(path)
    
    # remove MDC 15 - neonate and couple other codes related to postcare
    if 'MS' in path:
        msk = (rule_df['MDC']!='15') & (~rule_df['MS-DRG'].isin([945, 946, 949, 950, 998, 999])) 
        space = sorted(rule_df[msk]['DRG_CODE'].unique())
    elif 'APR' in path:
        msk = (rule_df['MDC']!='15') & (~rule_df['APR-DRG'].isin([860, 863])) 
        space = sorted(rule_df[msk]['DRG_CODE'].unique())
        
    drg2idx = {}
    for d in space:
        drg2idx[d] = len(drg2idx)
    i2d = {v:k for k,v in drg2idx.items()}

    d2mdc, d2w = {}, {}
    for _, r in rule_df.iterrows():
        drg = r['DRG_CODE']
        mdc = r['MDC']
        w = r['WEIGHT']
        d2mdc[drg] = mdc
        d2w[drg] = w
        
    return rule_df, drg2idx, i2d, d2mdc, d2w

def visualize_attn(model_results):
    class_id = model_results['class_dsc']
    prob = model_results['prob']
    attn = model_results['attn']
    tokens = model_results['tokens']
    scaler = PyTMinMaxScalerVectorized()
    normalized_attn = scaler(attn)
    viz_record = visualization.VisualizationDataRecord(
        word_attributions=normalized_attn,
        pred_prob=prob,
        pred_class=class_id,
        true_class=class_id,
        attr_class=0,
        attr_score=1,
        raw_input_ids=tokens,
        convergence_score=1
    )
    return visualize_text(
        viz_record,
        drg_link=model_results['drg_link'],
        icd_annotations=model_results['icd_results'],
        diseases=model_results['diseases']
    )


def modify_attn_html(attn_html):
    attn_split = attn_html.split('<mark')
    htmls = [attn_split[0]]
    for html in attn_split[1:]:
        # wrap around href tag
        href_html = f'<a href="https://" \
            <mark{html} \
            </a>'
        htmls.append(href_html)
    return "".join(htmls)

def modify_code_html(html, link, icd=False):
    html = html.split('<td>')[1].split('</td>')[0]
    href_html = f'<td><a href="{link}"{html}</a></td>'
    if icd:
        href_html = href_html.replace('<td>', '').replace('</td>', '')
    return href_html

def modify_drg_html(html, drg_link):
    return modify_code_html(html=html, link=drg_link, icd=False)

def get_icd_html(icd_list):
    if len(icd_list) == 0:
        return '<td><text style="padding-right:2em"><b>N/A</b></text></td>'
    final_html = '<td>'
    icd_set = set()
    for icd_dict in icd_list:
        text, link = icd_dict['text'], icd_dict['link']
        if text in icd_set:
            continue
        tmp_html = visualization.format_classname(classname=text)
        html = modify_code_html(html=tmp_html, link=link, icd=True)
        final_html += html
        icd_set.add(text)
    return final_html + '</td>'


def get_disease_html(diseases):
    if len(diseases) == 0:
        return '<td><text style="padding-right:2em"><b>N/A</b></text></td>'
    diseases = list(set(diseases))
    diseases_str = ', '.join(diseases)
    html = visualization.format_classname(classname=diseases_str)
    return html + '</td>'

    

# copied out of captum because we need raw html instead of a jupyter widget
def visualize_text(datarecord, drg_link, icd_annotations, diseases):
    dom = ["<table width: 100%>"]
    rows = [
        "<th style='text-align: left'>Predicted DRG</th>"
        "<th style='text-align: left'>Word Importance</th>"
        "<th style='text-align: left'>Diseases</th>"
        "<th style='text-align: left'>ICD Codes</th>"
    ]
    pred_class_html = visualization.format_classname(datarecord.pred_class)
    icd_class_html = get_icd_html(icd_annotations)
    disease_html = get_disease_html(diseases)
    pred_class_html = modify_drg_html(html=pred_class_html, drg_link=drg_link)
    word_attn_html = visualization.format_word_importances(
        datarecord.raw_input_ids, datarecord.word_attributions
    )
    rows.append(
        "".join(
            [
                "<tr>",
                pred_class_html,
                word_attn_html,
                disease_html,
                icd_class_html,
                "<tr>",
            ]
        )
    )

    dom.append("".join(rows))
    dom.append("</table>")
    html = "".join(dom)

    return html