File size: 5,779 Bytes
7c26359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""Script que ejecuta las predicciones para un modelo (solo HF por el momento)"""
import os
import sys
import numpy as np
from scipy.special import softmax
sys.path.append('../modelos')
from transformers import AutoModelForTokenClassification, AutoTokenizer
import pandas as pd

def map_entities(y_pred,map_dict,return_type='list'):
    inv_map = {v: k for k, v in map_dict.items()}
    if return_type == 'list':
        return [inv_map[y] for y in y_pred]
    else:
        return np.array([inv_map[y] for y in y_pred])


def word_ids_method(text,tokenizer):
    """Método que asigna el primer token (subword) de una palabra como representante
        La etiqueta de este token será la etiqueta de la palabra
    Método integrado en tokenize_and_align_labels
    Fuente: https://huggingface.co/docs/transformers/tasks/token_classification

    Argumentos

        text: str o List[str]
            texto a tokenizar, si 
    """
    if not isinstance(text,list):
        text = text.split()

    tokenized_inputs = tokenizer([text], truncation=True, is_split_into_words=True)
    mask = []
    word_ids = tokenized_inputs.word_ids(batch_index=0)
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            mask.append(0)
        elif word_idx != previous_word_idx:  # Only label the first token of a given word.
            mask.append(1)
        else:
            mask.append(0)
        previous_word_idx = word_idx
    
    return mask


def eval_text(text,tokenizer,model):
    """
    Toma un texto (lista de palabras o string), un tokenizador y modelo de HuggingFace
    Retorna el output del modelo (ids de entidades)
    """
    mask = word_ids_method(text,tokenizer)
    encoded_input = tokenizer(text,return_tensors='pt',is_split_into_words=isinstance(text,list))
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    result = np.argmax(scores,axis=1)

    return result[mask==np.array(1)]

ner_dict = {'O': 0,
            'B-ACTIVE_PRINCIPLE': 1,
            'I-ACTIVE_PRINCIPLE': 2,
            'B-FORMA_FARMA':3,
            'I-FORMA_FARMA':4,
            'B-ADMIN': 5,
            'I-ADMIN': 6,
            'B-PERIODICITY': 7,
            'I-PERIODICITY': 8,
            'B-DURATION': 9,
            'I-DURATION': 10
            }

admin_ner_dict = {
    'O': 0,
    'B-CANT': 1,
    'I-CANT': 2,
    'B-UND':3,
    'I-UND':4,
    'B-VIA_ADMIN': 5,
    'I-VIA_ADMIN': 6
}


def cargar_modelo(admin=False,verbose=False):
    MODEL = 'ccarvajal/beto-prescripciones-medicas-ADMIN' if admin else 'ccarvajal/beto-prescripciones-medicas'
    folder = "bert-clinical-scratch-wl-es-NER-ADMIN" if admin else "bert-clinical-scratch-wl-es-NER-prescription"

    n_labels = len(admin_ner_dict) if admin else len(ner_dict)

    if os.path.isdir('../modelos/' + folder):
        folder = '../modelos/' + folder
        if verbose:
            print("Cargando modelo guardado localmente")
        tokenizer = AutoTokenizer.from_pretrained(folder)
        model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True)
    elif os.path.isdir('modelos/' + folder):
        folder = 'modelos/' + folder
        if verbose:
            print("Cargando modelo guardado localmente")
        tokenizer = AutoTokenizer.from_pretrained(folder)
        model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True)
    else:
        if verbose:
            print("Descargando modelo de repositorio HuggingFace")
        tokenizer = AutoTokenizer.from_pretrained(MODEL)
        tokenizer.save_pretrained(folder)
        model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=n_labels,ignore_mismatched_sizes=True)
        model.save_pretrained(folder)
    
    return tokenizer, model


tokenizer, model = cargar_modelo()
tokenizer_admin, model_admin = cargar_modelo(admin=True)


def get_admin(entidades:list,texto:str) -> str:
    """
    Retorna un substring correspondiente a aquellas entidades etiquetadas con admin y los indices donde esto ocurre
    """
    indices = [i for i, ent in enumerate(entidades) if ent == 'B-ADMIN' or ent == 'I-ADMIN']
    return ' '.join([token for i, token in enumerate(texto.split(' ')) if i in indices]), indices


def render_pandas(ents,text_list):
    data = {'ACTIVE_PRINCIPLE':'','FORMA_FARMA':'','CANT-ADMIN':'','UND-ADMIN':'','VIA-ADMIN':'','PERIODICITY':'','DURATION':''}

    for i, ent in enumerate(ents):
        if '-ACTIVE_PRINCIPLE' in ent:
            data['ACTIVE_PRINCIPLE'] += ' ' + text_list[i]
        elif '-FORMA_FARMA' in ent:
            data['FORMA_FARMA'] += ' ' + text_list[i]
        elif '-CANT' in ent:
            data['CANT-ADMIN'] += ' ' + text_list[i]
        elif '-UND' in ent:
            data['UND-ADMIN'] += ' ' + text_list[i]
        elif '-VIA_ADMIN' in ent:
            data['VIA-ADMIN'] += ' ' + text_list[i]
        elif '-PERIODICITY' in ent:
            data['PERIODICITY'] += ' ' + text_list[i]
        elif '-DURATION' in ent:
            data['DURATION'] += ' ' + text_list[i]

    df = pd.DataFrame([data])
    return df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])]).hide(axis='index')


def etiquetar(texto,pandas=True):
    entidades = map_entities(eval_text(texto,tokenizer,model),ner_dict)
    admin_text, indices = get_admin(entidades,texto)
    entidades_admin = map_entities(eval_text(admin_text,tokenizer_admin,model_admin),admin_ner_dict)
    for i, ent_admin in enumerate(entidades_admin):
        entidades[indices[i]] = ent_admin
    if pandas:
        return render_pandas(entidades,texto.split())
    else:
        return entidades