Spaces:
Runtime error
Runtime error
Add application file
Browse files- app.py +14 -0
- predicciones.py +159 -0
- requirements.txt +5 -0
app.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from predicciones import etiquetar
|
3 |
+
|
4 |
+
example = "PARACETAMOL 500 MG COMPRIMIDO 1 COMPRIMIDO ORAL cada 6 horas durante 3 dias"
|
5 |
+
|
6 |
+
st.title("Reconocimiento de entidades en prescripciones médicas")
|
7 |
+
|
8 |
+
text = st.text_input('Insertar prescripción', example)
|
9 |
+
# st.write('Texto ingresado', title)
|
10 |
+
|
11 |
+
entidades = etiquetar(text,pandas=False)
|
12 |
+
for i, ent in enumerate(entidades):
|
13 |
+
# print("{}\t{}".format(text.split()[i],ent))
|
14 |
+
st.write(text.split()[i],ent)
|
predicciones.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Script que ejecuta las predicciones para un modelo (solo HF por el momento)"""
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
import numpy as np
|
5 |
+
from scipy.special import softmax
|
6 |
+
sys.path.append('../modelos')
|
7 |
+
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
8 |
+
import pandas as pd
|
9 |
+
|
10 |
+
def map_entities(y_pred,map_dict,return_type='list'):
|
11 |
+
inv_map = {v: k for k, v in map_dict.items()}
|
12 |
+
if return_type == 'list':
|
13 |
+
return [inv_map[y] for y in y_pred]
|
14 |
+
else:
|
15 |
+
return np.array([inv_map[y] for y in y_pred])
|
16 |
+
|
17 |
+
|
18 |
+
def word_ids_method(text,tokenizer):
|
19 |
+
"""Método que asigna el primer token (subword) de una palabra como representante
|
20 |
+
La etiqueta de este token será la etiqueta de la palabra
|
21 |
+
Método integrado en tokenize_and_align_labels
|
22 |
+
Fuente: https://huggingface.co/docs/transformers/tasks/token_classification
|
23 |
+
|
24 |
+
Argumentos
|
25 |
+
|
26 |
+
text: str o List[str]
|
27 |
+
texto a tokenizar, si
|
28 |
+
"""
|
29 |
+
if not isinstance(text,list):
|
30 |
+
text = text.split()
|
31 |
+
|
32 |
+
tokenized_inputs = tokenizer([text], truncation=True, is_split_into_words=True)
|
33 |
+
mask = []
|
34 |
+
word_ids = tokenized_inputs.word_ids(batch_index=0)
|
35 |
+
previous_word_idx = None
|
36 |
+
for word_idx in word_ids:
|
37 |
+
if word_idx is None:
|
38 |
+
mask.append(0)
|
39 |
+
elif word_idx != previous_word_idx: # Only label the first token of a given word.
|
40 |
+
mask.append(1)
|
41 |
+
else:
|
42 |
+
mask.append(0)
|
43 |
+
previous_word_idx = word_idx
|
44 |
+
|
45 |
+
return mask
|
46 |
+
|
47 |
+
|
48 |
+
def eval_text(text,tokenizer,model):
|
49 |
+
"""
|
50 |
+
Toma un texto (lista de palabras o string), un tokenizador y modelo de HuggingFace
|
51 |
+
Retorna el output del modelo (ids de entidades)
|
52 |
+
"""
|
53 |
+
mask = word_ids_method(text,tokenizer)
|
54 |
+
encoded_input = tokenizer(text,return_tensors='pt',is_split_into_words=isinstance(text,list))
|
55 |
+
output = model(**encoded_input)
|
56 |
+
scores = output[0][0].detach().numpy()
|
57 |
+
scores = softmax(scores)
|
58 |
+
result = np.argmax(scores,axis=1)
|
59 |
+
|
60 |
+
return result[mask==np.array(1)]
|
61 |
+
|
62 |
+
ner_dict = {'O': 0,
|
63 |
+
'B-ACTIVE_PRINCIPLE': 1,
|
64 |
+
'I-ACTIVE_PRINCIPLE': 2,
|
65 |
+
'B-FORMA_FARMA':3,
|
66 |
+
'I-FORMA_FARMA':4,
|
67 |
+
'B-ADMIN': 5,
|
68 |
+
'I-ADMIN': 6,
|
69 |
+
'B-PERIODICITY': 7,
|
70 |
+
'I-PERIODICITY': 8,
|
71 |
+
'B-DURATION': 9,
|
72 |
+
'I-DURATION': 10
|
73 |
+
}
|
74 |
+
|
75 |
+
admin_ner_dict = {
|
76 |
+
'O': 0,
|
77 |
+
'B-CANT': 1,
|
78 |
+
'I-CANT': 2,
|
79 |
+
'B-UND':3,
|
80 |
+
'I-UND':4,
|
81 |
+
'B-VIA_ADMIN': 5,
|
82 |
+
'I-VIA_ADMIN': 6
|
83 |
+
}
|
84 |
+
|
85 |
+
|
86 |
+
def cargar_modelo(admin=False,verbose=False):
|
87 |
+
MODEL = 'ccarvajal/beto-prescripciones-medicas-ADMIN' if admin else 'ccarvajal/beto-prescripciones-medicas'
|
88 |
+
folder = "bert-clinical-scratch-wl-es-NER-ADMIN" if admin else "bert-clinical-scratch-wl-es-NER-prescription"
|
89 |
+
|
90 |
+
n_labels = len(admin_ner_dict) if admin else len(ner_dict)
|
91 |
+
|
92 |
+
if os.path.isdir('../modelos/' + folder):
|
93 |
+
folder = '../modelos/' + folder
|
94 |
+
if verbose:
|
95 |
+
print("Cargando modelo guardado localmente")
|
96 |
+
tokenizer = AutoTokenizer.from_pretrained(folder)
|
97 |
+
model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True)
|
98 |
+
elif os.path.isdir('modelos/' + folder):
|
99 |
+
folder = 'modelos/' + folder
|
100 |
+
if verbose:
|
101 |
+
print("Cargando modelo guardado localmente")
|
102 |
+
tokenizer = AutoTokenizer.from_pretrained(folder)
|
103 |
+
model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True)
|
104 |
+
else:
|
105 |
+
if verbose:
|
106 |
+
print("Descargando modelo de repositorio HuggingFace")
|
107 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
108 |
+
tokenizer.save_pretrained(folder)
|
109 |
+
model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=n_labels,ignore_mismatched_sizes=True)
|
110 |
+
model.save_pretrained(folder)
|
111 |
+
|
112 |
+
return tokenizer, model
|
113 |
+
|
114 |
+
|
115 |
+
tokenizer, model = cargar_modelo()
|
116 |
+
tokenizer_admin, model_admin = cargar_modelo(admin=True)
|
117 |
+
|
118 |
+
|
119 |
+
def get_admin(entidades:list,texto:str) -> str:
|
120 |
+
"""
|
121 |
+
Retorna un substring correspondiente a aquellas entidades etiquetadas con admin y los indices donde esto ocurre
|
122 |
+
"""
|
123 |
+
indices = [i for i, ent in enumerate(entidades) if ent == 'B-ADMIN' or ent == 'I-ADMIN']
|
124 |
+
return ' '.join([token for i, token in enumerate(texto.split(' ')) if i in indices]), indices
|
125 |
+
|
126 |
+
|
127 |
+
def render_pandas(ents,text_list):
|
128 |
+
data = {'ACTIVE_PRINCIPLE':'','FORMA_FARMA':'','CANT-ADMIN':'','UND-ADMIN':'','VIA-ADMIN':'','PERIODICITY':'','DURATION':''}
|
129 |
+
|
130 |
+
for i, ent in enumerate(ents):
|
131 |
+
if '-ACTIVE_PRINCIPLE' in ent:
|
132 |
+
data['ACTIVE_PRINCIPLE'] += ' ' + text_list[i]
|
133 |
+
elif '-FORMA_FARMA' in ent:
|
134 |
+
data['FORMA_FARMA'] += ' ' + text_list[i]
|
135 |
+
elif '-CANT' in ent:
|
136 |
+
data['CANT-ADMIN'] += ' ' + text_list[i]
|
137 |
+
elif '-UND' in ent:
|
138 |
+
data['UND-ADMIN'] += ' ' + text_list[i]
|
139 |
+
elif '-VIA_ADMIN' in ent:
|
140 |
+
data['VIA-ADMIN'] += ' ' + text_list[i]
|
141 |
+
elif '-PERIODICITY' in ent:
|
142 |
+
data['PERIODICITY'] += ' ' + text_list[i]
|
143 |
+
elif '-DURATION' in ent:
|
144 |
+
data['DURATION'] += ' ' + text_list[i]
|
145 |
+
|
146 |
+
df = pd.DataFrame([data])
|
147 |
+
return df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])]).hide(axis='index')
|
148 |
+
|
149 |
+
|
150 |
+
def etiquetar(texto,pandas=True):
|
151 |
+
entidades = map_entities(eval_text(texto,tokenizer,model),ner_dict)
|
152 |
+
admin_text, indices = get_admin(entidades,texto)
|
153 |
+
entidades_admin = map_entities(eval_text(admin_text,tokenizer_admin,model_admin),admin_ner_dict)
|
154 |
+
for i, ent_admin in enumerate(entidades_admin):
|
155 |
+
entidades[indices[i]] = ent_admin
|
156 |
+
if pandas:
|
157 |
+
return render_pandas(entidades,texto.split())
|
158 |
+
else:
|
159 |
+
return entidades
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy
|
2 |
+
pandas
|
3 |
+
scipy
|
4 |
+
transformers
|
5 |
+
torch
|