ccarvajal commited on
Commit
7c26359
1 Parent(s): ec8d4e9

Add application file

Browse files
Files changed (3) hide show
  1. app.py +14 -0
  2. predicciones.py +159 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from predicciones import etiquetar
3
+
4
+ example = "PARACETAMOL 500 MG COMPRIMIDO 1 COMPRIMIDO ORAL cada 6 horas durante 3 dias"
5
+
6
+ st.title("Reconocimiento de entidades en prescripciones médicas")
7
+
8
+ text = st.text_input('Insertar prescripción', example)
9
+ # st.write('Texto ingresado', title)
10
+
11
+ entidades = etiquetar(text,pandas=False)
12
+ for i, ent in enumerate(entidades):
13
+ # print("{}\t{}".format(text.split()[i],ent))
14
+ st.write(text.split()[i],ent)
predicciones.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Script que ejecuta las predicciones para un modelo (solo HF por el momento)"""
2
+ import os
3
+ import sys
4
+ import numpy as np
5
+ from scipy.special import softmax
6
+ sys.path.append('../modelos')
7
+ from transformers import AutoModelForTokenClassification, AutoTokenizer
8
+ import pandas as pd
9
+
10
+ def map_entities(y_pred,map_dict,return_type='list'):
11
+ inv_map = {v: k for k, v in map_dict.items()}
12
+ if return_type == 'list':
13
+ return [inv_map[y] for y in y_pred]
14
+ else:
15
+ return np.array([inv_map[y] for y in y_pred])
16
+
17
+
18
+ def word_ids_method(text,tokenizer):
19
+ """Método que asigna el primer token (subword) de una palabra como representante
20
+ La etiqueta de este token será la etiqueta de la palabra
21
+ Método integrado en tokenize_and_align_labels
22
+ Fuente: https://huggingface.co/docs/transformers/tasks/token_classification
23
+
24
+ Argumentos
25
+
26
+ text: str o List[str]
27
+ texto a tokenizar, si
28
+ """
29
+ if not isinstance(text,list):
30
+ text = text.split()
31
+
32
+ tokenized_inputs = tokenizer([text], truncation=True, is_split_into_words=True)
33
+ mask = []
34
+ word_ids = tokenized_inputs.word_ids(batch_index=0)
35
+ previous_word_idx = None
36
+ for word_idx in word_ids:
37
+ if word_idx is None:
38
+ mask.append(0)
39
+ elif word_idx != previous_word_idx: # Only label the first token of a given word.
40
+ mask.append(1)
41
+ else:
42
+ mask.append(0)
43
+ previous_word_idx = word_idx
44
+
45
+ return mask
46
+
47
+
48
+ def eval_text(text,tokenizer,model):
49
+ """
50
+ Toma un texto (lista de palabras o string), un tokenizador y modelo de HuggingFace
51
+ Retorna el output del modelo (ids de entidades)
52
+ """
53
+ mask = word_ids_method(text,tokenizer)
54
+ encoded_input = tokenizer(text,return_tensors='pt',is_split_into_words=isinstance(text,list))
55
+ output = model(**encoded_input)
56
+ scores = output[0][0].detach().numpy()
57
+ scores = softmax(scores)
58
+ result = np.argmax(scores,axis=1)
59
+
60
+ return result[mask==np.array(1)]
61
+
62
+ ner_dict = {'O': 0,
63
+ 'B-ACTIVE_PRINCIPLE': 1,
64
+ 'I-ACTIVE_PRINCIPLE': 2,
65
+ 'B-FORMA_FARMA':3,
66
+ 'I-FORMA_FARMA':4,
67
+ 'B-ADMIN': 5,
68
+ 'I-ADMIN': 6,
69
+ 'B-PERIODICITY': 7,
70
+ 'I-PERIODICITY': 8,
71
+ 'B-DURATION': 9,
72
+ 'I-DURATION': 10
73
+ }
74
+
75
+ admin_ner_dict = {
76
+ 'O': 0,
77
+ 'B-CANT': 1,
78
+ 'I-CANT': 2,
79
+ 'B-UND':3,
80
+ 'I-UND':4,
81
+ 'B-VIA_ADMIN': 5,
82
+ 'I-VIA_ADMIN': 6
83
+ }
84
+
85
+
86
+ def cargar_modelo(admin=False,verbose=False):
87
+ MODEL = 'ccarvajal/beto-prescripciones-medicas-ADMIN' if admin else 'ccarvajal/beto-prescripciones-medicas'
88
+ folder = "bert-clinical-scratch-wl-es-NER-ADMIN" if admin else "bert-clinical-scratch-wl-es-NER-prescription"
89
+
90
+ n_labels = len(admin_ner_dict) if admin else len(ner_dict)
91
+
92
+ if os.path.isdir('../modelos/' + folder):
93
+ folder = '../modelos/' + folder
94
+ if verbose:
95
+ print("Cargando modelo guardado localmente")
96
+ tokenizer = AutoTokenizer.from_pretrained(folder)
97
+ model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True)
98
+ elif os.path.isdir('modelos/' + folder):
99
+ folder = 'modelos/' + folder
100
+ if verbose:
101
+ print("Cargando modelo guardado localmente")
102
+ tokenizer = AutoTokenizer.from_pretrained(folder)
103
+ model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True)
104
+ else:
105
+ if verbose:
106
+ print("Descargando modelo de repositorio HuggingFace")
107
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
108
+ tokenizer.save_pretrained(folder)
109
+ model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=n_labels,ignore_mismatched_sizes=True)
110
+ model.save_pretrained(folder)
111
+
112
+ return tokenizer, model
113
+
114
+
115
+ tokenizer, model = cargar_modelo()
116
+ tokenizer_admin, model_admin = cargar_modelo(admin=True)
117
+
118
+
119
+ def get_admin(entidades:list,texto:str) -> str:
120
+ """
121
+ Retorna un substring correspondiente a aquellas entidades etiquetadas con admin y los indices donde esto ocurre
122
+ """
123
+ indices = [i for i, ent in enumerate(entidades) if ent == 'B-ADMIN' or ent == 'I-ADMIN']
124
+ return ' '.join([token for i, token in enumerate(texto.split(' ')) if i in indices]), indices
125
+
126
+
127
+ def render_pandas(ents,text_list):
128
+ data = {'ACTIVE_PRINCIPLE':'','FORMA_FARMA':'','CANT-ADMIN':'','UND-ADMIN':'','VIA-ADMIN':'','PERIODICITY':'','DURATION':''}
129
+
130
+ for i, ent in enumerate(ents):
131
+ if '-ACTIVE_PRINCIPLE' in ent:
132
+ data['ACTIVE_PRINCIPLE'] += ' ' + text_list[i]
133
+ elif '-FORMA_FARMA' in ent:
134
+ data['FORMA_FARMA'] += ' ' + text_list[i]
135
+ elif '-CANT' in ent:
136
+ data['CANT-ADMIN'] += ' ' + text_list[i]
137
+ elif '-UND' in ent:
138
+ data['UND-ADMIN'] += ' ' + text_list[i]
139
+ elif '-VIA_ADMIN' in ent:
140
+ data['VIA-ADMIN'] += ' ' + text_list[i]
141
+ elif '-PERIODICITY' in ent:
142
+ data['PERIODICITY'] += ' ' + text_list[i]
143
+ elif '-DURATION' in ent:
144
+ data['DURATION'] += ' ' + text_list[i]
145
+
146
+ df = pd.DataFrame([data])
147
+ return df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])]).hide(axis='index')
148
+
149
+
150
+ def etiquetar(texto,pandas=True):
151
+ entidades = map_entities(eval_text(texto,tokenizer,model),ner_dict)
152
+ admin_text, indices = get_admin(entidades,texto)
153
+ entidades_admin = map_entities(eval_text(admin_text,tokenizer_admin,model_admin),admin_ner_dict)
154
+ for i, ent_admin in enumerate(entidades_admin):
155
+ entidades[indices[i]] = ent_admin
156
+ if pandas:
157
+ return render_pandas(entidades,texto.split())
158
+ else:
159
+ return entidades
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ scipy
4
+ transformers
5
+ torch