entidades-prescripciones / predicciones.py
ccarvajal's picture
Add application file
7c26359
raw
history blame
5.78 kB
"""Script que ejecuta las predicciones para un modelo (solo HF por el momento)"""
import os
import sys
import numpy as np
from scipy.special import softmax
sys.path.append('../modelos')
from transformers import AutoModelForTokenClassification, AutoTokenizer
import pandas as pd
def map_entities(y_pred,map_dict,return_type='list'):
inv_map = {v: k for k, v in map_dict.items()}
if return_type == 'list':
return [inv_map[y] for y in y_pred]
else:
return np.array([inv_map[y] for y in y_pred])
def word_ids_method(text,tokenizer):
"""Método que asigna el primer token (subword) de una palabra como representante
La etiqueta de este token será la etiqueta de la palabra
Método integrado en tokenize_and_align_labels
Fuente: https://huggingface.co/docs/transformers/tasks/token_classification
Argumentos
text: str o List[str]
texto a tokenizar, si
"""
if not isinstance(text,list):
text = text.split()
tokenized_inputs = tokenizer([text], truncation=True, is_split_into_words=True)
mask = []
word_ids = tokenized_inputs.word_ids(batch_index=0)
previous_word_idx = None
for word_idx in word_ids:
if word_idx is None:
mask.append(0)
elif word_idx != previous_word_idx: # Only label the first token of a given word.
mask.append(1)
else:
mask.append(0)
previous_word_idx = word_idx
return mask
def eval_text(text,tokenizer,model):
"""
Toma un texto (lista de palabras o string), un tokenizador y modelo de HuggingFace
Retorna el output del modelo (ids de entidades)
"""
mask = word_ids_method(text,tokenizer)
encoded_input = tokenizer(text,return_tensors='pt',is_split_into_words=isinstance(text,list))
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
result = np.argmax(scores,axis=1)
return result[mask==np.array(1)]
ner_dict = {'O': 0,
'B-ACTIVE_PRINCIPLE': 1,
'I-ACTIVE_PRINCIPLE': 2,
'B-FORMA_FARMA':3,
'I-FORMA_FARMA':4,
'B-ADMIN': 5,
'I-ADMIN': 6,
'B-PERIODICITY': 7,
'I-PERIODICITY': 8,
'B-DURATION': 9,
'I-DURATION': 10
}
admin_ner_dict = {
'O': 0,
'B-CANT': 1,
'I-CANT': 2,
'B-UND':3,
'I-UND':4,
'B-VIA_ADMIN': 5,
'I-VIA_ADMIN': 6
}
def cargar_modelo(admin=False,verbose=False):
MODEL = 'ccarvajal/beto-prescripciones-medicas-ADMIN' if admin else 'ccarvajal/beto-prescripciones-medicas'
folder = "bert-clinical-scratch-wl-es-NER-ADMIN" if admin else "bert-clinical-scratch-wl-es-NER-prescription"
n_labels = len(admin_ner_dict) if admin else len(ner_dict)
if os.path.isdir('../modelos/' + folder):
folder = '../modelos/' + folder
if verbose:
print("Cargando modelo guardado localmente")
tokenizer = AutoTokenizer.from_pretrained(folder)
model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True)
elif os.path.isdir('modelos/' + folder):
folder = 'modelos/' + folder
if verbose:
print("Cargando modelo guardado localmente")
tokenizer = AutoTokenizer.from_pretrained(folder)
model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True)
else:
if verbose:
print("Descargando modelo de repositorio HuggingFace")
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.save_pretrained(folder)
model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=n_labels,ignore_mismatched_sizes=True)
model.save_pretrained(folder)
return tokenizer, model
tokenizer, model = cargar_modelo()
tokenizer_admin, model_admin = cargar_modelo(admin=True)
def get_admin(entidades:list,texto:str) -> str:
"""
Retorna un substring correspondiente a aquellas entidades etiquetadas con admin y los indices donde esto ocurre
"""
indices = [i for i, ent in enumerate(entidades) if ent == 'B-ADMIN' or ent == 'I-ADMIN']
return ' '.join([token for i, token in enumerate(texto.split(' ')) if i in indices]), indices
def render_pandas(ents,text_list):
data = {'ACTIVE_PRINCIPLE':'','FORMA_FARMA':'','CANT-ADMIN':'','UND-ADMIN':'','VIA-ADMIN':'','PERIODICITY':'','DURATION':''}
for i, ent in enumerate(ents):
if '-ACTIVE_PRINCIPLE' in ent:
data['ACTIVE_PRINCIPLE'] += ' ' + text_list[i]
elif '-FORMA_FARMA' in ent:
data['FORMA_FARMA'] += ' ' + text_list[i]
elif '-CANT' in ent:
data['CANT-ADMIN'] += ' ' + text_list[i]
elif '-UND' in ent:
data['UND-ADMIN'] += ' ' + text_list[i]
elif '-VIA_ADMIN' in ent:
data['VIA-ADMIN'] += ' ' + text_list[i]
elif '-PERIODICITY' in ent:
data['PERIODICITY'] += ' ' + text_list[i]
elif '-DURATION' in ent:
data['DURATION'] += ' ' + text_list[i]
df = pd.DataFrame([data])
return df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])]).hide(axis='index')
def etiquetar(texto,pandas=True):
entidades = map_entities(eval_text(texto,tokenizer,model),ner_dict)
admin_text, indices = get_admin(entidades,texto)
entidades_admin = map_entities(eval_text(admin_text,tokenizer_admin,model_admin),admin_ner_dict)
for i, ent_admin in enumerate(entidades_admin):
entidades[indices[i]] = ent_admin
if pandas:
return render_pandas(entidades,texto.split())
else:
return entidades