Spaces:
Runtime error
Runtime error
"""Script que ejecuta las predicciones para un modelo (solo HF por el momento)""" | |
import os | |
import sys | |
import numpy as np | |
from scipy.special import softmax | |
sys.path.append('../modelos') | |
from transformers import AutoModelForTokenClassification, AutoTokenizer | |
import pandas as pd | |
def map_entities(y_pred,map_dict,return_type='list'): | |
inv_map = {v: k for k, v in map_dict.items()} | |
if return_type == 'list': | |
return [inv_map[y] for y in y_pred] | |
else: | |
return np.array([inv_map[y] for y in y_pred]) | |
def word_ids_method(text,tokenizer): | |
"""Método que asigna el primer token (subword) de una palabra como representante | |
La etiqueta de este token será la etiqueta de la palabra | |
Método integrado en tokenize_and_align_labels | |
Fuente: https://huggingface.co/docs/transformers/tasks/token_classification | |
Argumentos | |
text: str o List[str] | |
texto a tokenizar, si | |
""" | |
if not isinstance(text,list): | |
text = text.split() | |
tokenized_inputs = tokenizer([text], truncation=True, is_split_into_words=True) | |
mask = [] | |
word_ids = tokenized_inputs.word_ids(batch_index=0) | |
previous_word_idx = None | |
for word_idx in word_ids: | |
if word_idx is None: | |
mask.append(0) | |
elif word_idx != previous_word_idx: # Only label the first token of a given word. | |
mask.append(1) | |
else: | |
mask.append(0) | |
previous_word_idx = word_idx | |
return mask | |
def eval_text(text,tokenizer,model): | |
""" | |
Toma un texto (lista de palabras o string), un tokenizador y modelo de HuggingFace | |
Retorna el output del modelo (ids de entidades) | |
""" | |
mask = word_ids_method(text,tokenizer) | |
encoded_input = tokenizer(text,return_tensors='pt',is_split_into_words=isinstance(text,list)) | |
output = model(**encoded_input) | |
scores = output[0][0].detach().numpy() | |
scores = softmax(scores) | |
result = np.argmax(scores,axis=1) | |
return result[mask==np.array(1)] | |
ner_dict = {'O': 0, | |
'B-ACTIVE_PRINCIPLE': 1, | |
'I-ACTIVE_PRINCIPLE': 2, | |
'B-FORMA_FARMA':3, | |
'I-FORMA_FARMA':4, | |
'B-ADMIN': 5, | |
'I-ADMIN': 6, | |
'B-PERIODICITY': 7, | |
'I-PERIODICITY': 8, | |
'B-DURATION': 9, | |
'I-DURATION': 10 | |
} | |
admin_ner_dict = { | |
'O': 0, | |
'B-CANT': 1, | |
'I-CANT': 2, | |
'B-UND':3, | |
'I-UND':4, | |
'B-VIA_ADMIN': 5, | |
'I-VIA_ADMIN': 6 | |
} | |
def cargar_modelo(admin=False,verbose=False): | |
MODEL = 'ccarvajal/beto-prescripciones-medicas-ADMIN' if admin else 'ccarvajal/beto-prescripciones-medicas' | |
folder = "bert-clinical-scratch-wl-es-NER-ADMIN" if admin else "bert-clinical-scratch-wl-es-NER-prescription" | |
n_labels = len(admin_ner_dict) if admin else len(ner_dict) | |
if os.path.isdir('../modelos/' + folder): | |
folder = '../modelos/' + folder | |
if verbose: | |
print("Cargando modelo guardado localmente") | |
tokenizer = AutoTokenizer.from_pretrained(folder) | |
model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True) | |
elif os.path.isdir('modelos/' + folder): | |
folder = 'modelos/' + folder | |
if verbose: | |
print("Cargando modelo guardado localmente") | |
tokenizer = AutoTokenizer.from_pretrained(folder) | |
model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True) | |
else: | |
if verbose: | |
print("Descargando modelo de repositorio HuggingFace") | |
tokenizer = AutoTokenizer.from_pretrained(MODEL) | |
tokenizer.save_pretrained(folder) | |
model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=n_labels,ignore_mismatched_sizes=True) | |
model.save_pretrained(folder) | |
return tokenizer, model | |
tokenizer, model = cargar_modelo() | |
tokenizer_admin, model_admin = cargar_modelo(admin=True) | |
def get_admin(entidades:list,texto:str) -> str: | |
""" | |
Retorna un substring correspondiente a aquellas entidades etiquetadas con admin y los indices donde esto ocurre | |
""" | |
indices = [i for i, ent in enumerate(entidades) if ent == 'B-ADMIN' or ent == 'I-ADMIN'] | |
return ' '.join([token for i, token in enumerate(texto.split(' ')) if i in indices]), indices | |
def render_pandas(ents,text_list): | |
data = {'ACTIVE_PRINCIPLE':'','FORMA_FARMA':'','CANT-ADMIN':'','UND-ADMIN':'','VIA-ADMIN':'','PERIODICITY':'','DURATION':''} | |
for i, ent in enumerate(ents): | |
if '-ACTIVE_PRINCIPLE' in ent: | |
data['ACTIVE_PRINCIPLE'] += ' ' + text_list[i] | |
elif '-FORMA_FARMA' in ent: | |
data['FORMA_FARMA'] += ' ' + text_list[i] | |
elif '-CANT' in ent: | |
data['CANT-ADMIN'] += ' ' + text_list[i] | |
elif '-UND' in ent: | |
data['UND-ADMIN'] += ' ' + text_list[i] | |
elif '-VIA_ADMIN' in ent: | |
data['VIA-ADMIN'] += ' ' + text_list[i] | |
elif '-PERIODICITY' in ent: | |
data['PERIODICITY'] += ' ' + text_list[i] | |
elif '-DURATION' in ent: | |
data['DURATION'] += ' ' + text_list[i] | |
df = pd.DataFrame([data]) | |
return df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])]).hide(axis='index') | |
def etiquetar(texto,pandas=True): | |
entidades = map_entities(eval_text(texto,tokenizer,model),ner_dict) | |
admin_text, indices = get_admin(entidades,texto) | |
entidades_admin = map_entities(eval_text(admin_text,tokenizer_admin,model_admin),admin_ner_dict) | |
for i, ent_admin in enumerate(entidades_admin): | |
entidades[indices[i]] = ent_admin | |
if pandas: | |
return render_pandas(entidades,texto.split()) | |
else: | |
return entidades | |