"""Script que ejecuta las predicciones para un modelo (solo HF por el momento)""" import os import sys import numpy as np from scipy.special import softmax sys.path.append('../modelos') from transformers import AutoModelForTokenClassification, AutoTokenizer import pandas as pd def map_entities(y_pred,map_dict,return_type='list'): inv_map = {v: k for k, v in map_dict.items()} if return_type == 'list': return [inv_map[y] for y in y_pred] else: return np.array([inv_map[y] for y in y_pred]) def word_ids_method(text,tokenizer): """Método que asigna el primer token (subword) de una palabra como representante La etiqueta de este token será la etiqueta de la palabra Método integrado en tokenize_and_align_labels Fuente: https://huggingface.co/docs/transformers/tasks/token_classification Argumentos text: str o List[str] texto a tokenizar, si """ if not isinstance(text,list): text = text.split() tokenized_inputs = tokenizer([text], truncation=True, is_split_into_words=True) mask = [] word_ids = tokenized_inputs.word_ids(batch_index=0) previous_word_idx = None for word_idx in word_ids: if word_idx is None: mask.append(0) elif word_idx != previous_word_idx: # Only label the first token of a given word. mask.append(1) else: mask.append(0) previous_word_idx = word_idx return mask def eval_text(text,tokenizer,model): """ Toma un texto (lista de palabras o string), un tokenizador y modelo de HuggingFace Retorna el output del modelo (ids de entidades) """ mask = word_ids_method(text,tokenizer) encoded_input = tokenizer(text,return_tensors='pt',is_split_into_words=isinstance(text,list)) output = model(**encoded_input) scores = output[0][0].detach().numpy() scores = softmax(scores) result = np.argmax(scores,axis=1) return result[mask==np.array(1)] ner_dict = {'O': 0, 'B-ACTIVE_PRINCIPLE': 1, 'I-ACTIVE_PRINCIPLE': 2, 'B-FORMA_FARMA':3, 'I-FORMA_FARMA':4, 'B-ADMIN': 5, 'I-ADMIN': 6, 'B-PERIODICITY': 7, 'I-PERIODICITY': 8, 'B-DURATION': 9, 'I-DURATION': 10 } admin_ner_dict = { 'O': 0, 'B-CANT': 1, 'I-CANT': 2, 'B-UND':3, 'I-UND':4, 'B-VIA_ADMIN': 5, 'I-VIA_ADMIN': 6 } def cargar_modelo(admin=False,verbose=False): MODEL = 'ccarvajal/beto-prescripciones-medicas-ADMIN' if admin else 'ccarvajal/beto-prescripciones-medicas' folder = "bert-clinical-scratch-wl-es-NER-ADMIN" if admin else "bert-clinical-scratch-wl-es-NER-prescription" n_labels = len(admin_ner_dict) if admin else len(ner_dict) if os.path.isdir('../modelos/' + folder): folder = '../modelos/' + folder if verbose: print("Cargando modelo guardado localmente") tokenizer = AutoTokenizer.from_pretrained(folder) model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True) elif os.path.isdir('modelos/' + folder): folder = 'modelos/' + folder if verbose: print("Cargando modelo guardado localmente") tokenizer = AutoTokenizer.from_pretrained(folder) model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True) else: if verbose: print("Descargando modelo de repositorio HuggingFace") tokenizer = AutoTokenizer.from_pretrained(MODEL) tokenizer.save_pretrained(folder) model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=n_labels,ignore_mismatched_sizes=True) model.save_pretrained(folder) return tokenizer, model tokenizer, model = cargar_modelo() tokenizer_admin, model_admin = cargar_modelo(admin=True) def get_admin(entidades:list,texto:str) -> str: """ Retorna un substring correspondiente a aquellas entidades etiquetadas con admin y los indices donde esto ocurre """ indices = [i for i, ent in enumerate(entidades) if ent == 'B-ADMIN' or ent == 'I-ADMIN'] return ' '.join([token for i, token in enumerate(texto.split(' ')) if i in indices]), indices def render_pandas(ents,text_list): data = {'ACTIVE_PRINCIPLE':'','FORMA_FARMA':'','CANT-ADMIN':'','UND-ADMIN':'','VIA-ADMIN':'','PERIODICITY':'','DURATION':''} for i, ent in enumerate(ents): if '-ACTIVE_PRINCIPLE' in ent: data['ACTIVE_PRINCIPLE'] += ' ' + text_list[i] elif '-FORMA_FARMA' in ent: data['FORMA_FARMA'] += ' ' + text_list[i] elif '-CANT' in ent: data['CANT-ADMIN'] += ' ' + text_list[i] elif '-UND' in ent: data['UND-ADMIN'] += ' ' + text_list[i] elif '-VIA_ADMIN' in ent: data['VIA-ADMIN'] += ' ' + text_list[i] elif '-PERIODICITY' in ent: data['PERIODICITY'] += ' ' + text_list[i] elif '-DURATION' in ent: data['DURATION'] += ' ' + text_list[i] df = pd.DataFrame([data]) return df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])]).hide(axis='index') def etiquetar(texto,pandas=True): entidades = map_entities(eval_text(texto,tokenizer,model),ner_dict) admin_text, indices = get_admin(entidades,texto) entidades_admin = map_entities(eval_text(admin_text,tokenizer_admin,model_admin),admin_ner_dict) for i, ent_admin in enumerate(entidades_admin): entidades[indices[i]] = ent_admin if pandas: return render_pandas(entidades,texto.split()) else: return entidades