Spaces:
Runtime error
Runtime error
File size: 5,779 Bytes
7c26359 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
"""Script que ejecuta las predicciones para un modelo (solo HF por el momento)"""
import os
import sys
import numpy as np
from scipy.special import softmax
sys.path.append('../modelos')
from transformers import AutoModelForTokenClassification, AutoTokenizer
import pandas as pd
def map_entities(y_pred,map_dict,return_type='list'):
inv_map = {v: k for k, v in map_dict.items()}
if return_type == 'list':
return [inv_map[y] for y in y_pred]
else:
return np.array([inv_map[y] for y in y_pred])
def word_ids_method(text,tokenizer):
"""Método que asigna el primer token (subword) de una palabra como representante
La etiqueta de este token será la etiqueta de la palabra
Método integrado en tokenize_and_align_labels
Fuente: https://huggingface.co/docs/transformers/tasks/token_classification
Argumentos
text: str o List[str]
texto a tokenizar, si
"""
if not isinstance(text,list):
text = text.split()
tokenized_inputs = tokenizer([text], truncation=True, is_split_into_words=True)
mask = []
word_ids = tokenized_inputs.word_ids(batch_index=0)
previous_word_idx = None
for word_idx in word_ids:
if word_idx is None:
mask.append(0)
elif word_idx != previous_word_idx: # Only label the first token of a given word.
mask.append(1)
else:
mask.append(0)
previous_word_idx = word_idx
return mask
def eval_text(text,tokenizer,model):
"""
Toma un texto (lista de palabras o string), un tokenizador y modelo de HuggingFace
Retorna el output del modelo (ids de entidades)
"""
mask = word_ids_method(text,tokenizer)
encoded_input = tokenizer(text,return_tensors='pt',is_split_into_words=isinstance(text,list))
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
result = np.argmax(scores,axis=1)
return result[mask==np.array(1)]
ner_dict = {'O': 0,
'B-ACTIVE_PRINCIPLE': 1,
'I-ACTIVE_PRINCIPLE': 2,
'B-FORMA_FARMA':3,
'I-FORMA_FARMA':4,
'B-ADMIN': 5,
'I-ADMIN': 6,
'B-PERIODICITY': 7,
'I-PERIODICITY': 8,
'B-DURATION': 9,
'I-DURATION': 10
}
admin_ner_dict = {
'O': 0,
'B-CANT': 1,
'I-CANT': 2,
'B-UND':3,
'I-UND':4,
'B-VIA_ADMIN': 5,
'I-VIA_ADMIN': 6
}
def cargar_modelo(admin=False,verbose=False):
MODEL = 'ccarvajal/beto-prescripciones-medicas-ADMIN' if admin else 'ccarvajal/beto-prescripciones-medicas'
folder = "bert-clinical-scratch-wl-es-NER-ADMIN" if admin else "bert-clinical-scratch-wl-es-NER-prescription"
n_labels = len(admin_ner_dict) if admin else len(ner_dict)
if os.path.isdir('../modelos/' + folder):
folder = '../modelos/' + folder
if verbose:
print("Cargando modelo guardado localmente")
tokenizer = AutoTokenizer.from_pretrained(folder)
model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True)
elif os.path.isdir('modelos/' + folder):
folder = 'modelos/' + folder
if verbose:
print("Cargando modelo guardado localmente")
tokenizer = AutoTokenizer.from_pretrained(folder)
model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True)
else:
if verbose:
print("Descargando modelo de repositorio HuggingFace")
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.save_pretrained(folder)
model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=n_labels,ignore_mismatched_sizes=True)
model.save_pretrained(folder)
return tokenizer, model
tokenizer, model = cargar_modelo()
tokenizer_admin, model_admin = cargar_modelo(admin=True)
def get_admin(entidades:list,texto:str) -> str:
"""
Retorna un substring correspondiente a aquellas entidades etiquetadas con admin y los indices donde esto ocurre
"""
indices = [i for i, ent in enumerate(entidades) if ent == 'B-ADMIN' or ent == 'I-ADMIN']
return ' '.join([token for i, token in enumerate(texto.split(' ')) if i in indices]), indices
def render_pandas(ents,text_list):
data = {'ACTIVE_PRINCIPLE':'','FORMA_FARMA':'','CANT-ADMIN':'','UND-ADMIN':'','VIA-ADMIN':'','PERIODICITY':'','DURATION':''}
for i, ent in enumerate(ents):
if '-ACTIVE_PRINCIPLE' in ent:
data['ACTIVE_PRINCIPLE'] += ' ' + text_list[i]
elif '-FORMA_FARMA' in ent:
data['FORMA_FARMA'] += ' ' + text_list[i]
elif '-CANT' in ent:
data['CANT-ADMIN'] += ' ' + text_list[i]
elif '-UND' in ent:
data['UND-ADMIN'] += ' ' + text_list[i]
elif '-VIA_ADMIN' in ent:
data['VIA-ADMIN'] += ' ' + text_list[i]
elif '-PERIODICITY' in ent:
data['PERIODICITY'] += ' ' + text_list[i]
elif '-DURATION' in ent:
data['DURATION'] += ' ' + text_list[i]
df = pd.DataFrame([data])
return df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])]).hide(axis='index')
def etiquetar(texto,pandas=True):
entidades = map_entities(eval_text(texto,tokenizer,model),ner_dict)
admin_text, indices = get_admin(entidades,texto)
entidades_admin = map_entities(eval_text(admin_text,tokenizer_admin,model_admin),admin_ner_dict)
for i, ent_admin in enumerate(entidades_admin):
entidades[indices[i]] = ent_admin
if pandas:
return render_pandas(entidades,texto.split())
else:
return entidades
|