Spaces:

ccarvajal
/

entidades-prescripciones

Runtime error

App Files Files Community

ccarvajal commited on Dec 23, 2022

Commit

7c26359

•

1 Parent(s): ec8d4e9

Add application file

Browse files

Files changed (3) hide show

app.py +14 -0
predicciones.py +159 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import streamlit as st
+from predicciones import etiquetar
+example = "PARACETAMOL 500 MG COMPRIMIDO 1 COMPRIMIDO ORAL cada 6 horas durante 3 dias"
+st.title("Reconocimiento de entidades en prescripciones médicas")
+text = st.text_input('Insertar prescripción', example)
+# st.write('Texto ingresado', title)
+entidades = etiquetar(text,pandas=False)
+for i, ent in enumerate(entidades):
+    # print("{}\t{}".format(text.split()[i],ent))
+    st.write(text.split()[i],ent)

predicciones.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""Script que ejecuta las predicciones para un modelo (solo HF por el momento)"""
+import os
+import sys
+import numpy as np
+from scipy.special import softmax
+sys.path.append('../modelos')
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+import pandas as pd
+def map_entities(y_pred,map_dict,return_type='list'):
+    inv_map = {v: k for k, v in map_dict.items()}
+    if return_type == 'list':
+        return [inv_map[y] for y in y_pred]
+    else:
+        return np.array([inv_map[y] for y in y_pred])
+def word_ids_method(text,tokenizer):
+    """Método que asigna el primer token (subword) de una palabra como representante
+        La etiqueta de este token será la etiqueta de la palabra
+    Método integrado en tokenize_and_align_labels
+    Fuente: https://huggingface.co/docs/transformers/tasks/token_classification
+    Argumentos
+        text: str o List[str]
+            texto a tokenizar, si
+    """
+    if not isinstance(text,list):
+        text = text.split()
+    tokenized_inputs = tokenizer([text], truncation=True, is_split_into_words=True)
+    mask = []
+    word_ids = tokenized_inputs.word_ids(batch_index=0)
+    previous_word_idx = None
+    for word_idx in word_ids:
+        if word_idx is None:
+            mask.append(0)
+        elif word_idx != previous_word_idx:  # Only label the first token of a given word.
+            mask.append(1)
+        else:
+            mask.append(0)
+        previous_word_idx = word_idx
+    return mask
+def eval_text(text,tokenizer,model):
+    """
+    Toma un texto (lista de palabras o string), un tokenizador y modelo de HuggingFace
+    Retorna el output del modelo (ids de entidades)
+    """
+    mask = word_ids_method(text,tokenizer)
+    encoded_input = tokenizer(text,return_tensors='pt',is_split_into_words=isinstance(text,list))
+    output = model(**encoded_input)
+    scores = output[0][0].detach().numpy()
+    scores = softmax(scores)
+    result = np.argmax(scores,axis=1)
+    return result[mask==np.array(1)]
+ner_dict = {'O': 0,
+            'B-ACTIVE_PRINCIPLE': 1,
+            'I-ACTIVE_PRINCIPLE': 2,
+            'B-FORMA_FARMA':3,
+            'I-FORMA_FARMA':4,
+            'B-ADMIN': 5,
+            'I-ADMIN': 6,
+            'B-PERIODICITY': 7,
+            'I-PERIODICITY': 8,
+            'B-DURATION': 9,
+            'I-DURATION': 10
+            }
+admin_ner_dict = {
+    'O': 0,
+    'B-CANT': 1,
+    'I-CANT': 2,
+    'B-UND':3,
+    'I-UND':4,
+    'B-VIA_ADMIN': 5,
+    'I-VIA_ADMIN': 6
+}
+def cargar_modelo(admin=False,verbose=False):
+    MODEL = 'ccarvajal/beto-prescripciones-medicas-ADMIN' if admin else 'ccarvajal/beto-prescripciones-medicas'
+    folder = "bert-clinical-scratch-wl-es-NER-ADMIN" if admin else "bert-clinical-scratch-wl-es-NER-prescription"
+    n_labels = len(admin_ner_dict) if admin else len(ner_dict)
+    if os.path.isdir('../modelos/' + folder):
+        folder = '../modelos/' + folder
+        if verbose:
+            print("Cargando modelo guardado localmente")
+        tokenizer = AutoTokenizer.from_pretrained(folder)
+        model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True)
+    elif os.path.isdir('modelos/' + folder):
+        folder = 'modelos/' + folder
+        if verbose:
+            print("Cargando modelo guardado localmente")
+        tokenizer = AutoTokenizer.from_pretrained(folder)
+        model = AutoModelForTokenClassification.from_pretrained(folder, num_labels=n_labels,ignore_mismatched_sizes=True)
+    else:
+        if verbose:
+            print("Descargando modelo de repositorio HuggingFace")
+        tokenizer = AutoTokenizer.from_pretrained(MODEL)
+        tokenizer.save_pretrained(folder)
+        model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=n_labels,ignore_mismatched_sizes=True)
+        model.save_pretrained(folder)
+    return tokenizer, model
+tokenizer, model = cargar_modelo()
+tokenizer_admin, model_admin = cargar_modelo(admin=True)
+def get_admin(entidades:list,texto:str) -> str:
+    """
+    Retorna un substring correspondiente a aquellas entidades etiquetadas con admin y los indices donde esto ocurre
+    """
+    indices = [i for i, ent in enumerate(entidades) if ent == 'B-ADMIN' or ent == 'I-ADMIN']
+    return ' '.join([token for i, token in enumerate(texto.split(' ')) if i in indices]), indices
+def render_pandas(ents,text_list):
+    data = {'ACTIVE_PRINCIPLE':'','FORMA_FARMA':'','CANT-ADMIN':'','UND-ADMIN':'','VIA-ADMIN':'','PERIODICITY':'','DURATION':''}
+    for i, ent in enumerate(ents):
+        if '-ACTIVE_PRINCIPLE' in ent:
+            data['ACTIVE_PRINCIPLE'] += ' ' + text_list[i]
+        elif '-FORMA_FARMA' in ent:
+            data['FORMA_FARMA'] += ' ' + text_list[i]
+        elif '-CANT' in ent:
+            data['CANT-ADMIN'] += ' ' + text_list[i]
+        elif '-UND' in ent:
+            data['UND-ADMIN'] += ' ' + text_list[i]
+        elif '-VIA_ADMIN' in ent:
+            data['VIA-ADMIN'] += ' ' + text_list[i]
+        elif '-PERIODICITY' in ent:
+            data['PERIODICITY'] += ' ' + text_list[i]
+        elif '-DURATION' in ent:
+            data['DURATION'] += ' ' + text_list[i]
+    df = pd.DataFrame([data])
+    return df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])]).hide(axis='index')
+def etiquetar(texto,pandas=True):
+    entidades = map_entities(eval_text(texto,tokenizer,model),ner_dict)
+    admin_text, indices = get_admin(entidades,texto)
+    entidades_admin = map_entities(eval_text(admin_text,tokenizer_admin,model_admin),admin_ner_dict)
+    for i, ent_admin in enumerate(entidades_admin):
+        entidades[indices[i]] = ent_admin
+    if pandas:
+        return render_pandas(entidades,texto.split())
+    else:
+        return entidades

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+numpy
+pandas
+scipy
+transformers
+torch