import streamlit as st from transformers import pipeline from ipymarkup import format_span_box_markup # Load the pre-trained NER model model = pipeline("ner", model="/home/user/app/mendobert/", tokenizer="indolem/indobert-base-uncased") basemodel = pipeline("ner", model="/home/user/app/base-model/", tokenizer="indolem/indobert-base-uncased") st.title(':blue[MendoBERT] - Named Entity Recognition :sunglasses:') if 'options' not in st.session_state: st.session_state['options'] = "" def button1_callback(): st.session_state['options'] = "Aspartylglucosaminuria (AGU) adalah gangguan metabolisme glikoprotein langka." def button2_callback(): st.session_state['options'] = "Mutasi germ - line dari gen BRCA1 membuat wanita cenderung mengalami kanker payudara dini dengan mengorbankan fungsi presumtif gen sebagai penekan tumor." placeholder = st.empty() st.caption('_Examples_') st.button('Aspartylglucosaminuria (AGU) adalah gangguan metabolisme glikoprotein langka.', use_container_width=True, on_click = button1_callback) st.button('Mutasi germ - line dari gen BRCA1 membuat wanita cenderung mengalami kanker payudara dini dengan mengorbankan fungsi presumtif gen sebagai penekan tumor.', use_container_width=True, on_click = button2_callback) with placeholder: text = st.text_area('Enter some text: ', key = 'options') if text: ner_results = model(text) ner_results2 = basemodel(text) # MendoBERT formatted_results = [] for result in ner_results: end = result["start"]+len(result["word"].replace("##", "")) if result["word"].startswith("##"): formatted_results[-1]["end"] = end formatted_results[-1]["word"]+= result["word"].replace("##", "") else: formatted_results.append({ 'start': result["start"], 'end': end, 'entity': result["entity"], 'index': result["index"], 'score': result["score"], 'word': result["word"]}) for result in formatted_results: if result["entity"].startswith("LABEL_0"): result["entity"] = "O" elif result["entity"].startswith("LABEL_1"): result["entity"] = "B" elif result["entity"].startswith("LABEL_2"): result["entity"] = "I" mendo = [] spanMendo = [] for result in formatted_results: if not result["entity"].startswith("O"): spanMendo.append((result["start"],result["end"],result["entity"])) mendo.append(f"""Entity: {result["entity"]}, Start:{result["start"]}, End:{result["end"]}, word:{text[result["start"]:result["end"]]}, score:{result["score"]}""") # Base Model formatted_results = [] for result in ner_results2: end = result["start"]+len(result["word"].replace("##", "")) if result["word"].startswith("##"): formatted_results[-1]["end"] = end formatted_results[-1]["word"]+= result["word"].replace("##", "") else: formatted_results.append({ 'start': result["start"], 'end': end, 'entity': result["entity"], 'index': result["index"], 'score': result["score"], 'word': result["word"]}) for result in formatted_results: if result["entity"].startswith("LABEL_0"): result["entity"] = "O" elif result["entity"].startswith("LABEL_1"): result["entity"] = "B" elif result["entity"].startswith("LABEL_2"): result["entity"] = "I" base=[] spanBase=[] for result in formatted_results: if not result["entity"].startswith("O"): spanBase.append((result["start"],result["end"],result["entity"])) base.append(f"""Entity: {result["entity"]}, Start:{result["start"]}, End:{result["end"]}, word:{text[result["start"]:result["end"]]}, score:{result["score"]}""") formatMendo = format_span_box_markup(text, spanMendo) htmlMendo = ''.join(formatMendo) formatBase = format_span_box_markup(text, spanBase) htmlBase = ''.join(formatBase) st.subheader('MendoBERT') st.json(mendo) st.markdown(htmlMendo,unsafe_allow_html=True) st.subheader('IndoLEM') st.json(base) st.markdown(htmlBase,unsafe_allow_html=True) st.write("\n") st.info("'B' means Beginning of an entity, 'I' means Inside of an entity", icon="ℹ️") text = False st.write("\n\n")