from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, DebertaV2Tokenizer, DebertaV2Model import sentencepiece import streamlit as st import pandas as pd import spacy from spacy import displacy import plotly.express as px import numpy as np example_list = [ "Mustafa Kemal Atatürk 1919 yılında Samsun'a çıktı.", """Hello World""" ] st.set_page_config(layout="wide", page_title="Vocabulary Categorizer") st.title("Vocabulary Categorizer") st.write("This application identifies, highlights and categories nouns.") model_list = ['xlm-roberta-large-finetuned-conll03-english', 'xlm-roberta-large-xnli'] st.sidebar.header("Vocabulary categorizer") model_checkpoint = st.sidebar.radio("", model_list) st.sidebar.write("Which model highlights the most vocabulary words? Which model highlights the most accurately?") st.sidebar.write("") xlm_agg_strategy_info = "'aggregation_strategy' can be selected as 'simple' or 'none' for 'xlm-roberta'." st.sidebar.header("Select Aggregation Strategy Type") if model_checkpoint == "xlm-roberta-large-finetuned-conll03-english": aggregation = st.sidebar.radio("", ('simple', 'none')) st.sidebar.write(xlm_agg_strategy_info) st.sidebar.write("") elif model_checkpoint == "xlm-roberta-large-xnli": aggregation = st.sidebar.radio("", ('simple', 'none')) st.sidebar.write(xlm_agg_strategy_info) st.sidebar.write("") st.subheader("Select Text Input Method") input_method = st.radio("", ('Select from Examples', 'Write or Paste New Text')) if input_method == 'Select from Examples': selected_text = st.selectbox('Select Text from List', example_list, index=0, key=1) st.subheader("Text to Run") input_text = st.text_area("Selected Text", selected_text, height=128, max_chars=None, key=2) elif input_method == "Write or Paste New Text": st.subheader("Text to Run") input_text = st.text_area('Write or Paste Text Below', value="", height=128, max_chars=None, key=2) @st.cache(allow_output_mutation=True) def setModel(model_checkpoint, aggregation): model = AutoModelForTokenClassification.from_pretrained(model_checkpoint) tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) return pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy=aggregation) @st.cache(allow_output_mutation=True) def get_html(html: str): WRAPPER = """
{}
""" html = html.replace("\n", " ") return WRAPPER.format(html) Run_Button = st.button("Run", key=None) if Run_Button == True: ner_pipeline = setModel(model_checkpoint, aggregation) output = ner_pipeline(input_text) df = pd.DataFrame.from_dict(output) if aggregation != "none": cols_to_keep = ['word','entity_group','score','start','end'] else: cols_to_keep = ['word','entity','score','start','end'] df_final = df[cols_to_keep] st.subheader("Recognized Entities") st.dataframe(df_final) st.subheader("Spacy Style Display") spacy_display = {} spacy_display["ents"] = [] spacy_display["text"] = input_text spacy_display["title"] = None for entity in output: if aggregation != "none": spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]}) else: spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity"]}) entity_list = ["PER", "LOC", "ORG", "MISC"] colors = {'PER': '#85DCDF', 'LOC': '#DF85DC', 'ORG': '#DCDF85', 'MISC': '#85ABDF',} html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True, options={"ents": entity_list, "colors": colors}) style = "" st.write(f"{style}{get_html(html)}", unsafe_allow_html=True)