Wootang01's picture
Update app.py
e4016f5
raw
history blame
No virus
3.96 kB
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, DebertaV2Tokenizer, DebertaV2Model
import sentencepiece
import streamlit as st
import pandas as pd
import spacy
from spacy import displacy
import plotly.express as px
import numpy as np
example_list = [
"Mustafa Kemal Atatürk 1919 yılında Samsun'a çıktı.",
"""Hello World"""
]
st.set_page_config(layout="wide", page_title="Vocabulary Categorizer")
st.title("Vocabulary Categorizer")
st.write("This application identifies, highlights and categories nouns.")
model_list = ['xlm-roberta-large-finetuned-conll03-english', 'xlm-roberta-large']
st.sidebar.header("Vocabulary categorizer")
model_checkpoint = st.sidebar.radio("", model_list)
st.sidebar.write("Which model highlights the most vocabulary words? Which model highlights the most accurately?")
st.sidebar.write("")
xlm_agg_strategy_info = "'aggregation_strategy' can be selected as 'simple' or 'none' for 'xlm-roberta'."
st.sidebar.header("Select Aggregation Strategy Type")
if model_checkpoint == "xlm-roberta-large-finetuned-conll03-english":
aggregation = st.sidebar.radio("", ('simple', 'none'))
st.sidebar.write(xlm_agg_strategy_info)
st.sidebar.write("")
elif model_checkpoint == "xlm-roberta-large":
aggregation = st.sidebar.radio("", ('simple', 'none'))
st.sidebar.write(xlm_agg_strategy_info)
st.sidebar.write("")
st.subheader("Select Text Input Method")
input_method = st.radio("", ('Select from Examples', 'Write or Paste New Text'))
if input_method == 'Select from Examples':
selected_text = st.selectbox('Select Text from List', example_list, index=0, key=1)
st.subheader("Text to Run")
input_text = st.text_area("Selected Text", selected_text, height=128, max_chars=None, key=2)
elif input_method == "Write or Paste New Text":
st.subheader("Text to Run")
input_text = st.text_area('Write or Paste Text Below', value="", height=128, max_chars=None, key=2)
@st.cache(allow_output_mutation=True)
def setModel(model_checkpoint, aggregation):
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
return pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy=aggregation)
@st.cache(allow_output_mutation=True)
def get_html(html: str):
WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
html = html.replace("\n", " ")
return WRAPPER.format(html)
Run_Button = st.button("Run", key=None)
if Run_Button == True:
ner_pipeline = setModel(model_checkpoint, aggregation)
output = ner_pipeline(input_text)
df = pd.DataFrame.from_dict(output)
if aggregation != "none":
cols_to_keep = ['word','entity_group','score','start','end']
else:
cols_to_keep = ['word','entity','score','start','end']
df_final = df[cols_to_keep]
st.subheader("Recognized Entities")
st.dataframe(df_final)
st.subheader("Spacy Style Display")
spacy_display = {}
spacy_display["ents"] = []
spacy_display["text"] = input_text
spacy_display["title"] = None
for entity in output:
if aggregation != "none":
spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]})
else:
spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity"]})
entity_list = ["PER", "LOC", "ORG", "MISC"]
colors = {'PER': '#85DCDF', 'LOC': '#DF85DC', 'ORG': '#DCDF85', 'MISC': '#85ABDF',}
html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True, options={"ents": entity_list, "colors": colors})
style = "<style>mark.entity { display: inline-block }</style>"
st.write(f"{style}{get_html(html)}", unsafe_allow_html=True)