Wootang01's picture
Create app.py
3ec6971
raw
history blame
No virus
3.54 kB
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, DebertaV2Tokenizer, DebertaV2Model
import sentencepiece
import streamlit as at
import pandas as pd
import spacy
example_list = [
]
st.set_page_config(layout="wide")
st.title("Vocabulary Categorizer")
model_list = ['spacy/en_core_web_sm',
'xlm-roberta-large-finetuned-conll03-english']
st.sidebar.header("Select a vocabulary categorizer")
model_checkpoint = st.sidebar.radio("", model_list)
st.sidebar.write("Which model highlights the most vocabulary words? Which model highlights the most accurately?")
st.sidebar.write("")
xlm_agg_strategy_info = "'aggregation_strategy' can be selected as 'simple' or 'none' for 'xlm-roberta'."
st.sidebar.header("Select Aggregation Strategy Type")
if model_checkpoint == "xlm-roberta-large-finetuned-conll03-english":
aggregation = st.sidebar.radio("", ('simple', 'none'))
st.sidebar.write(xlm_agg_strategy_info)
st.sidebar.write("")
st.subheader("Select Text Input Method")
input_method = st.radio("", ('Select from Examples', 'Write or Paste New Text'))
if input_method == 'Select from Examples':
selected_text = st.selectbox('Select Text from List', example_list, index=0, key=1)
st.subheader("Text to Run")
input_text = st.text_area("Selected Text", selected_text, height=128, max_chars=None, key=2)
elif input_method == "Write or Paste New Text":
st.subheader("Text to Run")
input_text = st.text_area('Write or Paste Text Below', value="", height=128, max_chars=None, key=2)
@st.cache(allow_output_mutation=True)
def setModel(model_checkpoint, aggregation):
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
return pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy=aggregation)
@st.cache(allow_output_mutation=True)
def get_html(html: str):
WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
html = html.replace("\n", " ")
return WRAPPER.format(html)
Run_Button = st.button("Run", key=None)
if Run_Button == True:
ner_pipeline = setModel(model_checkpoint, aggregation)
output = ner_pipeline(input_text)
df = pd.DataFrame.from_dict(output)
if aggregation != "none":
cols_to_keep = ['word','entity_group','score','start','end']
else:
cols_to_keep = ['word','entity','score','start','end']
df_final = df[cols_to_keep]
st.subheader("Recognized Entities")
st.dataframe(df_final)
st.subheader("Spacy Style Display")
spacy_display = {}
spacy_display["ents"] = []
spacy_display["text"] = input_text
spacy_display["title"] = None
for entity in output:
if aggregation != "none":
spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]})
else:
spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity"]})
entity_list = ["PER", "LOC", "ORG", "MISC"]
colors = {'PER': '#85DCDF', 'LOC': '#DF85DC', 'ORG': '#DCDF85', 'MISC': '#85ABDF',}
html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True, options={"ents": entity_list, "colors": colors})
style = "<style>mark.entity { display: inline-block }</style>"
st.write(f"{style}{get_html(html)}", unsafe_allow_html=True)