Spaces:

Wootang01
/

vocabulary_categorizer

Runtime error

App Files Files Community

vocabulary_categorizer / app.py

Wootang01

Update app.py

e4016f5 over 2 years ago

raw

history blame

No virus

3.96 kB

	from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, DebertaV2Tokenizer, DebertaV2Model
	import sentencepiece
	import streamlit as st
	import pandas as pd
	import spacy
	from spacy import displacy
	import plotly.express as px
	import numpy as np

	example_list = [
	"Mustafa Kemal Atatürk 1919 yılında Samsun'a çıktı.",
	"""Hello World"""
	]

	st.set_page_config(layout="wide", page_title="Vocabulary Categorizer")

	st.title("Vocabulary Categorizer")
	st.write("This application identifies, highlights and categories nouns.")

	model_list = ['xlm-roberta-large-finetuned-conll03-english', 'xlm-roberta-large']

	st.sidebar.header("Vocabulary categorizer")
	model_checkpoint = st.sidebar.radio("", model_list)

	st.sidebar.write("Which model highlights the most vocabulary words? Which model highlights the most accurately?")
	st.sidebar.write("")

	xlm_agg_strategy_info = "'aggregation_strategy' can be selected as 'simple' or 'none' for 'xlm-roberta'."

	st.sidebar.header("Select Aggregation Strategy Type")
	if model_checkpoint == "xlm-roberta-large-finetuned-conll03-english":
	aggregation = st.sidebar.radio("", ('simple', 'none'))
	st.sidebar.write(xlm_agg_strategy_info)
	st.sidebar.write("")
	elif model_checkpoint == "xlm-roberta-large":
	aggregation = st.sidebar.radio("", ('simple', 'none'))
	st.sidebar.write(xlm_agg_strategy_info)
	st.sidebar.write("")

	st.subheader("Select Text Input Method")
	input_method = st.radio("", ('Select from Examples', 'Write or Paste New Text'))
	if input_method == 'Select from Examples':
	selected_text = st.selectbox('Select Text from List', example_list, index=0, key=1)
	st.subheader("Text to Run")
	input_text = st.text_area("Selected Text", selected_text, height=128, max_chars=None, key=2)
	elif input_method == "Write or Paste New Text":
	st.subheader("Text to Run")
	input_text = st.text_area('Write or Paste Text Below', value="", height=128, max_chars=None, key=2)

	@st.cache(allow_output_mutation=True)
	def setModel(model_checkpoint, aggregation):
	model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
	tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
	return pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy=aggregation)

	@st.cache(allow_output_mutation=True)
	def get_html(html: str):
	WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
	html = html.replace("\n", " ")
	return WRAPPER.format(html)

	Run_Button = st.button("Run", key=None)
	if Run_Button == True:

	ner_pipeline = setModel(model_checkpoint, aggregation)
	output = ner_pipeline(input_text)

	df = pd.DataFrame.from_dict(output)
	if aggregation != "none":
	cols_to_keep = ['word','entity_group','score','start','end']
	else:
	cols_to_keep = ['word','entity','score','start','end']
	df_final = df[cols_to_keep]

	st.subheader("Recognized Entities")
	st.dataframe(df_final)

	st.subheader("Spacy Style Display")
	spacy_display = {}
	spacy_display["ents"] = []
	spacy_display["text"] = input_text
	spacy_display["title"] = None

	for entity in output:
	if aggregation != "none":
	spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]})
	else:
	spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity"]})

	entity_list = ["PER", "LOC", "ORG", "MISC"]
	colors = {'PER': '#85DCDF', 'LOC': '#DF85DC', 'ORG': '#DCDF85', 'MISC': '#85ABDF',}
	html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True, options={"ents": entity_list, "colors": colors})
	style = "<style>mark.entity { display: inline-block }</style>"
	st.write(f"{style}{get_html(html)}", unsafe_allow_html=True)