from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, DebertaV2Tokenizer, DebertaV2Model import sentencepiece import streamlit as st import pandas as pd import spacy from spacy import displacy import plotly.express as px import numpy as np example_list = [ """Hong Kong’s two-week flight ban has dashed the hopes of those planning family reunions as well as disrupted plans for incoming domestic helpers, with the Philippines, Britain and the United States among eight countries hit with tightened rules aimed at containing a Covid-19 surge.""", """From Friday (Jan 7), all bars and entertainment venues will close for two weeks, and restaurants have to stop dine-in after 6pm, Chief Executive Carrie Lam Cheng Yuet-ngor announced on Wednesday. """ ] st.set_page_config(layout="wide", page_title="Vocabulary Categorizer") st.title("Vocabulary Categorizer") st.write("This application identifies, highlights and categorizes nouns.") model_list = ['xlm-roberta-large-finetuned-conll03-english', 'xlm-roberta-large'] st.sidebar.header("Vocabulary categorizer") model_checkpoint = st.sidebar.radio("", model_list) st.sidebar.write("Which model highlights the most vocabulary words? Which model highlights the most accurately?") st.sidebar.write("") xlm_agg_strategy_info = "'aggregation_strategy' can be selected as 'simple' or 'none' for 'xlm-roberta'." st.sidebar.header("Select Aggregation Strategy Type") if model_checkpoint == "xlm-roberta-large-finetuned-conll03-english": aggregation = st.sidebar.radio("", ('simple', 'none')) st.sidebar.write(xlm_agg_strategy_info) st.sidebar.write("") elif model_checkpoint == "xlm-roberta-large": aggregation = st.sidebar.radio("", ('simple', 'none')) st.sidebar.write(xlm_agg_strategy_info) st.sidebar.write("") st.subheader("Select Text Input Method") input_method = st.radio("", ('Select from Examples', 'Write or Paste New Text')) if input_method == 'Select from Examples': selected_text = st.selectbox('Select Text from List', example_list, index=0, key=1) st.subheader("Text to Run") input_text = st.text_area("Selected Text", selected_text, height=128, max_chars=None, key=2) elif input_method == "Write or Paste New Text": st.subheader("Text to Run") input_text = st.text_area('Write or Paste Text Below', value="", height=128, max_chars=None, key=2) @st.cache(allow_output_mutation=True) def setModel(model_checkpoint, aggregation): model = AutoModelForTokenClassification.from_pretrained(model_checkpoint) tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) return pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy=aggregation) @st.cache(allow_output_mutation=True) def get_html(html: str): WRAPPER = """