Spaces:
Build error
Build error
File size: 15,536 Bytes
fe11059 d561367 fe11059 d49281d fe11059 2c04d44 20dc861 fe11059 20dc861 2c04d44 fe11059 78e484a fe11059 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 |
import streamlit as st
st.set_page_config(
layout="centered", # Can be "centered" or "wide". In the future also "dashboard", etc.
initial_sidebar_state="auto", # Can be "auto", "expanded", "collapsed"
page_title='Extractive Summarization', # String or None. Strings get appended with "• Streamlit".
page_icon='./favicon.png', # String, anything supported by st.image, or None.
)
import pandas as pd
import numpy as np
import json
import os
import sys
sys.path.append(os.path.abspath('./'))
import streamlit_apps_config as config
from streamlit_ner_output import show_html2, jsl_display_annotations, get_color
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql import functions as F
from sparknlp_display import NerVisualizer
from pyspark.ml import Pipeline
from pyspark.sql.types import StringType
spark= sparknlp.start()
## Marking down NER Style
st.markdown(config.STYLE_CONFIG, unsafe_allow_html=True)
root_path = config.project_path
########## To Remove the Main Menu Hamburger ########
hide_menu_style = """
<style>
#MainMenu {visibility: hidden;}
</style>
"""
st.markdown(hide_menu_style, unsafe_allow_html=True)
########## Side Bar ########
## loading logo(newer version with href)
import base64
@st.cache(allow_output_mutation=True)
def get_base64_of_bin_file(bin_file):
with open(bin_file, 'rb') as f:
data = f.read()
return base64.b64encode(data).decode()
@st.cache(allow_output_mutation=True)
def get_img_with_href(local_img_path, target_url):
img_format = os.path.splitext(local_img_path)[-1].replace('.', '')
bin_str = get_base64_of_bin_file(local_img_path)
html_code = f'''
<a href="{target_url}">
<img height="90%" width="90%" src="data:image/{img_format};base64,{bin_str}" />
</a>'''
return html_code
logo_html = get_img_with_href('./jsl-logo.png', 'https://www.johnsnowlabs.com/')
st.sidebar.markdown(logo_html, unsafe_allow_html=True)
#sidebar info
model_name= ["nerdl_fewnerd_100d", "ner_conll_elmo", "ner_mit_movie_complex_distilbert_base_cased", "ner_conll_albert_large_uncased", "onto_100"]
st.sidebar.title("Pretrained model to test")
selected_model = st.sidebar.selectbox("", model_name)
######## Main Page #########
if selected_model == "nerdl_fewnerd_100d":
app_title= "Detect up to 8 entity types in general domain texts"
app_description= "Named Entity Recognition model aimed to detect up to 8 entity types from general domain texts. This model was trained on the Few-NERD/inter public dataset using Spark NLP, and it is available in Spark NLP Models hub (https://nlp.johnsnowlabs.com/models)"
st.title(app_title)
st.markdown("<h2>"+app_description+"</h2>" , unsafe_allow_html=True)
st.markdown("**`PERSON`** **,** **`ORGANIZATION`** **,** **`LOCATION`** **,** **`ART`** **,** **`BUILDING`** **,** **`PRODUCT`** **,** **`EVENT`** **,** **`OTHER`**", unsafe_allow_html=True)
elif selected_model== "ner_conll_elmo":
app_title= "Detect up to 4 entity types in general domain texts"
app_description= "Named Entity Recognition model aimed to detect up to 4 entity types from general domain texts. This model was trained on the CoNLL 2003 text corpu using Spark NLP, and it is available in Spark NLP Models hub (https://nlp.johnsnowlabs.com/models)"
st.title(app_title)
st.markdown("<h2>"+app_description+"</h2>" , unsafe_allow_html=True)
st.markdown("**`PER`** **,** **`LOC`** **,** **`ORG`** **,** **`MISC` **", unsafe_allow_html=True)
elif selected_model== "ner_mit_movie_complex_distilbert_base_cased":
app_title= "Detect up to 12 entity types in movie domain texts"
app_description= "Named Entity Recognition model aimed to detect up to 12 entity types from movie domain texts. This model was trained on the MIT Movie Corpus complex queries dataset to detect movie trivia using Spark NLP, and it is available in Spark NLP Models hub (https://nlp.johnsnowlabs.com/models)"
st.title(app_title)
st.markdown("<h2>"+app_description+"</h2>" , unsafe_allow_html=True)
st.markdown("""**`ACTOR`** **,** **`AWARD`** **,** **`CHARACTER_NAME`** **,** **`DIRECTOR`** **,** **`GENRE`** **,** **`OPINION`** **,** **`ORIGIN`** **,** **`PLOT`**,
**`QUOTE`** **,** **`RELATIONSHIP`** **,** **`SOUNDTRACK`** **,** **`YEAR` **""", unsafe_allow_html=True)
elif selected_model=="ner_conll_albert_large_uncased":
app_title= "Detect up to 4 entity types in general domain texts"
app_description= "Named Entity Recognition model aimed to detect up to 4 entity types from general domain texts. This model was trained on the CoNLL 2003 text corpus using Spark NLP, and it is available in Spark NLP Models hub (https://nlp.johnsnowlabs.com/models)"
st.title(app_title)
st.markdown("<h2>"+app_description+"</h2>" , unsafe_allow_html=True)
st.markdown("**`PER`** **,** **`LOC`** **,** **`ORG`** **,** **`MISC` **", unsafe_allow_html=True)
elif selected_model=="onto_100":
app_title= "Detect up to 18 entity types in general domain texts"
app_description= "Named Entity Recognition model aimed to detect up to 18 entity types from general domain texts. This model was trained with GloVe 100d word embeddings using Spark NLP, so be sure to use same embeddings in the pipeline. It is available in Spark NLP Models hub (https://nlp.johnsnowlabs.com/models)"
st.title(app_title)
st.markdown("<h2>"+app_description+"</h2>" , unsafe_allow_html=True)
st.markdown("""**`CARDINAL`** **,** **`EVENT`** **,** **`WORK_OF_ART`** **,** **`ORG`** **,** **`DATE`** **,** **`GPE`** **,** **`PERSON`** **,** **`PRODUCT`**,
**`NORP`** **,** **`ORDINAL`** **,** **`MONEY`** **,** **`LOC` **, **`FAC`** **,** **`LAW`** **,** **`TIME`** **,** **`PERCENT`** **,** **`QUANTITY`** **,** **`LANGUAGE` **""", unsafe_allow_html=True)
st.subheader("")
#caching the models in the dictionary
@st.cache(allow_output_mutation=True, show_spinner=False)
def load_sparknlp_models():
ner_models_list= ["nerdl_fewnerd_100d", "ner_conll_elmo", "ner_mit_movie_complex_distilbert_base_cased", "ner_conll_albert_large_uncased", "onto_100"]
embeddings_list= ["glove_100d", "elmo", "distilbert_base_cased", "albert_large_uncased", "glove_100d_for_onto"]
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentenceDetector= SentenceDetector()\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
ner_converter= NerConverter()\
.setInputCols(["document", "token", "ner"])\
.setOutputCol("ner_chunk")
model_dict= {
'documentAssembler': documentAssembler,
'sentenceDetector': sentenceDetector,
'tokenizer': tokenizer,
'ner_converter': ner_converter
}
for embeddings_name, ner_model_name in zip(embeddings_list, ner_models_list):
try:
if embeddings_name=="glove_100d":
model_dict[embeddings_name]= WordEmbeddingsModel.pretrained(embeddings_name, "en")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
elif embeddings_name=="elmo":
model_dict[embeddings_name]= ElmoEmbeddings.pretrained(embeddings_name, "en")\
.setInputCols(["token", "document"])\
.setOutputCol("embeddings")\
.setPoolingLayer("elmo")
elif embeddings_name=="distilbert_base_cased":
model_dict[embeddings_name]= DistilBertEmbeddings\
.pretrained(embeddings_name, 'en')\
.setInputCols(["token", "document"])\
.setOutputCol("embeddings")
elif embeddings_name=="albert_large_uncased":
model_dict[embeddings_name]= AlbertEmbeddings\
.pretrained(embeddings_name, 'en')\
.setInputCols(["document", "token"])\
.setOutputCol("embeddings")
elif embeddings_name=="glove_100d_for_onto":
model_dict[embeddings_name]= WordEmbeddingsModel.pretrained("glove_100d", "en")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
model_dict[ner_model_name]= NerDLModel.pretrained(ner_model_name, "en")\
.setInputCols(["document", "token", "embeddings"])\
.setOutputCol("ner")
except:
pass
return model_dict
placeholder= st.empty()
placeholder.info("If you are launching the app for the first time, it may take some time (approximately 1 minute) for SparkNLP models to load...")
nlp_dict= load_sparknlp_models()
placeholder.empty()
if selected_model=="ner_conll_albert_large_uncased":
text= st.text_input("Type here your text and press enter to run:", value="Mark Knopfler was born in Glasgow, Scotland. He is a British singer-songwriter, guitarist, and record producer. He became known as the lead guitarist, singer and songwriter of the rock band Dire Straits.")
elif selected_model=="ner_mit_movie_complex_distilbert_base_cased":
text= st.text_input("Type here your text and press enter to run:", value="It's only appropriate that Solaris, Russian filmmaker Andrei Tarkovsky's psychological sci-fi classic from 1972, contains an equally original and mind-bending score. Solaris explores the inadequacies of time and memory on an enigmatic planet below a derelict space station. To reinforce the film's chilling setting, Tarkovsky commissioned composer Eduard Artemiev to construct an electronic soundscape reflecting planet Solaris' amorphous and mysterious surface")
elif selected_model=="ner_conll_elmo":
text= st.text_input("Type here your text and press enter to run: ", value="Tottenham Hotspur Football Club, commonly referred to as Tottenham or Spurs, is an English professional football club based in Tottenham, London, that competes in the Premier League, the top flight of English football.")
elif selected_model=="onto_100":
text= st.text_input("Type here your text and press enter to run: ", value="William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor, and philanthropist. He is best known as the co-founder of Microsoft Corporation. During his career at Microsoft, Gates held the positions of chairman, chief executive officer (CEO), president and chief software architect, while also being the largest individual shareholder until May 2014. He is one of the best-known entrepreneurs and pioneers of the microcomputer revolution of the 1970s and 1980s. Born and raised in Seattle, Washington, Gates co-founded Microsoft with childhood friend Paul Allen in 1975, in Albuquerque, New Mexico; it went on to become the world's largest personal computer software company. Gates led the company as chairman and CEO until stepping down as CEO in January 2000, but he remained chairman and became chief software architect.")
else:
text= st.text_input("Type here your text and press enter to run:", value="12 Corazones ('12 Hearts') is Spanish-language dating game show produced in the United States for the television network Telemundo since January 2005, based on its namesake Argentine TV show format. The show is filmed in Los Angeles and revolves around the twelve Zodiac signs that identify each contestant. In 2008, Ho filmed a cameo in the Steven Spielberg feature film The Cloverfield Paradox, as a news pundit.")
def build_pipeline(text, model_name=selected_model):
base_pipeline= Pipeline(stages=[
nlp_dict["documentAssembler"],
nlp_dict["sentenceDetector"],
nlp_dict["tokenizer"]
])
fewnerd_pipeline= Pipeline(stages=[
base_pipeline,
nlp_dict["glove_100d"],
nlp_dict[model_name],
nlp_dict["ner_converter"]
])
elmo_pipeline= Pipeline(stages=[
base_pipeline,
nlp_dict["elmo"],
nlp_dict[model_name],
nlp_dict["ner_converter"]
])
movie_pipeline= Pipeline(stages=[
base_pipeline,
nlp_dict["distilbert_base_cased"],
nlp_dict[model_name],
nlp_dict["ner_converter"]
])
albert_pipeline= Pipeline(stages=[
base_pipeline,
nlp_dict["albert_large_uncased"],
nlp_dict[model_name],
nlp_dict["ner_converter"]
])
onto_pipeline= Pipeline(stages=[
base_pipeline,
nlp_dict["glove_100d_for_onto"],
nlp_dict[model_name],
nlp_dict["ner_converter"]
])
empty_df = spark.createDataFrame([[""]]).toDF("text")
if model_name=="nerdl_fewnerd_100d":
pipeline_model= fewnerd_pipeline.fit(empty_df)
elif model_name=="ner_conll_elmo":
pipeline_model= elmo_pipeline.fit(empty_df)
elif model_name=="ner_mit_movie_complex_distilbert_base_cased":
pipeline_model= movie_pipeline.fit(empty_df)
elif model_name=="ner_conll_albert_large_uncased":
pipeline_model= albert_pipeline.fit(empty_df)
elif model_name=="onto_100":
pipeline_model= onto_pipeline.fit(empty_df)
text_df= spark.createDataFrame(pd.DataFrame({"text": [text]}))
result= pipeline_model.transform(text_df).toPandas()
return result
#placeholder for warning
placeholder= st.empty()
placeholder.info("Processing...")
result= build_pipeline(text)
placeholder.empty()
df= pd.DataFrame({"ner_chunk": result["ner_chunk"].iloc[0]})
labels_set = set()
for i in df['ner_chunk'].values:
labels_set.add(i[4]['entity'])
labels_set = list(labels_set)
labels = st.sidebar.multiselect(
"NER Labels", options=labels_set, default=list(labels_set)
)
show_html2(text, df, labels, "Text annotated with identified Named Entities")
try_link="""<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/3.SparkNLP_Pretrained_Models.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/></a>"""
st.sidebar.title('')
st.sidebar.markdown('Try it yourself:')
st.sidebar.markdown(try_link, unsafe_allow_html=True)
|