Spaces:

spark-nlp
/

African-NER

Running

App Files Files Community

African-NER / Demo.py

abdullahmubeen10

Upload 58 files

908edf6 verified 11 months ago

raw

history blame contribute delete

6.52 kB

	import streamlit as st
	import sparknlp
	import os
	import pandas as pd

	from sparknlp.base import *
	from sparknlp.annotator import *
	from pyspark.ml import Pipeline
	from sparknlp.pretrained import PretrainedPipeline
	from annotated_text import annotated_text

	# Page configuration
	st.set_page_config(
	layout="wide",
	initial_sidebar_state="auto"
	)

	# CSS for styling
	st.markdown("""
	<style>
	.main-title {
	font-size: 36px;
	color: #4A90E2;
	font-weight: bold;
	text-align: center;
	}
	.section {
	background-color: #f9f9f9;
	padding: 10px;
	border-radius: 10px;
	margin-top: 10px;
	}
	.section p, .section ul {
	color: #666666;
	}
	</style>
	""", unsafe_allow_html=True)

	@st.cache_resource
	def init_spark():
	return sparknlp.start()

	@st.cache_resource
	def create_pipeline(model):
	documentAssembler = DocumentAssembler()\
	.setInputCol("text")\
	.setOutputCol("document")

	sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\
	.setInputCols(["document"])\
	.setOutputCol("sentence")

	tokenizer = Tokenizer()\
	.setInputCols(["sentence"])\
	.setOutputCol("token")

	ner_converter = NerConverter()\
	.setInputCols(["sentence", "token", "ner"])\
	.setOutputCol("ner_chunk")


	if model == 'xlm_roberta_large_token_classifier_masakhaner':
	tokenClassifier = XlmRoBertaForTokenClassification.pretrained("xlm_roberta_large_token_classifier_masakhaner", "xx")\
	.setInputCols(["sentence",'token'])\
	.setOutputCol("ner")

	else:
	tokenClassifier = DistilBertForTokenClassification.pretrained("distilbert_base_token_classifier_masakhaner", "xx")\
	.setInputCols(["sentence",'token'])\
	.setOutputCol("ner")

	nlpPipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer, tokenClassifier, ner_converter])
	return nlpPipeline

	def fit_data(pipeline, data):
	empty_df = spark.createDataFrame([['']]).toDF('text')
	pipeline_model = pipeline.fit(empty_df)
	model = LightPipeline(pipeline_model)
	result = model.fullAnnotate(data)
	return result

	def annotate(data):
	document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
	annotated_words = []
	for chunk, label in zip(chunks, labels):
	parts = document.split(chunk, 1)
	if parts[0]:
	annotated_words.append(parts[0])
	annotated_words.append((chunk, label))
	document = parts[1]
	if document:
	annotated_words.append(document)
	annotated_text(*annotated_words)

	# Set up the page layout
	st.markdown('<div class="main-title">Recognize entities in 10 African languages</div>', unsafe_allow_html=True)
	st.markdown("""
	<div class="section">
	<p>This model carries out Name Entity Recognition on 10 African languages (Amharic, Hausa, Igbo, Kinyarwanda, Luganda, Nigerian, Pidgin, Swahilu, Wolof, and Yorùbá).</p>
	</div>
	""", unsafe_allow_html=True)

	# Sidebar content
	model = st.sidebar.selectbox(
	"Choose the pretrained model",
	["xlm_roberta_large_token_classifier_masakhaner", "distilbert_base_token_classifier_masakhaner"],
	help="For more info about the models visit: https://sparknlp.org/models"
	)

	language = st.sidebar.selectbox(
	"Choose the pretrained model",
	["Amharic", "Hausa", "Igbo", "Kinyarwanda", "Luganda", "Nigerian", "Pidgin", "Swahilu", "Wolof", "Yorùbá"],
	help="For more info about the models visit: https://sparknlp.org/models"
	)

	try:
	labels_set = set()
	for i in results['NER Chunk'].values:
	labels_set.add(results["NER Label"][i])
	labels_set = list(labels_set)

	labels = st.sidebar.multiselect("Entity labels", options=labels_set, default=list(labels_set))

	NER_labs = ['PER', 'ORG', 'LOC', 'DATE']
	NER_exp = ['People, including fictional.', 'Companies, agencies, institutions, etc.', 'Countries, cities, states.', 'Date, Year']

	NER_dict = dict(zip(NER_labs, NER_exp))

	show_exp = st.sidebar.checkbox("Explain NER Labels", value=True)
	if show_exp:
	t_ner_k = []
	t_ner_v = []
	for t_lab in labels_set:
	if t_lab in NER_dict:
	t_ner_k.append(t_lab)
	t_ner_v.append(NER_dict[t_lab])
	tdf = pd.DataFrame({"NER": t_ner_k, "Meaning": t_ner_v})
	tdf.index=['']*len(t_ner_k)
	st.sidebar.table(tdf)
	except:
	pass

	# Reference notebook link in sidebar
	link = """
	<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/Ner_masakhaner.ipynb">
	<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
	</a>
	"""
	st.sidebar.markdown('Reference notebook:')
	st.sidebar.markdown(link, unsafe_allow_html=True)

	# Load examples
	folder_path = f"inputs/{language}"
	examples = [
	lines[1].strip()
	for filename in os.listdir(folder_path)
	if filename.endswith('.txt')
	for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()]
	if len(lines) >= 2
	]

	selected_text = st.selectbox("Select an example", examples)
	custom_input = st.text_input("Try it with your own Sentence!")

	text_to_analyze = custom_input if custom_input else selected_text

	st.subheader('Full example text')
	HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
	st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)

	# Initialize Spark and create pipeline
	spark = init_spark()
	pipeline = create_pipeline(model)
	output = fit_data(pipeline, text_to_analyze)

	# Display matched sentence
	st.subheader("Processed output:")

	results = {
	'Document': output[0]['document'][0].result,
	'NER Chunk': [n.result for n in output[0]['ner_chunk']],
	"NER Label": [n.metadata['entity'] for n in output[0]['ner_chunk']]
	}

	annotate(results)

	with st.expander("View DataFrame"):
	df = pd.DataFrame({'NER Chunk': results['NER Chunk'], 'NER Label': results['NER Label']})
	df.index += 1
	st.dataframe(df)