Spaces:
Sleeping
Sleeping
abdullahmubeen10
commited on
Upload 5 files
Browse files- .streamlit/config.toml +3 -0
- Demo.py +175 -0
- Dockerfile +70 -0
- pages/Workflow & Model Overview.py +244 -0
- requirements.txt +6 -0
.streamlit/config.toml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
base="light"
|
3 |
+
primaryColor="#29B4E8"
|
Demo.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import sparknlp
|
3 |
+
import os
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
from sparknlp.base import *
|
7 |
+
from sparknlp.annotator import *
|
8 |
+
from pyspark.ml import Pipeline
|
9 |
+
from sparknlp.pretrained import PretrainedPipeline
|
10 |
+
from annotated_text import annotated_text
|
11 |
+
|
12 |
+
# Page configuration
|
13 |
+
st.set_page_config(
|
14 |
+
layout="wide",
|
15 |
+
initial_sidebar_state="auto"
|
16 |
+
)
|
17 |
+
|
18 |
+
# CSS for styling
|
19 |
+
st.markdown("""
|
20 |
+
<style>
|
21 |
+
.main-title {
|
22 |
+
font-size: 36px;
|
23 |
+
color: #4A90E2;
|
24 |
+
font-weight: bold;
|
25 |
+
text-align: center;
|
26 |
+
}
|
27 |
+
.section {
|
28 |
+
background-color: #f9f9f9;
|
29 |
+
padding: 10px;
|
30 |
+
border-radius: 10px;
|
31 |
+
margin-top: 10px;
|
32 |
+
}
|
33 |
+
.section p, .section ul {
|
34 |
+
color: #666666;
|
35 |
+
}
|
36 |
+
</style>
|
37 |
+
""", unsafe_allow_html=True)
|
38 |
+
|
39 |
+
@st.cache_resource
|
40 |
+
def init_spark():
|
41 |
+
return sparknlp.start()
|
42 |
+
|
43 |
+
@st.cache_resource
|
44 |
+
def create_pipeline(model):
|
45 |
+
documentAssembler = DocumentAssembler()\
|
46 |
+
.setInputCol("text")\
|
47 |
+
.setOutputCol("document")
|
48 |
+
|
49 |
+
sentenceDetector = SentenceDetector()\
|
50 |
+
.setInputCols(["document"])\
|
51 |
+
.setOutputCol("sentence")
|
52 |
+
|
53 |
+
tokenizer = Tokenizer()\
|
54 |
+
.setInputCols(["sentence"])\
|
55 |
+
.setOutputCol("token")
|
56 |
+
|
57 |
+
embeddings = None
|
58 |
+
public_ner = None
|
59 |
+
|
60 |
+
if model == 'turkish_ner_840B_300' :
|
61 |
+
embeddings = WordEmbeddingsModel.pretrained('glove_840B_300', "xx").\
|
62 |
+
setInputCols(["sentence", 'token']).\
|
63 |
+
setOutputCol("embeddings").\
|
64 |
+
setCaseSensitive(True)
|
65 |
+
elif model == 'turkish_ner_bert' :
|
66 |
+
embeddings = BertEmbeddings.pretrained('bert_multi_cased', 'xx') \
|
67 |
+
.setInputCols(["sentence", "token"])\
|
68 |
+
.setOutputCol("embeddings")
|
69 |
+
|
70 |
+
public_ner = NerDLModel.pretrained(model, 'tr') \
|
71 |
+
.setInputCols(["sentence", "token", "embeddings"]) \
|
72 |
+
.setOutputCol("ner")
|
73 |
+
|
74 |
+
ner_converter = NerConverter() \
|
75 |
+
.setInputCols(["sentence", "token", "ner"]) \
|
76 |
+
.setOutputCol("ner_chunk")
|
77 |
+
|
78 |
+
nlp_pipeline = Pipeline(
|
79 |
+
stages=[
|
80 |
+
documentAssembler,
|
81 |
+
sentenceDetector,
|
82 |
+
tokenizer,
|
83 |
+
embeddings,
|
84 |
+
public_ner,
|
85 |
+
ner_converter])
|
86 |
+
|
87 |
+
return nlp_pipeline
|
88 |
+
|
89 |
+
def fit_data(pipeline, data):
|
90 |
+
empty_df = spark.createDataFrame([['']]).toDF('text')
|
91 |
+
pipeline_model = pipeline.fit(empty_df)
|
92 |
+
model = LightPipeline(pipeline_model)
|
93 |
+
result = model.fullAnnotate(data)
|
94 |
+
return result
|
95 |
+
|
96 |
+
def annotate(data):
|
97 |
+
document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
|
98 |
+
annotated_words = []
|
99 |
+
for chunk, label in zip(chunks, labels):
|
100 |
+
parts = document.split(chunk, 1)
|
101 |
+
if parts[0]:
|
102 |
+
annotated_words.append(parts[0])
|
103 |
+
annotated_words.append((chunk, label))
|
104 |
+
document = parts[1]
|
105 |
+
if document:
|
106 |
+
annotated_words.append(document)
|
107 |
+
annotated_text(*annotated_words)
|
108 |
+
|
109 |
+
# Set up the page layout
|
110 |
+
st.markdown('<div class="main-title">Recognize entities in Turkish text</div>', unsafe_allow_html=True)
|
111 |
+
st.markdown('<div class="section"><p>Recognize Persons, Locations, Organizations and Misc entities using an out of the box pretrained Deep Learning model and multi-lingual Bert word embeddings (bert_multi_cased) and GloVe word embeddings (glove_100d)</p></div>', unsafe_allow_html=True)
|
112 |
+
|
113 |
+
# Sidebar content
|
114 |
+
model = st.sidebar.selectbox(
|
115 |
+
"Choose the pretrained model",
|
116 |
+
["turkish_ner_bert", "turkish_ner_840B_300"],
|
117 |
+
help="For more info about the models visit: https://sparknlp.org/models"
|
118 |
+
)
|
119 |
+
|
120 |
+
# Reference notebook link in sidebar
|
121 |
+
link = """
|
122 |
+
<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/NER_TR.ipynb">
|
123 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
|
124 |
+
</a>
|
125 |
+
"""
|
126 |
+
st.sidebar.markdown('Reference notebook:')
|
127 |
+
st.sidebar.markdown(link, unsafe_allow_html=True)
|
128 |
+
|
129 |
+
# Load examples
|
130 |
+
examples = [
|
131 |
+
"William Henry Gates III (28 Ekim 1955 doΔumlu) AmerikalΔ± bir iΕ insanΔ±, yazΔ±lΔ±m geliΕtiricisi, yatΔ±rΔ±mcΔ± ve hayΔ±rseverdir. En Γ§ok Microsoft Corporation'Δ±n kurucu ortaΔΔ± olarak tanΔ±nΔ±r. Microsoft'taki kariyerinde Gates, baΕkan, genel mΓΌdΓΌr (CEO), baΕkan ve baΕ yazΔ±lΔ±m mimarΔ± gibi gΓΆrevlerde bulunmuΕ ve MayΔ±s 2014'e kadar en bΓΌyΓΌk bireysel hissedar olarak kalmΔ±ΕtΔ±r. 1970'ler ve 1980'ler mikro bilgisayar devriminin en iyi bilinen giriΕimcilerinden ve ΓΆncΓΌlerindendir. Seattle, Washington'da doΔup bΓΌyΓΌyen Gates, 1975 yΔ±lΔ±nda Γ§ocukluk arkadaΕΔ± Paul Allen ile Microsoft'u kurdu ve Εirket, dΓΌnyanΔ±n en bΓΌyΓΌk kiΕisel bilgisayar yazΔ±lΔ±mΔ± Εirketi haline geldi. Gates, Εirketi baΕkan ve CEO olarak yΓΆnetti ve Ocak 2000'de CEO olarak gΓΆrevden ayrΔ±ldΔ±, ancak baΕkan olarak kalmaya devam etti ve baΕ yazΔ±lΔ±m mimarΔ± oldu. 1990'larΔ±n sonlarΔ±nda Gates, iΕ taktikleri nedeniyle eleΕtirildi; bu gΓΆrΓΌΕ, birΓ§ok mahkeme kararΔ± tarafΔ±ndan desteklenmiΕtir. Haziran 2006'da Gates, Microsoft'ta yarΔ± zamanlΔ± bir role geΓ§eceΔini ve 2000 yΔ±lΔ±nda kendisi ve eΕi Melinda Gates tarafΔ±ndan kurulan Bill & Melinda Gates VakfΔ±'nda tam zamanlΔ± Γ§alΔ±ΕacaΔΔ±nΔ± duyurdu. GΓΆrevlerini Ray Ozzie ve Craig Mundie'ye devretti. Εubat 2014'te Microsoft'taki baΕkanlΔ±k gΓΆrevinden ayrΔ±ldΔ± ve yeni atanan CEO Satya Nadella'ya destek olmak iΓ§in teknoloji danΔ±ΕmanΔ± olarak yeni bir gΓΆreve baΕladΔ±.",
|
132 |
+
"Mona Lisa, Leonardo tarafΔ±ndan yaratΔ±lmΔ±Ε 16. yΓΌzyΔ±ldan kalma bir yaΔlΔ± boya tablodur. Louvre'da Paris'te sergilenmektedir.",
|
133 |
+
"Sebastian Thrun, 2007 yΔ±lΔ±nda Google'da kendi kendine giden arabalar ΓΌzerinde Γ§alΔ±Εmaya baΕladΔ±ΔΔ±nda, Εirket dΔ±ΕΔ±ndaki pek Γ§ok insan onu ciddiye almadΔ±. βSize Γ§ok kΔ±demli Amerikan otomobil Εirketlerinin CEO'larΔ±nΔ±n elimi sΔ±ktΔ±ΔΔ±nΔ± ve konuΕmaya deΔer biri olmadΔ±ΔΔ±m iΓ§in uzaklaΕtΔ±ΔΔ±nΔ± sΓΆyleyebilirimβ dedi Thrun, Εimdi online yΓΌksek ΓΆΔrenim giriΕimi Udacity'nin kurucu ortaΔΔ± ve CEO'su, bu hafta Recode ile yaptΔ±ΔΔ± bir rΓΆportajda.",
|
134 |
+
"Facebook, 4 Εubat 2004'te TheFacebook olarak baΕlatΔ±lan bir sosyal aΔ hizmetidir. Mark Zuckerberg tarafΔ±ndan, ΓΌniversite arkadaΕlarΔ± ve Harvard Γniversitesi ΓΆΔrencileri Eduardo Saverin, Andrew McCollum, Dustin Moskovitz ve Chris Hughes ile birlikte kurulmuΕtur. Web sitesinin ΓΌyeliΔi baΕlangΔ±Γ§ta Harvard ΓΆΔrencileriyle sΔ±nΔ±rlΔ±ydΔ±, ancak Boston bΓΆlgesindeki diΔer kolejler, Ivy League ve giderek Γ§oΔu ΓΌniversiteye geniΕletilmiΕtir.",
|
135 |
+
"DoΔal dil iΕleme tarihinin genellikle 1950'lerde baΕladΔ±ΔΔ± kabul edilir, ancak daha ΓΆnceki dΓΆnemlerde yapΔ±lan Γ§alΔ±Εmalar da vardΔ±r. 1950'de, Alan Turing 'Computing Machinery and Intelligence' baΕlΔ±klΔ± bir makale yayΔ±mlamΔ±Ε ve gΓΌnΓΌmΓΌzde Turing testi olarak bilinen zekΓ’ kriterini ΓΆnermiΕtir.",
|
136 |
+
"Geoffrey Everest Hinton, yapay sinir aΔlarΔ± ΓΌzerindeki Γ§alΔ±ΕmalarΔ± ile en Γ§ok tanΔ±nan Δ°ngiliz KanadalΔ± biliΕsel psikolog ve bilgisayar bilimcisidir. 2013'ten beri zamanΔ±nΔ± Google ve Toronto Γniversitesi'nde geΓ§irmektedir. 2017'de Toronto'daki Vector Institute'in kurucu ortaΔΔ± olmuΕ ve BaΕ Bilimsel DanΔ±Εman olarak atanmΔ±ΕtΔ±r.",
|
137 |
+
"John'a Alaska'ya taΕΔ±nmak istediΔimi sΓΆylediΔimde, orada bir Starbucks bulmanΔ±n zor olacaΔΔ±nΔ± bana sΓΆyledi.",
|
138 |
+
"Steven Paul Jobs, AmerikalΔ± bir iΕ insanΔ±, endΓΌstriyel tasarΔ±mcΔ±, yatΔ±rΔ±mcΔ± ve medya sahibi olarak bilinir. Apple Inc.'in baΕkanΔ±, genel mΓΌdΓΌrΓΌ (CEO) ve kurucu ortaΔΔ±, Pixar'Δ±n baΕkanΔ± ve Γ§oΔunluk hissedarΔ±, The Walt Disney Company'nin Pixar'Δ± satΔ±n almasΔ±nΔ±n ardΔ±ndan yΓΆnetim kurulu ΓΌyesi ve NeXT'in kurucusu, baΕkanΔ± ve CEO'suydu. Jobs, Apple kurucu ortaΔΔ± Steve Wozniak ile birlikte 1970'ler ve 1980'ler kiΕisel bilgisayar devriminin ΓΆncΓΌlerinden biri olarak tanΔ±nΔ±r. San Francisco, California'da doΔmuΕ ve evlatlΔ±k verilmiΕtir. San Francisco KΓΆrfez BΓΆlgesi'nde bΓΌyΓΌtΓΌlmΓΌΕtΓΌr. 1972'de Reed College'a gitmiΕ, aynΔ± yΔ±l ΓΌniversiteden ayrΔ±lmΔ±Ε ve 1974'te Hindistan'a giderek aydΔ±nlanma arayΔ±ΕΔ±nda bulunmuΕ ve Zen Budizmi ΓΌzerine Γ§alΔ±ΕmΔ±ΕtΔ±r.",
|
139 |
+
"Titanic, James Cameron tarafΔ±ndan yΓΆnetilmiΕ, yazΔ±lmΔ±Ε, ortak yapΔ±mcΔ±lΔ±ΔΔ± ve ortak kurgusu yapΔ±lmΔ±Ε 1997 Amerikan epik romantik ve felaket filmidir. Hem tarihi hem de kurgusal yΓΆnler iΓ§eren film, RMS Titanic'in batΔ±ΕΔ± hakkΔ±nda anlatΔ±mlara dayanΔ±r ve Leonardo DiCaprio ile Kate Winslet'i, geminin talihsiz ilk seferinde farklΔ± sosyal sΔ±nΔ±flardan gelen aΕΔ±klar olarak canlandΔ±rΔ±r.",
|
140 |
+
"Kuzey'in kralΔ± olmanΔ±n dΔ±ΕΔ±nda, John Snow, Δ°ngiliz bir doktor ve anestezi ve tΔ±bbi hijyen geliΕiminde lider olarak kabul edilir. 1834'te kolera salgΔ±nΔ±nΔ± veriler kullanarak tedavi eden ilk kiΕi olarak kabul edilir."
|
141 |
+
]
|
142 |
+
|
143 |
+
selected_text = st.selectbox("Select an example", examples)
|
144 |
+
custom_input = st.text_input("Try it with your own Sentence!")
|
145 |
+
|
146 |
+
text_to_analyze = custom_input if custom_input else selected_text
|
147 |
+
|
148 |
+
st.subheader('Full example text')
|
149 |
+
HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
|
150 |
+
st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)
|
151 |
+
|
152 |
+
# Initialize Spark and create pipeline
|
153 |
+
spark = init_spark()
|
154 |
+
pipeline = create_pipeline(model)
|
155 |
+
output = fit_data(pipeline, text_to_analyze)
|
156 |
+
|
157 |
+
# Display matched sentence
|
158 |
+
st.subheader("Processed output:")
|
159 |
+
|
160 |
+
results = {
|
161 |
+
'Document': output[0]['document'][0].result,
|
162 |
+
'NER Chunk': [n.result for n in output[0]['ner_chunk']],
|
163 |
+
"NER Label": [n.metadata['entity'] for n in output[0]['ner_chunk']]
|
164 |
+
}
|
165 |
+
|
166 |
+
annotate(results)
|
167 |
+
|
168 |
+
with st.expander("View DataFrame"):
|
169 |
+
df = pd.DataFrame({'NER Chunk': results['NER Chunk'], 'NER Label': results['NER Label']})
|
170 |
+
df.index += 1
|
171 |
+
st.dataframe(df)
|
172 |
+
|
173 |
+
|
174 |
+
|
175 |
+
|
Dockerfile
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Download base image ubuntu 18.04
|
2 |
+
FROM ubuntu:18.04
|
3 |
+
|
4 |
+
# Set environment variables
|
5 |
+
ENV NB_USER jovyan
|
6 |
+
ENV NB_UID 1000
|
7 |
+
ENV HOME /home/${NB_USER}
|
8 |
+
|
9 |
+
# Install required packages
|
10 |
+
RUN apt-get update && apt-get install -y \
|
11 |
+
tar \
|
12 |
+
wget \
|
13 |
+
bash \
|
14 |
+
rsync \
|
15 |
+
gcc \
|
16 |
+
libfreetype6-dev \
|
17 |
+
libhdf5-serial-dev \
|
18 |
+
libpng-dev \
|
19 |
+
libzmq3-dev \
|
20 |
+
python3 \
|
21 |
+
python3-dev \
|
22 |
+
python3-pip \
|
23 |
+
unzip \
|
24 |
+
pkg-config \
|
25 |
+
software-properties-common \
|
26 |
+
graphviz \
|
27 |
+
openjdk-8-jdk \
|
28 |
+
ant \
|
29 |
+
ca-certificates-java \
|
30 |
+
&& apt-get clean \
|
31 |
+
&& update-ca-certificates -f;
|
32 |
+
|
33 |
+
# Install Python 3.8 and pip
|
34 |
+
RUN add-apt-repository ppa:deadsnakes/ppa \
|
35 |
+
&& apt-get update \
|
36 |
+
&& apt-get install -y python3.8 python3-pip \
|
37 |
+
&& apt-get clean;
|
38 |
+
|
39 |
+
# Set up JAVA_HOME
|
40 |
+
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
|
41 |
+
RUN mkdir -p ${HOME} \
|
42 |
+
&& echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ${HOME}/.bashrc \
|
43 |
+
&& chown -R ${NB_UID}:${NB_UID} ${HOME}
|
44 |
+
|
45 |
+
# Create a new user named "jovyan" with user ID 1000
|
46 |
+
RUN useradd -m -u ${NB_UID} ${NB_USER}
|
47 |
+
|
48 |
+
# Switch to the "jovyan" user
|
49 |
+
USER ${NB_USER}
|
50 |
+
|
51 |
+
# Set home and path variables for the user
|
52 |
+
ENV HOME=/home/${NB_USER} \
|
53 |
+
PATH=/home/${NB_USER}/.local/bin:$PATH
|
54 |
+
|
55 |
+
# Set the working directory to the user's home directory
|
56 |
+
WORKDIR ${HOME}
|
57 |
+
|
58 |
+
# Upgrade pip and install Python dependencies
|
59 |
+
RUN python3.8 -m pip install --upgrade pip
|
60 |
+
COPY requirements.txt /tmp/requirements.txt
|
61 |
+
RUN python3.8 -m pip install -r /tmp/requirements.txt
|
62 |
+
|
63 |
+
# Copy the application code into the container at /home/jovyan
|
64 |
+
COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
|
65 |
+
|
66 |
+
# Expose port for Streamlit
|
67 |
+
EXPOSE 7860
|
68 |
+
|
69 |
+
# Define the entry point for the container
|
70 |
+
ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
|
pages/Workflow & Model Overview.py
ADDED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
# Custom CSS for better styling
|
4 |
+
st.markdown("""
|
5 |
+
<style>
|
6 |
+
.main-title {
|
7 |
+
font-size: 36px;
|
8 |
+
color: #4A90E2;
|
9 |
+
font-weight: bold;
|
10 |
+
text-align: center;
|
11 |
+
}
|
12 |
+
.sub-title {
|
13 |
+
font-size: 24px;
|
14 |
+
color: #4A90E2;
|
15 |
+
margin-top: 20px;
|
16 |
+
}
|
17 |
+
.section {
|
18 |
+
background-color: #f9f9f9;
|
19 |
+
padding: 15px;
|
20 |
+
border-radius: 10px;
|
21 |
+
margin-top: 20px;
|
22 |
+
}
|
23 |
+
.section h2 {
|
24 |
+
font-size: 22px;
|
25 |
+
color: #4A90E2;
|
26 |
+
}
|
27 |
+
.section p, .section ul {
|
28 |
+
color: #666666;
|
29 |
+
}
|
30 |
+
.link {
|
31 |
+
color: #4A90E2;
|
32 |
+
text-decoration: none;
|
33 |
+
}
|
34 |
+
</style>
|
35 |
+
""", unsafe_allow_html=True)
|
36 |
+
|
37 |
+
# Main Title
|
38 |
+
st.markdown('<div class="main-title">Named Entity Recognition (NER) in Turkish with Spark NLP</div>', unsafe_allow_html=True)
|
39 |
+
|
40 |
+
# Introduction
|
41 |
+
st.markdown("""
|
42 |
+
<div class="section">
|
43 |
+
<p>Named Entity Recognition (NER) is a task in Natural Language Processing (NLP) that involves identifying and classifying key information in a text into predefined categories. In this page, we present two different pipelines for performing NER on Turkish texts using Spark NLP:</p>
|
44 |
+
<ul>
|
45 |
+
<li>A pipeline using GloVe embeddings with the <code>turkish_ner_840B_300</code> model.</li>
|
46 |
+
<li>A pipeline using BERT embeddings with the <code>turkish_ner_bert</code> model.</li>
|
47 |
+
</ul>
|
48 |
+
</div>
|
49 |
+
""", unsafe_allow_html=True)
|
50 |
+
|
51 |
+
# Pipeline 1: Turkish NER with GloVe Embeddings
|
52 |
+
st.markdown('<div class="sub-title">Pipeline 1: Turkish NER with GloVe Embeddings</div>', unsafe_allow_html=True)
|
53 |
+
st.write("")
|
54 |
+
|
55 |
+
with st.expander("Turkish NER 840B_300"):
|
56 |
+
st.components.v1.html(
|
57 |
+
"""
|
58 |
+
<iframe
|
59 |
+
src="https://sparknlp.org/2020/11/10/turkish_ner_840B_300_tr.html"
|
60 |
+
width="100%"
|
61 |
+
height="600px"
|
62 |
+
style="border:none;"
|
63 |
+
title="Embedded Website">
|
64 |
+
</iframe>
|
65 |
+
""",
|
66 |
+
height=600
|
67 |
+
)
|
68 |
+
|
69 |
+
st.markdown("""
|
70 |
+
<div class="section">
|
71 |
+
<p>This pipeline utilizes GloVe embeddings to perform Named Entity Recognition. The <code>turkish_ner_840B_300</code> model is a pre-trained NER model for Turkish that uses GloVe embeddings with 840 billion tokens and 300 dimensions. The pipeline includes the following stages:</p>
|
72 |
+
<ul>
|
73 |
+
<li><strong>Document Assembler:</strong> Converts raw text into a format suitable for NLP processing.</li>
|
74 |
+
<li><strong>Sentence Detector:</strong> Splits the text into sentences.</li>
|
75 |
+
<li><strong>Tokenizer:</strong> Breaks sentences into tokens.</li>
|
76 |
+
<li><strong>Word Embeddings:</strong> Uses GloVe embeddings to represent tokens.</li>
|
77 |
+
<li><strong>NER Model:</strong> Applies the NER model to identify named entities.</li>
|
78 |
+
<li><strong>NER Converter:</strong> Converts the NER output into chunks representing named entities.</li>
|
79 |
+
</ul>
|
80 |
+
<p>Here is how you can set up and use this pipeline:</p>
|
81 |
+
</div>
|
82 |
+
""", unsafe_allow_html=True)
|
83 |
+
st.code("""
|
84 |
+
from sparknlp.base import *
|
85 |
+
from sparknlp.annotator import *
|
86 |
+
from pyspark.ml import Pipeline
|
87 |
+
|
88 |
+
# Document Assembler
|
89 |
+
documentAssembler = DocumentAssembler()\\
|
90 |
+
.setInputCol("text")\\
|
91 |
+
.setOutputCol("document")
|
92 |
+
|
93 |
+
# Sentence Detector
|
94 |
+
sentenceDetector = SentenceDetector()\\
|
95 |
+
.setInputCols(["document"])\\
|
96 |
+
.setOutputCol("sentence")
|
97 |
+
|
98 |
+
# Tokenizer
|
99 |
+
tokenizer = Tokenizer()\\
|
100 |
+
.setInputCols(["sentence"])\\
|
101 |
+
.setOutputCol("token")
|
102 |
+
|
103 |
+
# Word Embeddings
|
104 |
+
embeddings = WordEmbeddingsModel.pretrained('glove_840B_300', "xx")\\
|
105 |
+
.setInputCols(["sentence", 'token'])\\
|
106 |
+
.setOutputCol("embeddings")\\
|
107 |
+
.setCaseSensitive(True)
|
108 |
+
|
109 |
+
# NER Model
|
110 |
+
public_ner = NerDLModel.pretrained('turkish_ner_840B_300', 'tr')\\
|
111 |
+
.setInputCols(["sentence", "token", "embeddings"])\\
|
112 |
+
.setOutputCol("ner")
|
113 |
+
|
114 |
+
# NER Converter
|
115 |
+
ner_converter = NerConverter()\\
|
116 |
+
.setInputCols(["sentence", "token", "ner"])\\
|
117 |
+
.setOutputCol("ner_chunk")
|
118 |
+
|
119 |
+
# Pipeline
|
120 |
+
nlp_pipeline = Pipeline(
|
121 |
+
stages=[
|
122 |
+
documentAssembler,
|
123 |
+
sentenceDetector,
|
124 |
+
tokenizer,
|
125 |
+
embeddings,
|
126 |
+
public_ner,
|
127 |
+
ner_converter
|
128 |
+
]
|
129 |
+
)
|
130 |
+
""", language="python")
|
131 |
+
|
132 |
+
# Pipeline 2: Turkish NER with BERT Embeddings
|
133 |
+
st.markdown('<div class="sub-title">Pipeline 2: Turkish NER with BERT Embeddings</div>', unsafe_allow_html=True)
|
134 |
+
st.write("")
|
135 |
+
|
136 |
+
with st.expander("Turkish NER Bert"):
|
137 |
+
st.components.v1.html(
|
138 |
+
"""
|
139 |
+
<iframe
|
140 |
+
src="https://sparknlp.org/2020/11/10/turkish_ner_bert_tr.html"
|
141 |
+
width="100%"
|
142 |
+
height="600px"
|
143 |
+
style="border:none;"
|
144 |
+
title="Embedded Website">
|
145 |
+
</iframe>
|
146 |
+
""",
|
147 |
+
height=600
|
148 |
+
)
|
149 |
+
|
150 |
+
st.markdown("""
|
151 |
+
<div class="section">
|
152 |
+
<p>This pipeline uses BERT embeddings for Named Entity Recognition. The <code>turkish_ner_bert</code> model leverages BERT embeddings to achieve state-of-the-art results for NER tasks in Turkish. The pipeline consists of the following stages:</p>
|
153 |
+
<ul>
|
154 |
+
<li><strong>Document Assembler:</strong> Converts raw text into a format suitable for NLP processing.</li>
|
155 |
+
<li><strong>Sentence Detector:</strong> Splits the text into sentences.</li>
|
156 |
+
<li><strong>Tokenizer:</strong> Breaks sentences into tokens.</li>
|
157 |
+
<li><strong>BERT Embeddings:</strong> Uses BERT embeddings to represent tokens.</li>
|
158 |
+
<li><strong>NER Model:</strong> Applies the NER model to identify named entities.</li>
|
159 |
+
<li><strong>NER Converter:</strong> Converts the NER output into chunks representing named entities.</li>
|
160 |
+
</ul>
|
161 |
+
<p>Here is how you can set up and use this pipeline:</p>
|
162 |
+
</div>
|
163 |
+
""", unsafe_allow_html=True)
|
164 |
+
st.code("""
|
165 |
+
from sparknlp.base import *
|
166 |
+
from sparknlp.annotator import *
|
167 |
+
from pyspark.ml import Pipeline
|
168 |
+
|
169 |
+
# Document Assembler
|
170 |
+
documentAssembler = DocumentAssembler()\\
|
171 |
+
.setInputCol("text")\\
|
172 |
+
.setOutputCol("document")
|
173 |
+
|
174 |
+
# Sentence Detector
|
175 |
+
sentenceDetector = SentenceDetector()\\
|
176 |
+
.setInputCols(["document"])\\
|
177 |
+
.setOutputCol("sentence")
|
178 |
+
|
179 |
+
# Tokenizer
|
180 |
+
tokenizer = Tokenizer()\\
|
181 |
+
.setInputCols(["sentence"])\\
|
182 |
+
.setOutputCol("token")
|
183 |
+
|
184 |
+
# BERT Embeddings
|
185 |
+
embeddings = BertEmbeddings.pretrained('bert_multi_cased', 'xx')\\
|
186 |
+
.setInputCols(["sentence", "token"])\\
|
187 |
+
.setOutputCol("embeddings")
|
188 |
+
|
189 |
+
# NER Model
|
190 |
+
public_ner = NerDLModel.pretrained('turkish_ner_bert', 'tr')\\
|
191 |
+
.setInputCols(["sentence", "token", "embeddings"])\\
|
192 |
+
.setOutputCol("ner")
|
193 |
+
|
194 |
+
# NER Converter
|
195 |
+
ner_converter = NerConverter()\\
|
196 |
+
.setInputCols(["sentence", "token", "ner"])\\
|
197 |
+
.setOutputCol("ner_chunk")
|
198 |
+
|
199 |
+
# Pipeline
|
200 |
+
nlp_pipeline = Pipeline(
|
201 |
+
stages=[
|
202 |
+
documentAssembler,
|
203 |
+
sentenceDetector,
|
204 |
+
tokenizer,
|
205 |
+
embeddings,
|
206 |
+
public_ner,
|
207 |
+
ner_converter
|
208 |
+
]
|
209 |
+
)
|
210 |
+
""", language="python")
|
211 |
+
|
212 |
+
# Summary
|
213 |
+
st.markdown('<div class="sub-title">Summary</div>', unsafe_allow_html=True)
|
214 |
+
st.markdown("""
|
215 |
+
<div class="section">
|
216 |
+
<p>We have outlined two pipelines for performing Named Entity Recognition (NER) on Turkish texts using Spark NLP. The first pipeline uses GloVe embeddings, and the second one uses BERT embeddings. Both pipelines include stages for document assembly, sentence detection, tokenization, embedding generation, NER model application, and conversion of NER results into entity chunks.</p>
|
217 |
+
<p>These pipelines provide flexible options for leveraging pre-trained models in different contexts, allowing for scalable and accurate NER in Turkish.</p>
|
218 |
+
</div>
|
219 |
+
""", unsafe_allow_html=True)
|
220 |
+
|
221 |
+
# References
|
222 |
+
st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
|
223 |
+
st.markdown("""
|
224 |
+
<div class="section">
|
225 |
+
<ul>
|
226 |
+
<li><a class="link" href="https://sparknlp.org/api/python/reference/autosummary/sparknlp/annotator/word_embeddings_model/index.html" target="_blank" rel="noopener">WordEmbeddingsModel Documentation</a></li>
|
227 |
+
<li><a class="link" href="https://sparknlp.org/api/python/reference/autosummary/sparknlp/annotator/bert_embeddings/index.html" target="_blank" rel="noopener">BertEmbeddings Documentation</a></li>
|
228 |
+
<li><a class="link" href="https://sparknlp.org/api/python/reference/autosummary/sparknlp/annotator/ner_dl_model/index.html" target="_blank" rel="noopener">NerDLModel Documentation</a></li>
|
229 |
+
<li><a class="link" href="https://www.johnsnowlabs.com/spark-nlp/" target="_blank" rel="noopener">Spark NLP Official Site</a></li>
|
230 |
+
</ul>
|
231 |
+
</div>
|
232 |
+
""", unsafe_allow_html=True)
|
233 |
+
|
234 |
+
# Community & Support
|
235 |
+
st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
|
236 |
+
st.markdown("""
|
237 |
+
<div class="section">
|
238 |
+
<ul>
|
239 |
+
<li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
|
240 |
+
<li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub Repository</a>: Report issues or contribute</li>
|
241 |
+
<li><a class="link" href="https://forum.johnsnowlabs.com/" target="_blank">Community Forum</a>: Ask questions, share ideas, and get support</li>
|
242 |
+
</ul>
|
243 |
+
</div>
|
244 |
+
""", unsafe_allow_html=True)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
st-annotated-text
|
3 |
+
pandas
|
4 |
+
numpy
|
5 |
+
spark-nlp
|
6 |
+
pyspark
|