abdullahmubeen10's picture
Update Demo.py
9fb8775 verified
import streamlit as st
import sparknlp
import os
import pandas as pd
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline
# Page configuration
st.set_page_config(
layout="wide",
page_title="Spark NLP Demos App",
initial_sidebar_state="auto"
)
# CSS for styling
st.markdown("""
<style>
.main-title {
font-size: 36px;
color: #4A90E2;
font-weight: bold;
text-align: center;
}
.section {
background-color: #f9f9f9;
padding: 15px;
border-radius: 10px;
margin-top: 20px;
}
.stTable {
margin-left: auto;
margin-right: auto;
}
</style>
""", unsafe_allow_html=True)
@st.cache_resource
def init_spark():
return sparknlp.start()
@st.cache_resource
def create_pipeline(model):
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence")
languageDetector = LanguageDetectorDL.pretrained(model)\
.setInputCols("sentence")\
.setOutputCol("language")\
.setThreshold(0.5)\
.setCoalesceSentences(True)
nlpPipeline = Pipeline(
stages=[
documentAssembler,
sentence_detector,
languageDetector])
return nlpPipeline
def fit_data(pipeline, data):
empty_df = spark.createDataFrame([['']]).toDF('text')
pipeline_model = pipeline.fit(empty_df)
model = LightPipeline(pipeline_model)
results = model.fullAnnotate(data)[0]
return results
# Set up the page layout
st.markdown('<div class="main-title">State-Of-The-Art Language Detection With Spark NLP</div>', unsafe_allow_html=True)
st.subheader('Support for 375 different languages')
# Sidebar content
model = st.sidebar.selectbox(
"Choose the pretrained model",
["ld_wiki_tatoeba_cnn_375"],
help="For more info about the models visit: https://sparknlp.org/models"
)
with st.expander("View Supported Languges"):
st.write("Abkhaz, Iraqi Arabic, Adyghe, Afrikaans, Gulf Arabic, Afrihili, Assyrian Neo-Aramaic, Ainu, Aklanon, Gheg Albanian, Amharic, Aragonese, Old English, Uab Meto, North Levantine Arabic, Arabic, Algerian Arabic, Moroccan Arabic, Egyptian Arabic, Assamese, Asturian, Kotava, Awadhi, Aymara, Azerbaijani, Bashkir, Baluchi, Balinese, Bavarian, Central Bikol, Belarusian, Berber, Bulgarian, Bhojpuri, Bislama, Banjar, Bambara, Bengali, Tibetan, Breton, Bodo, Bosnian, Buryat, Baybayanon, Brithenig, Catalan, Cayuga, Chavacano, Chechen, Cebuano, Chamorro, Chagatai, Chinook Jargon, Choctaw, Cherokee, Jin Chinese, Chukchi, Central Mnong, Corsican, Chinese Pidgin English, Crimean Tatar, Seychellois Creole, Czech, Kashubian, Chuvash, Welsh, CycL, Cuyonon, Danish, German, Dungan, Drents, Lower Sorbian, Central Dusun, Dhivehi, Dutton World Speedwords, Ewe, Emilian, Greek, Erromintxela, English, Middle English, Esperanto, Spanish, Estonian, Basque, Evenki, Extremaduran, Persian, Finnish, Fijian, Kven Finnish, Faroese, French, Middle French, Old French, North Frisian, Pulaar, Friulian, Nigerian Fulfulde, Frisian, Irish, Ga, Gagauz, Gan Chinese, Garhwali, Guadeloupean Creole French, Scottish Gaelic, Gilbertese, Galician, Guarani, Konkani (Goan), Gronings, Gothic, Ancient Greek, Swiss German, Gujarati, Manx, Hausa, Hakka Chinese, Hawaiian, Ancient Hebrew, Hebrew, Hindi, Fiji Hindi, Hiligaynon, Hmong Njua (Green), Ho, Croatian, Hunsrik, Upper Sorbian, Xiang Chinese, Haitian Creole, Hungarian, Armenian, Interlingua, Iban, Indonesian, Interlingue, Igbo, Nuosu, Inuktitut, Ilocano, Ido, Icelandic, Italian, Ingrian, Japanese, Jamaican Patois, Lojban, Juhuri (Judeo-Tat), Jewish Palestinian Aramaic, Javanese, Georgian, Karakalpak, Kabyle, Kamba, Kekchi (Q'eqchi'), Khasi, Khakas, Kazakh, Greenlandic, Khmer, Kannada, Korean, Komi-Permyak, Komi-Zyrian, Karachay-Balkar, Karelian, Kashmiri, Kölsch, Kurdish, Kumyk, Cornish, Keningau Murut, Kyrgyz, Coastal Kadazan, Latin, Southern Subanen, Ladino, Luxembourgish, Láadan, Lingua Franca Nova, Luganda, Ligurian, Livonian, Lakota, Ladin, Lombard, Lingala, Lao, Louisiana Creole, Lithuanian, Latgalian, Latvian, Latvian, Literary Chinese, Laz, Madurese, Maithili, North Moluccan Malay, Moksha, Morisyen, Malagasy, Mambae, Marshallese, Meadow Mari, Maori, Mi'kmaq, Minangkabau, Macedonian, Malayalam, Mongolian, Manchu, Mon, Mohawk, Marathi, Hill Mari, Malay, Maltese, Tagal Murut, Mirandese, Hmong Daw (White), Burmese, Erzya, Nauruan, Nahuatl, Norwegian Bokmål, Central Huasteca Nahuatl, Low German (Low Saxon), Nepali, Newari, Ngeq, Guerrero Nahuatl, Niuean, Dutch, Orizaba Nahuatl, Norwegian Nynorsk, Norwegian, Nogai, Old Norse, Novial, Nepali, Naga (Tangshang), Navajo, Chinyanja, Nyungar, Old Aramaic, Occitan, Ojibwe, Odia (Oriya), Old East Slavic, Ossetian, Old Spanish, Old Saxon, Ottoman Turkish, Old Turkish, Punjabi (Eastern), Pangasinan, Kapampangan, Papiamento, Palauan, Picard, Pennsylvania German, Palatine German, Phoenician, Pali, Polish, Piedmontese, Punjabi (Western), Pipil, Old Prussian, Pashto, Portuguese, Quechua, K'iche', Quenya, Rapa Nui, Rendille, Tarifit, Romansh, Kirundi, Romanian, Romani, Russian, Rusyn, Kinyarwanda, Okinawan, Sanskrit, Yakut, Sardinian, Sicilian, Scots, Sindhi, Northern Sami, Sango, Samogitian, Shuswap, Tachawit, Sinhala, Sindarin, Slovak, Slovenian, Samoan, Southern Sami, Shona, Somali, Albanian, Serbian, Swazi, Southern Sotho, Saterland Frisian, Sundanese, Sumerian, Swedish, Swahili, Swabian, Swahili, Syriac, Tamil, Telugu, Tetun, Tajik, Thai, Tahaggart Tamahaq, Tigrinya, Tigre, Turkmen, Tokelauan, Tagalog, Klingon, Talysh, Jewish Babylonian Aramaic, Temuan, Setswana, Tongan, Tonga (Zambezi), Toki Pona, Tok Pisin, Old Tupi, Turkish, Tsonga, Tatar, Isan, Tuvaluan, Tahitian, Tuvinian, Talossan, Udmurt, Uyghur, Ukrainian, Umbundu, Urdu, Urhobo, Uzbek, Venetian, Veps, Vietnamese, Volapük, Võro, Walloon, Waray, Wolof, Shanghainese, Kalmyk, Xhosa, Mingrelian, Yiddish, Yoruba, Cantonese, Chinese, Malay (Vernacular), Malay, Zulu, and Zaza.")
# Reference notebook link in sidebar
link = """
<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/Language_Detector.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
</a>
"""
st.sidebar.markdown('Reference notebook:')
st.sidebar.markdown(link, unsafe_allow_html=True)
# Load examples
folder_path = f"inputs/{model}"
examples = [
lines[1].strip()
for filename in os.listdir(folder_path)
if filename.endswith('.txt')
for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()]
if len(lines) >= 2
]
selected_text = st.selectbox("Select a sample text", examples)
custom_input = st.text_input("Try it for yourself!")
if custom_input:
selected_text = custom_input
elif selected_text:
selected_text = selected_text
st.subheader('Selected Text')
st.markdown(f"""<div class="section">{selected_text}</div>""", unsafe_allow_html=True)
# Initialize Spark and create pipeline
spark = init_spark()
pipeline = create_pipeline(model)
output = fit_data(pipeline, selected_text)
# Display output
language_map = {
'ab': "Abkhaz",
'ace': "Achinese",
'acm': "Iraqi Arabic",
'ady': "Adyghe",
'af': "Afrikaans",
'afb': "Gulf Arabic",
'afh': "Afrihili",
'aii': "Assyrian Neo-Aramaic",
'ain': "Ainu",
'akl': "Aklanon",
'aln': "Gheg Albanian",
'als': "Tosk Albanian",
'am': "Amharic",
'an': "Aragonese",
'ang': "Old English",
'aoz': "Uab Meto",
'apc': "North Levantine Arabic",
'ar': "Arabic",
'arq': "Algerian Arabic",
'ary': "Moroccan Arabic",
'arz': "Egyptian Arabic",
'as': "Assamese",
'ast': "Asturian",
'av': "Avaric",
'avk': "Kotava",
'awa': "Awadhi",
'ay': "Aymara",
'az': "Azerbaijani",
'azb': "South Azerbaijani",
'ba': "Bashkir",
'bal': "Baluchi",
'ban': "Balinese",
'bar': "Bavarian",
'bat-smg': "bat-smg",
'bcl': "Central Bikol",
'be': "Belarusian",
'ber': "Berber",
'bg': "Bulgarian",
'bh': "bh",
'bho': "Bhojpuri",
'bi': "Bislama",
'bjn': "Banjar",
'bm': "Bambara",
'bn': "Bengali",
'bo': "Tibetan",
'bpy': "Bishnupriya",
'br': "Breton",
'brx': "Bodo",
'bs': "Bosnian",
'bua': "Buryat",
'bvy': "Baybayanon",
'bxr': "Russia Buriat",
'bzt': "Brithenig",
'ca': "Catalan",
'cay': "Cayuga",
'cbk': "Chavacano",
'cbk-zam': "cbk-zam",
'cdo': "Min Dong Chinese",
'ce': "Chechen",
'ceb': "Cebuano",
'ch': "Chamorro",
'chg': "Chagatai",
'chn': "Chinook Jargon",
'cho': "Choctaw",
'chr': "Cherokee",
'cjy': "Jin Chinese",
'ckb': "Central Kurdish (Soranî)",
'ckt': "Chukchi",
'cmo': "Central Mnong",
'co': "Corsican",
'cpi': "Chinese Pidgin English",
'crh': "Crimean Tatar",
'crs': "Seychellois Creole",
'cs': "Czech",
'ces': "Czech",
'csb': "Kashubian",
'cv': "Chuvash",
'cy': "Welsh",
'cycl': "CycL",
'cyo': "Cuyonon",
'da': "Danish",
'de': "German",
'deu': "German",
'diq': "Dimli (individual language)",
'dng': "Dungan",
'drt': "Drents",
'dsb': "Lower Sorbian",
'dtp': "Central Dusun",
'dty': "dty",
'dv': "Dhivehi",
'dws': "Dutton World Speedwords",
'ee': "Ewe",
'egl': "Emilian",
'el': "Greek",
'ell': "Greek",
'eml': "eml",
'emx': "Erromintxela",
'en': "English",
'enm': "Middle English",
'eo': "Esperanto",
'es': "Spanish",
'et': "Estonian",
'eu': "Basque",
'evn': "Evenki",
'ext': "Extremaduran",
'fa': "Persian",
'fi': "Finnish",
'fiu-vro': "fiu-vro",
'fj': "Fijian",
'fkv': "Kven Finnish",
'fo': "Faroese",
'fr': "French",
'fra': "French",
'frm': "Middle French",
'fro': "Old French",
'frp': "Arpitan",
'frr': "North Frisian",
'fuc': "Pulaar",
'fur': "Friulian",
'fuv': "Nigerian Fulfulde",
'fy': "Frisian",
'ga': "Irish",
'gaa': "Ga",
'gag': "Gagauz",
'gan': "Gan Chinese",
'gbm': "Garhwali",
'gcf': "Guadeloupean Creole French",
'gd': "Scottish Gaelic",
'gil': "Gilbertese",
'gl': "Galician",
'glk': "Gilaki",
'gn': "Guarani",
'gom': "Konkani (Goan)",
'gos': "Gronings",
'got': "Gothic",
'grc': "Ancient Greek",
'gsw': "Swiss German",
'gu': "Gujarati",
'gv': "Manx",
'ha': "Hausa",
'hak': "Hakka Chinese",
'haw': "Hawaiian",
'hbo': "Ancient Hebrew",
'he': "Hebrew",
'hi': "Hindi",
'hif': "Fiji Hindi",
'hil': "Hiligaynon",
'hnj': "Hmong Njua (Green)",
'hoc': "Ho",
'hr': "Croatian",
'hrx': "Hunsrik",
'hsb': "Upper Sorbian",
'hsn': "Xiang Chinese",
'ht': "Haitian Creole",
'hu': "Hungarian",
'hy': "Armenian",
'ia': "Interlingua",
'iba': "Iban",
'id': "Indonesian",
'ie': "Interlingue",
'ig': "Igbo",
'ii': "Nuosu",
'ike': "Inuktitut",
'ilo': "Ilocano",
'io': "Ido",
'is': "Icelandic",
'it': "Italian",
'izh': "Ingrian",
'ja': "Japanese",
'jam': "Jamaican Patois",
'jbo': "Lojban",
'jdt': "Juhuri (Judeo-Tat)",
'jpa': "Jewish Palestinian Aramaic",
'jv': "Javanese",
'ka': "Georgian",
'kaa': "Karakalpak",
'kab': "Kabyle",
'kam': "Kamba",
'kbd': "Kabardian",
'kek': "Kekchi (Q'eqchi')",
'kha': "Khasi",
'kjh': "Khakas",
'kk': "Kazakh",
'kl': "Greenlandic",
'km': "Khmer",
'kn': "Kannada",
'ko': "Korean",
'koi': "Komi-Permyak",
'kpv': "Komi-Zyrian",
'krc': "Karachay-Balkar",
'krl': "Karelian",
'ks': "Kashmiri",
'ksh': "Kölsch",
'ku': "Kurdish",
'kum': "Kumyk",
'kv': "Komi",
'kw': "Cornish",
'kxi': "Keningau Murut",
'ky': "Kyrgyz",
'kzj': "Coastal Kadazan",
'la': "Latin",
'laa': "Southern Subanen",
'lad': "Ladino",
'lb': "Luxembourgish",
'ldn': "Láadan",
'lez': "Lezghian",
'lfn': "Lingua Franca Nova",
'lg': "Luganda",
'li': "Limburgan",
'lij': "Ligurian",
'liv': "Livonian",
'lkt': "Lakota",
'lld': "Ladin",
'lmo': "Lombard",
'ln': "Lingala",
'lo': "Lao",
'lou': "Louisiana Creole",
'lrc': "Northern Luri",
'lt': "Lithuanian",
'ltg': "Latgalian",
'lv': "Latvian",
'lvs': "Latvian",
'lzh': "Literary Chinese",
'lzz': "Laz",
'mad': "Madurese",
'mai': "Maithili",
'map-bms': "map-bms",
'max': "North Moluccan Malay",
'mdf': "Moksha",
'mfe': "Morisyen",
'mg': "Malagasy",
'mgm': "Mambae",
'mh': "Marshallese",
'mhr': "Meadow Mari",
'mi': "Maori",
'mic': "Mi'kmaq",
'min': "Minangkabau",
'mk': "Macedonian",
'ml': "Malayalam",
'mn': "Mongolian",
'mnc': "Manchu",
'mnw': "Mon",
'moh': "Mohawk",
'mr': "Marathi",
'mrj': "Hill Mari",
'ms': "Malay",
'mt': "Maltese",
'mvv': "Tagal Murut",
'mwl': "Mirandese",
'mww': "Hmong Daw (White)",
'my': "Burmese",
'myv': "Erzya",
'mzn': "Mazanderani",
'na': "Nauruan",
'nah': "Nahuatl",
'nap': "Neapolitan",
'nb': "Norwegian Bokmål",
'nch': "Central Huasteca Nahuatl",
'nds': "Low German (Low Saxon)",
'nds-nl': "nds-nl",
'ne': "Nepali",
'new': "Newari",
'ngt': "Ngeq",
'ngu': "Guerrero Nahuatl",
'niu': "Niuean",
'nl': "Dutch",
'nlv': "Orizaba Nahuatl",
'nn': "Norwegian Nynorsk",
'no': "Norwegian",
'nog': "Nogai",
'non': "Old Norse",
'nov': "Novial",
'npi': "Nepali",
'nrm': "Narom",
'nso': "Pedi",
'nst': "Naga (Tangshang)",
'nv': "Navajo",
'ny': "Chinyanja",
'nys': "Nyungar",
'oar': "Old Aramaic",
'oc': "Occitan",
'oj': "Ojibwe",
'olo': "Livvi",
'om': "Oromo",
'or': "Odia (Oriya)",
'orv': "Old East Slavic",
'os': "Ossetian",
'osp': "Old Spanish",
'osx': "Old Saxon",
'ota': "Ottoman Turkish",
'otk': "Old Turkish",
'pa': "Punjabi (Eastern)",
'pag': "Pangasinan",
'pam': "Kapampangan",
'pap': "Papiamento",
'pau': "Palauan",
'pcd': "Picard",
'pdc': "Pennsylvania German",
'pfl': "Palatine German",
'phn': "Phoenician",
'pi': "Pali",
'pl': "Polish",
'pms': "Piedmontese",
'pnb': "Punjabi (Western)",
'ppl': "Pipil",
'prg': "Old Prussian",
'ps': "Pashto",
'pt': "Portuguese",
'qu': "Quechua",
'quc': "K'iche'",
'qya': "Quenya",
'rap': "Rapa Nui",
'rel': "Rendille",
'rif': "Tarifit",
'rm': "Romansh",
'rn': "Kirundi",
'ro': "Romanian",
'ron': "Romanian",
'roa-rup': "roa-rup",
'roa-tara': "roa-tara",
'rom': "Romani",
'ru': "Russian",
'rue': "Rusyn",
'rw': "Kinyarwanda",
'ryu': "Okinawan",
'sa': "Sanskrit",
'sah': "Yakut",
'sc': "Sardinian",
'scn': "Sicilian",
'sco': "Scots",
'sd': "Sindhi",
'se': "Northern Sami",
'sg': "Sango",
'sgs': "Samogitian",
'sh': "Serbo-Croatian",
'shs': "Shuswap",
'shy': "Tachawit",
'si': "Sinhala",
'sjn': "Sindarin",
'sk': "Slovak",
'slk': "Slovak",
'sl': "Slovenian",
'sm': "Samoan",
'sma': "Southern Sami",
'sn': "Shona",
'so': "Somali",
'sq': "Albanian",
'sr': "Serbian",
'srn': "Sranan Tongo",
'ss': "Swazi",
'st': "Southern Sotho",
'stq': "Saterland Frisian",
'su': "Sundanese",
'sux': "Sumerian",
'sv': "Swedish",
'sw': "Swahili",
'swg': "Swabian",
'swh': "Swahili",
'syc': "Syriac",
'szl': "Silesian",
'ta': "Tamil",
'tcy': "Tulu",
'te': "Telugu",
'tet': "Tetun",
'tg': "Tajik",
'th': "Thai",
'thv': "Tahaggart Tamahaq",
'ti': "Tigrinya",
'tig': "Tigre",
'tk': "Turkmen",
'tkl': "Tokelauan",
'tl': "Tagalog",
'tlh': "Klingon",
'tly': "Talysh",
'tmr': "Jewish Babylonian Aramaic",
'tmw': "Temuan",
'tn': "Setswana",
'to': "Tongan",
'toi': "Tonga (Zambezi)",
'toki': "Toki Pona",
'tpi': "Tok Pisin",
'tpw': "Old Tupi",
'tr': "Turkish",
'ts': "Tsonga",
'tt': "Tatar",
'tts': "Isan",
'tvl': "Tuvaluan",
'ty': "Tahitian",
'tyv': "Tuvinian",
'tzl': "Talossan",
'udm': "Udmurt",
'ug': "Uyghur",
'uk': "Ukrainian",
'umb': "Umbundu",
'ur': "Urdu",
'urh': "Urhobo",
'uz': "Uzbek",
'vec': "Venetian",
'vep': "Veps",
'vi': "Vietnamese",
'vls': "Vlaams",
'vo': "Volapük",
'vro': "Võro",
'wa': "Walloon",
'war': "Waray",
'wo': "Wolof",
'wuu': "Shanghainese",
'xal': "Kalmyk",
'xh': "Xhosa",
'xmf': "Mingrelian",
'yi': "Yiddish",
'yo': "Yoruba",
'yue': "Cantonese",
'zea': "Zeeuws",
'zh': "Chinese",
'zh-classical': "zh-classical",
'zh-min-nan': "zh-min-nan",
'zh-yue': "zh-yue",
'zlm': "Malay (Vernacular)",
'zsm': "Malay",
'zu': "Zulu",
'zza': "Zaza"
}
abbreviation = output['language'][0].result
language = language_map[abbreviation]
confidence = round(float(output['language'][0].metadata[abbreviation])*100, 2)
st.write("")
st.write("")
st.markdown(f"This text is in **{language} ({abbreviation})** language.")
st.markdown(f"Classification Confidence: **{confidence}%**")