Spaces:
Sleeping
Sleeping
import streamlit as st | |
import sparknlp | |
import os | |
import pandas as pd | |
from sparknlp.base import * | |
from sparknlp.annotator import * | |
from pyspark.ml import Pipeline | |
from sparknlp.pretrained import PretrainedPipeline | |
# Page configuration | |
st.set_page_config( | |
layout="wide", | |
page_title="Spark NLP Demos App", | |
initial_sidebar_state="auto" | |
) | |
# CSS for styling | |
st.markdown(""" | |
<style> | |
.main-title { | |
font-size: 36px; | |
color: #4A90E2; | |
font-weight: bold; | |
text-align: center; | |
} | |
.section { | |
background-color: #f9f9f9; | |
padding: 15px; | |
border-radius: 10px; | |
margin-top: 20px; | |
} | |
.stTable { | |
margin-left: auto; | |
margin-right: auto; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
def init_spark(): | |
return sparknlp.start() | |
def create_pipeline(model): | |
documentAssembler = DocumentAssembler()\ | |
.setInputCol("text")\ | |
.setOutputCol("document") | |
sentence_detector = SentenceDetector() \ | |
.setInputCols(["document"]) \ | |
.setOutputCol("sentence") | |
languageDetector = LanguageDetectorDL.pretrained(model)\ | |
.setInputCols("sentence")\ | |
.setOutputCol("language")\ | |
.setThreshold(0.5)\ | |
.setCoalesceSentences(True) | |
nlpPipeline = Pipeline( | |
stages=[ | |
documentAssembler, | |
sentence_detector, | |
languageDetector]) | |
return nlpPipeline | |
def fit_data(pipeline, data): | |
empty_df = spark.createDataFrame([['']]).toDF('text') | |
pipeline_model = pipeline.fit(empty_df) | |
model = LightPipeline(pipeline_model) | |
results = model.fullAnnotate(data)[0] | |
return results | |
# Set up the page layout | |
st.markdown('<div class="main-title">State-Of-The-Art Language Detection With Spark NLP</div>', unsafe_allow_html=True) | |
st.subheader('Support for 375 different languages') | |
# Sidebar content | |
model = st.sidebar.selectbox( | |
"Choose the pretrained model", | |
["ld_wiki_tatoeba_cnn_375"], | |
help="For more info about the models visit: https://sparknlp.org/models" | |
) | |
with st.expander("View Supported Languges"): | |
st.write("Abkhaz, Iraqi Arabic, Adyghe, Afrikaans, Gulf Arabic, Afrihili, Assyrian Neo-Aramaic, Ainu, Aklanon, Gheg Albanian, Amharic, Aragonese, Old English, Uab Meto, North Levantine Arabic, Arabic, Algerian Arabic, Moroccan Arabic, Egyptian Arabic, Assamese, Asturian, Kotava, Awadhi, Aymara, Azerbaijani, Bashkir, Baluchi, Balinese, Bavarian, Central Bikol, Belarusian, Berber, Bulgarian, Bhojpuri, Bislama, Banjar, Bambara, Bengali, Tibetan, Breton, Bodo, Bosnian, Buryat, Baybayanon, Brithenig, Catalan, Cayuga, Chavacano, Chechen, Cebuano, Chamorro, Chagatai, Chinook Jargon, Choctaw, Cherokee, Jin Chinese, Chukchi, Central Mnong, Corsican, Chinese Pidgin English, Crimean Tatar, Seychellois Creole, Czech, Kashubian, Chuvash, Welsh, CycL, Cuyonon, Danish, German, Dungan, Drents, Lower Sorbian, Central Dusun, Dhivehi, Dutton World Speedwords, Ewe, Emilian, Greek, Erromintxela, English, Middle English, Esperanto, Spanish, Estonian, Basque, Evenki, Extremaduran, Persian, Finnish, Fijian, Kven Finnish, Faroese, French, Middle French, Old French, North Frisian, Pulaar, Friulian, Nigerian Fulfulde, Frisian, Irish, Ga, Gagauz, Gan Chinese, Garhwali, Guadeloupean Creole French, Scottish Gaelic, Gilbertese, Galician, Guarani, Konkani (Goan), Gronings, Gothic, Ancient Greek, Swiss German, Gujarati, Manx, Hausa, Hakka Chinese, Hawaiian, Ancient Hebrew, Hebrew, Hindi, Fiji Hindi, Hiligaynon, Hmong Njua (Green), Ho, Croatian, Hunsrik, Upper Sorbian, Xiang Chinese, Haitian Creole, Hungarian, Armenian, Interlingua, Iban, Indonesian, Interlingue, Igbo, Nuosu, Inuktitut, Ilocano, Ido, Icelandic, Italian, Ingrian, Japanese, Jamaican Patois, Lojban, Juhuri (Judeo-Tat), Jewish Palestinian Aramaic, Javanese, Georgian, Karakalpak, Kabyle, Kamba, Kekchi (Q'eqchi'), Khasi, Khakas, Kazakh, Greenlandic, Khmer, Kannada, Korean, Komi-Permyak, Komi-Zyrian, Karachay-Balkar, Karelian, Kashmiri, Kölsch, Kurdish, Kumyk, Cornish, Keningau Murut, Kyrgyz, Coastal Kadazan, Latin, Southern Subanen, Ladino, Luxembourgish, Láadan, Lingua Franca Nova, Luganda, Ligurian, Livonian, Lakota, Ladin, Lombard, Lingala, Lao, Louisiana Creole, Lithuanian, Latgalian, Latvian, Latvian, Literary Chinese, Laz, Madurese, Maithili, North Moluccan Malay, Moksha, Morisyen, Malagasy, Mambae, Marshallese, Meadow Mari, Maori, Mi'kmaq, Minangkabau, Macedonian, Malayalam, Mongolian, Manchu, Mon, Mohawk, Marathi, Hill Mari, Malay, Maltese, Tagal Murut, Mirandese, Hmong Daw (White), Burmese, Erzya, Nauruan, Nahuatl, Norwegian Bokmål, Central Huasteca Nahuatl, Low German (Low Saxon), Nepali, Newari, Ngeq, Guerrero Nahuatl, Niuean, Dutch, Orizaba Nahuatl, Norwegian Nynorsk, Norwegian, Nogai, Old Norse, Novial, Nepali, Naga (Tangshang), Navajo, Chinyanja, Nyungar, Old Aramaic, Occitan, Ojibwe, Odia (Oriya), Old East Slavic, Ossetian, Old Spanish, Old Saxon, Ottoman Turkish, Old Turkish, Punjabi (Eastern), Pangasinan, Kapampangan, Papiamento, Palauan, Picard, Pennsylvania German, Palatine German, Phoenician, Pali, Polish, Piedmontese, Punjabi (Western), Pipil, Old Prussian, Pashto, Portuguese, Quechua, K'iche', Quenya, Rapa Nui, Rendille, Tarifit, Romansh, Kirundi, Romanian, Romani, Russian, Rusyn, Kinyarwanda, Okinawan, Sanskrit, Yakut, Sardinian, Sicilian, Scots, Sindhi, Northern Sami, Sango, Samogitian, Shuswap, Tachawit, Sinhala, Sindarin, Slovak, Slovenian, Samoan, Southern Sami, Shona, Somali, Albanian, Serbian, Swazi, Southern Sotho, Saterland Frisian, Sundanese, Sumerian, Swedish, Swahili, Swabian, Swahili, Syriac, Tamil, Telugu, Tetun, Tajik, Thai, Tahaggart Tamahaq, Tigrinya, Tigre, Turkmen, Tokelauan, Tagalog, Klingon, Talysh, Jewish Babylonian Aramaic, Temuan, Setswana, Tongan, Tonga (Zambezi), Toki Pona, Tok Pisin, Old Tupi, Turkish, Tsonga, Tatar, Isan, Tuvaluan, Tahitian, Tuvinian, Talossan, Udmurt, Uyghur, Ukrainian, Umbundu, Urdu, Urhobo, Uzbek, Venetian, Veps, Vietnamese, Volapük, Võro, Walloon, Waray, Wolof, Shanghainese, Kalmyk, Xhosa, Mingrelian, Yiddish, Yoruba, Cantonese, Chinese, Malay (Vernacular), Malay, Zulu, and Zaza.") | |
# Reference notebook link in sidebar | |
link = """ | |
<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/Language_Detector.ipynb"> | |
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/> | |
</a> | |
""" | |
st.sidebar.markdown('Reference notebook:') | |
st.sidebar.markdown(link, unsafe_allow_html=True) | |
# Load examples | |
folder_path = f"inputs/{model}" | |
examples = [ | |
lines[1].strip() | |
for filename in os.listdir(folder_path) | |
if filename.endswith('.txt') | |
for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()] | |
if len(lines) >= 2 | |
] | |
selected_text = st.selectbox("Select a sample text", examples) | |
custom_input = st.text_input("Try it for yourself!") | |
if custom_input: | |
selected_text = custom_input | |
elif selected_text: | |
selected_text = selected_text | |
st.subheader('Selected Text') | |
st.markdown(f"""<div class="section">{selected_text}</div>""", unsafe_allow_html=True) | |
# Initialize Spark and create pipeline | |
spark = init_spark() | |
pipeline = create_pipeline(model) | |
output = fit_data(pipeline, selected_text) | |
# Display output | |
language_map = { | |
'ab': "Abkhaz", | |
'ace': "Achinese", | |
'acm': "Iraqi Arabic", | |
'ady': "Adyghe", | |
'af': "Afrikaans", | |
'afb': "Gulf Arabic", | |
'afh': "Afrihili", | |
'aii': "Assyrian Neo-Aramaic", | |
'ain': "Ainu", | |
'akl': "Aklanon", | |
'aln': "Gheg Albanian", | |
'als': "Tosk Albanian", | |
'am': "Amharic", | |
'an': "Aragonese", | |
'ang': "Old English", | |
'aoz': "Uab Meto", | |
'apc': "North Levantine Arabic", | |
'ar': "Arabic", | |
'arq': "Algerian Arabic", | |
'ary': "Moroccan Arabic", | |
'arz': "Egyptian Arabic", | |
'as': "Assamese", | |
'ast': "Asturian", | |
'av': "Avaric", | |
'avk': "Kotava", | |
'awa': "Awadhi", | |
'ay': "Aymara", | |
'az': "Azerbaijani", | |
'azb': "South Azerbaijani", | |
'ba': "Bashkir", | |
'bal': "Baluchi", | |
'ban': "Balinese", | |
'bar': "Bavarian", | |
'bat-smg': "bat-smg", | |
'bcl': "Central Bikol", | |
'be': "Belarusian", | |
'ber': "Berber", | |
'bg': "Bulgarian", | |
'bh': "bh", | |
'bho': "Bhojpuri", | |
'bi': "Bislama", | |
'bjn': "Banjar", | |
'bm': "Bambara", | |
'bn': "Bengali", | |
'bo': "Tibetan", | |
'bpy': "Bishnupriya", | |
'br': "Breton", | |
'brx': "Bodo", | |
'bs': "Bosnian", | |
'bua': "Buryat", | |
'bvy': "Baybayanon", | |
'bxr': "Russia Buriat", | |
'bzt': "Brithenig", | |
'ca': "Catalan", | |
'cay': "Cayuga", | |
'cbk': "Chavacano", | |
'cbk-zam': "cbk-zam", | |
'cdo': "Min Dong Chinese", | |
'ce': "Chechen", | |
'ceb': "Cebuano", | |
'ch': "Chamorro", | |
'chg': "Chagatai", | |
'chn': "Chinook Jargon", | |
'cho': "Choctaw", | |
'chr': "Cherokee", | |
'cjy': "Jin Chinese", | |
'ckb': "Central Kurdish (Soranî)", | |
'ckt': "Chukchi", | |
'cmo': "Central Mnong", | |
'co': "Corsican", | |
'cpi': "Chinese Pidgin English", | |
'crh': "Crimean Tatar", | |
'crs': "Seychellois Creole", | |
'cs': "Czech", | |
'ces': "Czech", | |
'csb': "Kashubian", | |
'cv': "Chuvash", | |
'cy': "Welsh", | |
'cycl': "CycL", | |
'cyo': "Cuyonon", | |
'da': "Danish", | |
'de': "German", | |
'deu': "German", | |
'diq': "Dimli (individual language)", | |
'dng': "Dungan", | |
'drt': "Drents", | |
'dsb': "Lower Sorbian", | |
'dtp': "Central Dusun", | |
'dty': "dty", | |
'dv': "Dhivehi", | |
'dws': "Dutton World Speedwords", | |
'ee': "Ewe", | |
'egl': "Emilian", | |
'el': "Greek", | |
'ell': "Greek", | |
'eml': "eml", | |
'emx': "Erromintxela", | |
'en': "English", | |
'enm': "Middle English", | |
'eo': "Esperanto", | |
'es': "Spanish", | |
'et': "Estonian", | |
'eu': "Basque", | |
'evn': "Evenki", | |
'ext': "Extremaduran", | |
'fa': "Persian", | |
'fi': "Finnish", | |
'fiu-vro': "fiu-vro", | |
'fj': "Fijian", | |
'fkv': "Kven Finnish", | |
'fo': "Faroese", | |
'fr': "French", | |
'fra': "French", | |
'frm': "Middle French", | |
'fro': "Old French", | |
'frp': "Arpitan", | |
'frr': "North Frisian", | |
'fuc': "Pulaar", | |
'fur': "Friulian", | |
'fuv': "Nigerian Fulfulde", | |
'fy': "Frisian", | |
'ga': "Irish", | |
'gaa': "Ga", | |
'gag': "Gagauz", | |
'gan': "Gan Chinese", | |
'gbm': "Garhwali", | |
'gcf': "Guadeloupean Creole French", | |
'gd': "Scottish Gaelic", | |
'gil': "Gilbertese", | |
'gl': "Galician", | |
'glk': "Gilaki", | |
'gn': "Guarani", | |
'gom': "Konkani (Goan)", | |
'gos': "Gronings", | |
'got': "Gothic", | |
'grc': "Ancient Greek", | |
'gsw': "Swiss German", | |
'gu': "Gujarati", | |
'gv': "Manx", | |
'ha': "Hausa", | |
'hak': "Hakka Chinese", | |
'haw': "Hawaiian", | |
'hbo': "Ancient Hebrew", | |
'he': "Hebrew", | |
'hi': "Hindi", | |
'hif': "Fiji Hindi", | |
'hil': "Hiligaynon", | |
'hnj': "Hmong Njua (Green)", | |
'hoc': "Ho", | |
'hr': "Croatian", | |
'hrx': "Hunsrik", | |
'hsb': "Upper Sorbian", | |
'hsn': "Xiang Chinese", | |
'ht': "Haitian Creole", | |
'hu': "Hungarian", | |
'hy': "Armenian", | |
'ia': "Interlingua", | |
'iba': "Iban", | |
'id': "Indonesian", | |
'ie': "Interlingue", | |
'ig': "Igbo", | |
'ii': "Nuosu", | |
'ike': "Inuktitut", | |
'ilo': "Ilocano", | |
'io': "Ido", | |
'is': "Icelandic", | |
'it': "Italian", | |
'izh': "Ingrian", | |
'ja': "Japanese", | |
'jam': "Jamaican Patois", | |
'jbo': "Lojban", | |
'jdt': "Juhuri (Judeo-Tat)", | |
'jpa': "Jewish Palestinian Aramaic", | |
'jv': "Javanese", | |
'ka': "Georgian", | |
'kaa': "Karakalpak", | |
'kab': "Kabyle", | |
'kam': "Kamba", | |
'kbd': "Kabardian", | |
'kek': "Kekchi (Q'eqchi')", | |
'kha': "Khasi", | |
'kjh': "Khakas", | |
'kk': "Kazakh", | |
'kl': "Greenlandic", | |
'km': "Khmer", | |
'kn': "Kannada", | |
'ko': "Korean", | |
'koi': "Komi-Permyak", | |
'kpv': "Komi-Zyrian", | |
'krc': "Karachay-Balkar", | |
'krl': "Karelian", | |
'ks': "Kashmiri", | |
'ksh': "Kölsch", | |
'ku': "Kurdish", | |
'kum': "Kumyk", | |
'kv': "Komi", | |
'kw': "Cornish", | |
'kxi': "Keningau Murut", | |
'ky': "Kyrgyz", | |
'kzj': "Coastal Kadazan", | |
'la': "Latin", | |
'laa': "Southern Subanen", | |
'lad': "Ladino", | |
'lb': "Luxembourgish", | |
'ldn': "Láadan", | |
'lez': "Lezghian", | |
'lfn': "Lingua Franca Nova", | |
'lg': "Luganda", | |
'li': "Limburgan", | |
'lij': "Ligurian", | |
'liv': "Livonian", | |
'lkt': "Lakota", | |
'lld': "Ladin", | |
'lmo': "Lombard", | |
'ln': "Lingala", | |
'lo': "Lao", | |
'lou': "Louisiana Creole", | |
'lrc': "Northern Luri", | |
'lt': "Lithuanian", | |
'ltg': "Latgalian", | |
'lv': "Latvian", | |
'lvs': "Latvian", | |
'lzh': "Literary Chinese", | |
'lzz': "Laz", | |
'mad': "Madurese", | |
'mai': "Maithili", | |
'map-bms': "map-bms", | |
'max': "North Moluccan Malay", | |
'mdf': "Moksha", | |
'mfe': "Morisyen", | |
'mg': "Malagasy", | |
'mgm': "Mambae", | |
'mh': "Marshallese", | |
'mhr': "Meadow Mari", | |
'mi': "Maori", | |
'mic': "Mi'kmaq", | |
'min': "Minangkabau", | |
'mk': "Macedonian", | |
'ml': "Malayalam", | |
'mn': "Mongolian", | |
'mnc': "Manchu", | |
'mnw': "Mon", | |
'moh': "Mohawk", | |
'mr': "Marathi", | |
'mrj': "Hill Mari", | |
'ms': "Malay", | |
'mt': "Maltese", | |
'mvv': "Tagal Murut", | |
'mwl': "Mirandese", | |
'mww': "Hmong Daw (White)", | |
'my': "Burmese", | |
'myv': "Erzya", | |
'mzn': "Mazanderani", | |
'na': "Nauruan", | |
'nah': "Nahuatl", | |
'nap': "Neapolitan", | |
'nb': "Norwegian Bokmål", | |
'nch': "Central Huasteca Nahuatl", | |
'nds': "Low German (Low Saxon)", | |
'nds-nl': "nds-nl", | |
'ne': "Nepali", | |
'new': "Newari", | |
'ngt': "Ngeq", | |
'ngu': "Guerrero Nahuatl", | |
'niu': "Niuean", | |
'nl': "Dutch", | |
'nlv': "Orizaba Nahuatl", | |
'nn': "Norwegian Nynorsk", | |
'no': "Norwegian", | |
'nog': "Nogai", | |
'non': "Old Norse", | |
'nov': "Novial", | |
'npi': "Nepali", | |
'nrm': "Narom", | |
'nso': "Pedi", | |
'nst': "Naga (Tangshang)", | |
'nv': "Navajo", | |
'ny': "Chinyanja", | |
'nys': "Nyungar", | |
'oar': "Old Aramaic", | |
'oc': "Occitan", | |
'oj': "Ojibwe", | |
'olo': "Livvi", | |
'om': "Oromo", | |
'or': "Odia (Oriya)", | |
'orv': "Old East Slavic", | |
'os': "Ossetian", | |
'osp': "Old Spanish", | |
'osx': "Old Saxon", | |
'ota': "Ottoman Turkish", | |
'otk': "Old Turkish", | |
'pa': "Punjabi (Eastern)", | |
'pag': "Pangasinan", | |
'pam': "Kapampangan", | |
'pap': "Papiamento", | |
'pau': "Palauan", | |
'pcd': "Picard", | |
'pdc': "Pennsylvania German", | |
'pfl': "Palatine German", | |
'phn': "Phoenician", | |
'pi': "Pali", | |
'pl': "Polish", | |
'pms': "Piedmontese", | |
'pnb': "Punjabi (Western)", | |
'ppl': "Pipil", | |
'prg': "Old Prussian", | |
'ps': "Pashto", | |
'pt': "Portuguese", | |
'qu': "Quechua", | |
'quc': "K'iche'", | |
'qya': "Quenya", | |
'rap': "Rapa Nui", | |
'rel': "Rendille", | |
'rif': "Tarifit", | |
'rm': "Romansh", | |
'rn': "Kirundi", | |
'ro': "Romanian", | |
'ron': "Romanian", | |
'roa-rup': "roa-rup", | |
'roa-tara': "roa-tara", | |
'rom': "Romani", | |
'ru': "Russian", | |
'rue': "Rusyn", | |
'rw': "Kinyarwanda", | |
'ryu': "Okinawan", | |
'sa': "Sanskrit", | |
'sah': "Yakut", | |
'sc': "Sardinian", | |
'scn': "Sicilian", | |
'sco': "Scots", | |
'sd': "Sindhi", | |
'se': "Northern Sami", | |
'sg': "Sango", | |
'sgs': "Samogitian", | |
'sh': "Serbo-Croatian", | |
'shs': "Shuswap", | |
'shy': "Tachawit", | |
'si': "Sinhala", | |
'sjn': "Sindarin", | |
'sk': "Slovak", | |
'slk': "Slovak", | |
'sl': "Slovenian", | |
'sm': "Samoan", | |
'sma': "Southern Sami", | |
'sn': "Shona", | |
'so': "Somali", | |
'sq': "Albanian", | |
'sr': "Serbian", | |
'srn': "Sranan Tongo", | |
'ss': "Swazi", | |
'st': "Southern Sotho", | |
'stq': "Saterland Frisian", | |
'su': "Sundanese", | |
'sux': "Sumerian", | |
'sv': "Swedish", | |
'sw': "Swahili", | |
'swg': "Swabian", | |
'swh': "Swahili", | |
'syc': "Syriac", | |
'szl': "Silesian", | |
'ta': "Tamil", | |
'tcy': "Tulu", | |
'te': "Telugu", | |
'tet': "Tetun", | |
'tg': "Tajik", | |
'th': "Thai", | |
'thv': "Tahaggart Tamahaq", | |
'ti': "Tigrinya", | |
'tig': "Tigre", | |
'tk': "Turkmen", | |
'tkl': "Tokelauan", | |
'tl': "Tagalog", | |
'tlh': "Klingon", | |
'tly': "Talysh", | |
'tmr': "Jewish Babylonian Aramaic", | |
'tmw': "Temuan", | |
'tn': "Setswana", | |
'to': "Tongan", | |
'toi': "Tonga (Zambezi)", | |
'toki': "Toki Pona", | |
'tpi': "Tok Pisin", | |
'tpw': "Old Tupi", | |
'tr': "Turkish", | |
'ts': "Tsonga", | |
'tt': "Tatar", | |
'tts': "Isan", | |
'tvl': "Tuvaluan", | |
'ty': "Tahitian", | |
'tyv': "Tuvinian", | |
'tzl': "Talossan", | |
'udm': "Udmurt", | |
'ug': "Uyghur", | |
'uk': "Ukrainian", | |
'umb': "Umbundu", | |
'ur': "Urdu", | |
'urh': "Urhobo", | |
'uz': "Uzbek", | |
'vec': "Venetian", | |
'vep': "Veps", | |
'vi': "Vietnamese", | |
'vls': "Vlaams", | |
'vo': "Volapük", | |
'vro': "Võro", | |
'wa': "Walloon", | |
'war': "Waray", | |
'wo': "Wolof", | |
'wuu': "Shanghainese", | |
'xal': "Kalmyk", | |
'xh': "Xhosa", | |
'xmf': "Mingrelian", | |
'yi': "Yiddish", | |
'yo': "Yoruba", | |
'yue': "Cantonese", | |
'zea': "Zeeuws", | |
'zh': "Chinese", | |
'zh-classical': "zh-classical", | |
'zh-min-nan': "zh-min-nan", | |
'zh-yue': "zh-yue", | |
'zlm': "Malay (Vernacular)", | |
'zsm': "Malay", | |
'zu': "Zulu", | |
'zza': "Zaza" | |
} | |
abbreviation = output['language'][0].result | |
language = language_map[abbreviation] | |
confidence = round(float(output['language'][0].metadata[abbreviation])*100, 2) | |
st.write("") | |
st.write("") | |
st.markdown(f"This text is in **{language} ({abbreviation})** language.") | |
st.markdown(f"Classification Confidence: **{confidence}%**") | |