Spaces:
Sleeping
Sleeping
Upload 25 files
Browse files- .streamlit/config.toml +3 -0
- Demo.py +553 -0
- Dockerfile +70 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example1.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example10.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example11.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example12.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example13.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example14.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example15.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example16.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example17.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example18.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example19.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example2.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example20.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example3.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example4.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example5.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example6.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example7.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example8.txt +2 -0
- inputs/ld_wiki_tatoeba_cnn_375/Example9.txt +2 -0
- pages/Workflow & Model Overview.py +278 -0
- requirements.txt +5 -0
.streamlit/config.toml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
base="light"
|
3 |
+
primaryColor="#29B4E8"
|
Demo.py
ADDED
@@ -0,0 +1,553 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import sparknlp
|
3 |
+
import os
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
from sparknlp.base import *
|
7 |
+
from sparknlp.annotator import *
|
8 |
+
from pyspark.ml import Pipeline
|
9 |
+
from sparknlp.pretrained import PretrainedPipeline
|
10 |
+
from annotated_text import annotated_text
|
11 |
+
|
12 |
+
# Page configuration
|
13 |
+
st.set_page_config(
|
14 |
+
layout="wide",
|
15 |
+
page_title="Spark NLP Demos App",
|
16 |
+
initial_sidebar_state="auto"
|
17 |
+
)
|
18 |
+
|
19 |
+
# CSS for styling
|
20 |
+
st.markdown("""
|
21 |
+
<style>
|
22 |
+
.main-title {
|
23 |
+
font-size: 36px;
|
24 |
+
color: #4A90E2;
|
25 |
+
font-weight: bold;
|
26 |
+
text-align: center;
|
27 |
+
}
|
28 |
+
.section p, .section ul {
|
29 |
+
color: #666666;
|
30 |
+
}
|
31 |
+
.stTable {
|
32 |
+
margin-left: auto;
|
33 |
+
margin-right: auto;
|
34 |
+
}
|
35 |
+
</style>
|
36 |
+
""", unsafe_allow_html=True)
|
37 |
+
|
38 |
+
@st.cache_resource
|
39 |
+
def init_spark():
|
40 |
+
return sparknlp.start()
|
41 |
+
|
42 |
+
@st.cache_resource
|
43 |
+
def create_pipeline(model):
|
44 |
+
documentAssembler = DocumentAssembler()\
|
45 |
+
.setInputCol("text")\
|
46 |
+
.setOutputCol("document")
|
47 |
+
|
48 |
+
sentence_detector = SentenceDetector() \
|
49 |
+
.setInputCols(["document"]) \
|
50 |
+
.setOutputCol("sentence")
|
51 |
+
|
52 |
+
languageDetector = LanguageDetectorDL.pretrained(model)\
|
53 |
+
.setInputCols("sentence")\
|
54 |
+
.setOutputCol("language")\
|
55 |
+
.setThreshold(0.5)\
|
56 |
+
.setCoalesceSentences(True)
|
57 |
+
|
58 |
+
nlpPipeline = Pipeline(
|
59 |
+
stages=[
|
60 |
+
documentAssembler,
|
61 |
+
sentence_detector,
|
62 |
+
languageDetector])
|
63 |
+
|
64 |
+
return nlpPipeline
|
65 |
+
|
66 |
+
def fit_data(pipeline, data):
|
67 |
+
empty_df = spark.createDataFrame([['']]).toDF('text')
|
68 |
+
pipeline_model = pipeline.fit(empty_df)
|
69 |
+
model = LightPipeline(pipeline_model)
|
70 |
+
results = model.fullAnnotate(data)[0]
|
71 |
+
|
72 |
+
return results
|
73 |
+
|
74 |
+
# Set up the page layout
|
75 |
+
st.markdown('<div class="main-title">State-Of-The-Art Language Detection With Spark NLP</div>', unsafe_allow_html=True)
|
76 |
+
st.subheader('Support for 375 different languages')
|
77 |
+
|
78 |
+
# Sidebar content
|
79 |
+
model = st.sidebar.selectbox(
|
80 |
+
"Choose the pretrained model",
|
81 |
+
["ld_wiki_tatoeba_cnn_375"],
|
82 |
+
help="For more info about the models visit: https://sparknlp.org/models"
|
83 |
+
)
|
84 |
+
|
85 |
+
with st.expander("View Supported Languges"):
|
86 |
+
st.write("Abkhaz, Iraqi Arabic, Adyghe, Afrikaans, Gulf Arabic, Afrihili, Assyrian Neo-Aramaic, Ainu, Aklanon, Gheg Albanian, Amharic, Aragonese, Old English, Uab Meto, North Levantine Arabic, Arabic, Algerian Arabic, Moroccan Arabic, Egyptian Arabic, Assamese, Asturian, Kotava, Awadhi, Aymara, Azerbaijani, Bashkir, Baluchi, Balinese, Bavarian, Central Bikol, Belarusian, Berber, Bulgarian, Bhojpuri, Bislama, Banjar, Bambara, Bengali, Tibetan, Breton, Bodo, Bosnian, Buryat, Baybayanon, Brithenig, Catalan, Cayuga, Chavacano, Chechen, Cebuano, Chamorro, Chagatai, Chinook Jargon, Choctaw, Cherokee, Jin Chinese, Chukchi, Central Mnong, Corsican, Chinese Pidgin English, Crimean Tatar, Seychellois Creole, Czech, Kashubian, Chuvash, Welsh, CycL, Cuyonon, Danish, German, Dungan, Drents, Lower Sorbian, Central Dusun, Dhivehi, Dutton World Speedwords, Ewe, Emilian, Greek, Erromintxela, English, Middle English, Esperanto, Spanish, Estonian, Basque, Evenki, Extremaduran, Persian, Finnish, Fijian, Kven Finnish, Faroese, French, Middle French, Old French, North Frisian, Pulaar, Friulian, Nigerian Fulfulde, Frisian, Irish, Ga, Gagauz, Gan Chinese, Garhwali, Guadeloupean Creole French, Scottish Gaelic, Gilbertese, Galician, Guarani, Konkani (Goan), Gronings, Gothic, Ancient Greek, Swiss German, Gujarati, Manx, Hausa, Hakka Chinese, Hawaiian, Ancient Hebrew, Hebrew, Hindi, Fiji Hindi, Hiligaynon, Hmong Njua (Green), Ho, Croatian, Hunsrik, Upper Sorbian, Xiang Chinese, Haitian Creole, Hungarian, Armenian, Interlingua, Iban, Indonesian, Interlingue, Igbo, Nuosu, Inuktitut, Ilocano, Ido, Icelandic, Italian, Ingrian, Japanese, Jamaican Patois, Lojban, Juhuri (Judeo-Tat), Jewish Palestinian Aramaic, Javanese, Georgian, Karakalpak, Kabyle, Kamba, Kekchi (Q'eqchi'), Khasi, Khakas, Kazakh, Greenlandic, Khmer, Kannada, Korean, Komi-Permyak, Komi-Zyrian, Karachay-Balkar, Karelian, Kashmiri, Kölsch, Kurdish, Kumyk, Cornish, Keningau Murut, Kyrgyz, Coastal Kadazan, Latin, Southern Subanen, Ladino, Luxembourgish, Láadan, Lingua Franca Nova, Luganda, Ligurian, Livonian, Lakota, Ladin, Lombard, Lingala, Lao, Louisiana Creole, Lithuanian, Latgalian, Latvian, Latvian, Literary Chinese, Laz, Madurese, Maithili, North Moluccan Malay, Moksha, Morisyen, Malagasy, Mambae, Marshallese, Meadow Mari, Maori, Mi'kmaq, Minangkabau, Macedonian, Malayalam, Mongolian, Manchu, Mon, Mohawk, Marathi, Hill Mari, Malay, Maltese, Tagal Murut, Mirandese, Hmong Daw (White), Burmese, Erzya, Nauruan, Nahuatl, Norwegian Bokmål, Central Huasteca Nahuatl, Low German (Low Saxon), Nepali, Newari, Ngeq, Guerrero Nahuatl, Niuean, Dutch, Orizaba Nahuatl, Norwegian Nynorsk, Norwegian, Nogai, Old Norse, Novial, Nepali, Naga (Tangshang), Navajo, Chinyanja, Nyungar, Old Aramaic, Occitan, Ojibwe, Odia (Oriya), Old East Slavic, Ossetian, Old Spanish, Old Saxon, Ottoman Turkish, Old Turkish, Punjabi (Eastern), Pangasinan, Kapampangan, Papiamento, Palauan, Picard, Pennsylvania German, Palatine German, Phoenician, Pali, Polish, Piedmontese, Punjabi (Western), Pipil, Old Prussian, Pashto, Portuguese, Quechua, K'iche', Quenya, Rapa Nui, Rendille, Tarifit, Romansh, Kirundi, Romanian, Romani, Russian, Rusyn, Kinyarwanda, Okinawan, Sanskrit, Yakut, Sardinian, Sicilian, Scots, Sindhi, Northern Sami, Sango, Samogitian, Shuswap, Tachawit, Sinhala, Sindarin, Slovak, Slovenian, Samoan, Southern Sami, Shona, Somali, Albanian, Serbian, Swazi, Southern Sotho, Saterland Frisian, Sundanese, Sumerian, Swedish, Swahili, Swabian, Swahili, Syriac, Tamil, Telugu, Tetun, Tajik, Thai, Tahaggart Tamahaq, Tigrinya, Tigre, Turkmen, Tokelauan, Tagalog, Klingon, Talysh, Jewish Babylonian Aramaic, Temuan, Setswana, Tongan, Tonga (Zambezi), Toki Pona, Tok Pisin, Old Tupi, Turkish, Tsonga, Tatar, Isan, Tuvaluan, Tahitian, Tuvinian, Talossan, Udmurt, Uyghur, Ukrainian, Umbundu, Urdu, Urhobo, Uzbek, Venetian, Veps, Vietnamese, Volapük, Võro, Walloon, Waray, Wolof, Shanghainese, Kalmyk, Xhosa, Mingrelian, Yiddish, Yoruba, Cantonese, Chinese, Malay (Vernacular), Malay, Zulu, and Zaza.")
|
87 |
+
|
88 |
+
# Reference notebook link in sidebar
|
89 |
+
link = """
|
90 |
+
<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/Language_Detector.ipynb">
|
91 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
|
92 |
+
</a>
|
93 |
+
"""
|
94 |
+
st.sidebar.markdown('Reference notebook:')
|
95 |
+
st.sidebar.markdown(link, unsafe_allow_html=True)
|
96 |
+
|
97 |
+
# Load examples
|
98 |
+
folder_path = f"inputs/{model}"
|
99 |
+
examples = [
|
100 |
+
lines[1].strip()
|
101 |
+
for filename in os.listdir(folder_path)
|
102 |
+
if filename.endswith('.txt')
|
103 |
+
for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()]
|
104 |
+
if len(lines) >= 2
|
105 |
+
]
|
106 |
+
|
107 |
+
selected_text = st.selectbox("Select a sample text", examples)
|
108 |
+
custom_input = st.text_input("Try it for yourself!")
|
109 |
+
|
110 |
+
if custom_input:
|
111 |
+
selected_text = custom_input
|
112 |
+
elif selected_text:
|
113 |
+
selected_text = selected_text
|
114 |
+
|
115 |
+
st.subheader('Selected Text')
|
116 |
+
st.markdown("""<div class="section">{selected_text}</div>""", unsafe_allow_html=True)
|
117 |
+
|
118 |
+
# Initialize Spark and create pipeline
|
119 |
+
spark = init_spark()
|
120 |
+
pipeline = create_pipeline(model)
|
121 |
+
output = fit_data(pipeline, selected_text)
|
122 |
+
|
123 |
+
# Display output
|
124 |
+
language_map = {
|
125 |
+
'ab': "Abkhaz",
|
126 |
+
'ace': "Achinese",
|
127 |
+
'acm': "Iraqi Arabic",
|
128 |
+
'ady': "Adyghe",
|
129 |
+
'af': "Afrikaans",
|
130 |
+
'afb': "Gulf Arabic",
|
131 |
+
'afh': "Afrihili",
|
132 |
+
'aii': "Assyrian Neo-Aramaic",
|
133 |
+
'ain': "Ainu",
|
134 |
+
'akl': "Aklanon",
|
135 |
+
'aln': "Gheg Albanian",
|
136 |
+
'als': "Tosk Albanian",
|
137 |
+
'am': "Amharic",
|
138 |
+
'an': "Aragonese",
|
139 |
+
'ang': "Old English",
|
140 |
+
'aoz': "Uab Meto",
|
141 |
+
'apc': "North Levantine Arabic",
|
142 |
+
'ar': "Arabic",
|
143 |
+
'arq': "Algerian Arabic",
|
144 |
+
'ary': "Moroccan Arabic",
|
145 |
+
'arz': "Egyptian Arabic",
|
146 |
+
'as': "Assamese",
|
147 |
+
'ast': "Asturian",
|
148 |
+
'av': "Avaric",
|
149 |
+
'avk': "Kotava",
|
150 |
+
'awa': "Awadhi",
|
151 |
+
'ay': "Aymara",
|
152 |
+
'az': "Azerbaijani",
|
153 |
+
'azb': "South Azerbaijani",
|
154 |
+
'ba': "Bashkir",
|
155 |
+
'bal': "Baluchi",
|
156 |
+
'ban': "Balinese",
|
157 |
+
'bar': "Bavarian",
|
158 |
+
'bat-smg': "bat-smg",
|
159 |
+
'bcl': "Central Bikol",
|
160 |
+
'be': "Belarusian",
|
161 |
+
'ber': "Berber",
|
162 |
+
'bg': "Bulgarian",
|
163 |
+
'bh': "bh",
|
164 |
+
'bho': "Bhojpuri",
|
165 |
+
'bi': "Bislama",
|
166 |
+
'bjn': "Banjar",
|
167 |
+
'bm': "Bambara",
|
168 |
+
'bn': "Bengali",
|
169 |
+
'bo': "Tibetan",
|
170 |
+
'bpy': "Bishnupriya",
|
171 |
+
'br': "Breton",
|
172 |
+
'brx': "Bodo",
|
173 |
+
'bs': "Bosnian",
|
174 |
+
'bua': "Buryat",
|
175 |
+
'bvy': "Baybayanon",
|
176 |
+
'bxr': "Russia Buriat",
|
177 |
+
'bzt': "Brithenig",
|
178 |
+
'ca': "Catalan",
|
179 |
+
'cay': "Cayuga",
|
180 |
+
'cbk': "Chavacano",
|
181 |
+
'cbk-zam': "cbk-zam",
|
182 |
+
'cdo': "Min Dong Chinese",
|
183 |
+
'ce': "Chechen",
|
184 |
+
'ceb': "Cebuano",
|
185 |
+
'ch': "Chamorro",
|
186 |
+
'chg': "Chagatai",
|
187 |
+
'chn': "Chinook Jargon",
|
188 |
+
'cho': "Choctaw",
|
189 |
+
'chr': "Cherokee",
|
190 |
+
'cjy': "Jin Chinese",
|
191 |
+
'ckb': "Central Kurdish (Soranî)",
|
192 |
+
'ckt': "Chukchi",
|
193 |
+
'cmo': "Central Mnong",
|
194 |
+
'co': "Corsican",
|
195 |
+
'cpi': "Chinese Pidgin English",
|
196 |
+
'crh': "Crimean Tatar",
|
197 |
+
'crs': "Seychellois Creole",
|
198 |
+
'cs': "Czech",
|
199 |
+
'ces': "Czech",
|
200 |
+
'csb': "Kashubian",
|
201 |
+
'cv': "Chuvash",
|
202 |
+
'cy': "Welsh",
|
203 |
+
'cycl': "CycL",
|
204 |
+
'cyo': "Cuyonon",
|
205 |
+
'da': "Danish",
|
206 |
+
'de': "German",
|
207 |
+
'deu': "German",
|
208 |
+
'diq': "Dimli (individual language)",
|
209 |
+
'dng': "Dungan",
|
210 |
+
'drt': "Drents",
|
211 |
+
'dsb': "Lower Sorbian",
|
212 |
+
'dtp': "Central Dusun",
|
213 |
+
'dty': "dty",
|
214 |
+
'dv': "Dhivehi",
|
215 |
+
'dws': "Dutton World Speedwords",
|
216 |
+
'ee': "Ewe",
|
217 |
+
'egl': "Emilian",
|
218 |
+
'el': "Greek",
|
219 |
+
'ell': "Greek",
|
220 |
+
'eml': "eml",
|
221 |
+
'emx': "Erromintxela",
|
222 |
+
'en': "English",
|
223 |
+
'enm': "Middle English",
|
224 |
+
'eo': "Esperanto",
|
225 |
+
'es': "Spanish",
|
226 |
+
'et': "Estonian",
|
227 |
+
'eu': "Basque",
|
228 |
+
'evn': "Evenki",
|
229 |
+
'ext': "Extremaduran",
|
230 |
+
'fa': "Persian",
|
231 |
+
'fi': "Finnish",
|
232 |
+
'fiu-vro': "fiu-vro",
|
233 |
+
'fj': "Fijian",
|
234 |
+
'fkv': "Kven Finnish",
|
235 |
+
'fo': "Faroese",
|
236 |
+
'fr': "French",
|
237 |
+
'fra': "French",
|
238 |
+
'frm': "Middle French",
|
239 |
+
'fro': "Old French",
|
240 |
+
'frp': "Arpitan",
|
241 |
+
'frr': "North Frisian",
|
242 |
+
'fuc': "Pulaar",
|
243 |
+
'fur': "Friulian",
|
244 |
+
'fuv': "Nigerian Fulfulde",
|
245 |
+
'fy': "Frisian",
|
246 |
+
'ga': "Irish",
|
247 |
+
'gaa': "Ga",
|
248 |
+
'gag': "Gagauz",
|
249 |
+
'gan': "Gan Chinese",
|
250 |
+
'gbm': "Garhwali",
|
251 |
+
'gcf': "Guadeloupean Creole French",
|
252 |
+
'gd': "Scottish Gaelic",
|
253 |
+
'gil': "Gilbertese",
|
254 |
+
'gl': "Galician",
|
255 |
+
'glk': "Gilaki",
|
256 |
+
'gn': "Guarani",
|
257 |
+
'gom': "Konkani (Goan)",
|
258 |
+
'gos': "Gronings",
|
259 |
+
'got': "Gothic",
|
260 |
+
'grc': "Ancient Greek",
|
261 |
+
'gsw': "Swiss German",
|
262 |
+
'gu': "Gujarati",
|
263 |
+
'gv': "Manx",
|
264 |
+
'ha': "Hausa",
|
265 |
+
'hak': "Hakka Chinese",
|
266 |
+
'haw': "Hawaiian",
|
267 |
+
'hbo': "Ancient Hebrew",
|
268 |
+
'he': "Hebrew",
|
269 |
+
'hi': "Hindi",
|
270 |
+
'hif': "Fiji Hindi",
|
271 |
+
'hil': "Hiligaynon",
|
272 |
+
'hnj': "Hmong Njua (Green)",
|
273 |
+
'hoc': "Ho",
|
274 |
+
'hr': "Croatian",
|
275 |
+
'hrx': "Hunsrik",
|
276 |
+
'hsb': "Upper Sorbian",
|
277 |
+
'hsn': "Xiang Chinese",
|
278 |
+
'ht': "Haitian Creole",
|
279 |
+
'hu': "Hungarian",
|
280 |
+
'hy': "Armenian",
|
281 |
+
'ia': "Interlingua",
|
282 |
+
'iba': "Iban",
|
283 |
+
'id': "Indonesian",
|
284 |
+
'ie': "Interlingue",
|
285 |
+
'ig': "Igbo",
|
286 |
+
'ii': "Nuosu",
|
287 |
+
'ike': "Inuktitut",
|
288 |
+
'ilo': "Ilocano",
|
289 |
+
'io': "Ido",
|
290 |
+
'is': "Icelandic",
|
291 |
+
'it': "Italian",
|
292 |
+
'izh': "Ingrian",
|
293 |
+
'ja': "Japanese",
|
294 |
+
'jam': "Jamaican Patois",
|
295 |
+
'jbo': "Lojban",
|
296 |
+
'jdt': "Juhuri (Judeo-Tat)",
|
297 |
+
'jpa': "Jewish Palestinian Aramaic",
|
298 |
+
'jv': "Javanese",
|
299 |
+
'ka': "Georgian",
|
300 |
+
'kaa': "Karakalpak",
|
301 |
+
'kab': "Kabyle",
|
302 |
+
'kam': "Kamba",
|
303 |
+
'kbd': "Kabardian",
|
304 |
+
'kek': "Kekchi (Q'eqchi')",
|
305 |
+
'kha': "Khasi",
|
306 |
+
'kjh': "Khakas",
|
307 |
+
'kk': "Kazakh",
|
308 |
+
'kl': "Greenlandic",
|
309 |
+
'km': "Khmer",
|
310 |
+
'kn': "Kannada",
|
311 |
+
'ko': "Korean",
|
312 |
+
'koi': "Komi-Permyak",
|
313 |
+
'kpv': "Komi-Zyrian",
|
314 |
+
'krc': "Karachay-Balkar",
|
315 |
+
'krl': "Karelian",
|
316 |
+
'ks': "Kashmiri",
|
317 |
+
'ksh': "Kölsch",
|
318 |
+
'ku': "Kurdish",
|
319 |
+
'kum': "Kumyk",
|
320 |
+
'kv': "Komi",
|
321 |
+
'kw': "Cornish",
|
322 |
+
'kxi': "Keningau Murut",
|
323 |
+
'ky': "Kyrgyz",
|
324 |
+
'kzj': "Coastal Kadazan",
|
325 |
+
'la': "Latin",
|
326 |
+
'laa': "Southern Subanen",
|
327 |
+
'lad': "Ladino",
|
328 |
+
'lb': "Luxembourgish",
|
329 |
+
'ldn': "Láadan",
|
330 |
+
'lez': "Lezghian",
|
331 |
+
'lfn': "Lingua Franca Nova",
|
332 |
+
'lg': "Luganda",
|
333 |
+
'li': "Limburgan",
|
334 |
+
'lij': "Ligurian",
|
335 |
+
'liv': "Livonian",
|
336 |
+
'lkt': "Lakota",
|
337 |
+
'lld': "Ladin",
|
338 |
+
'lmo': "Lombard",
|
339 |
+
'ln': "Lingala",
|
340 |
+
'lo': "Lao",
|
341 |
+
'lou': "Louisiana Creole",
|
342 |
+
'lrc': "Northern Luri",
|
343 |
+
'lt': "Lithuanian",
|
344 |
+
'ltg': "Latgalian",
|
345 |
+
'lv': "Latvian",
|
346 |
+
'lvs': "Latvian",
|
347 |
+
'lzh': "Literary Chinese",
|
348 |
+
'lzz': "Laz",
|
349 |
+
'mad': "Madurese",
|
350 |
+
'mai': "Maithili",
|
351 |
+
'map-bms': "map-bms",
|
352 |
+
'max': "North Moluccan Malay",
|
353 |
+
'mdf': "Moksha",
|
354 |
+
'mfe': "Morisyen",
|
355 |
+
'mg': "Malagasy",
|
356 |
+
'mgm': "Mambae",
|
357 |
+
'mh': "Marshallese",
|
358 |
+
'mhr': "Meadow Mari",
|
359 |
+
'mi': "Maori",
|
360 |
+
'mic': "Mi'kmaq",
|
361 |
+
'min': "Minangkabau",
|
362 |
+
'mk': "Macedonian",
|
363 |
+
'ml': "Malayalam",
|
364 |
+
'mn': "Mongolian",
|
365 |
+
'mnc': "Manchu",
|
366 |
+
'mnw': "Mon",
|
367 |
+
'moh': "Mohawk",
|
368 |
+
'mr': "Marathi",
|
369 |
+
'mrj': "Hill Mari",
|
370 |
+
'ms': "Malay",
|
371 |
+
'mt': "Maltese",
|
372 |
+
'mvv': "Tagal Murut",
|
373 |
+
'mwl': "Mirandese",
|
374 |
+
'mww': "Hmong Daw (White)",
|
375 |
+
'my': "Burmese",
|
376 |
+
'myv': "Erzya",
|
377 |
+
'mzn': "Mazanderani",
|
378 |
+
'na': "Nauruan",
|
379 |
+
'nah': "Nahuatl",
|
380 |
+
'nap': "Neapolitan",
|
381 |
+
'nb': "Norwegian Bokmål",
|
382 |
+
'nch': "Central Huasteca Nahuatl",
|
383 |
+
'nds': "Low German (Low Saxon)",
|
384 |
+
'nds-nl': "nds-nl",
|
385 |
+
'ne': "Nepali",
|
386 |
+
'new': "Newari",
|
387 |
+
'ngt': "Ngeq",
|
388 |
+
'ngu': "Guerrero Nahuatl",
|
389 |
+
'niu': "Niuean",
|
390 |
+
'nl': "Dutch",
|
391 |
+
'nlv': "Orizaba Nahuatl",
|
392 |
+
'nn': "Norwegian Nynorsk",
|
393 |
+
'no': "Norwegian",
|
394 |
+
'nog': "Nogai",
|
395 |
+
'non': "Old Norse",
|
396 |
+
'nov': "Novial",
|
397 |
+
'npi': "Nepali",
|
398 |
+
'nrm': "Narom",
|
399 |
+
'nso': "Pedi",
|
400 |
+
'nst': "Naga (Tangshang)",
|
401 |
+
'nv': "Navajo",
|
402 |
+
'ny': "Chinyanja",
|
403 |
+
'nys': "Nyungar",
|
404 |
+
'oar': "Old Aramaic",
|
405 |
+
'oc': "Occitan",
|
406 |
+
'oj': "Ojibwe",
|
407 |
+
'olo': "Livvi",
|
408 |
+
'om': "Oromo",
|
409 |
+
'or': "Odia (Oriya)",
|
410 |
+
'orv': "Old East Slavic",
|
411 |
+
'os': "Ossetian",
|
412 |
+
'osp': "Old Spanish",
|
413 |
+
'osx': "Old Saxon",
|
414 |
+
'ota': "Ottoman Turkish",
|
415 |
+
'otk': "Old Turkish",
|
416 |
+
'pa': "Punjabi (Eastern)",
|
417 |
+
'pag': "Pangasinan",
|
418 |
+
'pam': "Kapampangan",
|
419 |
+
'pap': "Papiamento",
|
420 |
+
'pau': "Palauan",
|
421 |
+
'pcd': "Picard",
|
422 |
+
'pdc': "Pennsylvania German",
|
423 |
+
'pfl': "Palatine German",
|
424 |
+
'phn': "Phoenician",
|
425 |
+
'pi': "Pali",
|
426 |
+
'pl': "Polish",
|
427 |
+
'pms': "Piedmontese",
|
428 |
+
'pnb': "Punjabi (Western)",
|
429 |
+
'ppl': "Pipil",
|
430 |
+
'prg': "Old Prussian",
|
431 |
+
'ps': "Pashto",
|
432 |
+
'pt': "Portuguese",
|
433 |
+
'qu': "Quechua",
|
434 |
+
'quc': "K'iche'",
|
435 |
+
'qya': "Quenya",
|
436 |
+
'rap': "Rapa Nui",
|
437 |
+
'rel': "Rendille",
|
438 |
+
'rif': "Tarifit",
|
439 |
+
'rm': "Romansh",
|
440 |
+
'rn': "Kirundi",
|
441 |
+
'ro': "Romanian",
|
442 |
+
'ron': "Romanian",
|
443 |
+
'roa-rup': "roa-rup",
|
444 |
+
'roa-tara': "roa-tara",
|
445 |
+
'rom': "Romani",
|
446 |
+
'ru': "Russian",
|
447 |
+
'rue': "Rusyn",
|
448 |
+
'rw': "Kinyarwanda",
|
449 |
+
'ryu': "Okinawan",
|
450 |
+
'sa': "Sanskrit",
|
451 |
+
'sah': "Yakut",
|
452 |
+
'sc': "Sardinian",
|
453 |
+
'scn': "Sicilian",
|
454 |
+
'sco': "Scots",
|
455 |
+
'sd': "Sindhi",
|
456 |
+
'se': "Northern Sami",
|
457 |
+
'sg': "Sango",
|
458 |
+
'sgs': "Samogitian",
|
459 |
+
'sh': "Serbo-Croatian",
|
460 |
+
'shs': "Shuswap",
|
461 |
+
'shy': "Tachawit",
|
462 |
+
'si': "Sinhala",
|
463 |
+
'sjn': "Sindarin",
|
464 |
+
'sk': "Slovak",
|
465 |
+
'slk': "Slovak",
|
466 |
+
'sl': "Slovenian",
|
467 |
+
'sm': "Samoan",
|
468 |
+
'sma': "Southern Sami",
|
469 |
+
'sn': "Shona",
|
470 |
+
'so': "Somali",
|
471 |
+
'sq': "Albanian",
|
472 |
+
'sr': "Serbian",
|
473 |
+
'srn': "Sranan Tongo",
|
474 |
+
'ss': "Swazi",
|
475 |
+
'st': "Southern Sotho",
|
476 |
+
'stq': "Saterland Frisian",
|
477 |
+
'su': "Sundanese",
|
478 |
+
'sux': "Sumerian",
|
479 |
+
'sv': "Swedish",
|
480 |
+
'sw': "Swahili",
|
481 |
+
'swg': "Swabian",
|
482 |
+
'swh': "Swahili",
|
483 |
+
'syc': "Syriac",
|
484 |
+
'szl': "Silesian",
|
485 |
+
'ta': "Tamil",
|
486 |
+
'tcy': "Tulu",
|
487 |
+
'te': "Telugu",
|
488 |
+
'tet': "Tetun",
|
489 |
+
'tg': "Tajik",
|
490 |
+
'th': "Thai",
|
491 |
+
'thv': "Tahaggart Tamahaq",
|
492 |
+
'ti': "Tigrinya",
|
493 |
+
'tig': "Tigre",
|
494 |
+
'tk': "Turkmen",
|
495 |
+
'tkl': "Tokelauan",
|
496 |
+
'tl': "Tagalog",
|
497 |
+
'tlh': "Klingon",
|
498 |
+
'tly': "Talysh",
|
499 |
+
'tmr': "Jewish Babylonian Aramaic",
|
500 |
+
'tmw': "Temuan",
|
501 |
+
'tn': "Setswana",
|
502 |
+
'to': "Tongan",
|
503 |
+
'toi': "Tonga (Zambezi)",
|
504 |
+
'toki': "Toki Pona",
|
505 |
+
'tpi': "Tok Pisin",
|
506 |
+
'tpw': "Old Tupi",
|
507 |
+
'tr': "Turkish",
|
508 |
+
'ts': "Tsonga",
|
509 |
+
'tt': "Tatar",
|
510 |
+
'tts': "Isan",
|
511 |
+
'tvl': "Tuvaluan",
|
512 |
+
'ty': "Tahitian",
|
513 |
+
'tyv': "Tuvinian",
|
514 |
+
'tzl': "Talossan",
|
515 |
+
'udm': "Udmurt",
|
516 |
+
'ug': "Uyghur",
|
517 |
+
'uk': "Ukrainian",
|
518 |
+
'umb': "Umbundu",
|
519 |
+
'ur': "Urdu",
|
520 |
+
'urh': "Urhobo",
|
521 |
+
'uz': "Uzbek",
|
522 |
+
'vec': "Venetian",
|
523 |
+
'vep': "Veps",
|
524 |
+
'vi': "Vietnamese",
|
525 |
+
'vls': "Vlaams",
|
526 |
+
'vo': "Volapük",
|
527 |
+
'vro': "Võro",
|
528 |
+
'wa': "Walloon",
|
529 |
+
'war': "Waray",
|
530 |
+
'wo': "Wolof",
|
531 |
+
'wuu': "Shanghainese",
|
532 |
+
'xal': "Kalmyk",
|
533 |
+
'xh': "Xhosa",
|
534 |
+
'xmf': "Mingrelian",
|
535 |
+
'yi': "Yiddish",
|
536 |
+
'yo': "Yoruba",
|
537 |
+
'yue': "Cantonese",
|
538 |
+
'zea': "Zeeuws",
|
539 |
+
'zh': "Chinese",
|
540 |
+
'zh-classical': "zh-classical",
|
541 |
+
'zh-min-nan': "zh-min-nan",
|
542 |
+
'zh-yue': "zh-yue",
|
543 |
+
'zlm': "Malay (Vernacular)",
|
544 |
+
'zsm': "Malay",
|
545 |
+
'zu': "Zulu",
|
546 |
+
'zza': "Zaza"
|
547 |
+
}
|
548 |
+
|
549 |
+
language = language_map[output['language'][0].result]
|
550 |
+
confidence = round(float(output['language'][0].metadata[language])*100, 2)
|
551 |
+
|
552 |
+
st.markdown(f"This text is in **{language} ({output['language'][0].result})** language.")
|
553 |
+
st.markdown(f"Classification Confidence: **{confidence}%**")
|
Dockerfile
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Download base image ubuntu 18.04
|
2 |
+
FROM ubuntu:18.04
|
3 |
+
|
4 |
+
# Set environment variables
|
5 |
+
ENV NB_USER jovyan
|
6 |
+
ENV NB_UID 1000
|
7 |
+
ENV HOME /home/${NB_USER}
|
8 |
+
|
9 |
+
# Install required packages
|
10 |
+
RUN apt-get update && apt-get install -y \
|
11 |
+
tar \
|
12 |
+
wget \
|
13 |
+
bash \
|
14 |
+
rsync \
|
15 |
+
gcc \
|
16 |
+
libfreetype6-dev \
|
17 |
+
libhdf5-serial-dev \
|
18 |
+
libpng-dev \
|
19 |
+
libzmq3-dev \
|
20 |
+
python3 \
|
21 |
+
python3-dev \
|
22 |
+
python3-pip \
|
23 |
+
unzip \
|
24 |
+
pkg-config \
|
25 |
+
software-properties-common \
|
26 |
+
graphviz \
|
27 |
+
openjdk-8-jdk \
|
28 |
+
ant \
|
29 |
+
ca-certificates-java \
|
30 |
+
&& apt-get clean \
|
31 |
+
&& update-ca-certificates -f;
|
32 |
+
|
33 |
+
# Install Python 3.8 and pip
|
34 |
+
RUN add-apt-repository ppa:deadsnakes/ppa \
|
35 |
+
&& apt-get update \
|
36 |
+
&& apt-get install -y python3.8 python3-pip \
|
37 |
+
&& apt-get clean;
|
38 |
+
|
39 |
+
# Set up JAVA_HOME
|
40 |
+
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
|
41 |
+
RUN mkdir -p ${HOME} \
|
42 |
+
&& echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ${HOME}/.bashrc \
|
43 |
+
&& chown -R ${NB_UID}:${NB_UID} ${HOME}
|
44 |
+
|
45 |
+
# Create a new user named "jovyan" with user ID 1000
|
46 |
+
RUN useradd -m -u ${NB_UID} ${NB_USER}
|
47 |
+
|
48 |
+
# Switch to the "jovyan" user
|
49 |
+
USER ${NB_USER}
|
50 |
+
|
51 |
+
# Set home and path variables for the user
|
52 |
+
ENV HOME=/home/${NB_USER} \
|
53 |
+
PATH=/home/${NB_USER}/.local/bin:$PATH
|
54 |
+
|
55 |
+
# Set the working directory to the user's home directory
|
56 |
+
WORKDIR ${HOME}
|
57 |
+
|
58 |
+
# Upgrade pip and install Python dependencies
|
59 |
+
RUN python3.8 -m pip install --upgrade pip
|
60 |
+
COPY requirements.txt /tmp/requirements.txt
|
61 |
+
RUN python3.8 -m pip install -r /tmp/requirements.txt
|
62 |
+
|
63 |
+
# Copy the application code into the container at /home/jovyan
|
64 |
+
COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
|
65 |
+
|
66 |
+
# Expose port for Streamlit
|
67 |
+
EXPOSE 7860
|
68 |
+
|
69 |
+
# Define the entry point for the container
|
70 |
+
ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
|
inputs/ld_wiki_tatoeba_cnn_375/Example1.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Mona Lisa is a 16th century oil painting created by Leonardo. It is held at the Louvre...
|
2 |
+
Mona Lisa is a 16th century oil painting created by Leonardo. It is held at the Louvre in Paris.
|
inputs/ld_wiki_tatoeba_cnn_375/Example10.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
La Joconde est une peinture à l'huile du XVIe siècle créée par Léonard. Il se tient au Louvr...
|
2 |
+
La Joconde est une peinture à l'huile du XVIe siècle créée par Léonard. Il se tient au Louvre à Paris.
|
inputs/ld_wiki_tatoeba_cnn_375/Example11.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Quando Sebastian Thrun ha iniziato a lavorare su auto a guida autonoma presso Google nel 2007, poche...
|
2 |
+
Quando Sebastian Thrun ha iniziato a lavorare su auto a guida autonoma presso Google nel 2007, poche persone al di fuori dell'azienda lo hanno preso sul serio.
|
inputs/ld_wiki_tatoeba_cnn_375/Example12.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Mona Lisa je olejomalba ze 16. století, kterou vytvořil Leonardo. Koná se v Louvru...
|
2 |
+
Mona Lisa je olejomalba ze 16. století, kterou vytvořil Leonardo. Koná se v Louvru v Paříži.
|
inputs/ld_wiki_tatoeba_cnn_375/Example13.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Джефри Еверест Хинтън е британски канадски когнитивен психолог и компютърен учен, най-известен с раб...
|
2 |
+
Джефри Еверест Хинтън е британски канадски когнитивен психолог и компютърен учен, най-известен с работата си върху изкуствени невронни мрежи. От 2013 г. той прекарва времето си в работа за Google и университета в Торонто. През 2017 г. е съосновател и става главен научен съветник на Vector Institute of Toronto.
|
inputs/ld_wiki_tatoeba_cnn_375/Example14.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Titanic ist ein 1997 in den USA veröffentlichter epischer Roman und ein katastrophaler Film, der von...
|
2 |
+
Titanic ist ein 1997 in den USA veröffentlichter epischer Roman und ein katastrophaler Film, der von James Cameron inszeniert, geschrieben, co-produziert und mitherausgegeben wurde. Es deckt sowohl historische als auch fiktive Aspekte ab und basiert auf Berichten über den Untergang der RMS Titanic und der Stars Leonard DiCaprio und Kate Winslet als Mitglieder verschiedener sozialer Klassen, die sich während der Schiffsreise während ihrer unglücklichen ersten Reise verlieben.
|
inputs/ld_wiki_tatoeba_cnn_375/Example15.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Geoffrey Everest Hinton es un psicólogo cognitivo y científico informático británico canadiense, mej...
|
2 |
+
Geoffrey Everest Hinton es un psicólogo cognitivo y científico informático británico canadiense, mejor conocido por su trabajo en redes neuronales artificiales. Desde 2013 ha pasado su tiempo trabajando para Google y la Universidad de Toronto. En 2017 fue cofundador y se convirtió en Asesor Científico Jefe del Instituto Vector de Toronto.
|
inputs/ld_wiki_tatoeba_cnn_375/Example16.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Το Titanic είναι ένα αμερικανικό επικό μυθιστόρημα του 1997 και μια καταστροφική ταινία σε σκηνοθεσί...
|
2 |
+
Το Titanic είναι ένα αμερικανικό επικό μυθιστόρημα του 1997 και μια καταστροφική ταινία σε σκηνοθεσία, συγγραφή, συμπαραγωγή και συν-επεξεργασία από τον James Cameron. Καλύπτει τόσο ιστορικές όσο και φανταστικές πτυχές και βασίζεται σε αναφορές για τη βύθιση του Τιτανικού RMS και πρωταγωνιστούν οι Leonard DiCaprio και Kate Winslet ως μέλη διαφόρων κοινωνικών τάξεων που ερωτεύονται κατά τη διάρκεια του ταξιδιού του πλοίου κατά τη διάρκεια του πρώτου τους ατυχούς ταξιδιού.
|
inputs/ld_wiki_tatoeba_cnn_375/Example17.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Mona Lisa on 1500-luvun öljymaalaus, jonka on luonut Leonardo. Se pidetään Pariisin ...
|
2 |
+
Mona Lisa on 1500-luvun öljymaalaus, jonka on luonut Leonardo. Se pidetään Pariisin Louvressa.
|
inputs/ld_wiki_tatoeba_cnn_375/Example18.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
A természetes nyelvfeldolgozás története általában az 1950-es években kezdődött, bár a korábbi idősz...
|
2 |
+
A természetes nyelvfeldolgozás története általában az 1950-es években kezdődött, bár a korábbi időszakokból származó munkák is megtalálhatók. 1950-ben Alan Turing közzétett egy cikket, melynek címe: „Számítástechnika és intelligenciagépek”, és amely intelligenciakritériumként javasolta a Turing-tesztet.
|
inputs/ld_wiki_tatoeba_cnn_375/Example19.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Povijest obrade prirodnog jezika općenito je počela 1950-ih, iako se mogu naći djela iz ranijih razd...
|
2 |
+
Povijest obrade prirodnog jezika općenito je počela 1950-ih, iako se mogu naći djela iz ranijih razdoblja. 1950. Alan Turing objavio je članak pod naslovom "Računalna i inteligencijska mašinerija" u kojem se kao kriterij inteligencije predlaže ono što se danas naziva Turingov test.
|
inputs/ld_wiki_tatoeba_cnn_375/Example2.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Poza tym, że jest królem północy, John Snow jest angielskim lekarzem i liderem w dziedzinie anestezj...
|
2 |
+
Poza tym, że jest królem północy, John Snow jest angielskim lekarzem i liderem w dziedzinie anestezjologii i higieny medycznej. Uważany jest za pierwszego, który wykorzystał dane do leczenia epidemii cholery w 1834 r.
|
inputs/ld_wiki_tatoeba_cnn_375/Example20.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Titanic er en amerikansk episk roman fra 1997 og katastrofal film regissert, skrevet, co-produsert o...
|
2 |
+
Titanic er en amerikansk episk roman fra 1997 og katastrofal film regissert, skrevet, co-produsert og co-redigert av James Cameron. Det dekker både historiske og fiksjoniserte aspekter og er basert på rapporter om synkingen av RMS Titanic og stjernene Leonard DiCaprio og Kate Winslet som medlemmer av forskjellige sosiale klasser som forelsker seg under skipets seilas under deres uheldige første seilas.
|
inputs/ld_wiki_tatoeba_cnn_375/Example3.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Geoffrey Everest Hinton é um psicólogo cognitivo britânico canadense e cientista da computação, mais...
|
2 |
+
Geoffrey Everest Hinton é um psicólogo cognitivo britânico canadense e cientista da computação, mais conhecido por seu trabalho em redes neurais artificiais. Desde 2013, ele trabalha para o Google e a Universidade de Toronto. Em 2017, foi co-fundador e tornou-se Conselheiro Científico Chefe do Vector Institute of Toronto.
|
inputs/ld_wiki_tatoeba_cnn_375/Example4.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Когда в 2007 году Себастьян Трун начал работать над машинами для самостоятельного вождения в Google,...
|
2 |
+
Когда в 2007 году Себастьян Трун начал работать над машинами для самостоятельного вождения в Google, мало кто за пределами компании воспринимал его всерьез.
|
inputs/ld_wiki_tatoeba_cnn_375/Example5.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Mona Lisa este o pictură în ulei din secolul al XVI-lea creată de Leonardo. Se ține la Louvre ...
|
2 |
+
Mona Lisa este o pictură în ulei din secolul al XVI-lea creată de Leonardo. Se ține la Louvre din Paris.
|
inputs/ld_wiki_tatoeba_cnn_375/Example6.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
John Snow je okrem anglického kráľa anglickým lekárom a lídrom vo vývoji anestézie a lekárskej hygie...
|
2 |
+
John Snow je okrem anglického kráľa anglickým lekárom a lídrom vo vývoji anestézie a lekárskej hygieny. Je považovaný za prvého, ktorý používa údaje na liečenie prepuknutia cholery v roku 1834.
|
inputs/ld_wiki_tatoeba_cnn_375/Example7.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Mona Lisa, Leonardo tarafından yaratılan 16. yüzyıldan kalma bir yağlı boyadır. Paris'teki Louvre'da...
|
2 |
+
Mona Lisa, Leonardo tarafından yaratılan 16. yüzyıldan kalma bir yağlı boyadır. Paris'teki Louvre'da düzenleniyor.
|
inputs/ld_wiki_tatoeba_cnn_375/Example8.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Mona Lisa är en oljemålning från 1500-talet skapad av Leonardo. Det hålls vid Louvr...
|
2 |
+
Mona Lisa är en oljemålning från 1500-talet skapad av Leonardo. Det hålls vid Louvre i Paris.
|
inputs/ld_wiki_tatoeba_cnn_375/Example9.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Facebook - це послуга соціальних мереж, запущена під назвою TheFacebook 4 лютого 2004 року. Його зас...
|
2 |
+
Facebook - це послуга соціальних мереж, запущена під назвою TheFacebook 4 лютого 2004 року. Його заснував Марк Цукерберг разом зі своїми одноквартами та колегами Гарвардського університету Едуардо Саверином, Ендрю Макколлумом, Дастіном Московіцем та Крісом Х'юзом.
|
pages/Workflow & Model Overview.py
ADDED
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
# Custom CSS for better styling
|
4 |
+
st.markdown("""
|
5 |
+
<style>
|
6 |
+
.main-title {
|
7 |
+
font-size: 36px;
|
8 |
+
color: #4A90E2;
|
9 |
+
font-weight: bold;
|
10 |
+
text-align: center;
|
11 |
+
}
|
12 |
+
.sub-title {
|
13 |
+
font-size: 24px;
|
14 |
+
color: #4A90E2;
|
15 |
+
margin-top: 20px;
|
16 |
+
}
|
17 |
+
.section {
|
18 |
+
background-color: #f9f9f9;
|
19 |
+
padding: 15px;
|
20 |
+
border-radius: 10px;
|
21 |
+
margin-top: 20px;
|
22 |
+
}
|
23 |
+
.section h2 {
|
24 |
+
font-size: 22px;
|
25 |
+
color: #4A90E2;
|
26 |
+
}
|
27 |
+
.section p, .section ul {
|
28 |
+
color: #666666;
|
29 |
+
}
|
30 |
+
.link {
|
31 |
+
color: #4A90E2;
|
32 |
+
text-decoration: none;
|
33 |
+
}
|
34 |
+
</style>
|
35 |
+
""", unsafe_allow_html=True)
|
36 |
+
|
37 |
+
# Title
|
38 |
+
st.markdown('<div class="main-title">Automatic Language Detection Using Spark NLP in Python</div>', unsafe_allow_html=True)
|
39 |
+
|
40 |
+
# Introduction
|
41 |
+
st.markdown("""
|
42 |
+
<div class="section">
|
43 |
+
<p>Language detection is a critical component of Natural Language Processing (NLP), which involves automatically identifying the language of a given piece of text. This functionality is essential in various multilingual applications where the language of input text might not be known in advance. Accurate language detection can enhance the performance of downstream NLP tasks such as machine translation, sentiment analysis, and information retrieval.</p>
|
44 |
+
</div>
|
45 |
+
""", unsafe_allow_html=True)
|
46 |
+
|
47 |
+
# What is Language Detection
|
48 |
+
st.markdown('<div class="sub-title">What is Language Detection?</div>', unsafe_allow_html=True)
|
49 |
+
st.markdown("""
|
50 |
+
<div class="section">
|
51 |
+
<p>Language detection models analyze text to determine its language by examining features such as:</p>
|
52 |
+
<ul>
|
53 |
+
<li><b>Character Set</b>: Identifying language-specific characters and symbols.</li>
|
54 |
+
<li><b>Word Frequency</b>: Recognizing common words and their usage patterns in different languages.</li>
|
55 |
+
<li><b>N-grams</b>: Analyzing sequences of n words to detect language-specific phrases and structures.</li>
|
56 |
+
</ul>
|
57 |
+
<p>Models are typically trained on extensive datasets (e.g., Wikipedia, Tatoeba) using statistical and deep learning methods to recognize these patterns. Once trained, these models can predict the language of new text by comparing its features with those learned during training.</p>
|
58 |
+
</div>
|
59 |
+
""", unsafe_allow_html=True)
|
60 |
+
|
61 |
+
# Importance and Use Cases
|
62 |
+
st.markdown('<div class="sub-title">Importance and Use Cases</div>', unsafe_allow_html=True)
|
63 |
+
st.markdown("""
|
64 |
+
<div class="section">
|
65 |
+
<p>Accurate language detection is pivotal for many applications, including:</p>
|
66 |
+
<ul>
|
67 |
+
<li><b>Machine Translation</b>: Automatically translating text into various languages.</li>
|
68 |
+
<li><b>Sentiment Analysis</b>: Analyzing sentiments in multilingual datasets.</li>
|
69 |
+
<li><b>Information Retrieval</b>: Enhancing search results by filtering content based on language.</li>
|
70 |
+
<li><b>Spam Filtering</b>: Identifying spam content in multiple languages.</li>
|
71 |
+
<li><b>Social Media Analysis</b>: Processing and categorizing user-generated content in different languages.</li>
|
72 |
+
</ul>
|
73 |
+
</div>
|
74 |
+
""", unsafe_allow_html=True)
|
75 |
+
|
76 |
+
# Spark NLP's LanguageDetectorDL
|
77 |
+
st.markdown('<div class="sub-title">Spark NLP\'s LanguageDetectorDL</div>', unsafe_allow_html=True)
|
78 |
+
st.markdown("""
|
79 |
+
<div class="section">
|
80 |
+
<p>The <code>LanguageDetectorDL</code> annotator from Spark NLP is designed for high accuracy in language detection. It utilizes pretrained deep learning models to identify languages with precision. This annotator can effectively handle documents containing mixed languages by analyzing sentence segments and selecting the most probable language.</p>
|
81 |
+
</div>
|
82 |
+
""", unsafe_allow_html=True)
|
83 |
+
|
84 |
+
# Setup Instructions
|
85 |
+
st.markdown('<div class="sub-title">Setup</div>', unsafe_allow_html=True)
|
86 |
+
st.markdown('<p>To install Spark NLP and extract keywords in Python, simply use your favorite package manager (conda, pip, etc.). For example:</p>', unsafe_allow_html=True)
|
87 |
+
st.code("""
|
88 |
+
pip install spark-nlp
|
89 |
+
pip install pyspark
|
90 |
+
""", language="bash")
|
91 |
+
st.markdown('<p>For other installation options and environments, refer to the <a href="https://nlp.johnsnowlabs.com/docs/en/install" class="link">official documentation</a>.</p>', unsafe_allow_html=True)
|
92 |
+
|
93 |
+
st.markdown("<p>Then, import Spark NLP and start a Spark session:</p>", unsafe_allow_html=True)
|
94 |
+
st.code("""
|
95 |
+
import sparknlp
|
96 |
+
|
97 |
+
# Start Spark Session
|
98 |
+
spark = sparknlp.start()
|
99 |
+
""", language='python')
|
100 |
+
|
101 |
+
# Using LanguageDetectorDL
|
102 |
+
st.markdown('<div class="sub-title">Using LanguageDetectorDL</div>', unsafe_allow_html=True)
|
103 |
+
st.code("""
|
104 |
+
# Import necessary modules
|
105 |
+
from sparknlp.base import DocumentAssembler, Pipeline
|
106 |
+
from sparknlp.annotator import LanguageDetectorDL
|
107 |
+
import pyspark.sql.functions as F
|
108 |
+
|
109 |
+
# Step 1: Transform raw text into `document` annotation
|
110 |
+
document_assembler = (
|
111 |
+
DocumentAssembler()
|
112 |
+
.setInputCol("text")
|
113 |
+
.setOutputCol("document")
|
114 |
+
)
|
115 |
+
|
116 |
+
# Step 2: Detect the language of the text
|
117 |
+
language_detector = (
|
118 |
+
LanguageDetectorDL.pretrained()
|
119 |
+
.setInputCols("document")
|
120 |
+
.setOutputCol("language")
|
121 |
+
)
|
122 |
+
|
123 |
+
# Create the NLP pipeline
|
124 |
+
nlpPipeline = Pipeline(stages=[document_assembler, language_detector])
|
125 |
+
|
126 |
+
# Sample texts in different languages
|
127 |
+
data = spark.createDataFrame([
|
128 |
+
["Spark NLP is an open-source text processing library for advanced natural language processing for the Python, Java and Scala programming languages."],
|
129 |
+
["Spark NLP est une bibliothèque de traitement de texte open source pour le traitement avancé du langage naturel pour les langages de programmation Python, Java et Scala."],
|
130 |
+
["Spark NLP ist eine Open-Source-Textverarbeitungsbibliothek für fortgeschrittene natürliche Sprachverarbeitung für die Programmiersprachen Python, Java und Scala."],
|
131 |
+
["Spark NLP es una biblioteca de procesamiento de texto de código abierto para el procesamiento avanzado de lenguaje natural para los lenguajes de programación Python, Java y Scala."],
|
132 |
+
["Spark NLP é uma biblioteca de processamento de texto de código aberto para processamento avançado de linguagem natural para as linguagens de programação Python, Java e Scala"]
|
133 |
+
]).toDF("text")
|
134 |
+
|
135 |
+
# Transform the data with the pipeline
|
136 |
+
result = nlpPipeline.fit(data).transform(data)
|
137 |
+
|
138 |
+
# Show the results
|
139 |
+
result.select("text", "language.result").show(truncate=100)
|
140 |
+
""", language='python')
|
141 |
+
|
142 |
+
st.text("""
|
143 |
+
+----------------------------------------------------------------------------------------------------+------+
|
144 |
+
| text|result|
|
145 |
+
+----------------------------------------------------------------------------------------------------+------+
|
146 |
+
|Spark NLP is an open-source text processing library for advanced natural language processing for ...| [en]|
|
147 |
+
|Spark NLP est une bibliothèque de traitement de texte open source pour le traitement avancé du la...| [fr]|
|
148 |
+
|Spark NLP ist eine Open-Source-Textverarbeitungsbibliothek für fortgeschrittene natürliche Sprach...| [de]|
|
149 |
+
|Spark NLP es una biblioteca de procesamiento de texto de código abierto para el procesamiento ava...| [es]|
|
150 |
+
|Spark NLP é uma biblioteca de processamento de texto de código aberto para processamento avançado...| [pt]|
|
151 |
+
+----------------------------------------------------------------------------------------------------+------+
|
152 |
+
""")
|
153 |
+
|
154 |
+
# One-Liner Alternative
|
155 |
+
st.markdown('<div class="sub-title">One-Liner Alternative</div>', unsafe_allow_html=True)
|
156 |
+
st.markdown("""
|
157 |
+
<div class="section">
|
158 |
+
<p>John Snow Labs has introduced a unified library to simplify workflows across various products, including Spark NLP. Install the library with:</p>
|
159 |
+
<pre><code>pip install johnsnowlabs</code></pre>
|
160 |
+
<p>Use the following one-liner code for quick language detection:</p>
|
161 |
+
</div>
|
162 |
+
""", unsafe_allow_html=True)
|
163 |
+
|
164 |
+
st.code("""
|
165 |
+
# Import the NLP module which contains Spark NLP and NLU libraries
|
166 |
+
from johnsnowlabs import nlp
|
167 |
+
|
168 |
+
# Sample text in Polish
|
169 |
+
sample_text = "Spark NLP to biblioteka edytorów tekstu typu open source do zaawansowanego przetwarzania języka naturalnego w językach programowania Python, Java i Scala."
|
170 |
+
|
171 |
+
# Detect language with one line of code
|
172 |
+
result = nlp.load('xx.classify.wiki_95').predict(sample_text, output_level='sentence')
|
173 |
+
""", language='python')
|
174 |
+
|
175 |
+
st.markdown("""
|
176 |
+
<table style="width:100%; border-collapse: collapse; margin-top: 20px;">
|
177 |
+
<thead>
|
178 |
+
<tr style="background-color: #4A90E2; color: white; text-align: left;">
|
179 |
+
<th style="padding: 12px;">Language</th>
|
180 |
+
<th style="padding: 12px;">Confidence</th>
|
181 |
+
<th style="padding: 12px;">Sentence</th>
|
182 |
+
</tr>
|
183 |
+
</thead>
|
184 |
+
<tbody>
|
185 |
+
<tr style="background-color: #f9f9f9;">
|
186 |
+
<td style="padding: 12px; border: 1px solid #ddd;">pl</td>
|
187 |
+
<td style="padding: 12px; border: 1px solid #ddd;">9.0</td>
|
188 |
+
<td style="padding: 12px; border: 1px solid #ddd;">Spark NLP to biblioteka edytorów tekstu typu open source do zaawansowanego przetwarzania języka naturalnego w językach programowania Python, Java i Scala.</td>
|
189 |
+
</tr>
|
190 |
+
</tbody>
|
191 |
+
</table>
|
192 |
+
""", unsafe_allow_html=True)
|
193 |
+
|
194 |
+
st.markdown("""
|
195 |
+
<div class="section">
|
196 |
+
<p><b>Benefits of the One-Liner</b></p>
|
197 |
+
<p>This approach is convenient for quick implementations and testing. The one-liner model is based on default configurations, which may suffice for general use cases. However, for more specialized needs, customizing the pipeline or choosing specific models might be necessary.</p>
|
198 |
+
</div>
|
199 |
+
""", unsafe_allow_html=True)
|
200 |
+
|
201 |
+
# Notes and Recommendations
|
202 |
+
st.markdown('<div class="sub-title">Notes and Recommendations</div>', unsafe_allow_html=True)
|
203 |
+
st.markdown("""
|
204 |
+
<div class="section">
|
205 |
+
<ul>
|
206 |
+
<li><b>Customizing Pipelines</b>: While the one-liner is efficient, building a custom pipeline with specific models and configurations allows for greater flexibility and optimization according to the application's requirements.</li>
|
207 |
+
<li><b>Handling Mixed Languages</b>: <code>LanguageDetectorDL</code> can effectively manage texts with multiple languages by analyzing sentence segments. Ensure your pipeline is configured to handle such cases appropriately.</li>
|
208 |
+
<li><b>Performance Considerations</b>: When working with large datasets, optimizing Spark configurations and resources is crucial for maintaining performance and avoiding bottlenecks.</li>
|
209 |
+
</ul>
|
210 |
+
</div>
|
211 |
+
""", unsafe_allow_html=True)
|
212 |
+
|
213 |
+
# Benchmarking Section
|
214 |
+
st.markdown('<div class="sub-title">Benchmarking</div>', unsafe_allow_html=True)
|
215 |
+
st.write("")
|
216 |
+
st.markdown('<p><a href="https://sparknlp.org/2020/12/05/ld_wiki_tatoeba_cnn_375_xx.html" class="link" target="_blank">ld_wiki_tatoeba_cnn_375</a> Model Evaluated on Europarl dataset which the model has never seen:</p>', unsafe_allow_html=True)
|
217 |
+
st.text("""
|
218 |
+
+--------+-----+-------+------------------+
|
219 |
+
|src_lang|count|correct| precision|
|
220 |
+
+--------+-----+-------+------------------+
|
221 |
+
| fr| 1000| 1000| 1.0|
|
222 |
+
| de| 1000| 999| 0.999|
|
223 |
+
| fi| 1000| 999| 0.999|
|
224 |
+
| nl| 1000| 998| 0.998| +-------+--------------------+
|
225 |
+
| el| 1000| 997| 0.997| |summary| precision|
|
226 |
+
| en| 1000| 995| 0.995| +-------+--------------------+
|
227 |
+
| es| 1000| 994| 0.994| | count| 21|
|
228 |
+
| it| 1000| 993| 0.993| | mean| 0.9758952066282511|
|
229 |
+
| sv| 1000| 991| 0.991| | stddev|0.029434744995013935|
|
230 |
+
| da| 1000| 987| 0.987| | min| 0.8862144420131292|
|
231 |
+
| pl| 914| 901|0.9857768052516411| | max| 1.0|
|
232 |
+
| hu| 880| 866|0.9840909090909091| +-------+--------------------+
|
233 |
+
| pt| 1000| 980| 0.98|
|
234 |
+
| et| 928| 907|0.9773706896551724|
|
235 |
+
| ro| 784| 766|0.9770408163265306|
|
236 |
+
| lt| 1000| 976| 0.976|
|
237 |
+
| bg| 1000| 965| 0.965|
|
238 |
+
| cs| 1000| 945| 0.945|
|
239 |
+
| sk| 1000| 944| 0.944|
|
240 |
+
| lv| 916| 843|0.9203056768558951|
|
241 |
+
| sl| 914| 810|0.8862144420131292|
|
242 |
+
+--------+-----+-------+------------------+
|
243 |
+
""")
|
244 |
+
|
245 |
+
# Conclusion
|
246 |
+
st.markdown("""
|
247 |
+
<div class="section">
|
248 |
+
<h2>Conclusion</h2>
|
249 |
+
<p>Accurate language detection is a foundational step in many NLP workflows. Spark NLP’s <code>LanguageDetectorDL</code> annotator offers a robust solution for detecting languages in diverse text corpora. With its integration into Spark's powerful data processing framework, it enables efficient handling of large-scale multilingual datasets, providing accurate language identification for various applications.</p>
|
250 |
+
</div>
|
251 |
+
""", unsafe_allow_html=True)
|
252 |
+
|
253 |
+
# References and Additional Information
|
254 |
+
st.markdown('<div class="sub-title">References and Additional Information</div>', unsafe_allow_html=True)
|
255 |
+
|
256 |
+
st.markdown("""
|
257 |
+
<div class="section">
|
258 |
+
<ul>
|
259 |
+
<li><a href="https://nlp.johnsnowlabs.com/docs/en/annotators#languagedetectordl" class="link" target="_blank">Documentation: LanguageDetectorDL</a></li>
|
260 |
+
<li><a href="https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp/annotator/language_detector_dl/index.html#sparknlp.annotator.language_detector_dl.LanguageDetectorDL" class="link" target="_blank">Python Docs: LanguageDetectorDL</a></li>
|
261 |
+
<li><a href="https://sparknlp.org/2020/12/05/ld_wiki_tatoeba_cnn_375_xx.html" class="link" target="_blank">ld_wiki_tatoeba_cnn_375</a></li>
|
262 |
+
<li><a href="https://www.johnsnowlabs.com/how-to-detect-languages-with-python-a-comprehensive-guide/" class="link" target="_blank">Reference Article</a></li>
|
263 |
+
</ul>
|
264 |
+
</div>
|
265 |
+
""", unsafe_allow_html=True)
|
266 |
+
|
267 |
+
st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
|
268 |
+
st.markdown("""
|
269 |
+
<div class="section">
|
270 |
+
<ul>
|
271 |
+
<li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
|
272 |
+
<li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
|
273 |
+
<li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
|
274 |
+
<li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
|
275 |
+
<li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
|
276 |
+
</ul>
|
277 |
+
</div>
|
278 |
+
""", unsafe_allow_html=True)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
pandas
|
3 |
+
numpy
|
4 |
+
spark-nlp
|
5 |
+
pyspark
|