cdleong commited on
Commit
741cd0d
1 Parent(s): 6998de9

Add support for Matlab/Python

Browse files
Files changed (1) hide show
  1. app.py +123 -105
app.py CHANGED
@@ -5,55 +5,20 @@ import urllib
5
  import requests
6
 
7
  # FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org"
8
- # FEATURE: add programming languages easter egg
9
  # TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
10
  # TODO: add in some nice things from https://docs.streamlit.io/library/cheatsheet like error codes and status messages.
11
  # TODO: add in vachan search even if lang not found
 
 
 
 
 
 
 
12
 
13
- st.write("# Language code/tag search")
14
- st.write("Fed up with language tag confusion? Here's your one-stop shop!")
15
- st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English` (or even some in other languages, try '法语' or 'français'), and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 code according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)")
16
- st.write(f"**Feedback:** Provide feedback at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")
17
-
18
- # https://huggingface.co/blog/streamlit-spaces
19
- # https://github.com/psf/requests-html
20
- # https://docs.streamlit.io/library/api-reference/write-magic/st.write
21
- example_valid_lookups = ["zh-CN", "Chinese", "zh-Latn-pinyin", "en-Latn-US", "en", "English", "fr-CA", "French (Canada)", "français", "法语"]
22
- langtext = st.text_input("Language Code/Tag Lookup using langcodes", "Swahili", help=f"Try language codes or language names! Examples: {example_valid_lookups}").strip()
23
-
24
- # TODO: st.code() for these "lookup in progress" outputs.
25
- st.write("* Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.")
26
-
27
- if langcodes.tag_is_valid(langtext):
28
- st.write(f"* ...True! '{langtext}' parses meaningfully as a language tag according to IANA.")
29
- else:
30
- st.write(f"* ...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.")
31
-
32
-
33
- try:
34
- lang = langcodes.Language.get(langtext)
35
- # st.write(f"{lang} is the BCP-47 tag.")
36
- if "unknown" in lang.display_name().lower():
37
- st.write(f"* Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.")
38
- lang = None
39
- except langcodes.LanguageTagError as e:
40
- st.write(f"* Could not lookup code directly, attempting to search for it as a natural language string.")
41
- lang = None
42
-
43
-
44
-
45
- if lang is None:
46
- try:
47
- found = langcodes.find(langtext)
48
- lang = found
49
- st.write(f"* Natural language search found the following BCP-47 tag: {lang}")
50
- except LookupError as e:
51
- st.write("## Result: failure!")
52
- st.write(f"Unable to look up language code. But all hope is not lost...")
53
- st.write(f"* You can also try https://r12a.github.io/app-subtags/")
54
- st.write(f"* Or possibly https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)}")
55
- lang = None
56
 
 
 
57
 
58
  def pull_obsolete_codes(iso_code):
59
  session = HTMLSession()
@@ -93,73 +58,126 @@ def try_searching_vachan_engine(langtext):
93
  results_list = vachan_r.json()
94
  return results_list
95
 
96
- #st.write(f"langcodes found the following tag: {type(found)}") # a Language object
97
- if lang is not None:
98
- display = lang.display_name()
99
- b_variant = lang.to_alpha3(variant='B')
100
- t_variant = lang.to_alpha3(variant='T')
101
- broader_tags = lang.broader_tags()
102
- results_from_vachan = try_searching_vachan_engine(langtext)
103
- standardized_tag = langcodes.standardize_tag(lang)
104
- languoid_id = try_retrieving_glottolog_id(langtext)
105
-
106
-
107
- st.write(f"## Results: probably use '{standardized_tag}'")
108
- # TODO: make a results dictionary so it's easy to copy-paste?
109
- st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}")
110
- st.write(f"Breakdown of tag components:")
111
- st.write(lang.describe())
112
- st.write(f"Display name for {lang}: {lang.display_name()}")
113
- st.write(f"Autonym for {lang}: {lang.autonym()}")
114
- st.write(f"**Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library:** `{standardized_tag}`")
115
-
116
-
117
- st.write("## Further Information:")
118
-
119
- st.write(f"Broader tags for this language, if any:")
120
- st.write(broader_tags)
121
-
122
- st.write(f"### Language Subtag Search Tool")
123
- st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}, which will likely have links to Ethnologue, Wikipedia, and Character usage. You can also try searching for '{langtext}' there!")
124
-
125
- st.write(f"### Glottolog")
126
- if languoid_id:
127
- st.write(f"**Glottolog Languoid ID:** Searching for '{langtext}' on Glottolog returns the following 'languoid ID': [{languoid_id}](https://glottolog.org/resource/languoid/id/{languoid_id})")
128
- st.write(f"https://glottolog.org/glottolog?search={t_variant} may be also of interest, with links to various resources including WALS, Wikidata, Odin, and OLAC. ")
129
- if t_variant != b_variant:
130
- st.write(f"If that doesn't work, try https://glottolog.org/glottolog?search={b_variant}, or put in a [custom search query](https://glottolog.org/glottolog)")
131
- st.write(f"https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)} may pull up something as well.")
132
-
133
- st.write("### Older / Related Codes")
134
 
135
- st.write(f"ISO 639-3 'alpha3' code, 'terminology' or 'T' variant (deprecated): {t_variant}")
136
- st.write(f"ISO 639-3 'alpha3' code, 'bibliographic' or 'B' variant (deprecated): {b_variant}")
 
 
 
137
 
138
- # ethnologue prefers T for german (deu), and T for French
139
- st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at https://iso639-3.sil.org/code/{t_variant}. That is also the code variant that typically has a working link to Ethnologue.")
140
- if t_variant != b_variant:
141
- st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at https://iso639-3.sil.org/code/{b_variant}")
142
-
143
- st.write("#### Codes scraped from iso639-3.sil.org")
144
- #TODO: Cleanup this bit
145
- t_obsolete_codes = pull_obsolete_codes(t_variant)
146
- b_obsolete_codes = pull_obsolete_codes(b_variant)
147
- if t_obsolete_codes:
148
- st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{t_variant}:")
149
- st.write(t_obsolete_codes)
150
- elif b_obsolete_codes:
151
- st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{b_variant}:")
152
- st.write(b_obsolete_codes)
153
 
 
 
 
154
 
155
- if results_from_vachan:
156
- st.write("### Other potential matches, from [Vachan Engine](https://github.com/Bridgeconn/vachan-api/tree/version-2) (experimental)")
157
- st.write(results_from_vachan)
158
-
159
-
 
 
160
 
161
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
 
 
 
 
 
 
 
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import requests
6
 
7
  # FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org"
 
8
  # TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
9
  # TODO: add in some nice things from https://docs.streamlit.io/library/cheatsheet like error codes and status messages.
10
  # TODO: add in vachan search even if lang not found
11
+ # TODO: results from glottolog even if none from others
12
+ things_to_test = [
13
+ "knh", # deprecated code on ISO
14
+ "khn", # only has 639-3 on ISO
15
+ "xxx", # no such code on ISO or glottolog
16
+ "Chinese", # Vachan struggles.
17
+ ]
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ def get_bcp47_from_langcode(langtext):
21
+ pass
22
 
23
  def pull_obsolete_codes(iso_code):
24
  session = HTMLSession()
58
  results_list = vachan_r.json()
59
  return results_list
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ def main():
63
+ st.write("# Language code/tag search")
64
+ st.write("Fed up with language tag confusion? Here's your one-stop shop!")
65
+ st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English` (or even some in other languages, try '法语' or 'français'), and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 code according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)")
66
+ st.write(f"**Feedback:** Provide feedback at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")
67
 
68
+ # https://huggingface.co/blog/streamlit-spaces
69
+ # https://github.com/psf/requests-html
70
+ # https://docs.streamlit.io/library/api-reference/write-magic/st.write
71
+ example_valid_lookups = ["zh-CN", "Chinese", "zh-Latn-pinyin", "en-Latn-US", "en", "English", "fr-CA", "French (Canada)", "français", "法语"]
72
+ langtext = st.text_input("Language Code/Tag Lookup using langcodes", "Swahili", help=f"Try language codes or language names! Examples: {example_valid_lookups}").strip()
73
+
74
+ if langtext.lower() == "matlab":
75
+ st.error("Matlab is not a real language! ¯\\_()_/¯")
76
+ return
 
 
 
 
 
 
77
 
78
+ if langtext.lower() == "python":
79
+ st.success("[Python is the best language!(https://www.python.org/)")
80
+ return
81
 
82
+ # TODO: st.code() for these "lookup in progress" outputs.
83
+ st.write("* Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.")
84
+
85
+ if langcodes.tag_is_valid(langtext):
86
+ st.write(f"* ...True! '{langtext}' parses meaningfully as a language tag according to IANA.")
87
+ else:
88
+ st.write(f"* ...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.")
89
 
90
 
91
+ try:
92
+ lang = langcodes.Language.get(langtext)
93
+ # st.write(f"{lang} is the BCP-47 tag.")
94
+ if "unknown" in lang.display_name().lower():
95
+ st.write(f"* Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.")
96
+ lang = None
97
+ except langcodes.LanguageTagError as e:
98
+ st.write(f"* Could not lookup code directly, attempting to search for it as a natural language string.")
99
+ lang = None
100
+
101
+
102
 
103
+ if lang is None:
104
+ try:
105
+ found = langcodes.find(langtext)
106
+ lang = found
107
+ st.write(f"* Natural language search found the following BCP-47 tag: {lang}")
108
+ except LookupError as e:
109
+ st.write("## Result: failure!")
110
+ st.write(f"Unable to look up language code. But all hope is not lost...")
111
+ st.write(f"* You can also try https://r12a.github.io/app-subtags/")
112
+ st.write(f"* Or possibly https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)}")
113
+ lang = None
114
+
115
+
116
+
117
+
118
+ #st.write(f"langcodes found the following tag: {type(found)}") # a Language object
119
+ if lang is not None:
120
+ display = lang.display_name()
121
+ b_variant = lang.to_alpha3(variant='B')
122
+ t_variant = lang.to_alpha3(variant='T')
123
+ broader_tags = lang.broader_tags()
124
+ results_from_vachan = try_searching_vachan_engine(langtext)
125
+ standardized_tag = langcodes.standardize_tag(lang)
126
+ languoid_id = try_retrieving_glottolog_id(langtext)
127
+
128
+
129
+ st.write(f"## Results: probably use '{standardized_tag}'")
130
+ # TODO: make a results dictionary so it's easy to copy-paste?
131
+ st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}")
132
+ st.write(f"Breakdown of tag components:")
133
+ st.write(lang.describe())
134
+ st.write(f"Display name for {lang}: {lang.display_name()}")
135
+ st.write(f"Autonym for {lang}: {lang.autonym()}")
136
+ st.write(f"**Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library:** `{standardized_tag}`")
137
+
138
+
139
+ st.write("## Further Information:")
140
+
141
+ st.write(f"Broader tags for this language, if any:")
142
+ st.write(broader_tags)
143
+
144
+ st.write(f"### Language Subtag Search Tool")
145
+ st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}, which will likely have links to Ethnologue, Wikipedia, and Character usage. You can also try searching for '{langtext}' there!")
146
+
147
+ st.write(f"### Glottolog")
148
+ if languoid_id:
149
+ st.write(f"**Glottolog Languoid ID:** Searching for '{langtext}' on Glottolog returns the following 'languoid ID': [{languoid_id}](https://glottolog.org/resource/languoid/id/{languoid_id})")
150
+ st.write(f"https://glottolog.org/glottolog?search={t_variant} may be also of interest, with links to various resources including WALS, Wikidata, Odin, and OLAC. ")
151
+ if t_variant != b_variant:
152
+ st.write(f"If that doesn't work, try https://glottolog.org/glottolog?search={b_variant}, or put in a [custom search query](https://glottolog.org/glottolog)")
153
+ st.write(f"https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)} may pull up something as well.")
154
+
155
+ st.write("### Older / Related Codes")
156
 
157
+ st.write(f"ISO 639-3 'alpha3' code, 'terminology' or 'T' variant (deprecated): {t_variant}")
158
+ st.write(f"ISO 639-3 'alpha3' code, 'bibliographic' or 'B' variant (deprecated): {b_variant}")
159
+
160
+ # ethnologue prefers T for german (deu), and T for French
161
+ st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at https://iso639-3.sil.org/code/{t_variant}. That is also the code variant that typically has a working link to Ethnologue.")
162
+ if t_variant != b_variant:
163
+ st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at https://iso639-3.sil.org/code/{b_variant}")
164
 
165
+ st.write("#### Codes scraped from iso639-3.sil.org")
166
+ #TODO: Cleanup this bit
167
+ t_obsolete_codes = pull_obsolete_codes(t_variant)
168
+ b_obsolete_codes = pull_obsolete_codes(b_variant)
169
+ if t_obsolete_codes:
170
+ st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{t_variant}:")
171
+ st.write(t_obsolete_codes)
172
+ elif b_obsolete_codes:
173
+ st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{b_variant}:")
174
+ st.write(b_obsolete_codes)
175
+
176
+
177
+ if results_from_vachan:
178
+ st.write("### Other potential matches, from [Vachan Engine](https://github.com/Bridgeconn/vachan-api/tree/version-2) (experimental)")
179
+ st.write(results_from_vachan)
180
+
181
+
182
+ if __name__ == "__main__":
183
+ main()