Spaces:
Runtime error
Runtime error
File size: 9,199 Bytes
68a8c29 6570b48 25a3c87 65485b5 bdc0541 cba4b15 1e9b3ba 54d3963 1e9b3ba d7bf3e7 1e9b3ba 741cd0d 54d3963 741cd0d 54d3963 b81a5c9 54d3963 741cd0d 1e9b3ba 2f590b1 741cd0d 16fc4ca 46c94f2 92a84ae 9989672 25a3c87 5d305df 25a3c87 5d305df 92a84ae dedac74 46c94f2 7a00e93 881bbde 46c94f2 cba4b15 7a00e93 46c94f2 bdc0541 8329262 55f8482 741cd0d cba4b15 741cd0d 10bed1d 741cd0d bdc0541 741cd0d 54d3963 741cd0d bdc0541 741cd0d 54d3963 741cd0d 54d3963 741cd0d 54d3963 537dd73 92a84ae 741cd0d 54d3963 741cd0d 54d3963 741cd0d ca54e5d 741cd0d 54d3963 741cd0d 54d3963 cba4b15 741cd0d 54d3963 741cd0d cba4b15 54d3963 741cd0d 54d3963 741cd0d 54d3963 741cd0d 54d3963 741cd0d 54d3963 741cd0d fa7755f 5459c25 741cd0d 07e3fb8 741cd0d 5459c25 741cd0d 5459c25 741cd0d 54d3963 cba4b15 54d3963 741cd0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
import streamlit as st
import langcodes
from requests_html import HTMLSession
import urllib
import requests
session = HTMLSession()
# FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org"
# Big TODO: collate all the results into a big dictionary? Then display that. Reduces if statements?
# TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
# TODO: add in some nice things from https://docs.streamlit.io/library/cheatsheet like error codes and status messages.
# TODO: add in vachan search even if lang not found
# TODO: results from glottolog even if none from others
# TODO: check glottolog results to see if they find anything!
things_to_test = [
"knh", # deprecated code on ISO
"khn", # only has 639-3 on ISO
"xxx", # no such code on ISO or glottolog
"Chinese", # Vachan struggles.
"Mandarin", # Vachan struggles.
"zh-CN", # Chinese again.
"Chinese",
"zh-Latn-pinyin",
"en-Latn-US",
"en",
"English",
"fr-CA",
"French (Canada)",
"français",
"法语",
"", # empty string
]
def get_bcp47_from_langcode(langtext):
pass
@st.cache
def pull_obsolete_codes(iso_code):
session = HTMLSession()
r= session.get(f"https://iso639-3.sil.org/code/{iso_code}")
# https://www.w3schools.com/cssref/css_selectors.asp
obsolete_codes = {}
for found_element in r.html.find(".views-field-nothing", clean=True):
lines = found_element.text.splitlines()
for line in lines:
for obsolete_code_name in ["639-1","639-2/B", "639-2/T", "639-3"]:
if obsolete_code_name in line and ":" in line:
code = line.split()[-1]
obsolete_codes[obsolete_code_name] = code
return obsolete_codes
@st.cache
def try_retrieving_glottolog_id(langtext):
languoid_id = ""
langtext_quoted = urllib.parse.quote(langtext)
query_url=f"https://glottolog.org/glottolog?search={langtext_quoted}"
glottolog_r= session.get(query_url)
returned_url = glottolog_r.html.url
if "languoid" in returned_url:
last_section = returned_url.split("/")[-1]
languoid_id = last_section
return languoid_id
@st.cache
def get_glottolog_json(languoid_id):
query_url=f"https://glottolog.org/resource/languoid/id/{languoid_id}.json"
glottolog_r = session.get(query_url)
return glottolog_r.json()
@st.cache
def try_searching_vachan_engine(langtext):
results_list = []
langtext_quoted = urllib.parse.quote(langtext)
query_url = f"https://api.vachanengine.org/v2/languages?search_word={langtext_quoted}"
vachan_r= requests.get(query_url)
if vachan_r.status_code == 200:
results_list = vachan_r.json()
return results_list
def main():
st.write("# Language code/tag search")
st.write("Fed up with language tag confusion? Here's your one-stop shop!")
st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English` (or even some in other languages, try '法语' or 'français'), and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 tag according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)")
st.write(f"**Feedback:** Provide feedback at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")
# https://huggingface.co/blog/streamlit-spaces
# https://github.com/psf/requests-html
# https://docs.streamlit.io/library/api-reference/write-magic/st.write
example_valid_lookups = ["zh-CN", "Chinese", "zh-Latn-pinyin", "en-Latn-US", "en", "English", "fr-CA", "French (Canada)", "français", "法语"]
langtext = st.text_input("Language Code/Tag Lookup using langcodes", "Swahili", help=f"Try language codes or language names! Examples: {example_valid_lookups}").strip()
if langtext.lower() == "matlab":
st.error("Matlab is not a real language! ¯\\_(ツ)_/¯")
return
if langtext.lower() == "python":
st.success("[Python is the best language!](https://www.python.org/)")
return
# TODO: st.code() for these "lookup in progress" outputs.
st.info("* Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.")
if langcodes.tag_is_valid(langtext):
st.info(f"* ...True! '{langtext}' parses meaningfully as a language tag according to IANA.")
else:
st.info(f"* ...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.")
try:
lang = langcodes.Language.get(langtext)
# st.write(f"{lang} is the BCP-47 tag.")
if "unknown" in lang.display_name().lower():
st.info(f"* Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.")
lang = None
except langcodes.LanguageTagError as e:
st.info(f"* Could not lookup code directly, attempting to search for it as a natural language string.")
lang = None
if lang is None:
try:
found = langcodes.find(langtext)
lang = found
st.success(f"* Natural language search found the following BCP-47 tag: {lang}")
except LookupError as e:
st.error("## Result: failure!")
st.error(f"Unable to look up BCP-47 tag. But all hope is not lost...")
st.write(f"* You can also try https://r12a.github.io/app-subtags/")
lang = None
t_variant = None
b_variant = None
#st.write(f"langcodes found the following tag: {type(found)}") # a Language object
if lang is not None:
display = lang.display_name()
b_variant = lang.to_alpha3(variant='B')
t_variant = lang.to_alpha3(variant='T')
broader_tags = lang.broader_tags()
standardized_tag = langcodes.standardize_tag(lang)
st.write(f"## BCP-47 Results: probably use '{standardized_tag}'")
# TODO: make a results dictionary so it's easy to copy-paste?
st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}")
st.write(f"Breakdown of tag components:")
st.write(lang.describe())
st.write(f"Display name for {lang}: {lang.display_name()}")
st.write(f"Autonym for {lang}: {lang.autonym()}")
st.write(f"**Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library:** `{standardized_tag}`")
st.write("## Further Information:")
st.write(f"Broader tags for this language, if any:")
st.write(broader_tags)
st.write(f"### Language Subtag Search Tool")
st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}, which will likely have links to Ethnologue, Wikipedia, and Character usage. You can also try searching for '{langtext}' there!")
st.write("### Older / Related Codes")
st.write(f"ISO 639-2 'alpha3' code, 'terminology' or 'T' variant (deprecated): {t_variant}")
st.write(f"ISO 639-2 'alpha3' code, 'bibliographic' or 'B' variant (deprecated): {b_variant}")
# ethnologue prefers T for german (deu), and T for French
st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at https://iso639-3.sil.org/code/{t_variant}. That is also the code variant that typically has a working link to Ethnologue.")
if t_variant != b_variant:
st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at https://iso639-3.sil.org/code/{b_variant}")
st.write("#### Codes scraped from iso639-3.sil.org")
#TODO: Cleanup this bit
t_obsolete_codes = pull_obsolete_codes(t_variant)
b_obsolete_codes = pull_obsolete_codes(b_variant)
if t_obsolete_codes:
st.write(f"Older codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{t_variant}:")
st.write(t_obsolete_codes)
elif b_obsolete_codes:
st.write(f"Older codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{b_variant}:")
st.write(b_obsolete_codes)
st.write(f"### Glottolog")
search_terms_for_glottolog = [langtext, t_variant, b_variant]
languoids = []
for search_term in search_terms_for_glottolog :
if search_term :
languoid_id = try_retrieving_glottolog_id(search_term )
if languoid_id:
if languoid_id not in languoids:
st.write(f"**Glottolog Languoid ID:** Searching for '{search_term}' on Glottolog returns the following 'languoid ID': [{languoid_id}](https://glottolog.org/resource/languoid/id/{languoid_id})")
# get_glottolog_json(languoid_id)
languoids.append(languoid_id)
results_from_vachan = try_searching_vachan_engine(langtext)
if results_from_vachan:
st.write("### Other potential matches, from [Vachan Engine](https://github.com/Bridgeconn/vachan-api/tree/version-2) (experimental)")
st.write(results_from_vachan)
if __name__ == "__main__":
main() |