Spaces:
Runtime error
Runtime error
import streamlit as st | |
import langcodes | |
from requests_html import HTMLSession | |
import urllib | |
st.write("# Language code/tag search") | |
st.write("Fed up with language tag confusion? Here's your one-stop shop!") | |
st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English`, and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 code according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)") | |
# https://huggingface.co/blog/streamlit-spaces | |
# https://github.com/psf/requests-html | |
# https://docs.streamlit.io/library/api-reference/write-magic/st.write | |
example_valid_lookups = ["zh-CN", "Chinese", "zh-Latn-pinyin", "en-Latn-US", "en", "English", "fr-CA", "French (Canada)", "français", "法语"] | |
langtext = st.text_input("Language Code/Tag Lookup using langcodes", "english", help=f"Try language codes or language names! Examples: {example_valid_lookups}").strip() | |
st.write("* Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.") | |
if langcodes.tag_is_valid(langtext): | |
st.write(f"* ...True! '{langtext}' parses meaningfully as a language tag according to IANA.") | |
else: | |
st.write(f"* ...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.") | |
try: | |
lang = langcodes.Language.get(langtext) | |
# st.write(f"{lang} is the BCP-47 tag.") | |
if "unknown" in lang.display_name().lower(): | |
st.write(f"* Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.") | |
lang = None | |
except langcodes.LanguageTagError as e: | |
st.write(f"* Could not lookup code directly, attempting to search for it as a natural language string.") | |
lang = None | |
if lang is None: | |
try: | |
found = langcodes.find(langtext) | |
lang = found | |
st.write(f"* Natural language search found the following BCP-47 tag: {lang}") | |
except LookupError as e: | |
st.write("## Result: failure!") | |
st.write(f"Unable to look up language code. But all hope is not lost...") | |
st.write(f"* You can also try https://r12a.github.io/app-subtags/") | |
st.write(f"* Or possibly https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)}") | |
lang = None | |
def pull_obsolete_codes(iso_code): | |
session = HTMLSession() | |
r= session.get(f"https://iso639-3.sil.org/code/{iso_code}") | |
# https://www.w3schools.com/cssref/css_selectors.asp | |
obsolete_codes = {} | |
for found_element in r.html.find(".views-field-nothing", clean=True): | |
lines = found_element.text.splitlines() | |
for line in lines: | |
for obsolete_code_name in ["639-1","639-2/B", "639-2/T", "639-3"]: | |
if obsolete_code_name in line and ":" in line: | |
code = line.split()[-1] | |
obsolete_codes[obsolete_code_name] = code | |
return obsolete_codes | |
#st.write(f"langcodes found the following tag: {type(found)}") # a Language object | |
if lang is not None: | |
display = lang.display_name() | |
b_variant = lang.to_alpha3(variant='B') | |
t_variant = lang.to_alpha3(variant='T') | |
broader_tags = lang.broader_tags() | |
st.write("## Results") | |
st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}") | |
st.write(f"Breakdown of tag components:") | |
st.write(lang.describe()) | |
st.write(f"Display name for {lang}: {lang.display_name()}") | |
st.write(f"Autonym for {lang}: {lang.autonym()}") | |
st.write(f"**Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library:** `{langcodes.standardize_tag(lang)}`") | |
st.write("## Further Information:") | |
st.write(f"Broader tags for this language, if any:") | |
st.write(broader_tags) | |
st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}, which will likely have links to Ethnologue, Wikipedia, and Character usage. You can also try searching for '{langtext}' there!") | |
st.write(f"https://glottolog.org/glottolog?search={t_variant} may be also of interest, with links to various resources including WALS, Wikidata, Odin, and OLAC. If that doesn't work, try https://glottolog.org/glottolog?search={b_variant}, or put in a [custom search query](https://glottolog.org/glottolog)") | |
st.write(f"https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)} may pull up something as well.") | |
# ethnologue prefers T for german (deu), and T for French | |
st.write("## Older Codes") | |
st.write(f"ISO 639-3 'alpha3' code, 'terminology' variant (deprecated): {t_variant}") | |
st.write(f"ISO 639-3 'alpha3' code, 'bibliographic' variant (deprecated): {b_variant}") | |
st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at https://iso639-3.sil.org/code/{t_variant}") | |
st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at https://iso639-3.sil.org/code/{b_variant}") | |
obsolete_codes = pull_obsolete_codes(t_variant) | |
#TODO: Cleanup this bit | |
if obsolete_codes: | |
st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{t_variant}:") | |
st.write(obsolete_codes) | |
else: | |
obsolete_codes = pull_obsolete_codes(b_variant) | |
if obsolete_codes: | |
st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{b_variant}:") | |
st.write(obsolete_codes) | |