import streamlit as st import langcodes st.write("# Language code/tag search") st.write("Fed up with language tag confusion? Here's your one-stop shop!") st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English`, and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 code according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)") # https://huggingface.co/blog/streamlit-spaces # https://github.com/psf/requests-html # https://docs.streamlit.io/library/api-reference/write-magic/st.write langtext = st.text_input("Language Code/Tag Lookup using langcodes", "english") st.write("Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.") if langcodes.tag_is_valid(langtext): st.write(f"...True! '{langtext}' parses meaningfully as a language tag according to IANA.") else: st.write(f"...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.") try: lang = langcodes.Language.get(langtext) # st.write(f"{lang} is the BCP-47 tag.") if "unknown" in lang.display_name().lower(): st.write(f"Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.") lang = None except langcodes.LanguageTagError as e: st.write(f"Could not lookup code directly, attempting to search for it as a natural language string.") lang = None if lang is None: try: found = langcodes.find(langtext) lang = found st.write(f"natural language search found the following BCP-47 tag: {lang}") except LookupError as e: st.write(f"Unable to look up language code.") st.write(f"Try also: https://r12a.github.io/app-subtags/") st.write(f"Try also: https://glottolog.org/glottolog?search={langtext}") lang = None def pull_obsolete_codes(iso_code): from requests_html import HTMLSession session = HTMLSession() r= session.get(f"https://iso639-3.sil.org/code/{iso_code}") # https://www.w3schools.com/cssref/css_selectors.asp for found_element in r.html.find(".views-field-nothing", clean=True): lines = found_element.text.splitlines() # lines = text.splitlines() obsolete_codes = {} for line in lines: for obsolete_code_name in ["639-1","639-2/B", "639-2/T", "639-3"]: if obsolete_code_name in line: code = line.split()[-1] obsolete_codes[obsolete_code_name] = code return obsolete_codes #st.write(f"langcodes found the following tag: {type(found)}") # a Language object if lang is not None: display = lang.display_name() st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}") st.write(f"Breakdown of tag components: {lang.describe()}") st.write(f"Display name for {lang}: {lang.display_name()}") st.write(f"Autonym for {lang}: {lang.autonym()}") b_variant = lang.to_alpha3(variant='B') t_variant = lang.to_alpha3(variant='T') st.write(f"ISO 639-3 'alpha3' code, 'terminology' variant (deprecated): {t_variant}") st.write(f"ISO 639-3 'alpha3' code, 'bibliographic' variant (deprecated): {b_variant}") st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at https://iso639-3.sil.org/code/{t_variant}") st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at https://iso639-3.sil.org/code/{b_variant}") # ethnologue prefers T for german (deu), and T for French obsolete_codes = pull_obsolete_codes(t_variant) if obsolete_codes: st.write(f"Obsolete codes from previous ISO-639 iterations:") st.write(obsolete_codes) broader_tags = lang.broader_tags() st.write(f"Broader tags for this language, if any:") st.write(broader_tags) st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}") st.write(f"https://glottolog.org/glottolog?search={t_variant} may be of interest, with links to Ethnologue, etc. If that doesn't work, try https://glottolog.org/glottolog?search={b_variant}") st.write(f"## Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library: {langcodes.standardize_tag(lang)}")