cdleong's picture
import streamlit as st
import langcodes
from requests_html import HTMLSession
import urllib
import requests
session = HTMLSession()
# FEATURE: get wikipedia codes, e.g. from or, some of which are nonstandard. Then output f"{code}"
# Big TODO: collate all the results into a big dictionary? Then display that. Reduces if statements?
# TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
# TODO: add in some nice things from like error codes and status messages.
# TODO: add in vachan search even if lang not found
# TODO: results from glottolog even if none from others
# TODO: check glottolog results to see if they find anything!
things_to_test = [
"knh", # deprecated code on ISO
"khn", # only has 639-3 on ISO
"xxx", # no such code on ISO or glottolog
"Chinese", # Vachan struggles.
"Mandarin", # Vachan struggles.
"zh-CN", # Chinese again.
"French (Canada)",
"", # empty string
def get_bcp47_from_langcode(langtext):
def pull_obsolete_codes(iso_code):
session = HTMLSession()
r= session.get(f"{iso_code}")
obsolete_codes = {}
for found_element in r.html.find(".views-field-nothing", clean=True):
lines = found_element.text.splitlines()
for line in lines:
for obsolete_code_name in ["639-1","639-2/B", "639-2/T", "639-3"]:
if obsolete_code_name in line and ":" in line:
code = line.split()[-1]
obsolete_codes[obsolete_code_name] = code
return obsolete_codes
def try_retrieving_glottolog_id(langtext):
languoid_id = ""
langtext_quoted = urllib.parse.quote(langtext)
glottolog_r= session.get(query_url)
returned_url = glottolog_r.html.url
if "languoid" in returned_url:
last_section = returned_url.split("/")[-1]
languoid_id = last_section
return languoid_id
def get_glottolog_json(languoid_id):
glottolog_r = session.get(query_url)
return glottolog_r.json()
def try_searching_vachan_engine(langtext):
results_list = []
langtext_quoted = urllib.parse.quote(langtext)
query_url = f"{langtext_quoted}"
vachan_r= requests.get(query_url)
if vachan_r.status_code == 200:
results_list = vachan_r.json()
return results_list
def main():
st.write("# Language code/tag search")
st.write("Fed up with language tag confusion? Here's your one-stop shop!")
st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English` (or even some in other languages, try '法语' or 'français'), and we will use the [langcodes]( library to figure out the correct modern BCP-47 tag according to [official W3 Guidelines](")
st.write(f"**Feedback:** Provide feedback at, or via slack:")
example_valid_lookups = ["zh-CN", "Chinese", "zh-Latn-pinyin", "en-Latn-US", "en", "English", "fr-CA", "French (Canada)", "français", "法语"]
langtext = st.text_input("Language Code/Tag Lookup using langcodes", "Swahili", help=f"Try language codes or language names! Examples: {example_valid_lookups}").strip()
if langtext.lower() == "matlab":
st.error("Matlab is not a real language! ¯\\_(ツ)_/¯")
if langtext.lower() == "python":
st.success("[Python is the best language!](")
# TODO: st.code() for these "lookup in progress" outputs."* Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.")
if langcodes.tag_is_valid(langtext):"* ...True! '{langtext}' parses meaningfully as a language tag according to IANA.")
else:"* ...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.")
lang = langcodes.Language.get(langtext)
# st.write(f"{lang} is the BCP-47 tag.")
if "unknown" in lang.display_name().lower():"* Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.")
lang = None
except langcodes.LanguageTagError as e:"* Could not lookup code directly, attempting to search for it as a natural language string.")
lang = None
if lang is None:
found = langcodes.find(langtext)
lang = found
st.success(f"* Natural language search found the following BCP-47 tag: {lang}")
except LookupError as e:
st.error("## Result: failure!")
st.error(f"Unable to look up BCP-47 tag. But all hope is not lost...")
st.write(f"* You can also try")
lang = None
t_variant = None
b_variant = None
#st.write(f"langcodes found the following tag: {type(found)}") # a Language object
if lang is not None:
display = lang.display_name()
b_variant = lang.to_alpha3(variant='B')
t_variant = lang.to_alpha3(variant='T')
broader_tags = lang.broader_tags()
standardized_tag = langcodes.standardize_tag(lang)
st.write(f"## BCP-47 Results: probably use '{standardized_tag}'")
# TODO: make a results dictionary so it's easy to copy-paste?
st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}")
st.write(f"Breakdown of tag components:")
st.write(f"Display name for {lang}: {lang.display_name()}")
st.write(f"Autonym for {lang}: {lang.autonym()}")
st.write(f"**Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library:** `{standardized_tag}`")
st.write("## Further Information:")
st.write(f"Broader tags for this language, if any:")
st.write(f"### Language Subtag Search Tool")
st.write(f"Try also:{lang}, which will likely have links to Ethnologue, Wikipedia, and Character usage. You can also try searching for '{langtext}' there!")
st.write("### Older / Related Codes")
st.write(f"ISO 639-2 'alpha3' code, 'terminology' or 'T' variant (deprecated): {t_variant}")
st.write(f"ISO 639-2 'alpha3' code, 'bibliographic' or 'B' variant (deprecated): {b_variant}")
# ethnologue prefers T for german (deu), and T for French
st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at{t_variant}. That is also the code variant that typically has a working link to Ethnologue.")
if t_variant != b_variant:
st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at{b_variant}")
st.write("#### Codes scraped from")
#TODO: Cleanup this bit
t_obsolete_codes = pull_obsolete_codes(t_variant)
b_obsolete_codes = pull_obsolete_codes(b_variant)
if t_obsolete_codes:
st.write(f"Older codes from previous ISO-639 iterations, pulled from{t_variant}:")
elif b_obsolete_codes:
st.write(f"Older codes from previous ISO-639 iterations, pulled from{b_variant}:")
st.write(f"### Glottolog")
search_terms_for_glottolog = [langtext, t_variant, b_variant]
languoids = []
for search_term in search_terms_for_glottolog :
if search_term :
languoid_id = try_retrieving_glottolog_id(search_term )
if languoid_id:
if languoid_id not in languoids:
st.write(f"**Glottolog Languoid ID:** Searching for '{search_term}' on Glottolog returns the following 'languoid ID': [{languoid_id}]({languoid_id})")
# get_glottolog_json(languoid_id)
results_from_vachan = try_searching_vachan_engine(langtext)
if results_from_vachan:
st.write("### Other potential matches, from [Vachan Engine]( (experimental)")
if __name__ == "__main__":