Spaces:

cdleong
/

langcode-search

Runtime error

App Files Files Community

langcode-search / app.py

cdleong

Update app.py

92a84ae about 3 years ago

raw

history blame

3.76 kB

	import streamlit as st
	import langcodes


	# https://huggingface.co/blog/streamlit-spaces
	langtext = st.text_input("language lookup using https://github.com/rspeer/langcodes, see also https://r12a.github.io/app-subtags/", "english")

	st.write("Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.")

	if langcodes.tag_is_valid(langtext):
	st.write(f"...True! '{langtext}' parses meaningfully as a language tag according to IANA.")
	else:
	st.write(f"...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.")


	try:
	lang = langcodes.Language.get(langtext)
	# st.write(f"{lang} is the BCP-47 tag.")
	if "unknown" in lang.display_name().lower():
	st.write(f"Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.")
	lang = None
	except langcodes.LanguageTagError as e:
	st.write(f"Could not lookup code directly, attempting to search for it as a natural language string.")
	lang = None



	if lang is None:
	try:
	found = langcodes.find(langtext)
	lang = found
	st.write(f"natural language search found the following BCP-47 tag: {lang}")
	except LookupError as e:
	st.write(f"Unable to look up language code.")
	st.write(f"Try also: https://r12a.github.io/app-subtags/")
	st.write(f"Try also: https://glottolog.org/glottolog?search={langtext}")
	lang = None


	def pull_obsolete_codes(iso_code):
	from requests_html import HTMLSession
	session = HTMLSession()
	r= session.get(f"https://iso639-3.sil.org/code/{iso_code}")
	for thing in r.html.find(".views-field-nothing", clean=True):
	lines = thing.text.splitlines()
	# lines = text.splitlines()
	obsolete_codes = {}
	for line in lines:
	for obsolete_code_name in ["639-1","639-2/B", "639-2/T", "639-3"]
	if obsolete_code_name in line:
	print(line)
	code = line.split()[-1]
	print(code)
	obsolete_codes[obsolete_code_name] = code
	return obsolete_codes


	#st.write(f"langcodes found the following tag: {type(found)}") # a Language object
	if lang is not None:
	display = lang.display_name()

	st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}")
	st.write(f"Breakdown of tag components: {lang.describe()}")
	st.write(f"Display name for {lang}: {lang.display_name()}")
	st.write(f"Autonym for {lang}: {lang.autonym()}")
	b_variant = lang.to_alpha3(variant='B')
	t_variant = lang.to_alpha3(variant='T')
	st.write(f"ISO 639-3 'alpha3' code, 'terminology' variant (deprecated): {t_variant}")
	st.write(f"ISO 639-3 'alpha3' code, 'bibliographic' variant (deprecated): {b_variant}")
	st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at https://iso639-3.sil.org/code/{t_variant}")
	st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at https://iso639-3.sil.org/code/{b_variant}")


	# ethnologue prefers T for german (deu), and T for French
	obsolete_codes = pull_obsolete_codes(t_variant)





	broader_tags = lang.broader_tags()
	st.write(f"Broader tags for this language, if any: {broader_tags}")
	st.write(f"Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library: {langcodes.standardize_tag(lang)}")
	st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}")
	st.write(f"https://glottolog.org/glottolog?search={t_variant} may be of interest, with links to Ethnologue, etc. If that doesn't work, try https://glottolog.org/glottolog?search={b_variant}")