Spaces:

cdleong
/

langcode-search

Runtime error

App Files Files Community

langcode-search / app.py

cdleong

Update app.py

236ba5e over 2 years ago

raw history blame

No virus

9.2 kB

	import streamlit as st
	import langcodes
	from requests_html import HTMLSession
	import urllib
	import requests
	session = HTMLSession()
	# FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org"
	# Big TODO: collate all the results into a big dictionary? Then display that. Reduces if statements?
	# TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
	# TODO: add in some nice things from https://docs.streamlit.io/library/cheatsheet like error codes and status messages.
	# TODO: add in vachan search even if lang not found
	# TODO: results from glottolog even if none from others
	# TODO: check glottolog results to see if they find anything!
	things_to_test = [
	"knh", # deprecated code on ISO
	"khn", # only has 639-3 on ISO
	"xxx", # no such code on ISO or glottolog
	"Chinese", # Vachan struggles.
	"Mandarin", # Vachan struggles.
	"zh-CN", # Chinese again.
	"Chinese",
	"zh-Latn-pinyin",
	"en-Latn-US",
	"en",
	"English",
	"fr-CA",
	"French (Canada)",
	"français",
	"法语",
	"", # empty string
	]


	def get_bcp47_from_langcode(langtext):
	pass

	#@st.cache
	def pull_obsolete_codes(iso_code):
	session = HTMLSession()
	r= session.get(f"https://iso639-3.sil.org/code/{iso_code}")
	# https://www.w3schools.com/cssref/css_selectors.asp
	obsolete_codes = {}
	for found_element in r.html.find(".views-field-nothing", clean=True):
	lines = found_element.text.splitlines()
	for line in lines:
	for obsolete_code_name in ["639-1","639-2/B", "639-2/T", "639-3"]:
	if obsolete_code_name in line and ":" in line:
	code = line.split()[-1]
	obsolete_codes[obsolete_code_name] = code
	return obsolete_codes

	#@st.cache
	def try_retrieving_glottolog_id(langtext):
	languoid_id = ""
	langtext_quoted = urllib.parse.quote(langtext)
	query_url=f"https://glottolog.org/glottolog?search={langtext_quoted}"
	glottolog_r= session.get(query_url)
	returned_url = glottolog_r.html.url

	if "languoid" in returned_url:
	last_section = returned_url.split("/")[-1]
	languoid_id = last_section
	return languoid_id

	#@st.cache
	def get_glottolog_json(languoid_id):
	query_url=f"https://glottolog.org/resource/languoid/id/{languoid_id}.json"
	glottolog_r = session.get(query_url)
	return glottolog_r.json()

	#@st.cache
	def try_searching_vachan_engine(langtext):
	results_list = []
	langtext_quoted = urllib.parse.quote(langtext)
	query_url = f"https://api.vachanengine.org/v2/languages?search_word={langtext_quoted}"
	vachan_r= requests.get(query_url)
	if vachan_r.status_code == 200:
	results_list = vachan_r.json()
	return results_list


	def main():
	st.write("# Language code/tag search")
	st.write("Fed up with language tag confusion? Here's your one-stop shop!")
	st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English` (or even some in other languages, try '法语' or 'français'), and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 tag according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)")
	st.write(f"Feedback: Provide feedback at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")

	# https://huggingface.co/blog/streamlit-spaces
	# https://github.com/psf/requests-html
	# https://docs.streamlit.io/library/api-reference/write-magic/st.write
	example_valid_lookups = ["zh-CN", "Chinese", "zh-Latn-pinyin", "en-Latn-US", "en", "English", "fr-CA", "French (Canada)", "français", "法语"]
	langtext = st.text_input("Language Code/Tag Lookup using langcodes", "Swahili", help=f"Try language codes or language names! Examples: {example_valid_lookups}").strip()

	if langtext.lower() == "matlab":
	st.error("Matlab is not a real language! ¯\\_(ツ)_/¯")
	return

	if langtext.lower() == "python":
	st.success("[Python is the best language!](https://www.python.org/)")
	return

	# TODO: st.code() for these "lookup in progress" outputs.
	st.info("* Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.")

	if langcodes.tag_is_valid(langtext):
	st.info(f"* ...True! '{langtext}' parses meaningfully as a language tag according to IANA.")
	else:
	st.info(f"* ...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.")


	try:
	lang = langcodes.Language.get(langtext)
	# st.write(f"{lang} is the BCP-47 tag.")
	if "unknown" in lang.display_name().lower():
	st.info(f"* Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.")
	lang = None
	except langcodes.LanguageTagError as e:
	st.info(f"* Could not lookup code directly, attempting to search for it as a natural language string.")
	lang = None



	if lang is None:
	try:
	found = langcodes.find(langtext)
	lang = found
	st.success(f"* Natural language search found the following BCP-47 tag: {lang}")
	except LookupError as e:
	st.error("## Result: failure!")
	st.error(f"Unable to look up BCP-47 tag. But all hope is not lost...")
	st.write(f"* You can also try https://r12a.github.io/app-subtags/")
	lang = None




	t_variant = None
	b_variant = None


	#st.write(f"langcodes found the following tag: {type(found)}") # a Language object
	if lang is not None:
	display = lang.display_name()
	b_variant = lang.to_alpha3(variant='B')
	t_variant = lang.to_alpha3(variant='T')
	broader_tags = lang.broader_tags()

	standardized_tag = langcodes.standardize_tag(lang)



	st.write(f"## BCP-47 Results: probably use '{standardized_tag}'")
	# TODO: make a results dictionary so it's easy to copy-paste?
	st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}")
	st.write(f"Breakdown of tag components:")
	st.write(lang.describe())
	st.write(f"Display name for {lang}: {lang.display_name()}")
	st.write(f"Autonym for {lang}: {lang.autonym()}")
	st.write(f"Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library: `{standardized_tag}`")


	st.write("## Further Information:")

	st.write(f"Broader tags for this language, if any:")
	st.write(broader_tags)

	st.write(f"### Language Subtag Search Tool")
	st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}, which will likely have links to Ethnologue, Wikipedia, and Character usage. You can also try searching for '{langtext}' there!")


	st.write("### Older / Related Codes")

	st.write(f"ISO 639-2 'alpha3' code, 'terminology' or 'T' variant (deprecated): {t_variant}")
	st.write(f"ISO 639-2 'alpha3' code, 'bibliographic' or 'B' variant (deprecated): {b_variant}")

	# ethnologue prefers T for german (deu), and T for French
	st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at https://iso639-3.sil.org/code/{t_variant}. That is also the code variant that typically has a working link to Ethnologue.")
	if t_variant != b_variant:
	st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at https://iso639-3.sil.org/code/{b_variant}")

	st.write("#### Codes scraped from iso639-3.sil.org")
	#TODO: Cleanup this bit
	t_obsolete_codes = pull_obsolete_codes(t_variant)
	b_obsolete_codes = pull_obsolete_codes(b_variant)
	if t_obsolete_codes:
	st.write(f"Older codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{t_variant}:")
	st.write(t_obsolete_codes)
	elif b_obsolete_codes:
	st.write(f"Older codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{b_variant}:")
	st.write(b_obsolete_codes)

	st.write(f"### Glottolog")
	search_terms_for_glottolog = [langtext, t_variant, b_variant]
	languoids = []
	for search_term in search_terms_for_glottolog :
	if search_term :
	languoid_id = try_retrieving_glottolog_id(search_term )
	if languoid_id:
	if languoid_id not in languoids:
	st.write(f"Glottolog Languoid ID: Searching for '{search_term}' on Glottolog returns the following 'languoid ID': [{languoid_id}](https://glottolog.org/resource/languoid/id/{languoid_id})")
	# get_glottolog_json(languoid_id)
	languoids.append(languoid_id)

	results_from_vachan = try_searching_vachan_engine(langtext)
	if results_from_vachan:
	st.write("### Other potential matches, from [Vachan Engine](https://github.com/Bridgeconn/vachan-api/tree/version-2) (experimental)")
	st.write(results_from_vachan)


	if __name__ == "__main__":
	main()