Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import langcodes | |
| from requests_html import HTMLSession | |
| import urllib | |
| import requests | |
| session = HTMLSession() | |
| # FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org" | |
| # Big TODO: collate all the results into a big dictionary? Then display that. Reduces if statements? | |
| # TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3 | |
| # TODO: add in some nice things from https://docs.streamlit.io/library/cheatsheet like error codes and status messages. | |
| # TODO: add in vachan search even if lang not found | |
| # TODO: results from glottolog even if none from others | |
| # TODO: check glottolog results to see if they find anything! | |
| things_to_test = [ | |
| "knh", # deprecated code on ISO | |
| "khn", # only has 639-3 on ISO | |
| "xxx", # no such code on ISO or glottolog | |
| "Chinese", # Vachan struggles. | |
| "Mandarin", # Vachan struggles. | |
| "zh-CN", # Chinese again. | |
| "Chinese", | |
| "zh-Latn-pinyin", | |
| "en-Latn-US", | |
| "en", | |
| "English", | |
| "fr-CA", | |
| "French (Canada)", | |
| "français", | |
| "法语", | |
| "", # empty string | |
| ] | |
| def get_bcp47_from_langcode(langtext): | |
| pass | |
| def pull_obsolete_codes(iso_code): | |
| session = HTMLSession() | |
| r= session.get(f"https://iso639-3.sil.org/code/{iso_code}") | |
| # https://www.w3schools.com/cssref/css_selectors.asp | |
| obsolete_codes = {} | |
| for found_element in r.html.find(".views-field-nothing", clean=True): | |
| lines = found_element.text.splitlines() | |
| for line in lines: | |
| for obsolete_code_name in ["639-1","639-2/B", "639-2/T", "639-3"]: | |
| if obsolete_code_name in line and ":" in line: | |
| code = line.split()[-1] | |
| obsolete_codes[obsolete_code_name] = code | |
| return obsolete_codes | |
| def try_retrieving_glottolog_id(langtext): | |
| languoid_id = "" | |
| langtext_quoted = urllib.parse.quote(langtext) | |
| query_url=f"https://glottolog.org/glottolog?search={langtext_quoted}" | |
| glottolog_r= session.get(query_url) | |
| returned_url = glottolog_r.html.url | |
| if "languoid" in returned_url: | |
| last_section = returned_url.split("/")[-1] | |
| languoid_id = last_section | |
| return languoid_id | |
| def get_glottolog_json(languoid_id): | |
| query_url=f"https://glottolog.org/resource/languoid/id/{languoid_id}.json" | |
| glottolog_r = session.get(query_url) | |
| return glottolog_r.json() | |
| def try_searching_vachan_engine(langtext): | |
| results_list = [] | |
| langtext_quoted = urllib.parse.quote(langtext) | |
| query_url = f"https://api.vachanengine.org/v2/languages?search_word={langtext_quoted}" | |
| vachan_r= requests.get(query_url) | |
| if vachan_r.status_code == 200: | |
| results_list = vachan_r.json() | |
| return results_list | |
| def main(): | |
| st.write("# Language code/tag search") | |
| st.write("Fed up with language tag confusion? Here's your one-stop shop!") | |
| st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English` (or even some in other languages, try '法语' or 'français'), and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 tag according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)") | |
| st.write(f"**Feedback:** Provide feedback at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A") | |
| # https://huggingface.co/blog/streamlit-spaces | |
| # https://github.com/psf/requests-html | |
| # https://docs.streamlit.io/library/api-reference/write-magic/st.write | |
| example_valid_lookups = ["zh-CN", "Chinese", "zh-Latn-pinyin", "en-Latn-US", "en", "English", "fr-CA", "French (Canada)", "français", "法语"] | |
| langtext = st.text_input("Language Code/Tag Lookup using langcodes", "Swahili", help=f"Try language codes or language names! Examples: {example_valid_lookups}").strip() | |
| if langtext.lower() == "matlab": | |
| st.error("Matlab is not a real language! ¯\\_(ツ)_/¯") | |
| return | |
| if langtext.lower() == "python": | |
| st.success("[Python is the best language!](https://www.python.org/)") | |
| return | |
| # TODO: st.code() for these "lookup in progress" outputs. | |
| st.info("* Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.") | |
| if langcodes.tag_is_valid(langtext): | |
| st.info(f"* ...True! '{langtext}' parses meaningfully as a language tag according to IANA.") | |
| else: | |
| st.info(f"* ...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.") | |
| try: | |
| lang = langcodes.Language.get(langtext) | |
| # st.write(f"{lang} is the BCP-47 tag.") | |
| if "unknown" in lang.display_name().lower(): | |
| st.info(f"* Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.") | |
| lang = None | |
| except langcodes.LanguageTagError as e: | |
| st.info(f"* Could not lookup code directly, attempting to search for it as a natural language string.") | |
| lang = None | |
| if lang is None: | |
| try: | |
| found = langcodes.find(langtext) | |
| lang = found | |
| st.success(f"* Natural language search found the following BCP-47 tag: {lang}") | |
| except LookupError as e: | |
| st.error("## Result: failure!") | |
| st.error(f"Unable to look up BCP-47 tag. But all hope is not lost...") | |
| st.write(f"* You can also try https://r12a.github.io/app-subtags/") | |
| lang = None | |
| t_variant = None | |
| b_variant = None | |
| #st.write(f"langcodes found the following tag: {type(found)}") # a Language object | |
| if lang is not None: | |
| display = lang.display_name() | |
| b_variant = lang.to_alpha3(variant='B') | |
| t_variant = lang.to_alpha3(variant='T') | |
| broader_tags = lang.broader_tags() | |
| standardized_tag = langcodes.standardize_tag(lang) | |
| st.write(f"## BCP-47 Results: probably use '{standardized_tag}'") | |
| # TODO: make a results dictionary so it's easy to copy-paste? | |
| st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}") | |
| st.write(f"Breakdown of tag components:") | |
| st.write(lang.describe()) | |
| st.write(f"Display name for {lang}: {lang.display_name()}") | |
| st.write(f"Autonym for {lang}: {lang.autonym()}") | |
| st.write(f"**Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library:** `{standardized_tag}`") | |
| st.write("## Further Information:") | |
| st.write(f"Broader tags for this language, if any:") | |
| st.write(broader_tags) | |
| st.write(f"### Language Subtag Search Tool") | |
| st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}, which will likely have links to Ethnologue, Wikipedia, and Character usage. You can also try searching for '{langtext}' there!") | |
| st.write("### Older / Related Codes") | |
| st.write(f"ISO 639-2 'alpha3' code, 'terminology' or 'T' variant (deprecated): {t_variant}") | |
| st.write(f"ISO 639-2 'alpha3' code, 'bibliographic' or 'B' variant (deprecated): {b_variant}") | |
| # ethnologue prefers T for german (deu), and T for French | |
| st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at https://iso639-3.sil.org/code/{t_variant}. That is also the code variant that typically has a working link to Ethnologue.") | |
| if t_variant != b_variant: | |
| st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at https://iso639-3.sil.org/code/{b_variant}") | |
| st.write("#### Codes scraped from iso639-3.sil.org") | |
| #TODO: Cleanup this bit | |
| t_obsolete_codes = pull_obsolete_codes(t_variant) | |
| b_obsolete_codes = pull_obsolete_codes(b_variant) | |
| if t_obsolete_codes: | |
| st.write(f"Older codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{t_variant}:") | |
| st.write(t_obsolete_codes) | |
| elif b_obsolete_codes: | |
| st.write(f"Older codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{b_variant}:") | |
| st.write(b_obsolete_codes) | |
| st.write(f"### Glottolog") | |
| search_terms_for_glottolog = [langtext, t_variant, b_variant] | |
| languoids = [] | |
| for search_term in search_terms_for_glottolog : | |
| if search_term : | |
| languoid_id = try_retrieving_glottolog_id(search_term ) | |
| if languoid_id: | |
| if languoid_id not in languoids: | |
| st.write(f"**Glottolog Languoid ID:** Searching for '{search_term}' on Glottolog returns the following 'languoid ID': [{languoid_id}](https://glottolog.org/resource/languoid/id/{languoid_id})") | |
| # get_glottolog_json(languoid_id) | |
| languoids.append(languoid_id) | |
| results_from_vachan = try_searching_vachan_engine(langtext) | |
| if results_from_vachan: | |
| st.write("### Other potential matches, from [Vachan Engine](https://github.com/Bridgeconn/vachan-api/tree/version-2) (experimental)") | |
| st.write(results_from_vachan) | |
| if __name__ == "__main__": | |
| main() |