Spaces:

cdleong
/

langcode-search

Runtime error

File size: 9,199 Bytes

68a8c29
6570b48
25a3c87
65485b5
bdc0541
cba4b15
1e9b3ba
54d3963
1e9b3ba
d7bf3e7
1e9b3ba
741cd0d
54d3963
741cd0d
 
 
 
 
54d3963
b81a5c9
54d3963
 
 
 
 
 
 
 
 
 
741cd0d
1e9b3ba
2f590b1
741cd0d
 
16fc4ca
46c94f2
92a84ae
 
 
9989672
25a3c87
5d305df
 
 
 
25a3c87
5d305df
 
92a84ae
dedac74
46c94f2
7a00e93
 
 
 
 
 
 
 
 
 
 
881bbde
46c94f2
cba4b15
 
 
 
7a00e93
46c94f2
bdc0541
 
 
 
 
 
 
 
8329262
55f8482
741cd0d
 
 
cba4b15
741cd0d
10bed1d
741cd0d
 
 
 
 
 
 
 
 
bdc0541
741cd0d
54d3963
741cd0d
bdc0541
741cd0d
54d3963
741cd0d
 
54d3963
741cd0d
54d3963
537dd73
92a84ae
741cd0d
 
 
 
54d3963
741cd0d
 
54d3963
741cd0d
 
 
ca54e5d
741cd0d
 
 
 
54d3963
741cd0d
54d3963
cba4b15
741cd0d
 
 
 
54d3963
741cd0d
cba4b15
 
54d3963
741cd0d
 
 
 
 
 
 
54d3963
741cd0d
54d3963
741cd0d
 
54d3963
741cd0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54d3963
 
741cd0d
fa7755f
5459c25
 
741cd0d
 
 
 
 
07e3fb8
741cd0d
 
 
 
 
5459c25
741cd0d
 
5459c25
741cd0d
54d3963
 
cba4b15
 
 
 
 
 
 
 
 
 
54d3963
 
 
 
 
741cd0d

import streamlit as st
import langcodes
from requests_html import HTMLSession
import urllib
import requests
session = HTMLSession() 
# FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org"
# Big TODO: collate all the results into a big dictionary? Then display that. Reduces if statements?
# TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
# TODO: add in some nice things from https://docs.streamlit.io/library/cheatsheet like error codes and status messages. 
# TODO: add in vachan search even if lang not found
# TODO: results from glottolog even if none from others
# TODO: check glottolog results to see if they find anything! 
things_to_test = [ 
  "knh", # deprecated code on ISO
  "khn", # only has 639-3 on ISO
  "xxx", # no such code on ISO or glottolog
  "Chinese", # Vachan struggles. 
  "Mandarin", # Vachan struggles. 
  "zh-CN", # Chinese again. 
  "Chinese", 
  "zh-Latn-pinyin", 
  "en-Latn-US", 
  "en", 
  "English", 
  "fr-CA", 
  "French (Canada)", 
  "français", 
  "法语",
  "", # empty string
]


def get_bcp47_from_langcode(langtext):
  pass

@st.cache
def pull_obsolete_codes(iso_code):
  session = HTMLSession() 
  r= session.get(f"https://iso639-3.sil.org/code/{iso_code}")
  # https://www.w3schools.com/cssref/css_selectors.asp
  obsolete_codes = {}
  for found_element in r.html.find(".views-field-nothing", clean=True):
    lines = found_element.text.splitlines()
    for line in lines:
      for obsolete_code_name in ["639-1","639-2/B", "639-2/T", "639-3"]:
        if obsolete_code_name in line and ":" in line:
          code = line.split()[-1]
          obsolete_codes[obsolete_code_name] = code  
  return obsolete_codes

@st.cache
def try_retrieving_glottolog_id(langtext):
  languoid_id = ""
  langtext_quoted = urllib.parse.quote(langtext)
  query_url=f"https://glottolog.org/glottolog?search={langtext_quoted}"
  glottolog_r= session.get(query_url)
  returned_url = glottolog_r.html.url
  
  if "languoid" in returned_url:
    last_section = returned_url.split("/")[-1]
    languoid_id = last_section
  return languoid_id    

@st.cache  
def get_glottolog_json(languoid_id):
  query_url=f"https://glottolog.org/resource/languoid/id/{languoid_id}.json"
  glottolog_r = session.get(query_url)
  return glottolog_r.json()

@st.cache
def try_searching_vachan_engine(langtext):
  results_list = []
  langtext_quoted = urllib.parse.quote(langtext)
  query_url = f"https://api.vachanengine.org/v2/languages?search_word={langtext_quoted}"
  vachan_r= requests.get(query_url)
  if vachan_r.status_code == 200:
    results_list = vachan_r.json()
  return results_list


def main():
  st.write("# Language code/tag search")
  st.write("Fed up with language tag confusion? Here's your one-stop shop!")
  st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English` (or even some in other languages, try '法语' or 'français'), and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 tag according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)")
  st.write(f"**Feedback:** Provide feedback at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")
  
  # https://huggingface.co/blog/streamlit-spaces
  # https://github.com/psf/requests-html
  # https://docs.streamlit.io/library/api-reference/write-magic/st.write
  example_valid_lookups = ["zh-CN", "Chinese", "zh-Latn-pinyin", "en-Latn-US", "en", "English", "fr-CA", "French (Canada)", "français", "法语"]
  langtext = st.text_input("Language Code/Tag Lookup using langcodes", "Swahili", help=f"Try language codes or language names! Examples: {example_valid_lookups}").strip()
  
  if langtext.lower() == "matlab":
    st.error("Matlab is not a real language! ¯\\_(ツ)_/¯")
    return
    
  if langtext.lower() == "python":
    st.success("[Python is the best language!](https://www.python.org/)")
    return
  
  # TODO: st.code() for these "lookup in progress" outputs. 
  st.info("* Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.")
  
  if langcodes.tag_is_valid(langtext):
    st.info(f"* ...True! '{langtext}' parses meaningfully as a language tag according to IANA.")
  else:
    st.info(f"* ...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.")
      
  
  try:
    lang = langcodes.Language.get(langtext)
  #  st.write(f"{lang} is the BCP-47 tag.")
    if "unknown" in lang.display_name().lower():
      st.info(f"* Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.")
      lang = None
  except langcodes.LanguageTagError as e: 
    st.info(f"* Could not lookup code directly, attempting to search for it as a natural language string.")
    lang = None
    
  
  
  if lang is None:
    try:
      found = langcodes.find(langtext)
      lang = found
      st.success(f"* Natural language search found the following BCP-47 tag: {lang}")
    except LookupError as e:
      st.error("## Result: failure!")
      st.error(f"Unable to look up BCP-47 tag. But all hope is not lost...")
      st.write(f"* You can also try https://r12a.github.io/app-subtags/")    
      lang = None
  
  

  
  t_variant = None
  b_variant = None 
 
  
  #st.write(f"langcodes found the following tag: {type(found)}") # a Language object
  if lang is not None: 
    display = lang.display_name()
    b_variant = lang.to_alpha3(variant='B')
    t_variant = lang.to_alpha3(variant='T')
    broader_tags = lang.broader_tags()

    standardized_tag = langcodes.standardize_tag(lang)

    
    
    st.write(f"## BCP-47 Results: probably use '{standardized_tag}'")
    # TODO: make a results dictionary so it's easy to copy-paste?
    st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}")  
    st.write(f"Breakdown of tag components:")  
    st.write(lang.describe())
    st.write(f"Display name for {lang}: {lang.display_name()}")
    st.write(f"Autonym for {lang}: {lang.autonym()}")
    st.write(f"**Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library:** `{standardized_tag}`")
    
    
    st.write("## Further Information:")
  
    st.write(f"Broader tags for this language, if any:")
    st.write(broader_tags)
    
    st.write(f"### Language Subtag Search Tool")
    st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}, which will likely have links to Ethnologue, Wikipedia, and Character usage. You can also try searching for '{langtext}' there!")


    st.write("### Older / Related Codes")
  
    st.write(f"ISO 639-2 'alpha3' code, 'terminology' or 'T' variant (deprecated): {t_variant}")
    st.write(f"ISO 639-2 'alpha3' code, 'bibliographic' or 'B' variant (deprecated): {b_variant}")
    
    # ethnologue prefers T for german (deu), and T for French
    st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at https://iso639-3.sil.org/code/{t_variant}. That is also the code variant that typically has a working link to Ethnologue.")
    if t_variant != b_variant:
      st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at https://iso639-3.sil.org/code/{b_variant}")  
  
    st.write("#### Codes scraped from iso639-3.sil.org")
    #TODO: Cleanup this bit
    t_obsolete_codes = pull_obsolete_codes(t_variant)
    b_obsolete_codes = pull_obsolete_codes(b_variant)   
    if t_obsolete_codes:
      st.write(f"Older codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{t_variant}:")
      st.write(t_obsolete_codes)
    elif b_obsolete_codes:
      st.write(f"Older codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{b_variant}:")
      st.write(b_obsolete_codes)

  st.write(f"### Glottolog")
  search_terms_for_glottolog = [langtext, t_variant, b_variant]
  languoids = []
  for search_term in search_terms_for_glottolog :
    if search_term :
      languoid_id = try_retrieving_glottolog_id(search_term )  
      if languoid_id:
        if languoid_id not in languoids:
          st.write(f"**Glottolog Languoid ID:** Searching for '{search_term}' on Glottolog returns the following 'languoid ID': [{languoid_id}](https://glottolog.org/resource/languoid/id/{languoid_id})")
#          get_glottolog_json(languoid_id)
          languoids.append(languoid_id)
          
  results_from_vachan = try_searching_vachan_engine(langtext)    
  if results_from_vachan:
    st.write("### Other potential matches, from [Vachan Engine](https://github.com/Bridgeconn/vachan-api/tree/version-2) (experimental)")
    st.write(results_from_vachan)

      
if __name__ == "__main__":
  main()