cdleong commited on
Commit
cba4b15
1 Parent(s): 5459c25

Improved Glottolog retrieval

Browse files
Files changed (1) hide show
  1. app.py +20 -16
app.py CHANGED
@@ -3,7 +3,7 @@ import langcodes
3
  from requests_html import HTMLSession
4
  import urllib
5
  import requests
6
-
7
  # FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org"
8
  # Big TODO: collate all the results into a big dictionary? Then display that. Reduces if statements?
9
  # TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
@@ -51,17 +51,20 @@ def pull_obsolete_codes(iso_code):
51
 
52
  def try_retrieving_glottolog_id(langtext):
53
  languoid_id = ""
54
- session = HTMLSession()
55
  langtext_quoted = urllib.parse.quote(langtext)
56
  query_url=f"https://glottolog.org/glottolog?search={langtext_quoted}"
57
  glottolog_r= session.get(query_url)
58
  returned_url = glottolog_r.html.url
59
-
60
 
61
  if "languoid" in returned_url:
62
  last_section = returned_url.split("/")[-1]
63
  languoid_id = last_section
64
  return languoid_id
 
 
 
 
 
65
 
66
  def try_searching_vachan_engine(langtext):
67
  results_list = []
@@ -76,7 +79,7 @@ def try_searching_vachan_engine(langtext):
76
  def main():
77
  st.write("# Language code/tag search")
78
  st.write("Fed up with language tag confusion? Here's your one-stop shop!")
79
- st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English` (or even some in other languages, try '法语' or 'français'), and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 code according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)")
80
  st.write(f"**Feedback:** Provide feedback at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")
81
 
82
  # https://huggingface.co/blog/streamlit-spaces
@@ -121,15 +124,15 @@ def main():
121
  st.success(f"* Natural language search found the following BCP-47 tag: {lang}")
122
  except LookupError as e:
123
  st.error("## Result: failure!")
124
- st.error(f"Unable to look up language code. But all hope is not lost...")
125
  st.write(f"* You can also try https://r12a.github.io/app-subtags/")
126
- st.write(f"* Or possibly https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)}")
127
  lang = None
128
 
129
 
130
 
131
 
132
- t_variant = None
 
133
 
134
 
135
  #st.write(f"langcodes found the following tag: {type(found)}") # a Language object
@@ -184,15 +187,16 @@ def main():
184
  st.write(b_obsolete_codes)
185
 
186
  st.write(f"### Glottolog")
187
- languoid_id = try_retrieving_glottolog_id(langtext)
188
- if languoid_id:
189
- st.write(f"**Glottolog Languoid ID:** Searching for '{langtext}' on Glottolog returns the following 'languoid ID': [{languoid_id}](https://glottolog.org/resource/languoid/id/{languoid_id})")
190
- # FIXME: fix this to display something if there's an ISO code to try
191
- if t_variant:
192
- st.write(f"https://glottolog.org/glottolog?search={t_variant} may be also of interest, with links to various resources including WALS, Wikidata, Odin, and OLAC. ")
193
- if t_variant != b_variant:
194
- st.write(f"If that doesn't work, try https://glottolog.org/glottolog?search={b_variant}, or put in a [custom search query](https://glottolog.org/glottolog)")
195
- st.write(f"https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)} may pull up something as well.")
 
196
 
197
  results_from_vachan = try_searching_vachan_engine(langtext)
198
  if results_from_vachan:
3
  from requests_html import HTMLSession
4
  import urllib
5
  import requests
6
+ session = HTMLSession()
7
  # FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org"
8
  # Big TODO: collate all the results into a big dictionary? Then display that. Reduces if statements?
9
  # TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
51
 
52
  def try_retrieving_glottolog_id(langtext):
53
  languoid_id = ""
 
54
  langtext_quoted = urllib.parse.quote(langtext)
55
  query_url=f"https://glottolog.org/glottolog?search={langtext_quoted}"
56
  glottolog_r= session.get(query_url)
57
  returned_url = glottolog_r.html.url
 
58
 
59
  if "languoid" in returned_url:
60
  last_section = returned_url.split("/")[-1]
61
  languoid_id = last_section
62
  return languoid_id
63
+
64
+ def get_glottolog_json(languoid_id):
65
+ query_url=f"https://glottolog.org/resource/languoid/id/{languoid_id}.json"
66
+ glottolog_r = session.get(query_url)
67
+ return glottolog_r.json()
68
 
69
  def try_searching_vachan_engine(langtext):
70
  results_list = []
79
  def main():
80
  st.write("# Language code/tag search")
81
  st.write("Fed up with language tag confusion? Here's your one-stop shop!")
82
+ st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English` (or even some in other languages, try '法语' or 'français'), and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 tag according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)")
83
  st.write(f"**Feedback:** Provide feedback at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")
84
 
85
  # https://huggingface.co/blog/streamlit-spaces
124
  st.success(f"* Natural language search found the following BCP-47 tag: {lang}")
125
  except LookupError as e:
126
  st.error("## Result: failure!")
127
+ st.error(f"Unable to look up BCP-47 tag. But all hope is not lost...")
128
  st.write(f"* You can also try https://r12a.github.io/app-subtags/")
 
129
  lang = None
130
 
131
 
132
 
133
 
134
+ t_variant = None
135
+ b_variant = None
136
 
137
 
138
  #st.write(f"langcodes found the following tag: {type(found)}") # a Language object
187
  st.write(b_obsolete_codes)
188
 
189
  st.write(f"### Glottolog")
190
+ search_terms_for_glottolog = [langtext, t_variant, b_variant]
191
+ languoids = []
192
+ for search_term in search_terms_for_glottolog :
193
+ if search_term :
194
+ languoid_id = try_retrieving_glottolog_id(search_term )
195
+ if languoid_id:
196
+ if languoid_id not in languoids:
197
+ st.write(f"**Glottolog Languoid ID:** Searching for '{search_term}' on Glottolog returns the following 'languoid ID': [{languoid_id}](https://glottolog.org/resource/languoid/id/{languoid_id})")
198
+ # get_glottolog_json(languoid_id)
199
+ languoids.append(languoid_id)
200
 
201
  results_from_vachan = try_searching_vachan_engine(langtext)
202
  if results_from_vachan: