File size: 5,321 Bytes
c9a6574 28f58e9 c9a6574 fccef12 da22b94 c9a6574 fccef12 c9a6574 fccef12 c9a6574 2029420 c9a6574 8323cf4 5791539 ed8af94 c9a6574 94aefa4 c9a6574 2029420 c9a6574 2029420 c9a6574 17107df e15c6b0 c9a6574 0582f98 e15c6b0 c9a6574 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import pandas as pd
import streamlit as st
from utils import df_to_html, render_svg, combine_json_files, render_metadata, color_mapping
data = combine_json_files('./languages')
@st.cache_data
def render_home_table():
"""Renders home table."""
# Compute number of unique domains/urls
for key in data.keys():
data[key]['Number of Sites'] = len(data[key].get('Sites', []))
data[key]["Number of Links"] = sum(len(url_data["Links"]) for url_data in data[key].get('Sites', []))
# Convert dict to df
df_data = pd.DataFrame(data).transpose()
df_data['ISO Code'] = df_data.index
df_data['Number of Sites'] = df_data['Number of Sites'].astype(str) # Convert to string
df_data['ISO Code'] = df_data['ISO Code'].astype(str) # Convert to string
df_data['Number of Sites'] = df_data.apply(lambda row: '<a href="/?isocode={}&site=True" target="_self">{}</a>'.format(row['ISO Code'], row['Number of Sites']), axis=1)
df_data['Number of Links'] = df_data.apply(lambda row: '<a href="/?isocode={}&links=True" target="_self">{}</a>'.format(row['ISO Code'], row['Number of Links']), axis=1)
df_data["Support by MADLAD400, FLORES200, GLOT500"] = df_data.apply(lambda row: color_mapping([row["Supported by allenai/MADLAD-400"] + row["Supported by facebook/flores"] + row["Supported by cis-lmu/Glot500"]]), axis =1)
df_data['Color_Order'] = pd.Categorical(df_data['Support by MADLAD400, FLORES200, GLOT500'], categories=['π₯', 'π§', 'π¨', 'π©'], ordered=True)
# Sort by Color_Order then ISO Code
df_data = df_data.sort_values(by=['Color_Order', 'ISO Code'])
# Filter π©
df_data = df_data[df_data["Support by MADLAD400, FLORES200, GLOT500"]!= 'π©']
# Display the table
df_data = df_data[['ISO Code', 'Language Name', 'Family', 'Subgrouping', 'Number of Sites', 'Number of Links', 'Number of Speakers', 'Support by MADLAD400, FLORES200, GLOT500']]
st.write(df_to_html(df_data), unsafe_allow_html=True)
@st.cache_data
def render_site_table(isocode):
# back
back_text = '<a href="/?home=True" target="_self">[Back]</a>'
st.markdown(back_text, unsafe_allow_html=True)
# site
urls = data[isocode].get('Sites', [])
df_urls = pd.DataFrame(urls)
df_urls['Number of Links'] = df_urls['Links'].apply(len)
df_urls = df_urls.sort_values(by='Number of Links', ascending=False)
df_urls = df_urls.reset_index(drop=True)
df_urls['Number of Links'] = df_urls.apply(lambda row: '<a href="/?isocode={}&siteurl={}" target="_self">{}</a>'.format(isocode, row['Site URL'], row['Number of Links']), axis=1)
df_urls['Site URL'] = df_urls['Site URL'].apply(lambda url: f'<a href="{url}">{url}</a>' if url != 'misc' else url)
df_urls['Language Name'] = data[isocode]['Language Name']
df_urls['ISO Code'] = isocode
# Display the table
df_urls = df_urls[['ISO Code', 'Site URL', 'Category', 'Number of Links', 'Possible Parallel Languages', 'Confidence']]
st.write(df_to_html(df_urls), unsafe_allow_html=True)
@st.cache_data
def render_siteurl_table(isocode, url):
# back
back_text = '<a href="/?isocode={}&site=True" target="_self">[Back]</a>'.format(isocode)
st.markdown(back_text, unsafe_allow_html=True)
# Find selected domain
urls = data[isocode].get('Sites', [])
selected_domain = next((d for d in urls if 'Site URL' in d and d['Site URL'] == url), None)
if selected_domain:
st.write({'Language Name': data[isocode]['Language Name'], 'ISO Code': isocode, 'Site URL': url, 'Links': selected_domain['Links']})
@st.cache_data
def render_links_table(isocode):
# back
back_text = '<a href="/?home=True" target="_self">[Back]</a>'
st.markdown(back_text, unsafe_allow_html=True)
# output
urls = data[isocode].get('Sites', [])
lang_name = data[isocode]['Language Name']
all_urls = [{'Site URL': du['Site URL'], 'Links': du['Links']} for du in urls]
st.write({'Language Name': lang_name, 'ISO Code': isocode, 'URLs': all_urls})
# show logo
render_svg(open("assets/glotweb_logo.svg").read())
def main():
params = st.query_params
if 'isocode' in params:
if 'siteurl' in params:
render_siteurl_table(params['isocode'], params['siteurl'])
if 'site' in params:
render_site_table(params['isocode'])
if 'links' in params:
render_links_table(params['isocode'])
else:
# show home
render_metadata()
st.markdown("**GlotWeb** is an indexing service for low-resource languages. It indexes **non-religous** sites or links written in each language. This list can be used to create raw text or parallel corpora and to study low-resource languages on the web.\n")
render_home_table()
st.markdown("\n\n<font color='gray'>We compare the level of support for these languages in the three big datasets ([MADLAD400](https://huggingface.co/datasets/allenai/MADLAD-400), [FLORES200](https://huggingface.co/datasets/facebook/flores), [GLOT500](https://huggingface.co/datasets/cis-lmu/Glot500)) of low-resource languages (π₯ 0/3 < π§ 1/3 < π¨ 2/3 < π© 3/3). Although the support in these datasets for some of these languages could be just the religious texts.</font>", unsafe_allow_html=True)
main() |