|
import pandas as pd |
|
import streamlit as st |
|
from utils import df_to_html, render_svg, combine_json_files, render_metadata, color_mapping |
|
|
|
data = combine_json_files('./languages') |
|
|
|
|
|
@st.cache_data |
|
def render_home_table(): |
|
"""Renders home table.""" |
|
|
|
for key in data.keys(): |
|
data[key]['Number of Sites'] = len(data[key].get('Sites', [])) |
|
data[key]["Number of Links"] = sum(len(url_data["Links"]) for url_data in data[key].get('Sites', [])) |
|
|
|
|
|
df_data = pd.DataFrame(data).transpose() |
|
df_data['ISO Code'] = df_data.index |
|
|
|
df_data['Number of Sites'] = df_data['Number of Sites'].astype(str) |
|
df_data['ISO Code'] = df_data['ISO Code'].astype(str) |
|
df_data['Number of Sites'] = df_data.apply(lambda row: '<a href="/?isocode={}&site=True" target="_self">{}</a>'.format(row['ISO Code'], row['Number of Sites']), axis=1) |
|
df_data['Number of Links'] = df_data.apply(lambda row: '<a href="/?isocode={}&links=True" target="_self">{}</a>'.format(row['ISO Code'], row['Number of Links']), axis=1) |
|
df_data["Support by MADLAD400, FLORES200, GLOT500"] = df_data.apply(lambda row: color_mapping([row["Supported by allenai/MADLAD-400"] + row["Supported by facebook/flores"] + row["Supported by cis-lmu/Glot500"]]), axis =1) |
|
df_data['Color_Order'] = pd.Categorical(df_data['Support by MADLAD400, FLORES200, GLOT500'], categories=['π₯', 'π§', 'π¨', 'π©'], ordered=True) |
|
|
|
df_data = df_data.sort_values(by=['Color_Order', 'ISO Code']) |
|
|
|
|
|
df_data = df_data[df_data["Support by MADLAD400, FLORES200, GLOT500"]!= 'π©'] |
|
|
|
|
|
df_data = df_data[['ISO Code', 'Language Name', 'Family', 'Subgrouping', 'Number of Sites', 'Number of Links', 'Number of Speakers', 'Support by MADLAD400, FLORES200, GLOT500']] |
|
st.write(df_to_html(df_data), unsafe_allow_html=True) |
|
|
|
@st.cache_data |
|
def render_site_table(isocode): |
|
|
|
|
|
back_text = '<a href="/?home=True" target="_self">[Back]</a>' |
|
st.markdown(back_text, unsafe_allow_html=True) |
|
|
|
|
|
urls = data[isocode].get('Sites', []) |
|
df_urls = pd.DataFrame(urls) |
|
df_urls['Number of Links'] = df_urls['Links'].apply(len) |
|
df_urls = df_urls.sort_values(by='Number of Links', ascending=False) |
|
df_urls = df_urls.reset_index(drop=True) |
|
df_urls['Number of Links'] = df_urls.apply(lambda row: '<a href="/?isocode={}&siteurl={}" target="_self">{}</a>'.format(isocode, row['Site URL'], row['Number of Links']), axis=1) |
|
df_urls['Site URL'] = df_urls['Site URL'].apply(lambda url: f'<a href="{url}">{url}</a>' if url != 'misc' else url) |
|
df_urls['Language Name'] = data[isocode]['Language Name'] |
|
df_urls['ISO Code'] = isocode |
|
|
|
|
|
df_urls = df_urls[['ISO Code', 'Site URL', 'Category', 'Number of Links', 'Possible Parallel Languages', 'Confidence']] |
|
st.write(df_to_html(df_urls), unsafe_allow_html=True) |
|
|
|
|
|
@st.cache_data |
|
def render_siteurl_table(isocode, url): |
|
|
|
|
|
back_text = '<a href="/?isocode={}&site=True" target="_self">[Back]</a>'.format(isocode) |
|
st.markdown(back_text, unsafe_allow_html=True) |
|
|
|
|
|
urls = data[isocode].get('Sites', []) |
|
selected_domain = next((d for d in urls if 'Site URL' in d and d['Site URL'] == url), None) |
|
|
|
if selected_domain: |
|
st.write({'Language Name': data[isocode]['Language Name'], 'ISO Code': isocode, 'Site URL': url, 'Links': selected_domain['Links']}) |
|
|
|
|
|
|
|
@st.cache_data |
|
def render_links_table(isocode): |
|
|
|
|
|
back_text = '<a href="/?home=True" target="_self">[Back]</a>' |
|
st.markdown(back_text, unsafe_allow_html=True) |
|
|
|
|
|
urls = data[isocode].get('Sites', []) |
|
lang_name = data[isocode]['Language Name'] |
|
all_urls = [{'Site URL': du['Site URL'], 'Links': du['Links']} for du in urls] |
|
|
|
st.write({'Language Name': lang_name, 'ISO Code': isocode, 'URLs': all_urls}) |
|
|
|
|
|
|
|
|
|
render_svg(open("assets/glotweb_logo.svg").read()) |
|
|
|
def main(): |
|
params = st.query_params |
|
|
|
if 'isocode' in params: |
|
if 'siteurl' in params: |
|
render_siteurl_table(params['isocode'], params['siteurl']) |
|
if 'site' in params: |
|
render_site_table(params['isocode']) |
|
if 'links' in params: |
|
render_links_table(params['isocode']) |
|
|
|
else: |
|
|
|
render_metadata() |
|
st.markdown("**GlotWeb** is an indexing service for low-resource languages. It indexes **non-religous** sites or links written in each language. This list can be used to create raw text or parallel corpora and to study low-resource languages on the web.\n") |
|
render_home_table() |
|
st.markdown("\n\n<font color='gray'>We compare the level of support for these languages in the three big datasets ([MADLAD400](https://huggingface.co/datasets/allenai/MADLAD-400), [FLORES200](https://huggingface.co/datasets/facebook/flores), [GLOT500](https://huggingface.co/datasets/cis-lmu/Glot500)) of low-resource languages (π₯ 0/3 < π§ 1/3 < π¨ 2/3 < π© 3/3). Although the support in these datasets for some of these languages could be just the religious texts.</font>", unsafe_allow_html=True) |
|
|
|
main() |