kargaranamir commited on
Commit
e15c6b0
1 Parent(s): 4cea18a

add cas, lzz, anp and edit misc.

Browse files
app.py CHANGED
@@ -24,10 +24,10 @@ def render_home_table():
24
  df_data['ISO Code'] = df_data['ISO Code'].astype(str) # Convert to string
25
  df_data['Number of Sites'] = df_data.apply(lambda row: '<a href="/?isocode={}&site=True" target="_self">{}</a>'.format(row['ISO Code'], row['Number of Sites']), axis=1)
26
  df_data['Number of Links'] = df_data.apply(lambda row: '<a href="/?isocode={}&links=True" target="_self">{}</a>'.format(row['ISO Code'], row['Number of Links']), axis=1)
27
- df_data["Supported by MADLAD-400, flores, and Glot500"] = df_data.apply(lambda row: color_mapping([row["Supported by allenai/MADLAD-400"] + row["Supported by facebook/flores"] + row["Supported by cis-lmu/Glot500"]]), axis =1)
28
 
29
  # Display the table
30
- df_data = df_data[['ISO Code', 'Language Name', 'Family', 'Subgrouping', 'Number of Sites', 'Number of Links', 'Number of Speakers', 'Supported by MADLAD-400, flores, and Glot500']]
31
  st.write(df_to_html(df_data), unsafe_allow_html=True)
32
 
33
 
@@ -103,6 +103,8 @@ def main():
103
  else:
104
  # show home
105
  render_metadata()
106
- st.markdown("**GlotWeb** is an indexing service for low-resource languages. It indexes sites or links written in each language. This list can be used to create raw text or parallel corpora and to study low-resource languages on the web. We also compare the level of support for these languages in the 3 big datasets of low-resource languages (🟥 0/3 < 🟧 1/3 < 🟨 2/3 < 🟩 3/3).\n")
107
  render_home_table()
 
 
108
  main()
 
24
  df_data['ISO Code'] = df_data['ISO Code'].astype(str) # Convert to string
25
  df_data['Number of Sites'] = df_data.apply(lambda row: '<a href="/?isocode={}&site=True" target="_self">{}</a>'.format(row['ISO Code'], row['Number of Sites']), axis=1)
26
  df_data['Number of Links'] = df_data.apply(lambda row: '<a href="/?isocode={}&links=True" target="_self">{}</a>'.format(row['ISO Code'], row['Number of Links']), axis=1)
27
+ df_data["Supported by MADLAD400 & FLORES & GLOT500"] = df_data.apply(lambda row: color_mapping([row["Supported by allenai/MADLAD-400"] + row["Supported by facebook/flores"] + row["Supported by cis-lmu/Glot500"]]), axis =1)
28
 
29
  # Display the table
30
+ df_data = df_data[['ISO Code', 'Language Name', 'Family', 'Subgrouping', 'Number of Sites', 'Number of Links', 'Number of Speakers', 'Supported by MADLAD400 & FLORES & GLOT500']]
31
  st.write(df_to_html(df_data), unsafe_allow_html=True)
32
 
33
 
 
103
  else:
104
  # show home
105
  render_metadata()
106
+ st.markdown("**GlotWeb** is an indexing service for low-resource languages. It indexes **non-religous** sites or links written in each language. This list can be used to create raw text or parallel corpora and to study low-resource languages on the web.\n")
107
  render_home_table()
108
+ st.markdown("\n\nWe compare the level of support for these languages in the three big datasets ([MADLAD-400](https://huggingface.co/datasets/allenai/MADLAD-400), [FLORES200](https://huggingface.co/datasets/facebook/flores), [GLOT500](https://huggingface.co/datasets/cis-lmu/Glot500)) of low-resource languages (🟥 0/3 < 🟧 1/3 < 🟨 2/3 < 🟩 3/3). Although the support in these datasets for some of these languages could be just the religious texts.", unsafe_allow_html=True)
109
+
110
  main()
languages/anp_Deva.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Language Name": "Angika",
3
+ "Family": "Indo-European",
4
+ "Subgrouping": "Eastern Indo-Aryan",
5
+ "Number of Speakers": "15_000_000",
6
+ "Supported by allenai/MADLAD-400": 1,
7
+ "Supported by facebook/flores": 0,
8
+ "Supported by cis-lmu/Glot500": 0,
9
+ "Sites": [
10
+ {
11
+ "Site Name": "angika.com",
12
+ "Site URL": "https://www.angika.com/#angika",
13
+ "Category": "blog",
14
+ "Confidence": "🟩",
15
+ "Info": "confirmed by glotlid and webpage metadata.",
16
+ "Possible Parallel Languages": "eng_Latn, hin_Deva",
17
+ "Links": []
18
+ }
19
+ ]
20
+ }
languages/cas_Latn.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Language Name": "Tsimané",
3
+ "Family": "Mosetén-Chimané",
4
+ "Subgrouping": "",
5
+ "Number of Speakers": "5_300",
6
+ "Supported by allenai/MADLAD-400": 0,
7
+ "Supported by facebook/flores": 0,
8
+ "Supported by cis-lmu/Glot500": 0,
9
+ "Sites": [
10
+ {
11
+ "Site Name": "tsimanelinguisticouniverso.wordpress.com",
12
+ "Site URL": "https://tsimanelinguisticouniverso.wordpress.com/",
13
+ "Category": "blog",
14
+ "Confidence": "🟩",
15
+ "Info": "confirmed by glotlid and webpage metadata - some posts have spanish translation.",
16
+ "Possible Parallel Languages": "spa_Latn",
17
+ "Links": ["https://tsimanelinguisticouniverso.wordpress.com/2015/07/28/49/",
18
+ "https://tsimanelinguisticouniverso.wordpress.com/2015/07/28/conozcamos-la-lengua-tsimane-no-1/",
19
+ "https://tsimanelinguisticouniverso.wordpress.com/2015/05/07/jun-chuc-carijtacdye-yu/",
20
+ "https://tsimanelinguisticouniverso.wordpress.com/2015/02/17/patuju/"]
21
+ }
22
+ ]
23
+ }
languages/hac_Arab.json CHANGED
@@ -8,7 +8,7 @@
8
  "Supported by cis-lmu/Glot500": 0,
9
  "Sites": [
10
  {
11
- "Site Name": "anfsorani.com/هۆرامی",
12
  "Site URL": "https://anfsorani.com/هۆرامی",
13
  "Category": "news",
14
  "Confidence": "🟩",
 
8
  "Supported by cis-lmu/Glot500": 0,
9
  "Sites": [
10
  {
11
+ "Site Name": "anfsorani.com",
12
  "Site URL": "https://anfsorani.com/هۆرامی",
13
  "Category": "news",
14
  "Confidence": "🟩",
languages/lzz_Latn.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Language Name": "Laz",
3
+ "Family": "Kartvelian",
4
+ "Subgrouping": "Zan",
5
+ "Number of Speakers": "22_000",
6
+ "Supported by allenai/MADLAD-400": 0,
7
+ "Supported by facebook/flores": 0,
8
+ "Supported by cis-lmu/Glot500": 0,
9
+ "Sites": [
10
+ {
11
+ "Site Name": "kolkhoba.org",
12
+ "Site URL": "https://www.kolkhoba.org/lazuri.htm",
13
+ "Category": "articles",
14
+ "Confidence": "🟩",
15
+ "Info": "confirmed by webpage metadata",
16
+ "Possible Parallel Languages": "tur_Latn",
17
+ "Links": []
18
+ }
19
+ ]
20
+ }
21
+
languages/snk_Latn.json CHANGED
@@ -8,7 +8,7 @@
8
  "Supported by cis-lmu/Glot500": 0,
9
  "Sites": [
10
  {
11
- "Site Name": "soninkara.com/snk/",
12
  "Site URL": "http://www.soninkara.com/snk/",
13
  "Category": "news,forums",
14
  "Confidence": "🟩",
 
8
  "Supported by cis-lmu/Glot500": 0,
9
  "Sites": [
10
  {
11
+ "Site Name": "soninkara.com",
12
  "Site URL": "http://www.soninkara.com/snk/",
13
  "Category": "news,forums",
14
  "Confidence": "🟩",
languages/tet_Latn.json CHANGED
@@ -17,13 +17,13 @@
17
  "Links": []
18
  },
19
  {
20
- "Site Name": "timor-leste.gov.tl/?lang=tp",
21
  "Site URL": "http://timor-leste.gov.tl/?lang=tp",
22
  "Category": "government",
23
  "Confidence": "🟩",
24
  "Info": "confirmed by webpage metadata as tet_Latn",
25
  "Possible Parallel Languages": "eng_Latn, por_Latn",
26
- "Links": []
27
  },
28
  {
29
  "Site Name": "belun.tl",
@@ -32,7 +32,7 @@
32
  "Confidence": "🟩",
33
  "Info": "confirmed by webpage metadata as tet_Latn",
34
  "Possible Parallel Languages": "eng_Latn",
35
- "Links": []
36
  },
37
  {
38
  "Site Name": "tempotimor.com",
 
17
  "Links": []
18
  },
19
  {
20
+ "Site Name": "timor-leste.gov.tl",
21
  "Site URL": "http://timor-leste.gov.tl/?lang=tp",
22
  "Category": "government",
23
  "Confidence": "🟩",
24
  "Info": "confirmed by webpage metadata as tet_Latn",
25
  "Possible Parallel Languages": "eng_Latn, por_Latn",
26
+ "Links": ["http://timor-leste.gov.tl/wp-content/uploads/2021/08/TT-2021-08-24-debate-PN_autorizacao_EE17.pdf"]
27
  },
28
  {
29
  "Site Name": "belun.tl",
 
32
  "Confidence": "🟩",
33
  "Info": "confirmed by webpage metadata as tet_Latn",
34
  "Possible Parallel Languages": "eng_Latn",
35
+ "Links": ["https://belun.tl/wp-content/uploads/2015/12/Relatoriu-Politika-CPD-RDTL.pdf"]
36
  },
37
  {
38
  "Site Name": "tempotimor.com",