gsarti commited on
Commit
497b1c6
Β·
1 Parent(s): 1f483bd

Updated app visuals

Browse files
Files changed (1) hide show
  1. app.py +50 -47
app.py CHANGED
@@ -1,6 +1,7 @@
1
  from datasets import load_dataset
2
  import streamlit as st
3
  import urllib
 
4
  from inseq import FeatureAttributionOutput
5
 
6
  st.set_page_config(layout="wide")
@@ -8,15 +9,15 @@ st.set_page_config(layout="wide")
8
  dataset = load_dataset("GroNLP/divemt")
9
  attribution_path = "https://huggingface.co/datasets/inseq/divemt_attributions/resolve/main/divemt-attributions/{lang}/{idx}_{lang}_gradl2_{setting}_{sentence_type}.json.gz"
10
  df = dataset["train"].to_pandas()
11
- unique_src = df[["item_id", "src_text"]].drop_duplicates(subset="item_id")
12
  langs = list(df["lang_id"].unique())
13
  st.title("DivEMT Explorer πŸ” 🌍")
14
  st.markdown("""
15
- ##### The DivEMT Explorer is a tool to explore translations and edits in the DivEMT corpus.
16
 
17
- ##### Use the expandable section "Explore examples" below to visualize some of the original source sentences. When you find an interesting sentence, insert its numeric id (between 0 and 429) in the box below, and select all the available languages you want to use for visualizing the results.
18
 
19
- ##### Inside every generated language section, you will find the translations for all the available settings, alongside aligned edits and a collection of collected metadata. You can filter the shown settings to see the aligned edits annotations.
20
  """)
21
 
22
  divemt_to_spacy_lang_map = {
@@ -28,23 +29,19 @@ divemt_to_spacy_lang_map = {
28
  "vie": "vi",
29
  }
30
 
31
- with st.expander("Explore examples"):
32
- col1, col2, _ = st.columns([3,2,5])
33
- with col1:
34
- offset = st.slider(
35
- "Select an offset",
36
- min_value=0,
37
- max_value=len(unique_src) - 5,
38
- value=0,
39
- )
40
- with col2:
41
- count = st.number_input(
42
- 'Select the number of examples to display',
43
- min_value=3,
44
- max_value=len(unique_src),
45
- value=5,
46
- )
47
- st.table(unique_src[offset:int(offset+count)])
48
  col1_main, col2_main, _ = st.columns([1,1,3])
49
  with col1_main:
50
  item_id = st.number_input(
@@ -55,44 +52,46 @@ with col1_main:
55
  with col2_main:
56
  langs = st.multiselect(
57
  'Select languages',
58
- options=langs
 
59
  )
60
  st.markdown("##### Source text")
61
- st.markdown("##### <span style='color: #ff4b4b'> " + unique_src.iloc[int(item_id)]["src_text"] + "</span>", unsafe_allow_html=True)
62
  task_names = ["From Scratch (HT)", "Google PE (PE1)", "mBART PE (PE2)"]
63
  for lang in langs:
64
- with st.expander(f"View {lang.upper()} data"):
65
- c1, _ = st.columns([1, 2])
66
- with c1:
67
- tasks = st.multiselect(
68
- 'Select settings',
69
- options=task_names,
70
- default=task_names,
71
- key=f"{lang}_tasks"
72
- )
73
- #columns = st.columns(len(tasks))
74
- lang_data = df[(df["item_id"] == unique_src.iloc[int(item_id)]["item_id"]) & (df["lang_id"] == lang)]
75
- lang_dicts = lang_data.to_dict("records")
76
- ht = [x for x in lang_dicts if x["task_type"] == "ht"][0]
77
- pe1 = [x for x in lang_dicts if x["task_type"] == "pe1"][0]
78
- pe2 = [x for x in lang_dicts if x["task_type"] == "pe2"][0]
79
- task_dict = {k:v for k,v in zip(task_names, [ht, pe1, pe2])}
80
- max_mt_length = max([len(x["mt_text"]) for x in lang_dicts if x["mt_text"] is not None])
81
- for task_name, dic in zip(tasks, [task_dict[name] for name in tasks]):
82
- st.header(task_name)
 
83
  st.markdown(f"<b>Translator</b>: {dic['subject_id']}", unsafe_allow_html=True)
84
  mt_text = dic["mt_text"]
85
  if mt_text is None:
86
  mt_text = "<span style='opacity:0'>" + "".join(["O " for i in range(max_mt_length // 2)]) + "</span>"
87
- st.markdown(f"<b>MT</b>: {'<bdi>' if lang == 'ara' else ''}{mt_text}{'</bdi>' if lang == 'ara' else ''}", unsafe_allow_html=True)
88
  st.markdown(f"<b>PE</b>: {'<bdi>' if lang == 'ara' else ''}{dic['tgt_text']}{'</bdi>' if lang == 'ara' else ''}", unsafe_allow_html=True)
89
  st.markdown(f"<b>Aligned edits</b>:", unsafe_allow_html=True)
90
- if dic["aligned_edit"] is not None:
91
  aligned_edit = dic["aligned_edit"]
92
  if lang == 'ara' and len(dic["aligned_edit"].split("EVAL: ")) == 2:
93
  edits_reverse = aligned_edit.split("EVAL: ")[1]
94
  # - 4 is a hack that makes things aligned most of the time, grounded in empirical observation only
95
- edits_reverse = edits_reverse + " " * ((len(aligned_edit.split("\\n")[0]) - len(edits_reverse)) - 4)
96
  aligned_edit = aligned_edit.split("EVAL: ")[0] + "EVAL: " + edits_reverse[::-1]
97
  aligned_edit = aligned_edit.replace("\\n", "\n").replace("REF:", "MT :").replace("HYP:", "PE :")
98
  st.text(aligned_edit)
@@ -100,10 +99,10 @@ for lang in langs:
100
  st.text("MT : N/A\nPE : N/A\nEVAL: N/A\n")
101
  st.markdown(f"<b>Metadata</b>:", unsafe_allow_html=True)
102
  st.json({k:v for k,v in dic.items() if k not in ["src_text", "mt_text", "tgt_text", "aligned_edit"]}, expanded=False)
 
103
  if task_name != "From Scratch (HT)":
104
  setting = "pe1" if task_name == "Google PE (PE1)" else "pe2"
105
- st.markdown(f"<b>Attributions</b>:", unsafe_allow_html=True)
106
- st.text("Click on checkboxes to show/hide the respective attributions computed with mBART 1-to-50.")
107
  for sentence_type in ["mt", "pe", "diff"]:
108
  url = attribution_path.format(idx=item_id, setting=setting, sentence_type=sentence_type, lang=divemt_to_spacy_lang_map[lang])
109
  try:
@@ -116,6 +115,10 @@ for lang in langs:
116
  st.markdown(f"{attr.show(return_html=True, display=False, do_aggregation=False)}", unsafe_allow_html=True)
117
  except (urllib.error.HTTPError, urllib.error.URLError) as e:
118
  st.checkbox(sentence_type.upper() + " (NOT AVAILABLE)", key=f"{lang}_{task_name}_{sentence_type}", disabled=True)
 
 
 
 
119
 
120
 
121
 
 
1
  from datasets import load_dataset
2
  import streamlit as st
3
  import urllib
4
+ import math
5
  from inseq import FeatureAttributionOutput
6
 
7
  st.set_page_config(layout="wide")
 
9
  dataset = load_dataset("GroNLP/divemt")
10
  attribution_path = "https://huggingface.co/datasets/inseq/divemt_attributions/resolve/main/divemt-attributions/{lang}/{idx}_{lang}_gradl2_{setting}_{sentence_type}.json.gz"
11
  df = dataset["train"].to_pandas()
12
+ unique_src = df[["item_id", "src_text"]].drop_duplicates(subset="item_id").rename(columns={"item_id": "Item ID", "src_text": "Source text"})
13
  langs = list(df["lang_id"].unique())
14
  st.title("DivEMT Explorer πŸ” 🌍")
15
  st.markdown("""
16
+ ##### The DivEMT Explorer is a tool to explore translations, edits and errors in the [DivEMT dataset](https://huggingface.co/datasets/GroNLP/divemt).
17
 
18
+ The table below shows the 430 source sentences taken from Flores-101 and translated into six typologically diverse languages to build the DivEMT corpus. When you find a sentence you would like to inspect closely, insert its numeric id (between 0 and 429) in the box below, and select all the available languages you want to use for visualizing the results.
19
 
20
+ Inside every language section, you will find the translations for all the available settings, alongside aligned edits and all collected metadata. You can filter the settings to see only cases you are interested in. In the **Attributions** section, you can find attribution maps computed using the [Inseq library](https://github.com/inseq-team/inseq) and the mBART model.
21
  """)
22
 
23
  divemt_to_spacy_lang_map = {
 
29
  "vie": "vi",
30
  }
31
 
32
+ divemt_to_labels_lang_map = {
33
+ "ara": "Arabic",
34
+ "nld": "Dutch",
35
+ "ita": "Italian",
36
+ "tur": "Turkish",
37
+ "ukr": "Ukrainian",
38
+ "vie": "Vietnamese",
39
+ }
40
+
41
+ st.dataframe(
42
+ unique_src,
43
+ use_container_width=True,
44
+ )
 
 
 
 
45
  col1_main, col2_main, _ = st.columns([1,1,3])
46
  with col1_main:
47
  item_id = st.number_input(
 
52
  with col2_main:
53
  langs = st.multiselect(
54
  'Select languages',
55
+ options=langs,
56
+ format_func=lambda x: divemt_to_labels_lang_map[x],
57
  )
58
  st.markdown("##### Source text")
59
+ st.markdown("##### <span style='color: #ff4b4b'> " + unique_src.iloc[int(item_id)]["Source text"] + "</span>", unsafe_allow_html=True)
60
  task_names = ["From Scratch (HT)", "Google PE (PE1)", "mBART PE (PE2)"]
61
  for lang in langs:
62
+ st.markdown(f"## {divemt_to_labels_lang_map[lang]}")
63
+ c1, _ = st.columns([1.5,1])
64
+ with c1:
65
+ tasks = st.multiselect(
66
+ 'Select settings',
67
+ options=task_names,
68
+ default=task_names,
69
+ key=f"{lang}_tasks"
70
+ )
71
+ #columns = st.columns(len(tasks))
72
+ lang_data = df[(df["item_id"] == unique_src.iloc[int(item_id)]["Item ID"]) & (df["lang_id"] == lang)]
73
+ lang_dicts = lang_data.to_dict("records")
74
+ ht = [x for x in lang_dicts if x["task_type"] == "ht"][0]
75
+ pe1 = [x for x in lang_dicts if x["task_type"] == "pe1"][0]
76
+ pe2 = [x for x in lang_dicts if x["task_type"] == "pe2"][0]
77
+ task_dict = {k:v for k,v in zip(task_names, [ht, pe1, pe2])}
78
+ max_mt_length = max([len(x["mt_text"]) for x in lang_dicts if x["mt_text"] is not None])
79
+ for task_name, dic in zip(tasks, [task_dict[name] for name in tasks]):
80
+ with st.expander(f"{task_name}"):
81
+ st.markdown(f"### {task_name}")
82
  st.markdown(f"<b>Translator</b>: {dic['subject_id']}", unsafe_allow_html=True)
83
  mt_text = dic["mt_text"]
84
  if mt_text is None:
85
  mt_text = "<span style='opacity:0'>" + "".join(["O " for i in range(max_mt_length // 2)]) + "</span>"
86
+ st.markdown(f"<b>MT</b>: {'<bdi>' if lang == 'ara' else ''}{mt_text if mt_text != 'nan' else 'N/A'}{'</bdi>' if lang == 'ara' else ''}", unsafe_allow_html=True)
87
  st.markdown(f"<b>PE</b>: {'<bdi>' if lang == 'ara' else ''}{dic['tgt_text']}{'</bdi>' if lang == 'ara' else ''}", unsafe_allow_html=True)
88
  st.markdown(f"<b>Aligned edits</b>:", unsafe_allow_html=True)
89
+ if dic["aligned_edit"] != "nan":
90
  aligned_edit = dic["aligned_edit"]
91
  if lang == 'ara' and len(dic["aligned_edit"].split("EVAL: ")) == 2:
92
  edits_reverse = aligned_edit.split("EVAL: ")[1]
93
  # - 4 is a hack that makes things aligned most of the time, grounded in empirical observation only
94
+ edits_reverse = edits_reverse + " " * ((len(aligned_edit.split("\\n")[0]) - len(edits_reverse)) - 10)
95
  aligned_edit = aligned_edit.split("EVAL: ")[0] + "EVAL: " + edits_reverse[::-1]
96
  aligned_edit = aligned_edit.replace("\\n", "\n").replace("REF:", "MT :").replace("HYP:", "PE :")
97
  st.text(aligned_edit)
 
99
  st.text("MT : N/A\nPE : N/A\nEVAL: N/A\n")
100
  st.markdown(f"<b>Metadata</b>:", unsafe_allow_html=True)
101
  st.json({k:v for k,v in dic.items() if k not in ["src_text", "mt_text", "tgt_text", "aligned_edit"]}, expanded=False)
102
+ st.markdown(f"<b>Attributions</b>:", unsafe_allow_html=True)
103
  if task_name != "From Scratch (HT)":
104
  setting = "pe1" if task_name == "Google PE (PE1)" else "pe2"
105
+ st.markdown("<i>Click on checkboxes to show/hide the respective attributions computed with mBART.</i>", unsafe_allow_html=True)
 
106
  for sentence_type in ["mt", "pe", "diff"]:
107
  url = attribution_path.format(idx=item_id, setting=setting, sentence_type=sentence_type, lang=divemt_to_spacy_lang_map[lang])
108
  try:
 
115
  st.markdown(f"{attr.show(return_html=True, display=False, do_aggregation=False)}", unsafe_allow_html=True)
116
  except (urllib.error.HTTPError, urllib.error.URLError) as e:
117
  st.checkbox(sentence_type.upper() + " (NOT AVAILABLE)", key=f"{lang}_{task_name}_{sentence_type}", disabled=True)
118
+ else:
119
+ st.markdown("<i>Attributions are available only for machine-translated outputs.</i>", unsafe_allow_html=True)
120
+ st.markdown("</br>", unsafe_allow_html=True)
121
+ st.markdown("*Built by [Gabriele Sarti](https://gsarti.com)*")
122
 
123
 
124