Spaces:

GroNLP
/

divemt_explorer

Sleeping

App Files Files Community

gsarti commited on May 23, 2022

Commit

6c35910

•

1 Parent(s): 8d16aac

Create app.py

Browse files

Files changed (1) hide show

app.py +84 -0

app.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from datasets import load_dataset
+import streamlit as st
+st.set_page_config(layout="wide")
+dataset = load_dataset("GroNLP/divemt")
+df = dataset["train"].to_pandas()
+unique_src = df[["item_id", "src_text"]].drop_duplicates(subset="item_id")
+langs = list(df["lang"].unique())
+st.title("DivEMT Explorer")
+cc1, _ = st.columns([2, 1])
+with cc1:
+    st.write("""
+    The DivEMT Explorer is a tool to explore translations and edits contained in the DivEMT corpus.
+    Use the expandable section "Explore examples" below to visualize some of the original source sentences. When you found a sentence that you might be interested in, insert its numeric id (between 0 and 429) in the box below, and select all the languages for which you want to visualize the results.
+    Inside every generated section you will find the translations for all the available settings, alongside aligned edits and a collection of collected metadata. You can filter the showed settings to better see the aligned edits annotations.
+    """)
+with st.expander("Explore examples"):
+    col1, col2, _ = st.columns([3,2,5])
+    with col1:
+        offset = st.slider(
+            "Select an offset",
+            min_value=0,
+            max_value=len(unique_src) - 5,
+            value=0,
+        )
+    with col2:
+        count = st.number_input(
+            'Select the number of examples to display',
+            min_value=3,
+            max_value=len(unique_src),
+            value=5,
+        )
+    st.table(unique_src[offset:int(offset+count)])
+col1_main, col2_main, _ = st.columns([1,1,3])
+with col1_main:
+    item_id = st.number_input(
+        'Select an item (0-429) to inspect',
+        min_value=0,
+        max_value=len(unique_src) - 1,
+    )
+with col2_main:
+    langs = st.multiselect(
+        'Select languages',
+        options=langs
+    )
+st.markdown("<b>Source text:</b> <span style='color: #ff4b4b'> " + unique_src.iloc[int(item_id)]["src_text"] + "</span>", unsafe_allow_html=True)
+task_names = ["From Scratch (HT)", "Google PE (PE1)", "mBART PE (PE2)"]
+for lang in langs:
+    with st.expander(f"View {lang.upper()} data"):
+        c1, _ = st.columns([1, 2])
+        with c1:
+            tasks = st.multiselect(
+                'Select settings',
+                options=task_names,
+                default=task_names,
+                key=f"{lang}_tasks"
+            )
+        columns = st.columns(len(tasks))
+        lang_data = df[(df["item_id"] == unique_src.iloc[int(item_id)]["item_id"]) & (df["lang"] == lang)]
+        lang_dicts = lang_data.to_dict("records")
+        ht = [x for x in lang_dicts if x["task_type"] == "ht"][0]
+        pe1 = [x for x in lang_dicts if x["task_type"] == "pe1"][0]
+        pe2 = [x for x in lang_dicts if x["task_type"] == "pe2"][0]
+        task_dict = {k:v for k,v in zip(task_names, [ht, pe1, pe2])}
+        max_mt_length = max([len(x["mt_text"]) for x in lang_dicts if x["mt_text"] is not None])
+        for task_name, dic, col in zip(tasks, [task_dict[name] for name in tasks], columns):
+            with col:
+                st.header(task_name)
+                st.markdown(f"<b>Translator</b>: {dic['subject_id']}", unsafe_allow_html=True)
+                mt_text = dic["mt_text"]
+                if mt_text is None:
+                    mt_text = "<span style='opacity:0'>" + "".join(["O " for i in range(max_mt_length // 2)]) + "</span>"
+                st.markdown(f"<b>MT</b>: {mt_text}", unsafe_allow_html=True)
+                st.markdown(f"<b>PE</b>: {dic['tgt_text']}", unsafe_allow_html=True)
+                st.markdown(f"<b>Aligned edits</b>:", unsafe_allow_html=True)
+                if dic["aligned_edit"] is not None:
+                    st.text(dic["aligned_edit"].replace("\\n", "\n").replace("REF:", "MT :").replace("HYP:", "PE :"))
+                else:
+                    st.text("MT : N/A\nPE : N/A\nEVAL: N/A\n")
+                st.markdown(f"<b>Metadata</b>:", unsafe_allow_html=True)
+                st.json({k:v for k,v in dic.items() if k not in ["src_text", "mt_text", "tgt_text", "aligned_edit"]})