gsarti commited on
Commit
6c35910
1 Parent(s): 8d16aac

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import streamlit as st
3
+
4
+ st.set_page_config(layout="wide")
5
+
6
+ dataset = load_dataset("GroNLP/divemt")
7
+ df = dataset["train"].to_pandas()
8
+ unique_src = df[["item_id", "src_text"]].drop_duplicates(subset="item_id")
9
+ langs = list(df["lang"].unique())
10
+
11
+ st.title("DivEMT Explorer")
12
+
13
+ cc1, _ = st.columns([2, 1])
14
+ with cc1:
15
+ st.write("""
16
+ The DivEMT Explorer is a tool to explore translations and edits contained in the DivEMT corpus.
17
+ Use the expandable section "Explore examples" below to visualize some of the original source sentences. When you found a sentence that you might be interested in, insert its numeric id (between 0 and 429) in the box below, and select all the languages for which you want to visualize the results.
18
+ Inside every generated section you will find the translations for all the available settings, alongside aligned edits and a collection of collected metadata. You can filter the showed settings to better see the aligned edits annotations.
19
+ """)
20
+ with st.expander("Explore examples"):
21
+ col1, col2, _ = st.columns([3,2,5])
22
+ with col1:
23
+ offset = st.slider(
24
+ "Select an offset",
25
+ min_value=0,
26
+ max_value=len(unique_src) - 5,
27
+ value=0,
28
+ )
29
+ with col2:
30
+ count = st.number_input(
31
+ 'Select the number of examples to display',
32
+ min_value=3,
33
+ max_value=len(unique_src),
34
+ value=5,
35
+ )
36
+ st.table(unique_src[offset:int(offset+count)])
37
+ col1_main, col2_main, _ = st.columns([1,1,3])
38
+ with col1_main:
39
+ item_id = st.number_input(
40
+ 'Select an item (0-429) to inspect',
41
+ min_value=0,
42
+ max_value=len(unique_src) - 1,
43
+ )
44
+ with col2_main:
45
+ langs = st.multiselect(
46
+ 'Select languages',
47
+ options=langs
48
+ )
49
+ st.markdown("<b>Source text:</b> <span style='color: #ff4b4b'> " + unique_src.iloc[int(item_id)]["src_text"] + "</span>", unsafe_allow_html=True)
50
+ task_names = ["From Scratch (HT)", "Google PE (PE1)", "mBART PE (PE2)"]
51
+ for lang in langs:
52
+ with st.expander(f"View {lang.upper()} data"):
53
+ c1, _ = st.columns([1, 2])
54
+ with c1:
55
+ tasks = st.multiselect(
56
+ 'Select settings',
57
+ options=task_names,
58
+ default=task_names,
59
+ key=f"{lang}_tasks"
60
+ )
61
+ columns = st.columns(len(tasks))
62
+ lang_data = df[(df["item_id"] == unique_src.iloc[int(item_id)]["item_id"]) & (df["lang"] == lang)]
63
+ lang_dicts = lang_data.to_dict("records")
64
+ ht = [x for x in lang_dicts if x["task_type"] == "ht"][0]
65
+ pe1 = [x for x in lang_dicts if x["task_type"] == "pe1"][0]
66
+ pe2 = [x for x in lang_dicts if x["task_type"] == "pe2"][0]
67
+ task_dict = {k:v for k,v in zip(task_names, [ht, pe1, pe2])}
68
+ max_mt_length = max([len(x["mt_text"]) for x in lang_dicts if x["mt_text"] is not None])
69
+ for task_name, dic, col in zip(tasks, [task_dict[name] for name in tasks], columns):
70
+ with col:
71
+ st.header(task_name)
72
+ st.markdown(f"<b>Translator</b>: {dic['subject_id']}", unsafe_allow_html=True)
73
+ mt_text = dic["mt_text"]
74
+ if mt_text is None:
75
+ mt_text = "<span style='opacity:0'>" + "".join(["O " for i in range(max_mt_length // 2)]) + "</span>"
76
+ st.markdown(f"<b>MT</b>: {mt_text}", unsafe_allow_html=True)
77
+ st.markdown(f"<b>PE</b>: {dic['tgt_text']}", unsafe_allow_html=True)
78
+ st.markdown(f"<b>Aligned edits</b>:", unsafe_allow_html=True)
79
+ if dic["aligned_edit"] is not None:
80
+ st.text(dic["aligned_edit"].replace("\\n", "\n").replace("REF:", "MT :").replace("HYP:", "PE :"))
81
+ else:
82
+ st.text("MT : N/A\nPE : N/A\nEVAL: N/A\n")
83
+ st.markdown(f"<b>Metadata</b>:", unsafe_allow_html=True)
84
+ st.json({k:v for k,v in dic.items() if k not in ["src_text", "mt_text", "tgt_text", "aligned_edit"]})