from datasets import load_dataset import streamlit as st st.set_page_config(layout="wide") dataset = load_dataset("GroNLP/divemt") df = dataset["train"].to_pandas() unique_src = df[["item_id", "src_text"]].drop_duplicates(subset="item_id") langs = list(df["lang_id"].unique()) st.title("DivEMT Explorer") cc1, _ = st.columns([2, 1]) with cc1: st.write(""" The DivEMT Explorer is a tool to explore translations and edits contained in the DivEMT corpus. Use the expandable section "Explore examples" below to visualize some of the original source sentences. When you found a sentence that you might be interested in, insert its numeric id (between 0 and 429) in the box below, and select all the languages for which you want to visualize the results. Inside every generated section you will find the translations for all the available settings, alongside aligned edits and a collection of collected metadata. You can filter the showed settings to better see the aligned edits annotations. """) with st.expander("Explore examples"): col1, col2, _ = st.columns([3,2,5]) with col1: offset = st.slider( "Select an offset", min_value=0, max_value=len(unique_src) - 5, value=0, ) with col2: count = st.number_input( 'Select the number of examples to display', min_value=3, max_value=len(unique_src), value=5, ) st.table(unique_src[offset:int(offset+count)]) col1_main, col2_main, _ = st.columns([1,1,3]) with col1_main: item_id = st.number_input( 'Select an item (0-429) to inspect', min_value=0, max_value=len(unique_src) - 1, ) with col2_main: langs = st.multiselect( 'Select languages', options=langs ) st.markdown("Source text: " + unique_src.iloc[int(item_id)]["src_text"] + "", unsafe_allow_html=True) task_names = ["From Scratch (HT)", "Google PE (PE1)", "mBART PE (PE2)"] for lang in langs: with st.expander(f"View {lang.upper()} data"): c1, _ = st.columns([1, 2]) with c1: tasks = st.multiselect( 'Select settings', options=task_names, default=task_names, key=f"{lang}_tasks" ) columns = st.columns(len(tasks)) lang_data = df[(df["item_id"] == unique_src.iloc[int(item_id)]["item_id"]) & (df["lang_id"] == lang)] lang_dicts = lang_data.to_dict("records") ht = [x for x in lang_dicts if x["task_type"] == "ht"][0] pe1 = [x for x in lang_dicts if x["task_type"] == "pe1"][0] pe2 = [x for x in lang_dicts if x["task_type"] == "pe2"][0] task_dict = {k:v for k,v in zip(task_names, [ht, pe1, pe2])} max_mt_length = max([len(x["mt_text"]) for x in lang_dicts if x["mt_text"] is not None]) for task_name, dic, col in zip(tasks, [task_dict[name] for name in tasks], columns): with col: st.header(task_name) st.markdown(f"Translator: {dic['subject_id']}", unsafe_allow_html=True) mt_text = dic["mt_text"] if mt_text is None: mt_text = "" + "".join(["O " for i in range(max_mt_length // 2)]) + "" st.markdown(f"MT: {mt_text}", unsafe_allow_html=True) st.markdown(f"PE: {dic['tgt_text']}", unsafe_allow_html=True) st.markdown(f"Aligned edits:", unsafe_allow_html=True) if dic["aligned_edit"] is not None: st.text(dic["aligned_edit"].replace("\\n", "\n").replace("REF:", "MT :").replace("HYP:", "PE :")) else: st.text("MT : N/A\nPE : N/A\nEVAL: N/A\n") st.markdown(f"Metadata:", unsafe_allow_html=True) st.json({k:v for k,v in dic.items() if k not in ["src_text", "mt_text", "tgt_text", "aligned_edit"]})