Spaces:

retkowski
/

ytseg_demo

Running

App Files Files Community

ScientiaEtVeritas commited on Mar 15

Commit

c57bf8a

•

1 Parent(s): 2412f21

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
app.py +322 -0
demo_data/lectures/Lecture-01-18.04.2023/English.vtt +2582 -0
demo_data/lectures/Lecture-01-18.04.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-02-20.04.2023/English.vtt +2984 -0
demo_data/lectures/Lecture-02-20.04.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-03-25.04.2023/English.vtt +3102 -0
demo_data/lectures/Lecture-03-25.04.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-04-27.04.2023/English.vtt +2919 -0
demo_data/lectures/Lecture-04-27.04.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-05-02.05.2023/English.vtt +1124 -0
demo_data/lectures/Lecture-05-02.05.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-06-09.05.2023/English.vtt +2970 -0
demo_data/lectures/Lecture-06-09.05.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-07-11.05.2023/English.vtt +2596 -0
demo_data/lectures/Lecture-07-11.05.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-07-16.05.2023/English.vtt +2523 -0
demo_data/lectures/Lecture-07-16.05.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-09-25.05.2023/English.vtt +3039 -0
demo_data/lectures/Lecture-09-25.05.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-10-13.06.2023/English.vtt +2458 -0
demo_data/lectures/Lecture-10-13.06.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-11-15.06.2023/English.vtt +0 -0
demo_data/lectures/Lecture-11-15.06.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-12-20.06.2023/English.vtt +0 -0
demo_data/lectures/Lecture-12-20.06.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-13-04.07.2023/English.vtt +2699 -0
demo_data/lectures/Lecture-13-04.07.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-14-27.06.2023/English.vtt +2753 -0
demo_data/lectures/Lecture-14-27.06.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-15-11.07.2023/English.vtt +2295 -0
demo_data/lectures/Lecture-15-11.07.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-18-18.07.2023/English.vtt +2738 -0
demo_data/lectures/Lecture-18-18.07.2023/video.mp4 +3 -0
demo_data/lectures/Lecture-19-21.07.2023/English.vtt +2860 -0
demo_data/lectures/Lecture-19-21.07.2023/video.mp4 +3 -0
demo_data/nips-2021/25953/metadata.json +3 -0
demo_data/nips-2021/25953/transcript_whisper_large-v2.vtt +581 -0
demo_data/nips-2021/25953/video.mp4 +3 -0
demo_data/nips-2021/25957/metadata.json +3 -0
demo_data/nips-2021/25957/transcript_whisper_large-v2.vtt +539 -0
demo_data/nips-2021/25957/video.mp4 +3 -0
demo_data/nips-2021/25958/metadata.json +3 -0
demo_data/nips-2021/25958/transcript_whisper_large-v2.vtt +374 -0
demo_data/nips-2021/25958/video.mp4 +3 -0
demo_data/nips-2021/25959/metadata.json +3 -0
demo_data/nips-2021/25959/transcript_whisper_large-v2.vtt +353 -0
demo_data/nips-2021/25959/video.mp4 +3 -0
demo_data/nips-2021/25962/metadata.json +3 -0
demo_data/nips-2021/25962/transcript_whisper_large-v2.vtt +155 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 video.mp4 filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 video.mp4 filter=lfs diff=lfs merge=lfs -text
+*.psd filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+demo_data/lectures/*/*.mp4 filter=lfs diff=lfs merge=lfs -text
+demo_data/*/.mp4 filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,322 @@

+import itertools
+import json
+import re
+from functools import partial
+from pathlib import Path
+import requests
+import streamlit as st
+import webvtt
+from load_data import get_partition
+from transformers import AutoTokenizer
+from generate_text_api import TextGenerator
+from model_inferences.utils.chunking import Truncater
+from model_inferences.utils.files import get_captions_from_vtt, get_transcript
+USE_PARAGRAPHING_MODEL = True
+def get_sublist_by_flattened_index(A, i):
+    current_index = 0
+    for sublist in A:
+        sublist_length = len(sublist)
+        if current_index <= i < current_index + sublist_length:
+            return sublist, A.index(sublist)
+        current_index += sublist_length
+    return None, None
+import requests
+def get_talk_metadata(video_id):
+    url = "https://www.ted.com/graphql"
+    headers = {
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+        "x-operation-name": "Transcript",  # Replace with the actual operation name
+    }
+    data = {
+        "query": """
+        query GetTalk($videoId: ID!) {
+            video(id: $videoId) {
+                title,
+                presenterDisplayName,
+                nativeDownloads {medium}
+            }
+        }
+        """,
+        "variables": {
+            "videoId": video_id,  # Corrected key to "videoId"
+        },
+    }
+    response = requests.post(url, json=data, headers=headers)
+    if response.status_code == 200:
+        result = response.json()
+        return result
+    else:
+        print(f"Error: {response.status_code}, {response.text}")
+class OfflineTextSegmenterClient:
+    def __init__(self, host_url):
+        self.host_url = host_url.rstrip("/") + "/segment"
+    def segment(self, text, captions=None, generate_titles=False, threshold=0.4):
+        payload = {
+            'text': text,
+            'captions': captions,
+            'generate_titles': generate_titles,
+            "prefix_titles": True,
+            "threshold": threshold,
+        }
+        headers = {
+            'Content-Type': 'application/json'
+        }
+        response = requests.post(self.host_url, data=json.dumps(payload), headers=headers).json()
+        #segments =  response["annotated_segments"] if "annotated_segments" in response else response["segments"]
+        return {'segments':response["segments"], 'titles': response["titles"], 'sentences': response["sentences"]}
+class Toc:
+    def __init__(self):
+        self._items = []
+        self._placeholder = None
+    def title(self, text):
+        self._markdown(text, "h1")
+    def header(self, text):
+        self._markdown(text, "h2", " " * 2)
+    def subheader(self, text):
+        self._markdown(text, "h3", " " * 4)
+    def placeholder(self, sidebar=False):
+        self._placeholder = st.sidebar.empty() if sidebar else st.empty()
+    def generate(self):
+        if self._placeholder:
+            self._placeholder.markdown("\n".join(self._items), unsafe_allow_html=True)
+    def _markdown(self, text, level, space=""):
+        key = re.sub(r'[^\w-]', '', text.replace(" ", "-").replace("'", "-").lower())
+        st.markdown(f"<{level} id='{key}'>{text}</{level}>", unsafe_allow_html=True)
+        self._items.append(f"{space}* <a href='#{key}'>{text}</a>")
+custom_css = "<style type='text/css'>" + Path('style.css').read_text() + "</style>"
+st.write(custom_css, unsafe_allow_html=True)
+def concat_prompt(prompt_text, text, model_name):
+    if 'flan' in model_name:
+        input_ = prompt_text + "\n\n" + text
+    elif 'galactica' in model_name:
+        input_ = text + "\n\n" + prompt_text
+    return input_
+endpoint = "http://hiaisc.isl.iar.kit.edu/summarize"
+ENDPOINTS = {"http://hiaisc.isl.iar.kit.edu/summarize": "meta-llama/Llama-2-13b-chat-hf",}
+client = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/chapter")
+if USE_PARAGRAPHING_MODEL:
+    paragrapher = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/paragraph")
+summarizer = TextGenerator(endpoint)
+tokenizer = AutoTokenizer.from_pretrained(ENDPOINTS[endpoint], use_fast=False)
+# TLDR PROMPT
+SYSTEM_PROMPT = "You are an assistant who replies with a summary to every message."
+TLDR_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
+{system_prompt}
+<</SYS>>
+{user_message} [/INST] Sure! Here is a summary of the research presentation in a single, short sentence:"""
+TLDR_USER_PROMPT = "Summarize the following research presentation in a single, short sentence:\n\n{input}"
+TLDR_PROMPT = TLDR_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
+TLDR_PROMPT_LENGTH = tokenizer(TLDR_PROMPT, return_tensors="pt")["input_ids"].size(1)
+BP_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
+{system_prompt}
+<</SYS>>
+{user_message} [/INST] Sure! Here is a summary of the research presentation using three bullet points:\n\n\u2022"""
+BP_USER_PROMPT = "Summarize the following research presentation using three bullet points:\n\n{input}"
+BP_PROMPT = BP_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
+BP_PROMPT_LENGTH = tokenizer(BP_PROMPT, return_tensors="pt")["input_ids"].size(1)
+CONTEXT_LENGTH = 3072
+MAX_SUMMARY_LENGTH = 1024
+TLDR_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - TLDR_PROMPT_LENGTH - 1
+BP_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - BP_PROMPT_LENGTH - 1
+text_generator = TextGenerator(endpoint)
+temperature = 0.7
+import re
+def replace_newlines(text):
+    updated_text = re.sub(r'\n+', r'\n\n', text)
+    return updated_text
+def generate_summary(summarizer, generated_text_box, input_, prompt, max_input_length, prefix=""):
+    all_generated_text = prefix
+    truncater = Truncater(tokenizer, max_length=max_input_length)
+    input_ = truncater(input_)
+    input_ = prompt.format(input=input_)
+    for generated_text in summarizer.generate_text_stream(input_, max_new_tokens=MAX_SUMMARY_LENGTH, do_sample=True, temperature=temperature):
+        all_generated_text += replace_newlines(generated_text)
+        generated_text_box.info(all_generated_text)
+    print(all_generated_text)
+    return all_generated_text.strip()
+st.header("Demo: Intelligent Recap")
+if not hasattr(st, 'global_state'):
+    st.global_state = {'NIPS 2021 Talks': None, 'TED Talks': None}
+    # NIPS 2021 Talks
+    transcript_files = itertools.islice(Path("demo_data/nips-2021/").rglob("transcript_whisper_large-v2.vtt"), 15)
+    # get titles from metadata.json
+    transcripts_map = {}
+    for transcript_file in transcript_files:
+        base_path = transcript_file.parent
+        metadata = base_path / "metadata.json"
+        txt_file = base_path / "transcript_whisper_large-v2.txt"
+        with open(metadata) as f:
+            metadata = json.load(f)
+            title = metadata["title"]
+            transcript = get_transcript(txt_file)
+            captions = get_captions_from_vtt(transcript_file)
+            transcripts_map[title] = {"transcript": transcript, "captions": captions, "video": base_path / "video.mp4"}
+    st.global_state['NIPS 2021 Talks'] = transcripts_map
+    data = get_partition("train").sample(15, random_state=41)
+    video_ids = data.talk_id.tolist()
+    transcripts = data.text.apply(lambda x: " ".join(x)).tolist()
+    transcripts_map = {}
+    for video_id, transcript in zip(video_ids, transcripts):
+        metadata = get_talk_metadata(video_id)
+        title = metadata["data"]["video"]["title"]
+        presenter = metadata["data"]["video"]["presenterDisplayName"]
+        print(metadata["data"])
+        if metadata["data"]["video"]["nativeDownloads"] is None:
+            continue
+        video_url = metadata["data"]["video"]["nativeDownloads"]["medium"]
+        transcripts_map[title] = {"transcript": transcript, "video": video_url, "presenter": presenter}
+    st.global_state['TED Talks'] = transcripts_map
+    def get_lecture_id(path):
+        return int(path.parts[-2].split('-')[1])
+    transcript_files = Path("demo_data/lectures/").rglob("English.vtt")
+    sorted_path_list = sorted(transcript_files, key=get_lecture_id)
+    transcripts_map = {}
+    for transcript_file in sorted_path_list:
+        base_path = transcript_file.parent
+        lecture_id = base_path.parts[-1]
+        transcript = " ".join([c["text"].strip() for c in get_captions_from_vtt(transcript_file)]).replace("\n", " ")
+        video_path = Path(base_path, "video.mp4")
+        transcripts_map["Machine Translation: " + lecture_id] = {"transcript": transcript, "video": video_path}
+    st.global_state['KIT Lectures'] = transcripts_map
+type_of_document = st.selectbox('What kind of document do you want to test it on?', list(st.global_state.keys()))
+transcripts_map = st.global_state[type_of_document]
+selected_talk = st.selectbox("Choose a document...", list(transcripts_map.keys()))
+st.video(str(transcripts_map[selected_talk]['video']), format="video/mp4", start_time=0)
+input_text = st.text_area("Transcript", value=transcripts_map[selected_talk]['transcript'], height=300)
+toc = Toc()
+summarization_todos = []
+with st.expander("Adjust Thresholds"):
+    threshold = st.slider('Chapter Segmentation Threshold', 0.00, 1.00, value=0.4, step=0.05)
+    paragraphing_threshold = st.slider('Paragraphing Threshold', 0.00, 1.00, value=0.5, step=0.05)
+if st.button("Process Transcript"):
+    with st.sidebar:
+        st.header("Table of Contents")
+        toc.placeholder()
+    st.header(selected_talk, divider='rainbow')
+    # if 'presenter' in transcripts_map[selected_talk]:
+    #     st.markdown(f"### *by **{transcripts_map[selected_talk]['presenter']}***")
+    captions = transcripts_map[selected_talk]['captions'] if 'captions' in transcripts_map[selected_talk] else None
+    result = client.segment(input_text, captions, generate_titles=True, threshold=threshold)
+    if USE_PARAGRAPHING_MODEL:
+        presult = paragrapher.segment(input_text, captions, generate_titles=False, threshold=paragraphing_threshold)
+        paragraphs = presult['segments']
+    segments, titles, sentences = result['segments'], result['titles'], result['sentences']
+    if USE_PARAGRAPHING_MODEL:
+        prev_chapter_idx = 0
+        prev_paragraph_idx = 0
+        segment = []
+        for i, sentence in enumerate(sentences):
+            chapter, chapter_idx = get_sublist_by_flattened_index(segments, i)
+            paragraph, paragraph_idx = get_sublist_by_flattened_index(paragraphs, i)
+            if (chapter_idx != prev_chapter_idx and paragraph_idx == prev_paragraph_idx) or (paragraph_idx != prev_paragraph_idx and chapter_idx != prev_chapter_idx):
+                print("Chapter / Chapter & Paragraph")
+                segment_text = " ".join(segment)
+                toc.subheader(titles[prev_chapter_idx])
+                if len(segment_text) > 1200:
+                    generated_text_box = st.info("")
+                    summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
+                elif len(segment_text) > 450:
+                    generated_text_box = st.info("")
+                    summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
+                st.write(segment_text)
+                segment = []
+            elif paragraph_idx != prev_paragraph_idx and chapter_idx == prev_chapter_idx:
+                print("Paragraph")
+                segment.append("\n\n")
+            segment.append(sentence)
+            prev_chapter_idx = chapter_idx
+            prev_paragraph_idx = paragraph_idx
+        segment_text = " ".join(segment)
+        toc.subheader(titles[prev_chapter_idx])
+        if len(segment_text) > 1200:
+            generated_text_box = st.info("")
+            summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
+        elif len(segment_text) > 450:
+            generated_text_box = st.info("")
+            summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
+        st.write(segment_text)
+    else:
+        segments = [" ".join([sentence for sentence in segment]) for segment in segments]
+        for title, segment in zip(titles, segments):
+            toc.subheader(title)
+            if len(segment) > 1200:
+                generated_text_box = st.info("")
+                summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
+            elif len(segment) > 450:
+                generated_text_box = st.info("")
+                summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
+            st.write(segment)
+    toc.generate()
+for summarization_todo in summarization_todos:
+    summarization_todo()

demo_data/lectures/Lecture-01-18.04.2023/English.vtt ADDED Viewed

	@@ -0,0 +1,2582 @@

+WEBVTT
+0:00:00.000 --> 0:00:10.115
+That easy to say this is a good translation
+and this is a bad translation.
+0:00:10.115 --> 0:00:12.947
+How can we evaluate?
+0:00:13.413 --> 0:00:26.083
+We will put an emphasis on machine translation
+because that is currently the state of the
+0:00:26.083 --> 0:00:26.787
+art.
+0:00:28.028 --> 0:00:35.120
+But we are now focused on the details of neural
+networks where we are describing the basic
+0:00:35.120 --> 0:00:39.095
+ideas and how to use the info machine translation.
+0:00:39.095 --> 0:00:41.979
+This is not a neural network course.
+0:00:42.242 --> 0:00:49.574
+If you have some background in Neo Networks,
+that is of course of an advantage, but it should
+0:00:49.574 --> 0:00:51.134
+not be a challenge.
+0:00:51.134 --> 0:00:58.076
+If you have not done the details, we'll shortly
+cover the background and the main ideas.
+0:00:58.076 --> 0:01:00.338
+How can we use them for for?
+0:01:00.280 --> 0:01:06.880
+Machine translation: We will starve the first
+two, three lectures with some like more traditional
+0:01:06.880 --> 0:01:12.740
+approaches how they work because they still
+give some good intuition, some good ideas.
+0:01:12.872 --> 0:01:17.141
+And they help us to understand where our systems
+might be better.
+0:01:17.657 --> 0:01:22.942
+And yeah, we have an innocence on really what
+do we need to do to build a strong system.
+0:01:23.343 --> 0:01:35.534
+And then we have a part on experience where
+it's about how to build the systems and how
+0:01:35.534 --> 0:01:37.335
+to apply it.
+0:01:39.799 --> 0:01:47.774
+For additional reading materials, so we have
+the slides on the website.
+0:01:47.774 --> 0:01:55.305
+There is also links to papers which cover
+the topic of the lecture.
+0:01:55.235 --> 0:01:58.436
+If You'd Like to Study Additional Books.
+0:01:59.559 --> 0:02:07.158
+Think the most relevant is this machine translation
+from Philip Kurnan, which gives an introduction
+0:02:07.158 --> 0:02:09.210
+about machine translation.
+0:02:09.210 --> 0:02:15.897
+But this lecture is, of course, not a one
+to one like we don't go through the book, but
+0:02:15.897 --> 0:02:17.873
+it covers related topics.
+0:02:18.678 --> 0:02:25.094
+Is a previous version of that statistical
+machine translation focusing on that part,
+0:02:25.094 --> 0:02:28.717
+and we cover some of that part rather than
+all.
+0:02:28.717 --> 0:02:35.510
+If you want to have more basics about natural
+language processing, this might be helpful.
+0:02:39.099 --> 0:02:53.738
+In addition, there is an online course on
+machine translation which we also develop here
+0:02:53.738 --> 0:02:57.521
+at which is available.
+0:02:57.377 --> 0:03:04.894
+Input where you're, of course, free to use
+that I might give you some other type of presentation
+0:03:04.894 --> 0:03:07.141
+of the lecture important is.
+0:03:07.141 --> 0:03:14.193
+It's, of course, a lot shorter and book doesn't
+cover all the topics which you're covering
+0:03:14.193 --> 0:03:15.432
+in the lecture.
+0:03:15.655 --> 0:03:19.407
+So, of course, for the exam everything which
+was in the lecture is important.
+0:03:19.679 --> 0:03:25.012
+This covers like the first half where don't
+know exactly the first X lectures.
+0:03:26.026 --> 0:03:28.554
+Feel free to have a look at that.
+0:03:28.554 --> 0:03:29.596
+It's shorter.
+0:03:29.596 --> 0:03:36.438
+Maybe there's some of you interesting to have
+very short videos or after the lecture single
+0:03:36.438 --> 0:03:39.934
+this topic I didn't understand want to repeat.
+0:03:40.260 --> 0:03:50.504
+Then this might be helpful, but it's important
+that there is more content in the lecture.
+0:03:53.753 --> 0:04:02.859
+The exam will be minutes and oral exam and
+just make an appointment and then.
+0:04:05.305 --> 0:04:09.735
+If you think this is a really cool topic,
+want to hear more.
+0:04:09.735 --> 0:04:14.747
+There's two similars, one on advanced topics
+in machine translation.
+0:04:15.855 --> 0:04:24.347
+Which is every Thursday and there is one which
+was already on Monday.
+0:04:24.347 --> 0:04:34.295
+But if you're interested in speech translation
+to contact us and there, I think,.
+0:04:34.734 --> 0:04:47.066
+Then there are other lectures, one more learning
+by Professor Vival, and for us some of you
+0:04:47.066 --> 0:04:48.942
+have already.
+0:04:48.888 --> 0:04:55.496
+Lecture, which is related but of discovering
+more general natural language processing than
+0:04:55.496 --> 0:04:57.530
+will be again available in.
+0:04:57.597 --> 0:05:07.108
+Winter semester, and then we are concentrating
+on the task of machine translation and mighty.
+0:05:11.191 --> 0:05:14.630
+Yeah, and also there's an automatic speech
+emission problem.
+0:05:16.616 --> 0:05:27.150
+And this is a bit what we are planning to
+talk about in this semester.
+0:05:27.150 --> 0:05:30.859
+Today we have a general.
+0:05:31.371 --> 0:05:37.362
+Then on Thursday we are doing a bit of a different
+lecture and that's about the linguistic.
+0:05:37.717 --> 0:05:42.475
+It may be quite different from what you're
+more computer scientist, what you've done there,
+0:05:42.475 --> 0:05:43.354
+but don't worry.
+0:05:43.763 --> 0:05:49.051
+We're coming in a very basic thing that I
+think it's important if you're dealing with
+0:05:49.051 --> 0:05:53.663
+natural language to have a bit of an understanding
+of what language isn't.
+0:05:53.663 --> 0:05:59.320
+Maybe I've learned about that in high school,
+but also for you this I guess some years ago.
+0:05:59.619 --> 0:06:07.381
+And so it's a bit of yeah, it better understand
+also what other challenges there.
+0:06:07.307 --> 0:06:16.866
+And especially since we are all dealing with
+our mother time, it may be English, but there
+0:06:16.866 --> 0:06:25.270
+is a lot of interesting phenomena which would
+not occur in these two languages.
+0:06:25.625 --> 0:06:30.663
+And therefore we'll also look a bit into what
+are things which might happen in other languages.
+0:06:30.930 --> 0:06:35.907
+If we want to build machine translation, of
+course we want to build machine Translation
+0:06:35.907 --> 0:06:36.472
+for many.
+0:06:38.178 --> 0:06:46.989
+Then we will see a lot of these machine learning
+based how to get the data and process the data
+0:06:46.989 --> 0:06:47.999
+next week.
+0:06:48.208 --> 0:07:03.500
+And then we'll have one lecture about statistical
+machine translation, which was the approach
+0:07:03.500 --> 0:07:06.428
+for twenty years.
+0:07:07.487 --> 0:07:17.308
+And then maybe surprisingly very early we'll
+talk about evaluation and this is because evaluation
+0:07:17.308 --> 0:07:24.424
+is really essential for machine translation
+and it's very challenging.
+0:07:24.804 --> 0:07:28.840
+To decide if machine translation output is
+good or bad is really challenging.
+0:07:29.349 --> 0:07:38.563
+If you see another translation for a machine
+to decide is not as difficult and even for
+0:07:38.563 --> 0:07:48.387
+a machine translation output and ask them to
+rate, you'll get three different answers: And
+0:07:48.387 --> 0:07:55.158
+so it's worse to investigate it, and of course
+it's also important to have that at the beginning
+0:07:55.158 --> 0:08:01.928
+because if we're later talking about some techniques,
+it will be always saying this technique is
+0:08:01.928 --> 0:08:03.813
+better by x percent or so.
+0:08:04.284 --> 0:08:06.283
+And we'll also have a practical good course
+of this.
+0:08:06.746 --> 0:08:16.553
+Then we're going to build language models
+which are in point to translation models.
+0:08:16.736 --> 0:08:28.729
+After the half you have a basic understanding
+of what and basic machine translation.
+0:08:29.029 --> 0:08:39.065
+And then on the second part of the lecture
+we will cover more advanced topics.
+0:08:39.065 --> 0:08:42.369
+What are the challenging?
+0:08:43.463 --> 0:08:48.035
+One challenge is, of course, about additional
+resources about data.
+0:08:48.208 --> 0:08:53.807
+So the question is how can we get more data
+or better data and their different ways of
+0:08:53.807 --> 0:08:54.258
+doing?
+0:08:54.214 --> 0:09:00.230
+Our thralling data will look into our building
+systems which not translate between one language
+0:09:00.230 --> 0:09:06.122
+but which translate between fifteen languages
+and youth knowledge and share knowledge between
+0:09:06.122 --> 0:09:09.632
+the language so that for each pair they need
+less data.
+0:09:11.751 --> 0:09:19.194
+And then we'll have something about efficiency.
+0:09:19.194 --> 0:09:27.722
+That is, of course, with more and more complex
+models.
+0:09:27.647 --> 0:09:33.053
+Because then nobody can afford to do that,
+so how can you build really efficient things?
+0:09:33.393 --> 0:09:38.513
+Who also like energy is getting more expensive
+so it's even more important to build systems.
+0:09:39.419 --> 0:09:43.447
+We're Looking to Biases So.
+0:09:43.423 --> 0:09:50.364
+That is a machine translation quite interesting
+because some information are represented different
+0:09:50.364 --> 0:09:51.345
+in languages.
+0:09:51.345 --> 0:09:55.552
+So if you think about German, there is always
+clear or not.
+0:09:55.552 --> 0:10:00.950
+But in a lot of situations, it's clear if
+you talk about to teach her about.
+0:10:01.321 --> 0:10:03.807
+Another Person If It's Male or Female.
+0:10:04.204 --> 0:10:13.832
+From English to German you don't have this
+information, so how do you generate that and
+0:10:13.832 --> 0:10:15.364
+what systems?
+0:10:15.515 --> 0:10:24.126
+Will just assume things and we'll see that
+exactly this is happening, so in order to address
+0:10:24.126 --> 0:10:27.459
+these challenges and try to reduce.
+0:10:28.368 --> 0:10:35.186
+The main adaptation is what I said that beginning
+systems are good at the task they are trained.
+0:10:35.186 --> 0:10:37.928
+But how can we adapt them to new task?
+0:10:38.959 --> 0:10:51.561
+Document level is doing more context and we
+have two lectures about speech translation,
+0:10:51.561 --> 0:10:56.859
+so mostly before we are translating.
+0:10:57.117 --> 0:11:00.040
+Are now translating audio things.
+0:11:00.040 --> 0:11:05.371
+We have just additional challenges and these
+we will address.
+0:11:10.450 --> 0:11:22.165
+So to the motivation, why should you work
+on the theme translation and why should you
+0:11:22.165 --> 0:11:23.799
+put effort?
+0:11:24.224 --> 0:11:30.998
+So we want or we are living in a more global
+society.
+0:11:30.998 --> 0:11:37.522
+You have now the chance to communicate with
+people.
+0:11:37.897 --> 0:11:44.997
+And the danger of course is that languages
+are dying, and more and more languages are
+0:11:44.997 --> 0:11:45.988
+going away.
+0:11:46.006 --> 0:11:53.669
+I think at least that some opportunity in
+order to keep more languages is that we have
+0:11:53.669 --> 0:12:01.509
+technology solutions which help you to speak
+in your language and still communicate with
+0:12:01.509 --> 0:12:04.592
+people who speak another language.
+0:12:04.864 --> 0:12:16.776
+And on the one hand there is the need and
+more and more people want to speak in some
+0:12:16.776 --> 0:12:19.159
+other languages.
+0:12:19.759 --> 0:12:27.980
+For example, Iceland was really keen on getting
+Icelandic into commercial systems and they
+0:12:27.980 --> 0:12:36.471
+even provided data and so on because they wanted
+that their language is spoken longer and not
+0:12:36.471 --> 0:12:38.548
+just people switching.
+0:12:38.959 --> 0:12:47.177
+So there's even like yeah, they were spending
+for promoting this language in order to have
+0:12:47.177 --> 0:12:55.125
+all these digital tools available for languages
+which are not spoken by so many people.
+0:12:56.156 --> 0:13:07.409
+So it's questionable and it's not completely
+clear technology always provides.
+0:13:10.430 --> 0:13:25.622
+If we think about machine translation, there
+are different use cases in which you can use
+0:13:25.622 --> 0:13:26.635
+that.
+0:13:27.207 --> 0:13:36.978
+And this has some characteristics: So typically
+in this case it is where machine translation
+0:13:36.978 --> 0:13:40.068
+was used first anybody.
+0:13:40.780 --> 0:13:50.780
+Because most youth outlets around the world
+report at least some of the same events, like
+0:13:50.780 --> 0:13:58.669
+was probably covered around the world in a
+lot of different languages.
+0:13:59.279 --> 0:14:08.539
+That is one point yes, so the training gator
+is there.
+0:14:08.539 --> 0:14:16.284
+That's definitely a good point here and then.
+0:14:17.717 --> 0:14:19.425
+Yes, there was my regional idea.
+0:14:19.425 --> 0:14:23.256
+The motivation program was a bit different
+by you, but it's a good point.
+0:14:23.256 --> 0:14:26.517
+So on the one end you'll understand maybe
+not perfect English.
+0:14:26.517 --> 0:14:30.762
+Also, it's for his personal use, so you're
+using machine translation for you use.
+0:14:31.311 --> 0:14:37.367
+It's not as important that this is really
+perfect written text, but you're more interested
+0:14:37.367 --> 0:14:38.564
+in understanding.
+0:14:38.858 --> 0:14:45.570
+Maybe it's more clearer if you think about
+the other situation where it's about dissimination
+0:14:45.570 --> 0:14:48.926
+that means producing text in another language.
+0:14:48.926 --> 0:14:55.138
+So just imagine you have a website or you
+have a restaurant and you want to offer your
+0:14:55.138 --> 0:14:55.566
+menu.
+0:14:56.476 --> 0:15:01.948
+And in this case maybe you want to have a
+higher quality because in some of your.
+0:15:01.901 --> 0:15:06.396
+You're presenting something of yourself and
+you want to have good quality.
+0:15:06.396 --> 0:15:11.490
+Just remember you're writing a letter and
+if you're translating your letter then you
+0:15:11.490 --> 0:15:17.123
+don't want to have it full of mistakes because
+it's somehow a bad, bad oppression but if it's
+0:15:17.123 --> 0:15:20.300
+assimilation it's about you getting the information.
+0:15:20.660 --> 0:15:25.564
+So here you want your disciplination, you're
+producing texts for another language.
+0:15:26.006 --> 0:15:31.560
+And then you have the disadvantage that you
+maybe want to have a higher quality.
+0:15:31.831 --> 0:15:43.432
+Therefore, typically there is less amount,
+so normally you're getting more information
+0:15:43.432 --> 0:15:46.499
+than you're producing.
+0:15:49.109 --> 0:15:57.817
+Then of course there is a dynamic scenario
+where there is some type of interaction and
+0:15:57.817 --> 0:16:07.099
+the one thing which is interesting about the
+dialogue scenario is there is: So if you're
+0:16:07.099 --> 0:16:18.045
+translating a website you have all the data
+available but in a dialogue scenario you.
+0:16:18.378 --> 0:16:23.655
+And we'll see that in speech recognition this
+is a big challenge.
+0:16:23.655 --> 0:16:30.930
+Just to mention German where in German the
+work is often more at the end, so each harmony.
+0:16:32.052 --> 0:16:36.343
+Know that you want to generate the English
+sentence.
+0:16:36.343 --> 0:16:42.740
+Now you need to know if you cancel this registration
+to produce a second word.
+0:16:42.740 --> 0:16:49.785
+So you have to either guess or do something
+in order to provide the translation before
+0:16:49.785 --> 0:16:52.052
+the translation is already.
+0:16:57.817 --> 0:17:00.530
+The question, of course, is in the new world.
+0:17:00.530 --> 0:17:05.659
+I mean, of course, we can, on the one hand,
+say we don't want to have English, but the
+0:17:05.659 --> 0:17:10.789
+question is do we really need that many languages
+and how many are here at the moment?
+0:17:11.291 --> 0:17:20.248
+Does anybody have an idea how many languages
+are spoken in the world?
+0:17:23.043 --> 0:17:26.510
+This is already the first big challenge.
+0:17:26.510 --> 0:17:34.120
+What a language is and what no language is
+is already difficult, and then maybe one point
+0:17:34.120 --> 0:17:40.124
+people have to argue first about written language
+or spoken languages.
+0:17:40.400 --> 0:17:47.765
+For written languages I think that number
+is still too low, but for a spoken language
+0:17:47.765 --> 0:17:53.879
+people normally think: So you see that it's
+really a lot of languages which will be difficult
+0:17:53.879 --> 0:17:54.688
+to all happen.
+0:17:55.035 --> 0:18:00.662
+And these are just like you see Europe where
+there's relatively few languages.
+0:18:00.662 --> 0:18:05.576
+You already have quite a lot of languages,
+even walls and countries.
+0:18:06.126 --> 0:18:13.706
+Of course sometimes you share the language,
+but then you have Briton or Gillesian vest
+0:18:13.706 --> 0:18:17.104
+where you have languages in a country.
+0:18:18.478 --> 0:18:24.902
+And yeah, of course, there's the question:
+When does it start to be a language?
+0:18:24.902 --> 0:18:27.793
+And when is it more like a dialect?
+0:18:27.793 --> 0:18:28.997
+So is Catalan?
+0:18:28.997 --> 0:18:31.727
+Is Swiss German a known language?
+0:18:31.727 --> 0:18:33.253
+Or is it the same?
+0:18:33.293 --> 0:18:36.887
+So then, of course, it's are like Czech and
+Slovakian.
+0:18:36.887 --> 0:18:42.704
+I know heard that people can understand each
+other so they can just continue talking and
+0:18:42.704 --> 0:18:45.711
+understand by some of their own language and.
+0:18:46.026 --> 0:18:56.498
+Of course, it's partly also like about your
+own nationality, so I think some people said
+0:18:56.498 --> 0:18:57.675
+creation.
+0:18:58.018 --> 0:19:04.957
+But think for a lot of people you shouldn't
+say that they are part of being creation language.
+0:19:05.165 --> 0:19:10.876
+But you see therefore that it is not completely
+clear that there is no hardwater between this
+0:19:10.876 --> 0:19:13.974
+and the new language, and this is a different
+one.
+0:19:14.094 --> 0:19:19.403
+And of course it's getting more fluent when
+you talk about scientific things.
+0:19:19.403 --> 0:19:25.189
+I guess sometimes it's no longer clear if
+it's German or English because we start to
+0:19:25.189 --> 0:19:27.707
+use a lot of English terms in there.
+0:19:27.707 --> 0:19:31.519
+So of course there's interesting mixes which
+will talk.
+0:19:33.193 --> 0:19:38.537
+So should everybody just speak English, and
+these numbers are a bit older, have to admit:
+0:19:38.938 --> 0:19:47.124
+However, I don't think they're completely different
+now and it says like how many people know in
+0:19:47.124 --> 0:19:54.718
+Europe can speak English for countries where
+English is not the mothertown or for people.
+0:19:54.995 --> 0:20:06.740
+In some countries like smaller ones, for smaller
+countries you have quite high numbers.
+0:20:07.087 --> 0:20:13.979
+However, there are many countries where you
+have like twenty to thirty percent of the population,
+0:20:13.979 --> 0:20:16.370
+only being able to speak English.
+0:20:16.370 --> 0:20:22.559
+So if we would only do everything only in
+English, we would exclude half the population
+0:20:22.559 --> 0:20:23.333
+of Europe.
+0:20:23.563 --> 0:20:30.475
+And therefore providing translations is very
+important and therefore, for example, the European
+0:20:30.475 --> 0:20:35.587
+Parliament puts a really large amount of money
+into doing translation.
+0:20:35.695 --> 0:20:40.621
+So that's why you can speak in your mother
+too in the European Parliament.
+0:20:40.621 --> 0:20:46.204
+Everybody like everyone elected there can
+speak in there and they were translated to
+0:20:46.204 --> 0:20:52.247
+all the other languages and it's a huge effort
+and so the question is can we do better with
+0:20:52.247 --> 0:20:52.838
+machine.
+0:20:53.493 --> 0:20:58.362
+And for other countries things are even more.
+0:20:58.362 --> 0:21:05.771
+They may be not worse, difficult, but they
+are even more challenging.
+0:21:06.946 --> 0:21:13.764
+So there's even more diversity of languages
+and it might be even more important to do machines.
+0:21:16.576 --> 0:21:31.034
+If you see how many people speak French, Portuguese
+or English, it's relatively few compared to
+0:21:31.034 --> 0:21:33.443
+the population.
+0:21:33.813 --> 0:21:46.882
+So think that this should be around millions
+would understand you, but all the others wouldn't.
+0:21:49.289 --> 0:21:54.877
+So it seems to be very important to provide
+some taebo translation.
+0:21:54.877 --> 0:21:58.740
+It's a quite big industry as a European Union.
+0:21:58.740 --> 0:22:05.643
+This is already also quite long ago, but it
+won't get less spent like in that year.
+0:22:05.643 --> 0:22:08.931
+One point three billion on translation.
+0:22:09.289 --> 0:22:21.315
+So it might be very helpful to have tools
+in order to provide them, and as said, not
+0:22:21.315 --> 0:22:26.267
+all directions might be important.
+0:22:26.426 --> 0:22:35.059
+Is even not possible for students, so in the
+European Parliament they don't have all combinations
+0:22:35.059 --> 0:22:36.644
+of the different.
+0:22:36.977 --> 0:22:42.210
+And language is so if they want to translate
+from Maltese to Estonian or so.
+0:22:42.402 --> 0:22:47.361
+And maybe they have a translator for that,
+but there are some directions which don't have
+0:22:47.361 --> 0:22:47.692
+that.
+0:22:47.692 --> 0:22:52.706
+Then they handle directly, but they would
+translate first to French, German or or English,
+0:22:52.706 --> 0:22:57.721
+and then there would be a second translator
+getting the translation and really translating
+0:22:57.721 --> 0:22:59.154
+to your Italian language.
+0:22:59.299 --> 0:23:06.351
+And it's not always English, so they are really
+selecting what is most helpful.
+0:23:06.351 --> 0:23:13.931
+But you see that even in this small setup,
+with this large amount of effort in there,
+0:23:13.931 --> 0:23:17.545
+there's not enough ability to translate.
+0:23:19.819 --> 0:23:21.443
+And of course this was text.
+0:23:21.443 --> 0:23:26.538
+Then you have a lot of other things where
+you want to, for example, do speech translation.
+0:23:26.538 --> 0:23:31.744
+There is a lot of conferences which currently
+are all held in English, which of course might
+0:23:31.744 --> 0:23:35.831
+also not be the best solution if you've gone
+to some of the conferences.
+0:23:36.176 --> 0:23:45.964
+You might have heard some accented speech
+where people speak a language that is very
+0:23:45.964 --> 0:23:49.304
+different from their mother.
+0:23:49.749 --> 0:23:52.059
+Might be difficult to understand.
+0:23:52.212 --> 0:23:59.123
+We're currently having an effort for example
+by ACL, which is the conference organized in
+0:23:59.123 --> 0:24:06.112
+this field to provide these translations into
+ten hour languages so that also students who
+0:24:06.112 --> 0:24:06.803
+are not.
+0:24:06.746 --> 0:24:12.446
+That familiar English is able to read the
+papers and watch the present case.
+0:24:16.416 --> 0:24:25.243
+So the question is what can you do here and
+one interesting solution which we'll cover
+0:24:25.243 --> 0:24:26.968
+in this lecture?
+0:24:27.087 --> 0:24:38.112
+This always comes with a question: is it will
+it replace the human?
+0:24:38.112 --> 0:24:40.382
+And yes, the.
+0:24:40.300 --> 0:24:49.300
+Idea, but the question doesn't really happen
+and I'm any skeptical about that.
+0:24:49.300 --> 0:24:52.946
+So currently we are not seeing.
+0:24:53.713 --> 0:24:55.807
+So much more effort needed.
+0:24:55.807 --> 0:25:00.294
+Of course, machine translation is now used
+as some type of.
+0:25:01.901 --> 0:25:11.785
+If you think about in the European Parliament,
+they will have some humans doing their translation
+0:25:11.785 --> 0:25:18.060
+because: If you think about the chancel of
+Germany trembling somewhere and quite sure
+0:25:18.060 --> 0:25:18.784
+you want,.
+0:25:19.179 --> 0:25:31.805
+And so it's more like we are augmenting the
+possibilities to have more possibilities to
+0:25:31.805 --> 0:25:37.400
+provide translation and travel around.
+0:25:39.499 --> 0:25:53.650
+How can this technology help so machine translation
+is one way of dealing with?
+0:25:54.474 --> 0:26:01.144
+Of course, there is other tasks which do even
+without machine translation.
+0:26:01.144 --> 0:26:04.613
+Just think about summarize my lecture.
+0:26:04.965 --> 0:26:08.019
+Approaches doing that what they call end to
+end.
+0:26:08.019 --> 0:26:11.635
+So you just put an English text and get a
+German summary.
+0:26:11.635 --> 0:26:17.058
+However, a good baseline and an important
+thing is to either first lecture into German
+0:26:17.058 --> 0:26:22.544
+and then do a summary art, first do a summary
+in English and then translation language.
+0:26:23.223 --> 0:26:28.764
+Translation is very important in order to
+different application scenarios.
+0:26:28.764 --> 0:26:33.861
+We have that dissemination dialogue but also
+information extraction.
+0:26:33.861 --> 0:26:39.993
+So if you want to do like get information
+not only from English websites but from.
+0:26:40.300 --> 0:26:42.427
+Very different websites.
+0:26:42.427 --> 0:26:46.171
+It's helpful to have this type of solution.
+0:26:50.550 --> 0:26:52.772
+Yeah, what can you translate?
+0:26:52.772 --> 0:26:59.660
+Of course, we will focus on text, as I said
+for most of them, because it's about translation
+0:26:59.660 --> 0:27:06.178
+and anything first translates to text, and
+then change to text, and then we can do text
+0:27:06.178 --> 0:27:07.141
+translation.
+0:27:09.189 --> 0:27:19.599
+And text is not equals text, so we can do
+translation that is some of the most common.
+0:27:19.499 --> 0:27:27.559
+Is working on translation, so just imagine
+you are developing your new.
+0:27:27.947 --> 0:27:34.628
+Nowadays you don't want to have to only be
+available in English or German books in as
+0:27:34.628 --> 0:27:40.998
+many languages as possible, and if you use
+the standard tools it's not that easy.
+0:27:41.141 --> 0:27:50.666
+We have a different type of domain and there
+again we have very few contexts.
+0:27:50.666 --> 0:27:56.823
+Normally we translate: To pick up an app you
+have the menu and there's like safe.
+0:27:57.577 --> 0:28:02.535
+And then you only have safe.
+0:28:02.535 --> 0:28:14.845
+How should translate safe should it be written
+or should it be spicing?
+0:28:16.856 --> 0:28:24.407
+Then, of course, if you have like files, it
+might be that you have meta data to transport.
+0:28:26.466 --> 0:28:27.137
+Novels.
+0:28:27.137 --> 0:28:32.501
+Some work on that, but yeah, that's always
+a typical criticism.
+0:28:32.501 --> 0:28:36.440
+You'll never be able to translate Shakespeare.
+0:28:36.656 --> 0:28:43.684
+Think this is somehow the last use case of
+machine translation.
+0:28:43.684 --> 0:28:47.637
+For a translation of books there's.
+0:28:47.847 --> 0:28:57.047
+But the nice thing about machine translation
+is that it can translate to things which are
+0:28:57.047 --> 0:29:05.327
+boring, so think about translating some bureaucrative
+forms or some regulations.
+0:29:05.565 --> 0:29:11.302
+This is normally not very interesting, it's
+very repetitive, so their automation works
+0:29:11.302 --> 0:29:11.697
+well.
+0:29:11.931 --> 0:29:17.519
+Of course, there is also translations on Paibos
+images.
+0:29:17.519 --> 0:29:24.604
+I guess you point your camera to an object
+where it translates things.
+0:29:25.005 --> 0:29:43.178
+And we'll cover that at the end, as said,
+the speech translation.
+0:29:43.663 --> 0:29:46.795
+So you can't provide the translation of the
+lecture.
+0:29:46.795 --> 0:29:50.518
+If I'm five slides further then you would
+see the translation.
+0:29:50.518 --> 0:29:52.291
+It might not be very helpful.
+0:29:54.794 --> 0:29:57.062
+We are not speaking as we are written.
+0:29:57.062 --> 0:29:59.097
+It's again like a domain mismatch.
+0:29:59.359 --> 0:30:10.161
+So typically the sentences are not full sentences
+and I'm saying this is not the right way to
+0:30:10.161 --> 0:30:19.354
+praise it and if you just read what was written
+it might be hard to understand.
+0:30:23.803 --> 0:30:36.590
+We are focusing on the first application scenario
+that is fully out of management.
+0:30:37.177 --> 0:30:46.373
+Of course, there are quite interesting application
+scenarios for other things where it should
+0:30:46.373 --> 0:30:47.645
+be referred.
+0:30:47.867 --> 0:30:49.695
+Where it's no longer going to be.
+0:30:49.695 --> 0:30:52.436
+We have this tool and it works, but it's a
+market.
+0:30:52.436 --> 0:30:57.381
+We have the machine translation system and
+the human translator, and they somehow cooperate
+0:30:57.381 --> 0:30:59.853
+and try to be as fast as possible in doing
+a.
+0:31:00.380 --> 0:31:12.844
+The easiest idea there would be the first
+point you take the machine translation.
+0:31:13.553 --> 0:31:17.297
+That sometimes farther might not be the best
+way of suing it.
+0:31:17.357 --> 0:31:25.308
+Any ideas or what else you could do, then
+maybe the machine could aid the human and say
+0:31:25.308 --> 0:31:27.838
+I'm sure about this author.
+0:31:28.368 --> 0:31:32.319
+Yeah, very interesting, very good.
+0:31:32.319 --> 0:31:42.252
+Of course, the dangerous thing there is you
+asking something from a machine translation
+0:31:42.252 --> 0:31:45.638
+system where it's really bad.
+0:31:45.845 --> 0:31:50.947
+There is quality estimation that maybe it
+will couple that in evaluation so in evaluation
+0:31:50.947 --> 0:31:55.992
+you know what is correct translation and you
+have another output and you try to estimate
+0:31:55.992 --> 0:31:57.409
+how good is the quality.
+0:31:57.409 --> 0:32:02.511
+In quality estimation you don't have you only
+have a source and time and good question is
+0:32:02.511 --> 0:32:03.531
+exactly this one.
+0:32:03.531 --> 0:32:05.401
+Is it a good translation or not?
+0:32:05.665 --> 0:32:12.806
+This might be easier because the system might
+not know what translation is.
+0:32:13.053 --> 0:32:23.445
+Human is very good at that for machines that
+are difficult, but of course that's an interesting
+0:32:23.445 --> 0:32:24.853
+application.
+0:32:25.065 --> 0:32:32.483
+Be more interactive so that you may be translating
+if the human changes the fifth word.
+0:32:32.483 --> 0:32:36.361
+What does it mean for the remaining sentence?
+0:32:36.361 --> 0:32:38.131
+Do I need to change?
+0:32:38.131 --> 0:32:43.948
+There are also things like you don't have
+to repeat the same errors.
+0:32:47.767 --> 0:32:57.651
+Hell our automated basemen, you only want
+to correct at once and not at all positions.
+0:33:00.000 --> 0:33:21.784
+And then they ask, for example, so before
+the translation is done they ask: I'm not directly
+0:33:21.784 --> 0:33:23.324
+aware of that.
+0:33:23.324 --> 0:33:33.280
+I think it's a good way of ending and I think
+it's where, especially with more advanced dialogue
+0:33:33.280 --> 0:33:34.717
+strategy and.
+0:33:35.275 --> 0:33:38.831
+Currently think of most of the focus is like
+at least determining.
+0:33:39.299 --> 0:33:45.646
+Don't have this information that is already
+challenging, so there is quite some work on
+0:33:45.646 --> 0:33:49.541
+quality estimation that I'm missing your information.
+0:33:49.789 --> 0:33:53.126
+But is there something missing?
+0:33:53.126 --> 0:33:59.904
+It's really quite challenging and think that
+is where currently.
+0:34:00.260 --> 0:34:05.790
+What is there is there is opportunities to
+provide or there is models to directly provide
+0:34:05.790 --> 0:34:06.527
+additional?
+0:34:06.786 --> 0:34:13.701
+You can give them anything you have and provide
+them.
+0:34:13.701 --> 0:34:21.129
+It's a similar situation if you're translating
+to German.
+0:34:21.641 --> 0:34:31.401
+And it would just guess normally or do some
+random guessing always means it's using some
+0:34:31.401 --> 0:34:36.445
+information which should not be really there.
+0:34:36.776 --> 0:34:46.449
+So then you can provide it with an additional
+input or you should use formula or non formula.
+0:34:47.747 --> 0:35:04.687
+To know that this information is missing.
+0:35:04.544 --> 0:35:19.504
+Since you're not specifically modeling this,
+it's likely that there is a gender difference
+0:35:19.504 --> 0:35:21.805
+in languages.
+0:35:26.046 --> 0:35:39.966
+One are we doing good search on machine translation,
+so it's a very important part to ask in natural
+0:35:39.966 --> 0:35:42.860
+language processing.
+0:35:43.283 --> 0:35:49.234
+So of course you have a lot of computer science
+thing in there and that's the backbone of.
+0:35:49.569 --> 0:36:01.848
+However, task and understanding you can also
+get from information like computational linguistics,
+0:36:01.848 --> 0:36:08.613
+which tell you about what language it's good
+to know.
+0:36:08.989 --> 0:36:15.425
+Doesn't mean that in a computer we have to
+bottle it exactly the same, but for example
+0:36:15.425 --> 0:36:22.453
+to know that there is something like morphology,
+which means how words are built, and that for
+0:36:22.453 --> 0:36:24.746
+some languages it's very easy.
+0:36:24.746 --> 0:36:28.001
+In English there is nearly no worth coming.
+0:36:28.688 --> 0:36:35.557
+Well in Germany you already start for soon
+you have like different forms and so on.
+0:36:36.316 --> 0:36:41.991
+And for other languages, for finish, it's
+even more complicated with Basque.
+0:36:41.991 --> 0:36:44.498
+I think for some words more than.
+0:36:45.045 --> 0:36:52.098
+So knowing this, of course, gives you some
+advice.
+0:36:52.098 --> 0:37:04.682
+How do I look at that now because we'll see
+in the basic treat each word as an individual?
+0:37:06.106 --> 0:37:09.259
+Of course there is a lot of interest also
+prone from industry.
+0:37:09.259 --> 0:37:10.860
+There is a lot of applications.
+0:37:11.191 --> 0:37:17.068
+There's research groups at Google, Facebook,
+and Amazon.
+0:37:17.068 --> 0:37:26.349
+So there's quite a lot of interest in providing
+that for German and English it is solved.
+0:37:26.546 --> 0:37:27.569
+Annoucing it's hard.
+0:37:27.569 --> 0:37:31.660
+We're saying that not hard, but of course
+we haven't acquired high quality in them.
+0:37:32.212 --> 0:37:39.296
+But there's currently really a large trend
+in building other systems for low research
+0:37:39.296 --> 0:37:40.202
+languages.
+0:37:40.480 --> 0:37:53.302
+So there are tasks on last year's task on
+translating from Native American languages:
+0:37:53.193 --> 0:37:58.503
+Don't know yet but but five other languages,
+so how can you translate from them?
+0:37:58.538 --> 0:38:05.074
+Then you don't have like millions of sentences,
+but you might have only the Bible or some more
+0:38:05.074 --> 0:38:05.486
+data.
+0:38:05.486 --> 0:38:08.169
+Then the question is, what can you do?
+0:38:08.169 --> 0:38:09.958
+And how good can you get?
+0:38:14.794 --> 0:38:17.296
+One thing is very important.
+0:38:17.296 --> 0:38:25.751
+Of course, in a lot of A I is to measure the
+quality and what you can measure is quite important.
+0:38:25.986 --> 0:38:37.213
+So that's why for many years of regular there
+is different evaluation campaigns where people
+0:38:37.213 --> 0:38:38.178
+submit.
+0:38:39.419 --> 0:38:45.426
+We're often part of the statistical machine
+translation original, yet now I think it's
+0:38:45.426 --> 0:38:51.019
+a machine translation where it's mostly about
+European languages and used texts.
+0:38:51.051 --> 0:38:57.910
+The International Workshop of Spoken Language
+Translation, which is translation about lectures
+0:38:57.910 --> 0:39:04.263
+which we are co organizing, and there is a
+bovia as I said building strong systems this
+0:39:04.263 --> 0:39:04.696
+time.
+0:39:04.664 --> 0:39:11.295
+This has established translating conference
+presentations from English into ten different
+0:39:11.295 --> 0:39:17.080
+languages: And then, of course, you have to
+deal with things like special vocabulary.
+0:39:17.037 --> 0:39:23.984
+You think about recurrent real networks are
+terms like co-recurrent networks, convolutional
+0:39:23.984 --> 0:39:24.740
+networks.
+0:39:25.545 --> 0:39:29.917
+That might be more difficult to translate
+and you also have to decide who I need to translate
+0:39:29.917 --> 0:39:33.359
+or should I keep it in English, and that's
+not the same in each language.
+0:39:33.873 --> 0:39:37.045
+In German maybe mostly you keep it.
+0:39:37.045 --> 0:39:44.622
+I think in French people are typically like
+wanting to translate as much as possible.
+0:39:44.622 --> 0:39:52.200
+These are then challenges and then, of course,
+in Poland where it's also challenging.
+0:39:53.153 --> 0:39:59.369
+I think all of the speakers in the test that
+are not native in your speakers, so you need
+0:39:59.369 --> 0:40:05.655
+to translate people with a German accent or
+with a French accent or with a Japanese accent
+0:40:05.655 --> 0:40:09.178
+or an English accent, which poison has additional.
+0:40:12.272 --> 0:40:21.279
+Yes, so there is criticism always with new
+technologies because people say will never
+0:40:21.279 --> 0:40:23.688
+translate Shakespeare.
+0:40:24.204 --> 0:40:26.845
+Partly agree with the second.
+0:40:26.845 --> 0:40:34.682
+Maybe it's not good at translating Shakespeare,
+but there's many people working on that.
+0:40:35.255 --> 0:40:38.039
+Of course, the poison cookie is a challenge.
+0:40:38.858 --> 0:40:44.946
+The thing is here that the cookie chart that
+you can't never be sure if the machine translation
+0:40:44.946 --> 0:40:47.546
+system doesn't really mistake somewhere.
+0:40:47.546 --> 0:40:53.316
+So if you can't be sure that there's no error
+in there, how can you trust the translation?
+0:40:55.275 --> 0:41:01.892
+That is partly true, on the other hand, otherwise
+you have to translate to a human translator
+0:41:01.892 --> 0:41:06.116
+and men who are sometimes overestimating human
+performance.
+0:41:06.746 --> 0:41:15.111
+They are very good translators but under a
+lot of pressure and not human translations.
+0:41:15.715 --> 0:41:22.855
+The question is: When can you trust it enough
+anyway?
+0:41:22.855 --> 0:41:28.540
+You should be careful about trusting them.
+0:41:31.011 --> 0:41:38.023
+And I think some of them are too old now because
+it has been shown that it is helpful to have
+0:41:38.023 --> 0:41:41.082
+some type of machine translation system.
+0:41:41.082 --> 0:41:47.722
+Of course, it is not buying the car, so typically
+still a system is not working forever.
+0:41:48.048 --> 0:41:56.147
+If you want your dedicated system, which is
+good for the task you are, they are typically
+0:41:56.147 --> 0:41:57.947
+not as generalized.
+0:41:58.278 --> 0:42:07.414
+That can translate news and chats, and I don't
+know what.
+0:42:07.414 --> 0:42:12.770
+So typically if you want to show.
+0:42:12.772 --> 0:42:18.796
+It's not made for, it has not seen very well
+and then you see a bad quality.
+0:42:19.179 --> 0:42:27.139
+But that's also like yeah, therefore you don't
+build it.
+0:42:27.139 --> 0:42:42.187
+If you have a sports car and you are driving
+off road you should: Yeah, you can also say
+0:42:42.187 --> 0:42:49.180
+the other way around trans machine translation
+is already solved, and especially with more
+0:42:49.180 --> 0:42:50.487
+people think so.
+0:42:50.750 --> 0:43:04.275
+However, there is an impressive performance
+of machine translation, but it's not stated
+0:43:04.275 --> 0:43:06.119
+of the art.
+0:43:06.586 --> 0:43:11.811
+And yeah, they're good for some domains and
+some languages that are even like already.
+0:43:12.572 --> 0:43:27.359
+Have Microsoft has a very super human performance
+claiming that their machine translated system.
+0:43:27.467 --> 0:43:38.319
+However, there was one domain use and some
+language in Spanish where there is a huge amount
+0:43:38.319 --> 0:43:45.042
+of training data and you can build a very strong
+system.
+0:43:45.505 --> 0:43:48.605
+And you even don't have to go to these extreme
+cases.
+0:43:48.688 --> 0:43:54.328
+We have worked on Canada, which is a language
+in India spoken.
+0:43:54.328 --> 0:44:01.669
+I think by also around eighty million people
+so similar to to German that it has.
+0:44:01.669 --> 0:44:07.757
+The quality is significantly worse, it has
+significantly less data.
+0:44:08.108 --> 0:44:15.132
+There are still quite a lot of languages where
+the quality is not, where you want to have.
+0:44:15.295 --> 0:44:17.971
+Scaling this is not as easy at this thing.
+0:44:17.971 --> 0:44:23.759
+That's why we're also interested in multilingual
+systems with the hope that we don't have to
+0:44:23.759 --> 0:44:29.548
+build a system for each possible combination,
+but we can build a system which can cover many
+0:44:29.548 --> 0:44:33.655
+tags, many languages and then also need less
+data for each other.
+0:44:39.639 --> 0:44:51.067
+With invasion maybe some presentation of everything
+is a bit cat that can say the most important.
+0:44:51.331 --> 0:45:09.053
+So machine translation started coming from
+information theory in there was this: It's
+0:45:09.053 --> 0:45:13.286
+treating machine translation as encryption
+or decryption.
+0:45:13.533 --> 0:45:21.088
+Don't understand it, want to have it in English,
+treat it as if it's like encrypted English,
+0:45:21.088 --> 0:45:28.724
+and then apply my decryption algorithm, which
+they were working a lot during the Second World
+0:45:28.724 --> 0:45:29.130
+War.
+0:45:29.209 --> 0:45:34.194
+And so if I cannot do this detruction then
+this sings a song.
+0:45:34.934 --> 0:45:42.430
+And they based on that they had rules and
+so on.
+0:45:42.430 --> 0:45:50.843
+So they had the judge Georgetown experiments
+in where.
+0:45:51.691 --> 0:45:57.419
+From English and then they were like wow.
+0:45:57.419 --> 0:46:01.511
+This is solved in some years.
+0:46:01.511 --> 0:46:04.921
+Now we can do sentences.
+0:46:06.546 --> 0:46:18.657
+As you can imagine this didn't really work
+out that way, so it's not really happening.
+0:46:18.657 --> 0:46:24.503
+The spirit is willing, but flesh is weak.
+0:46:24.444 --> 0:46:30.779
+Translated it to Russian and then to Germany
+and then vodka is good but the meat is rotten.
+0:46:31.271 --> 0:46:39.694
+Think it never really happened this way, but
+you can see you can imagine that something
+0:46:39.694 --> 0:46:49.533
+like that could happen, and then in in the
+there was this report saying: It's more challenging
+0:46:49.533 --> 0:46:56.877
+than expected and the problem is that we have
+to invest more.
+0:46:56.877 --> 0:47:02.801
+There's no benefit for doing machine translation.
+0:47:04.044 --> 0:47:09.255
+At least in some other countries there was
+a bit, but then for some time there wasn't
+0:47:09.255 --> 0:47:10.831
+that big out of progress.
+0:47:12.152 --> 0:47:26.554
+We have then in the' 70s there were some rule
+based systems that would cover out some linguistic
+0:47:26.554 --> 0:47:28.336
+background.
+0:47:28.728 --> 0:47:34.013
+They are now doing very good machine translation,
+but they had a really huge rule base.
+0:47:34.314 --> 0:47:43.538
+So they really have like handwritten roots
+how to parse sentences, how to translate parse
+0:47:43.538 --> 0:47:45.587
+sentences to parse.
+0:47:46.306 --> 0:47:55.868
+When which word should be translated, these
+rule based systems were quite strong for a
+0:47:55.868 --> 0:47:57.627
+very long time.
+0:47:57.917 --> 0:48:03.947
+So even in or so for some language fares and
+some remains, it was better than a machine
+0:48:03.947 --> 0:48:04.633
+learning.
+0:48:05.505 --> 0:48:09.576
+Well, of course, there was a lot of effort
+in and a lot of experts were building this.
+0:48:11.791 --> 0:48:13.170
+And then.
+0:48:13.053 --> 0:48:18.782
+The first statistical machine translations
+were coming in the early nineties.
+0:48:18.782 --> 0:48:25.761
+There's the system by IBM will refer to them
+as a T by the IBM models, which are quite famous,
+0:48:25.761 --> 0:48:32.886
+and they were used to film your machine translations
+from the nineties nineties to two thousand.
+0:48:32.912 --> 0:48:35.891
+Fifteen or so people were working on the IBM
+models.
+0:48:36.496 --> 0:48:44.608
+And that was the first way of doing a machine
+translation with statisticals or machine learning.
+0:48:44.924 --> 0:48:52.143
+And it was possible through the French English
+under a corpusol from the Canadian Parliament
+0:48:52.143 --> 0:48:59.516
+they also had proceedings in French and English
+and people tried to use that to translate and.
+0:49:01.681 --> 0:49:06.919
+And yes, so that was than the start of statistical
+machine translation.
+0:49:07.227 --> 0:49:17.797
+Is called a phrase page machine translation
+was introduced where you could add more information
+0:49:17.797 --> 0:49:26.055
+in use longer chunks to translate and phrase
+page translation was somehow.
+0:49:26.326 --> 0:49:27.603
+She'll Start Fourteen.
+0:49:27.767 --> 0:49:37.721
+With this straight space machine sensation
+we saw the first commercial systems.
+0:49:38.178 --> 0:49:45.301
+And yeah, that was the first big advantage
+where really you can see the machine translation.
+0:49:47.287 --> 0:49:55.511
+And neural machine translation was mainly
+introduced.
+0:49:55.511 --> 0:50:07.239
+That means there was a shift from traditional
+statistical modeling to using.
+0:50:07.507 --> 0:50:09.496
+And that was quite impressive.
+0:50:09.496 --> 0:50:11.999
+It was really within one or two years.
+0:50:11.999 --> 0:50:17.453
+The whole research community shifted from
+what they had been working on since twenty
+0:50:17.453 --> 0:50:17.902
+years.
+0:50:17.902 --> 0:50:23.485
+And everybody was using this pattern, you
+know networks, because just the performances
+0:50:23.485 --> 0:50:25.089
+were really really much.
+0:50:25.425 --> 0:50:35.048
+Especially they are what we also see now with
+chat boards like the impressive thing.
+0:50:35.135 --> 0:50:45.261
+That was very, very challenging if you see
+machine translation before that, especially
+0:50:45.261 --> 0:50:47.123
+if the English.
+0:50:47.547 --> 0:50:53.352
+But if you were transmitting to German you
+would see that the agreement so that it's there
+0:50:53.352 --> 0:50:58.966
+shown abound and dishewn and boima and this
+didn't always really work perfect maybe for
+0:50:58.966 --> 0:51:04.835
+the short range of work but then it has to
+be accusative and it's like far away then things
+0:51:04.835 --> 0:51:06.430
+didn't really work well.
+0:51:06.866 --> 0:51:13.323
+Now with new machine translation we have a
+bit of a different problem: So the sentences
+0:51:13.323 --> 0:51:16.901
+are typically really nice.
+0:51:16.901 --> 0:51:24.056
+They are perfectly written not always but
+very often.
+0:51:24.224 --> 0:51:36.587
+So that adequacy and their conveillance should
+have the same meaning is typically the bigger.
+0:51:42.002 --> 0:51:46.039
+So how can we do so last?
+0:51:46.039 --> 0:51:54.889
+What are the things and how can we do machine
+rendering?
+0:51:55.235 --> 0:52:01.297
+So we had first blue based systems, and as
+a side systems we did that we manually created
+0:52:01.297 --> 0:52:01.769
+rules.
+0:52:01.861 --> 0:52:07.421
+And there were rules how to dissemvy real
+ambiguities.
+0:52:07.421 --> 0:52:16.417
+For example, we had the word banks look at
+the context and do rules like to decide when.
+0:52:17.197 --> 0:52:28.418
+How to translate the structure, but you know
+how to transfer the structure that you work
+0:52:28.418 --> 0:52:33.839
+has to split it in German and move to the.
+0:52:35.295 --> 0:52:36.675
+Here's a difficult thing.
+0:52:36.675 --> 0:52:39.118
+My thing is you don't need any training data.
+0:52:39.118 --> 0:52:41.295
+It's not like now with machine learning.
+0:52:41.295 --> 0:52:46.073
+If you build a machine translation system,
+the first question you should ask is do I have
+0:52:46.073 --> 0:52:46.976
+data to do that?
+0:52:46.976 --> 0:52:48.781
+Do I have parallel data to train?
+0:52:49.169 --> 0:52:50.885
+Here there's no data.
+0:52:50.885 --> 0:52:57.829
+It's like all trades, pencils and roads, but
+the problem is people trading the roads and
+0:52:57.829 --> 0:52:59.857
+this needs to be experts.
+0:52:59.799 --> 0:53:06.614
+Understand at least the grammar in one language,
+basically the grammar in both languages.
+0:53:06.614 --> 0:53:09.264
+It needs to be a real language to.
+0:53:10.090 --> 0:53:17.308
+Then we have the two corpus based machine
+translation approaches, and then we use machine
+0:53:17.308 --> 0:53:22.682
+learning to learn how to translate from one
+language to the other.
+0:53:22.882 --> 0:53:29.205
+We should find out ourselves what is the meaning
+of individual words, which words translate
+0:53:29.205 --> 0:53:30.236
+to each other.
+0:53:30.236 --> 0:53:36.215
+The only information we give is the German
+sentence, the English sentence, and then we
+0:53:36.215 --> 0:53:37.245
+look for many.
+0:53:37.697 --> 0:53:42.373
+So maybe you think there's a Bible for each
+language.
+0:53:42.373 --> 0:53:44.971
+There shouldn't be a problem.
+0:53:45.605 --> 0:53:52.752
+But this is not the scale when we're talking
+about.
+0:53:52.752 --> 0:54:05.122
+Small systems have maybe one hundred thousand
+sentences when we're building large models.
+0:54:05.745 --> 0:54:19.909
+The statistical models do statistics about
+how the word screw occur and how often the
+0:54:19.909 --> 0:54:21.886
+word screw.
+0:54:22.382 --> 0:54:29.523
+While we were focused on it was currently
+most of the cases referred to as neural communication.
+0:54:30.050 --> 0:54:44.792
+So in this case the idea is that you have
+a neural model which is a big neural network.
+0:54:45.345 --> 0:54:55.964
+And for these machine drums there quite challenging
+tasks.
+0:54:55.964 --> 0:55:03.883
+For example, this transformal architecture.
+0:55:03.903 --> 0:55:07.399
+Cast by Google in two thousand eight.
+0:55:08.028 --> 0:55:19.287
+Here want to ask the screw-based machine translation
+of that part.
+0:55:22.862 --> 0:55:33.201
+Would say it's mainly rule based systems because
+purely rule based systems maybe exist with
+0:55:33.201 --> 0:55:36.348
+some very exotic languages.
+0:55:36.776 --> 0:55:43.947
+Of course, the idea of investigating if we
+have this type of rulers that might be still
+0:55:43.947 --> 0:55:45.006
+interesting.
+0:55:45.105 --> 0:55:52.090
+Maybe you can try to let someone force the
+rules in there.
+0:55:52.090 --> 0:55:57.655
+You might use rules to create artificial data.
+0:55:57.557 --> 0:56:03.577
+That it might be helpful to have some concepts
+which develop by bilinguistic researches to
+0:56:03.577 --> 0:56:09.464
+somehow interview that that's still an open
+question is sometimes helpful, and of course
+0:56:09.464 --> 0:56:13.235
+is also interesting from more the analyzed
+perspectives.
+0:56:13.235 --> 0:56:13.499
+So.
+0:56:13.793 --> 0:56:20.755
+Do the new networks have these types of concepts
+of gender or anything?
+0:56:20.755 --> 0:56:23.560
+And can we test that though?
+0:56:30.330 --> 0:56:34.255
+Yes, and then the other way of describing
+how this can be done.
+0:56:34.574 --> 0:56:52.021
+And then originally mainly for a rule based
+system that can be used for a lot of scenarios.
+0:56:52.352 --> 0:57:04.135
+In real ways, the first world has really direct
+translation systems that work for related languages.
+0:57:04.135 --> 0:57:11.367
+You mainly look at each word and replace the
+word by the one.
+0:57:11.631 --> 0:57:22.642
+Another idea is that you first do some type
+of animus on the source side, so for example
+0:57:22.642 --> 0:57:28.952
+you can create what is referred to as a path
+tree.
+0:57:30.150 --> 0:57:36.290
+Or you can instead, and that is what is called
+the lingua face approach.
+0:57:36.290 --> 0:57:44.027
+You take the short sentence and parse it into
+a semantic representation, which is hopefully
+0:57:44.027 --> 0:57:44.448
+the.
+0:57:44.384 --> 0:57:50.100
+Only of the meaning of what is said and then
+you can generate it to any other language because
+0:57:50.100 --> 0:57:55.335
+it has a meaning and then you can need a part
+generation which can generate all other.
+0:57:57.077 --> 0:58:09.248
+The idea is somewhat nice to have this type
+of interlingua, general representation of all
+0:58:09.248 --> 0:58:17.092
+meanings, and they always translate into the
+interlingua.
+0:58:17.177 --> 0:58:19.189
+A Little World and It's Been Somewhere.
+0:58:20.580 --> 0:58:26.684
+It shouldn't be a natural language because
+it shouldn't have ambiguities so that's a big
+0:58:26.684 --> 0:58:32.995
+difference so the story and the tiger language
+have ambiguities so the idea is they do some
+0:58:32.995 --> 0:58:39.648
+semantic representation or what does it mean
+and so on and therefore it's very easy to generate.
+0:58:41.962 --> 0:58:45.176
+However, that is a challenge that this really
+exists.
+0:58:45.176 --> 0:58:48.628
+You cannot define the language for anything
+in the world.
+0:58:49.249 --> 0:58:56.867
+And that's why the Lingo-based approach typically
+worked for small domains to do hotel reservation,
+0:58:56.867 --> 0:59:00.676
+but if you want to define the Lingo for anything.
+0:59:01.061 --> 0:59:07.961
+There have been approaches and semantics,
+but it's yeah, it's not really possible CR.
+0:59:07.961 --> 0:59:15.905
+So approaches to this because I mean a seasonal
+vector's face and bitch eyes and slaves everything
+0:59:15.905 --> 0:59:20.961
+that I mitonized that they all could end up
+in the same space.
+0:59:21.821 --> 0:59:24.936
+That is not the question.
+0:59:24.936 --> 0:59:35.957
+If you talk about neural networks, it's direct
+translation on the one you're putting in the
+0:59:35.957 --> 0:59:36.796
+input.
+0:59:36.957 --> 0:59:44.061
+And you can argue for both that we have been
+making this representation language agnostic
+0:59:44.061 --> 0:59:45.324
+or independent.
+0:59:47.227 --> 0:59:52.912
+Until now we were able to make it less language
+dependent but it's very hard to make it completely
+0:59:52.912 --> 0:59:54.175
+language independent.
+0:59:54.175 --> 0:59:59.286
+Maybe it's also not necessary and of course
+if there's again the problem there's not all
+0:59:59.286 --> 1:00:04.798
+information and the source and the target there
+is different types of information if you remove
+1:00:04.798 --> 1:00:05.602
+all language.
+1:00:05.585 --> 1:00:09.408
+Information might be that you have removed
+too many information.
+1:00:10.290 --> 1:00:15.280
+Talk about this and there's a very interesting
+research direction in which we are working
+1:00:15.280 --> 1:00:20.325
+on on the multilingual part because there is
+especially the case if we have several source
+1:00:20.325 --> 1:00:25.205
+languages, several type of languages who try
+to generate a representation in the middle
+1:00:25.205 --> 1:00:27.422
+which have the few language dependence.
+1:00:32.752 --> 1:00:46.173
+Yes, so for a direct base approach, so as
+said the first one is dictionary based approach.
+1:00:46.806 --> 1:00:48.805
+Replace some words with other words.
+1:00:48.805 --> 1:00:51.345
+Then you have exactly the same same structure.
+1:00:51.771 --> 1:00:55.334
+Other problems are one to one correspondence.
+1:00:55.334 --> 1:01:01.686
+Some phrases are expressed with several words
+in English, but one word in German.
+1:01:01.686 --> 1:01:03.777
+That's extremely the case.
+1:01:03.777 --> 1:01:07.805
+Just think about all our composites like the
+Donau.
+1:01:08.608 --> 1:01:18.787
+Which is used very often as been referred
+to as translation memory.
+1:01:18.787 --> 1:01:25.074
+It might seem very simple, but it's like.
+1:01:26.406 --> 1:01:33.570
+That means you might think of this not helpful
+at all, but you know think about translating.
+1:01:33.513 --> 1:01:38.701
+The law text is more like the interactive
+scenario for the human translator.
+1:01:38.701 --> 1:01:44.091
+In law text there is a lot of repetition and
+a lot of phrases occur very often.
+1:01:44.424 --> 1:01:55.412
+The translator has just a background of translation
+memory and retrieve all this translation.
+1:01:55.895 --> 1:02:07.147
+There is even another benefit in addition
+to less work: That is also precise in the way
+1:02:07.147 --> 1:02:19.842
+know this creates a small mistake in the North
+Carolina.
+1:02:20.300 --> 1:02:22.584
+By especially its like consistence,.
+1:02:23.243 --> 1:02:32.954
+If you once translate the sentence this way
+you again translate it and especially for some
+1:02:32.954 --> 1:02:36.903
+situations like a company they have.
+1:02:37.217 --> 1:02:47.695
+With this one, of course, you get more consistent
+translations.
+1:02:47.695 --> 1:02:56.700
+Each one is a style where phrases maybe are
+retrieved.
+1:03:01.861 --> 1:03:15.502
+Then we have these transfer based approaches
+where we have three steps: Analysts remain
+1:03:15.502 --> 1:03:25.975
+that you check one synthetic structure, so
+for example for morphology the basic.
+1:03:26.286 --> 1:03:37.277
+Then you will do a parstry or dependency structure
+that this is the adjective of the balm.
+1:03:37.917 --> 1:03:42.117
+Then you can do the transfer where you transfer
+the structure to the other.
+1:03:42.382 --> 1:03:46.633
+There you have to do, for example, it's re-ordering
+because the satisfaction is different.
+1:03:46.987 --> 1:03:50.088
+In German, the adjective is before the noun.
+1:03:50.088 --> 1:03:52.777
+In Spanish, it's the other way around.
+1:03:52.777 --> 1:03:59.256
+You have first found and then that it's nice
+and these types of rehonoring can be done there.
+1:03:59.256 --> 1:04:04.633
+You might have to do other things like passive
+voice to exit voice and so on.
+1:04:05.145 --> 1:04:14.074
+And in some type of lexical transverse it
+should like to me: And then you are doing the
+1:04:14.074 --> 1:04:16.014
+generation.
+1:04:16.014 --> 1:04:25.551
+Of course, you would do the agreement if it
+is accusative.
+1:04:25.551 --> 1:04:29.430
+What type of adjective?
+1:04:30.090 --> 1:04:32.048
+Is some kind of saving.
+1:04:32.048 --> 1:04:39.720
+Of course, here, because the analyze has only
+to be done in the source language, the transfer
+1:04:39.720 --> 1:04:41.679
+has to do on the pairs.
+1:04:41.679 --> 1:04:48.289
+But if you not look German, English and French
+through all directions, you only.
+1:04:53.273 --> 1:04:59.340
+Then there is an interlingua card which is
+really about the pure meaning, so you have
+1:04:59.340 --> 1:05:00.751
+a semantic grammar.
+1:05:01.061 --> 1:05:07.930
+To represent everything and one thing, one
+nice implication is more extreme than before.
+1:05:07.930 --> 1:05:15.032
+You don't have the transfer anymore, so if
+you add one language to it and you have already.
+1:05:15.515 --> 1:05:26.188
+If you add the one parting and the one generation
+phase, you can now translate from: So you need
+1:05:26.188 --> 1:05:40.172
+components which do the and components which
+do the generation, and then you can translate:
+1:05:41.001 --> 1:05:45.994
+You can also do other things like paraphrasing.
+1:05:45.994 --> 1:05:52.236
+You can translate back to the words language
+and hopefully.
+1:05:53.533 --> 1:06:05.013
+If you're sparkling trying to analyze it,
+it was also down a lot for ungrammetical speech
+1:06:05.013 --> 1:06:11.518
+because the idea is you're in this representation.
+1:06:12.552 --> 1:06:18.679
+Of course, it's very much work and it's only
+realistic for limited domains.
+1:06:20.000 --> 1:06:25.454
+Then we're, we're have the campus based approach.
+1:06:25.745 --> 1:06:32.486
+So we'll talk about a lot about peril layer
+and what is really peril data is what you know
+1:06:32.486 --> 1:06:34.634
+from the Rosetta stone page.
+1:06:34.634 --> 1:06:41.227
+That is, you have a sewer sentence and you
+have a target sentence and you know they need
+1:06:41.227 --> 1:06:42.856
+to watch translation.
+1:06:43.343 --> 1:06:46.651
+And that's important, so the alignment is
+typically at a sentence level.
+1:06:46.987 --> 1:06:50.252
+So you know, for each sentence what is a translation?
+1:06:50.252 --> 1:06:55.756
+Not always perfect because maybe there's two
+German sentences and one English, but at that
+1:06:55.756 --> 1:06:57.570
+level it's normally possible.
+1:06:57.570 --> 1:07:03.194
+At word level you can't do that because it's
+a very complicated thing and sense level that's
+1:07:03.194 --> 1:07:04.464
+normally a relative.
+1:07:05.986 --> 1:07:12.693
+Some type of machine learning which tries
+to learn dismapping between sentences on the
+1:07:12.693 --> 1:07:14.851
+English side and sentences.
+1:07:15.355 --> 1:07:22.088
+Of course this doesn't look like good mapping
+too complex but you try to find something like
+1:07:22.088 --> 1:07:28.894
+that where it's a very nice mapping so there's
+always the mixing things are met to each other
+1:07:28.894 --> 1:07:32.224
+and then if you have the English you can try.
+1:07:32.172 --> 1:07:36.900
+In another English sentence you can apply
+the same mannering and hopefully adhere to
+1:07:36.900 --> 1:07:38.514
+the right sentence in terms.
+1:07:38.918 --> 1:07:41.438
+The big problem here.
+1:07:41.438 --> 1:07:44.646
+How can we find this model?
+1:07:44.646 --> 1:07:50.144
+How to map English centers into German centers?
+1:07:54.374 --> 1:08:08.492
+How we do that is that we are trying to maximize
+the probability, so we have all the letterstone.
+1:08:09.109 --> 1:08:15.230
+Then we're having some type of model here
+which takes the Suez language and translates
+1:08:15.230 --> 1:08:16.426
+it for a target.
+1:08:16.896 --> 1:08:34.008
+And then we are in our translation, and we
+are adjusting our model in a way that the probability.
+1:08:34.554 --> 1:08:48.619
+How that is the idea behind it, how we are
+pushed now, implement that is part of the bottle.
+1:08:51.131 --> 1:09:01.809
+And then if we want to do translation, what
+we are doing is we are trying to find the translation.
+1:09:01.962 --> 1:09:06.297
+So we are scoring many possible translations.
+1:09:06.297 --> 1:09:12.046
+There is an infinite number of sentences that
+we are trying.
+1:09:12.552 --> 1:09:18.191
+That may be a bit of a problem when we talk
+about confidence because we are always trying
+1:09:18.191 --> 1:09:19.882
+to find the most probable.
+1:09:20.440 --> 1:09:28.241
+And then, of course, we are not really having
+intrinsically the possibility to say, oh, I
+1:09:28.241 --> 1:09:31.015
+have no idea in this situation.
+1:09:31.015 --> 1:09:35.782
+But our general model is always about how
+can we find?
+1:09:40.440 --> 1:09:41.816
+Think It's.
+1:09:42.963 --> 1:09:44.242
+Get Four More Slides.
+1:09:46.686 --> 1:09:52.025
+So just high level, so for a proper space
+this one we won't cover again.
+1:09:52.352 --> 1:10:00.808
+Its example based machine translation was
+at the beginning of SMT.
+1:10:00.808 --> 1:10:08.254
+The idea is that you take subparts and combine
+them again.
+1:10:08.568 --> 1:10:11.569
+So this will not be really covered here.
+1:10:11.569 --> 1:10:15.228
+Then the statistical machine translation we
+will.
+1:10:17.077 --> 1:10:18.773
+Yeah, we will cover next week.
+1:10:19.079 --> 1:10:27.594
+The idea is there that we automatically now,
+if we have the sentence alignment, we automatically.
+1:10:27.527 --> 1:10:34.207
+In the sentences, and then we can learn statistical
+models of how probable words are translated
+1:10:34.207 --> 1:10:39.356
+to each other, and then the surge is that we
+create different hypotheses.
+1:10:39.356 --> 1:10:45.200
+This could be a translation of this part,
+this could be a translation of that part.
+1:10:45.200 --> 1:10:47.496
+We give a score to each of them.
+1:10:47.727 --> 1:10:51.584
+The statistical machine manual is where a
+lot of work is done.
+1:10:51.584 --> 1:10:54.155
+How can we score how good translation is?
+1:10:54.494 --> 1:11:04.764
+The words can recur this type of structure,
+how is it reordered, and then based on that
+1:11:04.764 --> 1:11:08.965
+we search for the best translation.
+1:11:12.252 --> 1:11:19.127
+Then yeah, that one what we'll cover most
+of the time is is a neural, a model where we
+1:11:19.127 --> 1:11:21.102
+can use neural networks.
+1:11:21.102 --> 1:11:27.187
+The nice thing is between everything together
+before we get some compliment.
+1:11:27.187 --> 1:11:30.269
+Each of them is trained independently.
+1:11:30.210 --> 1:11:34.349
+Which of course has a disadvantage that they
+might not best work together.
+1:11:34.694 --> 1:11:36.601
+Here everything is trained together.
+1:11:36.601 --> 1:11:39.230
+The continuous representation will look into
+that.
+1:11:39.339 --> 1:11:41.846
+That's very helpful soft.
+1:11:41.846 --> 1:11:50.426
+We then neonetworks are able to learn somehow
+the relation between words and that's very
+1:11:50.426 --> 1:11:57.753
+helpful because then we can more easily deal
+with words which didn't occur.
+1:12:00.000 --> 1:12:05.240
+One thing just to correlate that to interlingua
+based.
+1:12:05.345 --> 1:12:07.646
+So we have this as an actual language.
+1:12:07.627 --> 1:12:11.705
+And if you do an interlingual based approach
+but don't take an artificial.
+1:12:11.731 --> 1:12:17.814
+With no ambiguities, but with a natural language
+that's referred to as pivot based in tea and
+1:12:17.814 --> 1:12:20.208
+can be done with all the approaches.
+1:12:20.208 --> 1:12:25.902
+So the ideas instead of directly translating
+from German to French, you first translate
+1:12:25.902 --> 1:12:29.073
+from German to English and then from English
+to.
+1:12:29.409 --> 1:12:40.954
+French where the big advantage is that you
+might have a lot more data for these two directions
+1:12:40.954 --> 1:12:43.384
+than you have here.
+1:12:44.864 --> 1:12:54.666
+With this thank you and deserve more questions
+and a bit late I'm sorry and then I'll see
+1:12:54.666 --> 1:12:55.864
+you again.

demo_data/lectures/Lecture-01-18.04.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f95bffd5a310af38b1ee51daef47a0af905687cbee799c161515f743cb30d0c
+size 103388000

demo_data/lectures/Lecture-02-20.04.2023/English.vtt ADDED Viewed

	@@ -0,0 +1,2984 @@

+WEBVTT
+0:00:01.561 --> 0:00:05.186
+Okay So Um.
+0:00:08.268 --> 0:00:17.655
+Welcome to today's presentation of the second
+class and machine translation where we'll today
+0:00:17.655 --> 0:00:25.044
+do a bit of a specific topic and we'll talk
+about linguistic backgrounds.
+0:00:26.226 --> 0:00:34.851
+Will cover their three different parts of
+the lecture.
+0:00:35.615 --> 0:00:42.538
+We'll do first a very, very brief introduction
+about linguistic background in a way that what
+0:00:42.538 --> 0:00:49.608
+is language, what are ways of describing language,
+what are a bit serious behind it, very, very
+0:00:49.608 --> 0:00:50.123
+short.
+0:00:50.410 --> 0:00:57.669
+Don't know some of you have listened, think
+to NLP in the last semester or so.
+0:00:58.598 --> 0:01:02.553
+So there we did a lot longer explanation.
+0:01:02.553 --> 0:01:08.862
+Here is just because we are not talking about
+machine translation.
+0:01:09.109 --> 0:01:15.461
+So it's really focused on the parts which
+are important when we talk about machine translation.
+0:01:15.755 --> 0:01:19.377
+Though for everybody who has listened to that
+already, it's a bit of a repetition.
+0:01:19.377 --> 0:01:19.683
+Maybe.
+0:01:19.980 --> 0:01:23.415
+But it's really trying to look.
+0:01:23.415 --> 0:01:31.358
+These are properties of languages and how
+can they influence translation.
+0:01:31.671 --> 0:01:38.928
+We'll use that in the second part to discuss
+why is machine translation more from what we
+0:01:38.928 --> 0:01:40.621
+know about language.
+0:01:40.940 --> 0:01:47.044
+We will see that I mean there's two main things
+is that the language might express ideas and
+0:01:47.044 --> 0:01:53.279
+information differently, and if they are expressed
+different in different languages we have to
+0:01:53.279 --> 0:01:54.920
+do somehow the transfer.
+0:01:55.135 --> 0:02:02.771
+And it's not purely that we know there's words
+used for it, but it's not that simple and very
+0:02:02.771 --> 0:02:03.664
+different.
+0:02:04.084 --> 0:02:10.088
+And the other problem we mentioned last time
+about biases is that there's not always the
+0:02:10.088 --> 0:02:12.179
+same amount of information in.
+0:02:12.592 --> 0:02:18.206
+So it can be that there's some more information
+in the one or you can't express that few information
+0:02:18.206 --> 0:02:19.039
+on the target.
+0:02:19.039 --> 0:02:24.264
+We had that also, for example, with the example
+with the rice plant in Germany, we would just
+0:02:24.264 --> 0:02:24.820
+say rice.
+0:02:24.904 --> 0:02:33.178
+Or in English, while in other countries you
+have to distinguish between rice plant or rice
+0:02:33.178 --> 0:02:33.724
+as a.
+0:02:34.194 --> 0:02:40.446
+And then it's not always possible to directly
+infer this on the surface.
+0:02:41.781 --> 0:02:48.501
+And if we make it to the last point otherwise
+we'll do that next Tuesday or we'll partly
+0:02:48.501 --> 0:02:55.447
+do it only here is like we'll describe briefly
+the three main approaches on a rule based so
+0:02:55.447 --> 0:02:59.675
+linguistic motivated ways of doing machine
+translation.
+0:02:59.779 --> 0:03:03.680
+We mentioned them last time like the direct
+translation.
+0:03:03.680 --> 0:03:10.318
+The translation by transfer the lingua interlingua
+bass will do that a bit more in detail today.
+0:03:10.590 --> 0:03:27.400
+But very briefly because this is not a focus
+of this class and then next week because.
+0:03:29.569 --> 0:03:31.757
+Why do we think this is important?
+0:03:31.757 --> 0:03:37.259
+On the one hand, of course, we are dealing
+with natural language, so therefore it might
+0:03:37.259 --> 0:03:43.074
+be good to spend a bit of time in understanding
+what we are really dealing with because this
+0:03:43.074 --> 0:03:45.387
+is challenging these other problems.
+0:03:45.785 --> 0:03:50.890
+And on the other hand, this was the first
+way of how we're doing machine translation.
+0:03:51.271 --> 0:04:01.520
+Therefore, it's interesting to understand
+what was the idea behind that and also to later
+0:04:01.520 --> 0:04:08.922
+see what is done differently and to understand
+when some models.
+0:04:13.453 --> 0:04:20.213
+When we're talking about linguistics, we can
+of course do that on different levels and there's
+0:04:20.213 --> 0:04:21.352
+different ways.
+0:04:21.521 --> 0:04:26.841
+On the right side here you are seeing the
+basic levels of linguistics.
+0:04:27.007 --> 0:04:31.431
+So we have at the bottom the phonetics and
+phonology.
+0:04:31.431 --> 0:04:38.477
+Phones will not cover this year because we
+are mainly focusing on text input where we
+0:04:38.477 --> 0:04:42.163
+are directly having directors and then work.
+0:04:42.642 --> 0:04:52.646
+Then what we touch today, at least mention
+what it is, is a morphology which is the first
+0:04:52.646 --> 0:04:53.424
+level.
+0:04:53.833 --> 0:04:59.654
+Already mentioned it a bit on Tuesday that
+of course there are some languages where this
+0:04:59.654 --> 0:05:05.343
+is very, very basic and there is not really
+a lot of rules of how you can build words.
+0:05:05.343 --> 0:05:11.099
+But since I assume you all have some basic
+knowledge of German there is like a lot more
+0:05:11.099 --> 0:05:12.537
+challenges than that.
+0:05:13.473 --> 0:05:20.030
+You know, maybe if you're a native speaker
+that's quite easy and everything is clear,
+0:05:20.030 --> 0:05:26.969
+but if you have to learn it like the endings
+of a word, we are famous for doing compositar
+0:05:26.969 --> 0:05:29.103
+and putting words together.
+0:05:29.103 --> 0:05:31.467
+So this is like the first lab.
+0:05:32.332 --> 0:05:40.268
+Then we have the syntax, which is both on
+the word and on the sentence level, and that's
+0:05:40.268 --> 0:05:43.567
+about the structure of the sentence.
+0:05:43.567 --> 0:05:46.955
+What are the functions of some words?
+0:05:47.127 --> 0:05:51.757
+You might remember part of speech text from
+From Your High School Time.
+0:05:51.757 --> 0:05:57.481
+There is like noun and adjective and and things
+like that and this is something helpful.
+0:05:57.737 --> 0:06:03.933
+Just imagine in the beginning that it was
+not only used for rule based but for statistical
+0:06:03.933 --> 0:06:10.538
+machine translation, for example, the reordering
+between languages was quite a challenging task.
+0:06:10.770 --> 0:06:16.330
+Especially if you have long range reorderings
+and their part of speech information is very
+0:06:16.330 --> 0:06:16.880
+helpful.
+0:06:16.880 --> 0:06:20.301
+You know, in German you have to move the word
+the verb.
+0:06:20.260 --> 0:06:26.599
+To the second position, if you have Spanish
+you have to change the noun and the adjective
+0:06:26.599 --> 0:06:30.120
+so information from part of speech could be
+very.
+0:06:30.410 --> 0:06:38.621
+Then you have a syntax base structure where
+you have a full syntax tree in the beginning
+0:06:38.621 --> 0:06:43.695
+and then it came into statistical machine translation.
+0:06:44.224 --> 0:06:50.930
+And it got more and more important for statistical
+machine translation that you are really trying
+0:06:50.930 --> 0:06:53.461
+to model the whole syntax tree of a.
+0:06:53.413 --> 0:06:57.574
+Sentence in order to better match how to do
+that in UM.
+0:06:57.574 --> 0:07:04.335
+In the target language, a bit yeah, the syntax
+based statistical machine translation had a
+0:07:04.335 --> 0:07:05.896
+bitter of a problem.
+0:07:05.896 --> 0:07:08.422
+It got better and better and was.
+0:07:08.368 --> 0:07:13.349
+Just on the way of getting better in some
+languages than traditional statistical models.
+0:07:13.349 --> 0:07:18.219
+But then the neural models came up and they
+were just so much better in modelling that
+0:07:18.219 --> 0:07:19.115
+all implicitly.
+0:07:19.339 --> 0:07:23.847
+So that they are never were used in practice
+so much.
+0:07:24.304 --> 0:07:34.262
+And then we'll talk about the semantics, so
+what is the meaning of the words?
+0:07:34.262 --> 0:07:40.007
+Last time words can have different meanings.
+0:07:40.260 --> 0:07:46.033
+And yeah, how you represent meaning of cause
+is very challenging.
+0:07:45.966 --> 0:07:53.043
+And normally that like formalizing this is
+typically done in quite limited domains because
+0:07:53.043 --> 0:08:00.043
+like doing that for like all possible words
+has not really been achieved yet in this very
+0:08:00.043 --> 0:08:00.898
+challenge.
+0:08:02.882 --> 0:08:09.436
+About pragmatics, so pragmatics is then what
+is meaning in the context of the current situation.
+0:08:09.789 --> 0:08:16.202
+So one famous example is there, for example,
+if you say the light is red.
+0:08:16.716 --> 0:08:21.795
+The traffic light is red so that typically
+not you don't want to tell the other person
+0:08:21.795 --> 0:08:27.458
+if you're sitting in a car that it's surprising
+oh the light is red but typically you're meaning
+0:08:27.458 --> 0:08:30.668
+okay you should stop and you shouldn't pass
+the light.
+0:08:30.850 --> 0:08:40.994
+So the meaning of this sentence, the light,
+is red in the context of sitting in the car.
+0:08:42.762 --> 0:08:51.080
+So let's start with the morphology so that
+with the things we are starting there and one
+0:08:51.080 --> 0:08:53.977
+easy and first thing is there.
+0:08:53.977 --> 0:09:02.575
+Of course we have to split the sentence into
+words or joint directors so that we have word.
+0:09:02.942 --> 0:09:09.017
+Because in most of our work we'll deal like
+machine translation with some type of words.
+0:09:09.449 --> 0:09:15.970
+In neuromachine translation, people are working
+also on director based and subwords, but a
+0:09:15.970 --> 0:09:20.772
+basic unique words of the sentence is a very
+important first step.
+0:09:21.421 --> 0:09:32.379
+And for many languages that is quite simple
+in German, it's not that hard to determine
+0:09:32.379 --> 0:09:33.639
+the word.
+0:09:34.234 --> 0:09:46.265
+In tokenization, the main challenge is if
+we are doing corpus-based methods that we are
+0:09:46.265 --> 0:09:50.366
+also dealing as normal words.
+0:09:50.770 --> 0:10:06.115
+And there of course it's getting a bit more
+challenging.
+0:10:13.173 --> 0:10:17.426
+So that is maybe the main thing where, for
+example, in Germany, if you think of German
+0:10:17.426 --> 0:10:19.528
+tokenization, it's easy to get every word.
+0:10:19.779 --> 0:10:26.159
+You split it at a space, but then you would
+have the dots at the end join to the last word,
+0:10:26.159 --> 0:10:30.666
+and of course that you don't want because it's
+a different word.
+0:10:30.666 --> 0:10:37.046
+The last word would not be go, but go dot,
+but what you can do is split up the dots always.
+0:10:37.677 --> 0:10:45.390
+Can you really do that always or it might
+be sometimes better to keep the dot as a point?
+0:10:47.807 --> 0:10:51.001
+For example, email addresses or abbreviations
+here.
+0:10:51.001 --> 0:10:56.284
+For example, doctor, maybe it doesn't make
+sense to split up the dot because then you
+0:10:56.284 --> 0:11:01.382
+would assume all year starts a new sentence,
+but it's just the DR dot from doctor.
+0:11:01.721 --> 0:11:08.797
+Or if you have numbers like he's a seventh
+person like the zipter, then you don't want
+0:11:08.797 --> 0:11:09.610
+to split.
+0:11:09.669 --> 0:11:15.333
+So there are some things where it could be
+a bit more difficult, but it's not really challenging.
+0:11:16.796 --> 0:11:23.318
+In other languages it's getting a lot more
+challenging, especially in Asian languages
+0:11:23.318 --> 0:11:26.882
+where often there are no spaces between words.
+0:11:27.147 --> 0:11:32.775
+So you just have the sequence of characters.
+0:11:32.775 --> 0:11:38.403
+The quick brown fox jumps over the lazy dog.
+0:11:38.999 --> 0:11:44.569
+And then it still might be helpful to work
+on something like words.
+0:11:44.569 --> 0:11:48.009
+Then you need to have a bit more complex.
+0:11:48.328 --> 0:11:55.782
+And here you see we are again having our typical
+problem.
+0:11:55.782 --> 0:12:00.408
+That means that there is ambiguity.
+0:12:00.600 --> 0:12:02.104
+So you're seeing here.
+0:12:02.104 --> 0:12:08.056
+We have exactly the same sequence of characters
+or here, but depending on how we split it,
+0:12:08.056 --> 0:12:12.437
+it means he is your servant or he is the one
+who used your things.
+0:12:12.437 --> 0:12:15.380
+Or here we have round eyes and take the air.
+0:12:15.895 --> 0:12:22.953
+So then of course yeah this type of tokenization
+gets more important because you could introduce
+0:12:22.953 --> 0:12:27.756
+already arrows and you can imagine if you're
+doing it here wrong.
+0:12:27.756 --> 0:12:34.086
+If you once do a wrong decision it's quite
+difficult to recover from a wrong decision.
+0:12:34.634 --> 0:12:47.088
+And so in these cases looking about how we're
+doing tokenization is an important issue.
+0:12:47.127 --> 0:12:54.424
+And then it might be helpful to do things
+like director based models where we treat each
+0:12:54.424 --> 0:12:56.228
+director as a symbol.
+0:12:56.228 --> 0:13:01.803
+For example, do this decision in the later
+or never really do this?
+0:13:06.306 --> 0:13:12.033
+The other thing is that if we have words we
+might, it might not be the optimal unit to
+0:13:12.033 --> 0:13:18.155
+work with because it can be that we should
+look into the internal structure of words because
+0:13:18.155 --> 0:13:20.986
+if we have a morphological rich language,.
+0:13:21.141 --> 0:13:27.100
+That means we have a lot of different types
+of words, and if you have a lot of many different
+0:13:27.100 --> 0:13:32.552
+types of words, it on the other hand means
+of course each of these words we have seen
+0:13:32.552 --> 0:13:33.757
+very infrequently.
+0:13:33.793 --> 0:13:39.681
+So if you only have ten words and you have
+a large corpus, each word occurs more often.
+0:13:39.681 --> 0:13:45.301
+If you have three million different words,
+then each of them will occur less often.
+0:13:45.301 --> 0:13:51.055
+Hopefully you know, from machine learning,
+it's helpful if you have seen each example
+0:13:51.055 --> 0:13:51.858
+very often.
+0:13:52.552 --> 0:13:54.524
+And so why does it help?
+0:13:54.524 --> 0:13:56.495
+Why does it help happen?
+0:13:56.495 --> 0:14:02.410
+Yeah, in some languages we have quite a complex
+information inside a word.
+0:14:02.410 --> 0:14:09.271
+So here's a word from a finish talosanikiko
+or something like that, and it means in my
+0:14:09.271 --> 0:14:10.769
+house to question.
+0:14:11.491 --> 0:14:15.690
+So you have all these information attached
+to the word.
+0:14:16.036 --> 0:14:20.326
+And that of course in extreme case that's
+why typically, for example, Finnish is the
+0:14:20.326 --> 0:14:20.831
+language.
+0:14:20.820 --> 0:14:26.725
+Where machine translation quality is less
+good because generating all these different
+0:14:26.725 --> 0:14:33.110
+morphological variants is is a challenge and
+the additional challenge is typically in finish
+0:14:33.110 --> 0:14:39.564
+not really low resource but for in low resource
+languages you quite often have more difficult
+0:14:39.564 --> 0:14:40.388
+morphology.
+0:14:40.440 --> 0:14:43.949
+Mean English is an example of a relatively
+easy one.
+0:14:46.066 --> 0:14:54.230
+And so in general we can say that words are
+composed of more themes, and more themes are
+0:14:54.230 --> 0:15:03.069
+the smallest meaning carrying unit, so normally
+it means: All morphine should have some type
+0:15:03.069 --> 0:15:04.218
+of meaning.
+0:15:04.218 --> 0:15:09.004
+For example, here does not really have a meaning.
+0:15:09.289 --> 0:15:12.005
+Bian has some type of meaning.
+0:15:12.005 --> 0:15:14.371
+It's changing the meaning.
+0:15:14.371 --> 0:15:21.468
+The NES has the meaning that it's making out
+of an adjective, a noun, and happy.
+0:15:21.701 --> 0:15:31.215
+So each of these parts conveys some meaning,
+but you cannot split them further up and have
+0:15:31.215 --> 0:15:32.156
+somehow.
+0:15:32.312 --> 0:15:36.589
+You see that of course a little bit more is
+happening.
+0:15:36.589 --> 0:15:43.511
+Typically the Y is going into an E so there
+can be some variation, but these are typical
+0:15:43.511 --> 0:15:46.544
+examples of what we have as morphines.
+0:16:02.963 --> 0:16:08.804
+That is, of course, a problem and that's the
+question why how you do your splitting.
+0:16:08.804 --> 0:16:15.057
+But that problem we have anyway always because
+even full words can have different meanings
+0:16:15.057 --> 0:16:17.806
+depending on the context they're using.
+0:16:18.038 --> 0:16:24.328
+So we always have to somewhat have a model
+which can infer or represent the meaning of
+0:16:24.328 --> 0:16:25.557
+the word in the.
+0:16:25.825 --> 0:16:30.917
+But you are right that this problem might
+get even more severe if you're splitting up.
+0:16:30.917 --> 0:16:36.126
+Therefore, it might not be the best to go
+for the very extreme and represent each letter
+0:16:36.126 --> 0:16:41.920
+and have a model which is only on letters because,
+of course, a letter can have a lot of different
+0:16:41.920 --> 0:16:44.202
+meanings depending on where it's used.
+0:16:44.524 --> 0:16:50.061
+And yeah, there is no right solution like
+what is the right splitting.
+0:16:50.061 --> 0:16:56.613
+It depends on the language and the application
+on the amount of data you're having.
+0:16:56.613 --> 0:17:01.058
+For example, typically it means the fewer
+data you have.
+0:17:01.301 --> 0:17:12.351
+The more splitting you should do, if you have
+more data, then you can be better distinguish.
+0:17:13.653 --> 0:17:19.065
+Then there are different types of morphines:
+So we have typically one stemmed theme: It's
+0:17:19.065 --> 0:17:21.746
+like house or tish, so the main meaning.
+0:17:21.941 --> 0:17:29.131
+And then you can have functional or bound
+morphemes which can be f which can be prefix,
+0:17:29.131 --> 0:17:34.115
+suffix, infix or circumfix so it can be before
+can be after.
+0:17:34.114 --> 0:17:39.416
+It can be inside or it can be around it, something
+like a coughed there.
+0:17:39.416 --> 0:17:45.736
+Typically you would say that it's not like
+two more themes, G and T, because they both
+0:17:45.736 --> 0:17:50.603
+describe the function, but together G and T
+are marking the cough.
+0:17:53.733 --> 0:18:01.209
+For what are people using them you can use
+them for inflection to describe something like
+0:18:01.209 --> 0:18:03.286
+tense count person case.
+0:18:04.604 --> 0:18:09.238
+That is yeah, if you know German, this is
+commonly used in German.
+0:18:10.991 --> 0:18:16.749
+But of course there is a lot more complicated
+things: I think in in some languages it also.
+0:18:16.749 --> 0:18:21.431
+I mean, in Germany it only depends counting
+person on the subject.
+0:18:21.431 --> 0:18:27.650
+For the word, for example, in other languages
+it can also determine the first and on the
+0:18:27.650 --> 0:18:28.698
+second object.
+0:18:28.908 --> 0:18:35.776
+So that it like if you buy an apple or an
+house, that not only the, the, the.
+0:18:35.776 --> 0:18:43.435
+Kauft depends on on me like in German, but
+it can also depend on whether it's an apple
+0:18:43.435 --> 0:18:44.492
+or a house.
+0:18:44.724 --> 0:18:48.305
+And then of course you have an exploding number
+of web fronts.
+0:18:49.409 --> 0:19:04.731
+Furthermore, it can be used to do derivations
+so you can make other types of words from it.
+0:19:05.165 --> 0:19:06.254
+And then yeah.
+0:19:06.254 --> 0:19:12.645
+This is like creating new words by joining
+them like rainbow waterproof but for example
+0:19:12.645 --> 0:19:19.254
+in German like Einköw's Wagen, Ice Cult and
+so on where you can join where you can do that
+0:19:19.254 --> 0:19:22.014
+with nouns and German adjectives and.
+0:19:22.282 --> 0:19:29.077
+Then of course you might have additional challenges
+like the Fugan where you have to add this one.
+0:19:32.452 --> 0:19:39.021
+Yeah, then there is a yeah of course additional
+special things.
+0:19:39.639 --> 0:19:48.537
+You have to sometimes put extra stuff because
+of phonology, so it's dig the plural, not plural.
+0:19:48.537 --> 0:19:56.508
+The third person singular, as in English,
+is normally S, but by Goes, for example, is
+0:19:56.508 --> 0:19:57.249
+an E S.
+0:19:57.277 --> 0:20:04.321
+In German you can also have other things that
+like Osmutta gets Mutter so you're changing
+0:20:04.321 --> 0:20:11.758
+the Umlaud in order to express the plural and
+in other languages for example the vowel harmony
+0:20:11.758 --> 0:20:17.315
+where the vowels inside are changing depending
+on which form you have.
+0:20:17.657 --> 0:20:23.793
+Which makes things more difficult than splitting
+a word into its part doesn't really work anymore.
+0:20:23.793 --> 0:20:28.070
+So like for Muta and Muta, for example, that
+is not really possible.
+0:20:28.348 --> 0:20:36.520
+The nice thing is, of course, more like a
+general thing, but often irregular things are
+0:20:36.520 --> 0:20:39.492
+happening as words which occur.
+0:20:39.839 --> 0:20:52.177
+So that you can have enough examples, while
+the regular things you can do by some type
+0:20:52.177 --> 0:20:53.595
+of rules.
+0:20:55.655 --> 0:20:57.326
+Yeah, This Can Be Done.
+0:20:57.557 --> 0:21:02.849
+So there are tasks on this: how to do automatic
+inflection, how to analyze them.
+0:21:02.849 --> 0:21:04.548
+So you give it a word to.
+0:21:04.548 --> 0:21:10.427
+It's telling you what are the possible forms
+of that, like how they are built, and so on.
+0:21:10.427 --> 0:21:15.654
+And for the at least Ah Iris shoes language,
+there are a lot of tools for that.
+0:21:15.654 --> 0:21:18.463
+Of course, if you now want to do that for.
+0:21:18.558 --> 0:21:24.281
+Some language which is very low resourced
+might be very difficult and there might be
+0:21:24.281 --> 0:21:25.492
+no tool for them.
+0:21:28.368 --> 0:21:37.652
+Good before we are going for the next part
+about part of speech, are there any questions
+0:21:37.652 --> 0:21:38.382
+about?
+0:22:01.781 --> 0:22:03.187
+Yeah, we'll come to that a bit.
+0:22:03.483 --> 0:22:09.108
+So it's a very good question and difficult
+and especially we'll see that later if you
+0:22:09.108 --> 0:22:14.994
+just put in words it would be very bad because
+words are put into neural networks just as
+0:22:14.994 --> 0:22:15.844
+some digits.
+0:22:15.844 --> 0:22:21.534
+Each word is mapped into a jitter and you
+put it in so it doesn't really know any more
+0:22:21.534 --> 0:22:22.908
+about the structure.
+0:22:23.543 --> 0:22:29.898
+What we will see therefore the most successful
+approach which is mostly done is a subword
+0:22:29.898 --> 0:22:34.730
+unit where we split: But we will do this.
+0:22:34.730 --> 0:22:40.154
+Don't know if you have been in advanced.
+0:22:40.154 --> 0:22:44.256
+We'll cover this on a Tuesday.
+0:22:44.364 --> 0:22:52.316
+So there is an algorithm called bite pairing
+coding, which is about splitting words into
+0:22:52.316 --> 0:22:52.942
+parts.
+0:22:53.293 --> 0:23:00.078
+So it's doing the splitting of words but not
+morphologically motivated but more based on
+0:23:00.078 --> 0:23:00.916
+frequency.
+0:23:00.940 --> 0:23:11.312
+However, it performs very good and that's
+why it's used and there is a bit of correlation.
+0:23:11.312 --> 0:23:15.529
+Sometimes they agree on count based.
+0:23:15.695 --> 0:23:20.709
+So we're splitting words and we're splitting
+especially words which are infrequent and that's
+0:23:20.709 --> 0:23:23.962
+maybe a good motivation why that's good for
+neural networks.
+0:23:23.962 --> 0:23:28.709
+That means if you have seen a word very often
+you don't need to split it and it's easier
+0:23:28.709 --> 0:23:30.043
+to just process it fast.
+0:23:30.690 --> 0:23:39.218
+While if you have seen the words infrequently,
+it is good to split it into parts so it can
+0:23:39.218 --> 0:23:39.593
+do.
+0:23:39.779 --> 0:23:47.729
+So there is some way of doing it, but linguists
+would say this is not a morphological analyst.
+0:23:47.729 --> 0:23:53.837
+That is true, but we are spitting words into
+parts if they are not seen.
+0:23:59.699 --> 0:24:06.324
+Yes, so another important thing about words
+are the paddle speech text.
+0:24:06.324 --> 0:24:14.881
+These are the common ones: noun, verb, adjective,
+verb, determine, pronoun, proposition, and
+0:24:14.881 --> 0:24:16.077
+conjunction.
+0:24:16.077 --> 0:24:26.880
+There are some more: They are not the same
+in all language, but for example there is this
+0:24:26.880 --> 0:24:38.104
+universal grammar which tries to do this type
+of part of speech text for many languages.
+0:24:38.258 --> 0:24:42.018
+And then, of course, it's helping you for
+generalization.
+0:24:42.018 --> 0:24:48.373
+There are some language deals with verbs and
+nouns, especially if you look at sentence structure.
+0:24:48.688 --> 0:24:55.332
+And so if you know the part of speech tag
+you can easily generalize and do get these
+0:24:55.332 --> 0:24:58.459
+rules or apply these rules as you know.
+0:24:58.459 --> 0:25:02.680
+The verb in English is always at the second
+position.
+0:25:03.043 --> 0:25:10.084
+So you know how to deal with verbs independently
+of which words you are now really looking at.
+0:25:12.272 --> 0:25:18.551
+And that again can be done is ambiguous.
+0:25:18.598 --> 0:25:27.171
+So there are some words which can have several
+pot of speech text.
+0:25:27.171 --> 0:25:38.686
+Example are the word can, for example, which
+can be the can of beans or can do something.
+0:25:38.959 --> 0:25:46.021
+Often is also in English related work.
+0:25:46.021 --> 0:25:55.256
+Access can be to excess or to access to something.
+0:25:56.836 --> 0:26:02.877
+Most words have only one single part of speech
+tag, but they are some where it's a bit more
+0:26:02.877 --> 0:26:03.731
+challenging.
+0:26:03.731 --> 0:26:09.640
+The nice thing is the ones which are in big
+are often more words, which occur more often,
+0:26:09.640 --> 0:26:12.858
+while for really ware words it's not that often.
+0:26:13.473 --> 0:26:23.159
+If you look at these classes you can distinguish
+open classes where new words can happen so
+0:26:23.159 --> 0:26:25.790
+we can invent new nouns.
+0:26:26.926 --> 0:26:31.461
+But then there are the close classes which
+I think are determined or pronoun.
+0:26:31.461 --> 0:26:35.414
+For example, it's not that you can easily
+develop your new pronoun.
+0:26:35.414 --> 0:26:38.901
+So there is a fixed list of pronouns and we
+are using that.
+0:26:38.901 --> 0:26:44.075
+So it's not like that or tomorrow there is
+something happening and then people are using
+0:26:44.075 --> 0:26:44.482
+a new.
+0:26:45.085 --> 0:26:52.426
+Pronoun or new conjectures, so it's like end,
+because it's not that you normally invent a
+0:26:52.426 --> 0:26:52.834
+new.
+0:27:00.120 --> 0:27:03.391
+And additional to part of speech text.
+0:27:03.391 --> 0:27:09.012
+Then some of these part of speech texts have
+different properties.
+0:27:09.389 --> 0:27:21.813
+So, for example, for nouns and adjectives
+we can have a singular plural: In other languages,
+0:27:21.813 --> 0:27:29.351
+there is a duel so that a word is not only
+like a single or in plural, but also like a
+0:27:29.351 --> 0:27:31.257
+duel if it's meaning.
+0:27:31.631 --> 0:27:36.246
+You have the gender and masculine feminine
+neutre we know.
+0:27:36.246 --> 0:27:43.912
+In other language there is animated and inanimated
+and you have the cases like in German you have
+0:27:43.912 --> 0:27:46.884
+no maternative guinetive acquisitive.
+0:27:47.467 --> 0:27:57.201
+So here and then in other languages you also
+have Latin with the upper teeth.
+0:27:57.497 --> 0:28:03.729
+So there's like more, it's just like yeah,
+and there you have no one to one correspondence,
+0:28:03.729 --> 0:28:09.961
+so it can be that there are some cases which
+are only in the one language and do not happen
+0:28:09.961 --> 0:28:11.519
+in the other language.
+0:28:13.473 --> 0:28:20.373
+For whorps we have tenses of course like walk
+is walking walked have walked head walked will
+0:28:20.373 --> 0:28:21.560
+walk and so on.
+0:28:21.560 --> 0:28:28.015
+Interestingly for example in Japanese this
+can also happen for adjectives though there
+0:28:28.015 --> 0:28:32.987
+is a difference between something is white
+or something was white.
+0:28:35.635 --> 0:28:41.496
+There is this continuous thing which should
+not really have that commonly in German and
+0:28:41.496 --> 0:28:47.423
+I guess that's if you're German and learning
+English that's something like she sings and
+0:28:47.423 --> 0:28:53.350
+she is singing and of course we can express
+that but it's not commonly used and normally
+0:28:53.350 --> 0:28:55.281
+we're not doing this aspect.
+0:28:55.455 --> 0:28:57.240
+Also about tenses.
+0:28:57.240 --> 0:29:05.505
+If you use pasts in English you will also
+use past tenses in German, so we have similar
+0:29:05.505 --> 0:29:09.263
+tenses, but the use might be different.
+0:29:14.214 --> 0:29:20.710
+There is uncertainty like the mood in there
+indicative.
+0:29:20.710 --> 0:29:26.742
+If he were here, there's voices active and
+passive.
+0:29:27.607 --> 0:29:34.024
+That you know, that is like both in German
+and English there, but there is something in
+0:29:34.024 --> 0:29:35.628
+the Middle and Greek.
+0:29:35.628 --> 0:29:42.555
+I get myself taught, so there is other phenomens
+than which might only happen in one language.
+0:29:42.762 --> 0:29:50.101
+This is, like yeah, the different synthetic
+structures that you can can have in the language,
+0:29:50.101 --> 0:29:57.361
+and where there's the two things, so it might
+be that some only are in some language, others
+0:29:57.361 --> 0:29:58.376
+don't exist.
+0:29:58.358 --> 0:30:05.219
+And on the other hand there is also matching,
+so it might be that in some situations you
+0:30:05.219 --> 0:30:07.224
+use different structures.
+0:30:10.730 --> 0:30:13.759
+The next would be then about semantics.
+0:30:13.759 --> 0:30:16.712
+Do you have any questions before that?
+0:30:19.819 --> 0:30:31.326
+I'll just continue, but if something is unclear
+beside the structure, we typically have more
+0:30:31.326 --> 0:30:39.863
+ambiguities, so it can be that words itself
+have different meanings.
+0:30:40.200 --> 0:30:48.115
+And we are typically talking about polysemy
+and homonyme, where polysemy means that a word
+0:30:48.115 --> 0:30:50.637
+can have different meanings.
+0:30:50.690 --> 0:30:58.464
+So if you have the English word interest,
+it can be that you are interested in something.
+0:30:58.598 --> 0:31:07.051
+Or it can be like the interest rate financial,
+but it is somehow related because if you are
+0:31:07.051 --> 0:31:11.002
+getting some interest rates there is some.
+0:31:11.531 --> 0:31:18.158
+Are, but there is a homophemer where they
+really are not related.
+0:31:18.458 --> 0:31:24.086
+So you can and can doesn't really have anything
+in common, so it's really very different.
+0:31:24.324 --> 0:31:29.527
+And of course that's not completely clear
+so there is not a clear definition so for example
+0:31:29.527 --> 0:31:34.730
+for the bank it can be that you say it's related
+but it can also be other can argue that so
+0:31:34.730 --> 0:31:39.876
+there are some clear things which is interest
+there are some which is vague and then there
+0:31:39.876 --> 0:31:43.439
+are some where it's very clear again that there
+are different.
+0:31:45.065 --> 0:31:49.994
+And in order to translate them, of course,
+we might need the context to disambiguate.
+0:31:49.994 --> 0:31:54.981
+That's typically where we can disambiguate,
+and that's not only for lexical semantics,
+0:31:54.981 --> 0:32:00.198
+that's generally very often that if you want
+to disambiguate, context can be very helpful.
+0:32:00.198 --> 0:32:03.981
+So in which sentence and which general knowledge
+who is speaking?
+0:32:04.944 --> 0:32:09.867
+You can do that externally by some disinvigration
+task.
+0:32:09.867 --> 0:32:14.702
+Machine translation system will also do it
+internally.
+0:32:16.156 --> 0:32:21.485
+And sometimes you're lucky and you don't need
+to do it because you just have the same ambiguity
+0:32:21.485 --> 0:32:23.651
+in the source and the target language.
+0:32:23.651 --> 0:32:26.815
+And then it doesn't matter if you think about
+the mouse.
+0:32:26.815 --> 0:32:31.812
+As I said, you don't really need to know if
+it's a computer mouse or the living mouse you
+0:32:31.812 --> 0:32:36.031
+translate from German to English because it
+has exactly the same ambiguity.
+0:32:40.400 --> 0:32:46.764
+There's also relations between words like
+synonyms, antonyms, hipponomes, like the is
+0:32:46.764 --> 0:32:50.019
+a relation and the part of like Dora House.
+0:32:50.019 --> 0:32:55.569
+Big small is an antonym and synonym is like
+which needs something similar.
+0:32:56.396 --> 0:33:03.252
+There are resources which try to express all
+these linguistic information like word net
+0:33:03.252 --> 0:33:10.107
+or German net where you have a graph with words
+and how they are related to each other.
+0:33:11.131 --> 0:33:12.602
+Which can be helpful.
+0:33:12.602 --> 0:33:18.690
+Typically these things were more used in tasks
+where there is fewer data, so there's a lot
+0:33:18.690 --> 0:33:24.510
+of tasks in NLP where you have very limited
+data because you really need to hand align
+0:33:24.510 --> 0:33:24.911
+that.
+0:33:25.125 --> 0:33:28.024
+Machine translation has a big advantage.
+0:33:28.024 --> 0:33:31.842
+There's naturally a lot of text translated
+out there.
+0:33:32.212 --> 0:33:39.519
+Typically in machine translation we have compared
+to other tasks significantly amount of data.
+0:33:39.519 --> 0:33:46.212
+People have looked into integrating wordnet
+or things like that, but it is rarely used
+0:33:46.212 --> 0:33:49.366
+in like commercial systems or something.
+0:33:52.692 --> 0:33:55.626
+So this was based on the words.
+0:33:55.626 --> 0:34:03.877
+We have morphology, syntax, and semantics,
+and then of course it makes sense to also look
+0:34:03.877 --> 0:34:06.169
+at the bigger structure.
+0:34:06.169 --> 0:34:08.920
+That means information about.
+0:34:08.948 --> 0:34:17.822
+Of course, we don't have a really morphology
+there because morphology about the structure
+0:34:17.822 --> 0:34:26.104
+of words, but we have syntax on the sentence
+level and the semantic representation.
+0:34:28.548 --> 0:34:35.637
+When we are thinking about the sentence structure,
+then the sentence is, of course, first a sequence
+0:34:35.637 --> 0:34:37.742
+of words terminated by a dot.
+0:34:37.742 --> 0:34:42.515
+Jane bought the house and we can say something
+about the structure.
+0:34:42.515 --> 0:34:47.077
+It's typically its subject work and then one
+or several objects.
+0:34:47.367 --> 0:34:51.996
+And the number of objects, for example, is
+then determined by the word.
+0:34:52.232 --> 0:34:54.317
+It's Called the Valency.
+0:34:54.354 --> 0:35:01.410
+So you have intransitive verbs which don't
+get any object, it's just to sleep.
+0:35:02.622 --> 0:35:05.912
+For example, there is no object sleep beds.
+0:35:05.912 --> 0:35:14.857
+You cannot say that: And there are transitive
+verbs where you have to put one or more objects,
+0:35:14.857 --> 0:35:16.221
+and you always.
+0:35:16.636 --> 0:35:19.248
+Sentence is not correct if you don't put the
+object.
+0:35:19.599 --> 0:35:33.909
+So if you have to buy something you have to
+say bought this or give someone something then.
+0:35:34.194 --> 0:35:40.683
+Here you see a bit that may be interesting
+the relation between word order and morphology.
+0:35:40.683 --> 0:35:47.243
+Of course it's not that strong, but for example
+in English you always have to first say who
+0:35:47.243 --> 0:35:49.453
+you gave it and what you gave.
+0:35:49.453 --> 0:35:53.304
+So the structure is very clear and cannot
+be changed.
+0:35:54.154 --> 0:36:00.801
+German, for example, has a possibility of
+determining what you gave and whom you gave
+0:36:00.801 --> 0:36:07.913
+it because there is a morphology and you can
+do what you gave a different form than to whom
+0:36:07.913 --> 0:36:08.685
+you gave.
+0:36:11.691 --> 0:36:18.477
+And that is a general tendency that if you
+have morphology then typically the word order
+0:36:18.477 --> 0:36:25.262
+is more free and possible, while in English
+you cannot express these information through
+0:36:25.262 --> 0:36:26.482
+the morphology.
+0:36:26.706 --> 0:36:30.238
+You typically have to express them through
+the word order.
+0:36:30.238 --> 0:36:32.872
+It's not as free, but it's more restricted.
+0:36:35.015 --> 0:36:40.060
+Yeah, the first part is typically the noun
+phrase, the subject, and that can not only
+0:36:40.060 --> 0:36:43.521
+be a single noun, but of course it can be a
+longer phrase.
+0:36:43.521 --> 0:36:48.860
+So if you have Jane the woman, it can be Jane,
+it can be the woman, it can a woman, it can
+0:36:48.860 --> 0:36:52.791
+be the young woman or the young woman who lives
+across the street.
+0:36:53.073 --> 0:36:56.890
+All of these are the subjects, so this can
+be already very, very long.
+0:36:57.257 --> 0:36:58.921
+And they also put this.
+0:36:58.921 --> 0:37:05.092
+The verb is on the second position in a bit
+more complicated way because if you have now
+0:37:05.092 --> 0:37:11.262
+the young woman who lives across the street
+runs to somewhere or so then yeah runs is at
+0:37:11.262 --> 0:37:16.185
+the second position in this tree but the first
+position is quite long.
+0:37:16.476 --> 0:37:19.277
+And so it's not just counting okay.
+0:37:19.277 --> 0:37:22.700
+The second word is always is always a word.
+0:37:26.306 --> 0:37:32.681
+Additional to these simple things, there's
+more complex stuff.
+0:37:32.681 --> 0:37:43.104
+Jane bought the house from Jim without hesitation,
+or Jane bought the house in the pushed neighborhood
+0:37:43.104 --> 0:37:44.925
+across the river.
+0:37:45.145 --> 0:37:51.694
+And these often lead to additional ambiguities
+because it's not always completely clear to
+0:37:51.694 --> 0:37:53.565
+which this prepositional.
+0:37:54.054 --> 0:37:59.076
+So that we'll see and you have, of course,
+subclasses and so on.
+0:38:01.061 --> 0:38:09.926
+And then there is a theory behind it which
+was very important for rule based machine translation
+0:38:09.926 --> 0:38:14.314
+because that's exactly what you're doing there.
+0:38:14.314 --> 0:38:18.609
+You would take the sentence, do the syntactic.
+0:38:18.979 --> 0:38:28.432
+So that we can have this constituents which
+like describe the basic parts of the language.
+0:38:28.468 --> 0:38:35.268
+And we can create the sentence structure as
+a context free grammar, which you hopefully
+0:38:35.268 --> 0:38:42.223
+remember from basic computer science, which
+is a pair of non terminals, terminal symbols,
+0:38:42.223 --> 0:38:44.001
+production rules, and.
+0:38:43.943 --> 0:38:50.218
+And the star symbol, and you can then describe
+a sentence by this phrase structure grammar:
+0:38:51.751 --> 0:38:59.628
+So a simple example would be something like
+that: you have a lexicon, Jane is a noun, Frays
+0:38:59.628 --> 0:39:02.367
+is a noun, Telescope is a noun.
+0:39:02.782 --> 0:39:10.318
+And then you have these production rules sentences:
+a noun phrase in the web phrase.
+0:39:10.318 --> 0:39:18.918
+The noun phrase can either be a determinized
+noun or it can be a noun phrase and a propositional
+0:39:18.918 --> 0:39:19.628
+phrase.
+0:39:19.919 --> 0:39:25.569
+Or a prepositional phrase and a prepositional
+phrase is a preposition and a non phrase.
+0:39:26.426 --> 0:39:27.622
+We're looking at this.
+0:39:27.622 --> 0:39:30.482
+What is the valency of the word we're describing
+here?
+0:39:33.513 --> 0:39:36.330
+How many objects would in this case the world
+have?
+0:39:46.706 --> 0:39:48.810
+We're looking at the web phrase.
+0:39:48.810 --> 0:39:54.358
+The web phrase is a verb and a noun phrase,
+so one object here, so this would be for a
+0:39:54.358 --> 0:39:55.378
+balance of one.
+0:39:55.378 --> 0:40:00.925
+If you have intransitive verbs, it would be
+verb phrases, just a word, and if you have
+0:40:00.925 --> 0:40:03.667
+two, it would be noun phrase, noun phrase.
+0:40:08.088 --> 0:40:15.348
+And yeah, then the, the, the challenge or
+what you have to do is like this: Given a natural
+0:40:15.348 --> 0:40:23.657
+language sentence, you want to parse it to
+get this type of pastry from programming languages
+0:40:23.657 --> 0:40:30.198
+where you also need to parse the code in order
+to get the representation.
+0:40:30.330 --> 0:40:39.356
+However, there is one challenge if you parse
+natural language compared to computer language.
+0:40:43.823 --> 0:40:56.209
+So there are different ways of how you can
+express things and there are different pastures
+0:40:56.209 --> 0:41:00.156
+belonging to the same input.
+0:41:00.740 --> 0:41:05.241
+So if you have Jane buys a horse, how's that
+an easy example?
+0:41:05.241 --> 0:41:07.491
+So you do the lexicon look up.
+0:41:07.491 --> 0:41:13.806
+Jane can be a noun phrase, a bias is a verb,
+a is a determiner, and a house is a noun.
+0:41:15.215 --> 0:41:18.098
+And then you can now use the grammar rules
+of here.
+0:41:18.098 --> 0:41:19.594
+There is no rule for that.
+0:41:20.080 --> 0:41:23.564
+Here we have no rules, but here we have a
+rule.
+0:41:23.564 --> 0:41:27.920
+A noun is a non-phrase, so we have mapped
+that to the noun.
+0:41:28.268 --> 0:41:34.012
+Then we can map this to the web phrase.
+0:41:34.012 --> 0:41:47.510
+We have a verb noun phrase to web phrase and
+then we can map this to a sentence representing:
+0:41:49.069 --> 0:41:53.042
+We can have that even more complex.
+0:41:53.042 --> 0:42:01.431
+The woman who won the lottery yesterday bought
+the house across the street.
+0:42:01.431 --> 0:42:05.515
+The structure gets more complicated.
+0:42:05.685 --> 0:42:12.103
+You now see that the word phrase is at the
+second position, but the noun phrase is quite.
+0:42:12.052 --> 0:42:18.655
+Quite big in here and the p p phrases, it's
+sometimes difficult where to put them because
+0:42:18.655 --> 0:42:25.038
+they can be put to the noun phrase, but in
+other sentences they can also be put to the
+0:42:25.038 --> 0:42:25.919
+web phrase.
+0:42:36.496 --> 0:42:38.250
+Yeah.
+0:42:43.883 --> 0:42:50.321
+Yes, so then either it can have two tags,
+noun or noun phrase, or you can have the extra
+0:42:50.321 --> 0:42:50.755
+rule.
+0:42:50.755 --> 0:42:57.409
+The noun phrase can not only be a determiner
+in the noun, but it can also be a noun phrase.
+0:42:57.717 --> 0:43:04.360
+Then of course either you introduce additional
+rules when what is possible or the problem
+0:43:04.360 --> 0:43:11.446
+that if you do pastures which are not correct
+and then you have to add some type of probability
+0:43:11.446 --> 0:43:13.587
+which type is more probable.
+0:43:16.876 --> 0:43:23.280
+But of course some things also can't really
+model easily with this type of cheese.
+0:43:23.923 --> 0:43:32.095
+There, for example, the agreement is not straightforward
+to do so that in subject and work you can check
+0:43:32.095 --> 0:43:38.866
+that the person, the agreement, the number
+in person, the number agreement is correct,
+0:43:38.866 --> 0:43:41.279
+but if it's a singular object.
+0:43:41.561 --> 0:43:44.191
+A singular verb, it's also a singular.
+0:43:44.604 --> 0:43:49.242
+Non-subject, and if it's a plural subject,
+it's a plural work.
+0:43:49.489 --> 0:43:56.519
+Things like that are yeah, the agreement in
+determining action driven now, so they also
+0:43:56.519 --> 0:43:57.717
+have to agree.
+0:43:57.877 --> 0:44:05.549
+Things like that cannot be easily done with
+this type of grammar or this subcategorization
+0:44:05.549 --> 0:44:13.221
+that you check whether the verb is transitive
+or intransitive, and that Jane sleeps is OK,
+0:44:13.221 --> 0:44:16.340
+but Jane sleeps the house is not OK.
+0:44:16.436 --> 0:44:21.073
+And Jane Walterhouse is okay, but Jane Walterhouse
+is not okay.
+0:44:23.183 --> 0:44:29.285
+Furthermore, this long range dependency might
+be difficult and which word orders are allowed
+0:44:29.285 --> 0:44:31.056
+and which are not allowed.
+0:44:31.571 --> 0:44:40.011
+This is also not directly so you can say Maria
+give de man das bourg, de man give Maria das
+0:44:40.011 --> 0:44:47.258
+bourg, das bourg give Maria, de man aber Maria,
+de man give des bourg is some.
+0:44:47.227 --> 0:44:55.191
+One yeah, which one from this one is possible
+and not is sometimes not possible to model,
+0:44:55.191 --> 0:44:56.164
+is simple.
+0:44:56.876 --> 0:45:05.842
+Therefore, people have done more complex stuff
+like this unification grammar and tried to
+0:45:05.842 --> 0:45:09.328
+model both the categories of verb.
+0:45:09.529 --> 0:45:13.367
+The agreement has to be that it's person and
+single.
+0:45:13.367 --> 0:45:20.028
+You're joining that so you're annotating this
+thing with more information and then you have
+0:45:20.028 --> 0:45:25.097
+more complex synthetic structures in order
+to model also these types.
+0:45:28.948 --> 0:45:33.137
+Yeah, why is this difficult?
+0:45:33.873 --> 0:45:39.783
+We have different ambiguities and that makes
+it different, so words have different part
+0:45:39.783 --> 0:45:43.610
+of speech text and if you have time flies like
+an error.
+0:45:43.583 --> 0:45:53.554
+It can mean that sometimes the animal L look
+like an arrow and or it can mean that the time
+0:45:53.554 --> 0:45:59.948
+is flying very fast is going away very fast
+like an error.
+0:46:00.220 --> 0:46:10.473
+And if you want to do a pastry, these two
+meanings have a different part of speech text,
+0:46:10.473 --> 0:46:13.008
+so flies is the verb.
+0:46:13.373 --> 0:46:17.999
+And of course that is a different semantic,
+and so that is very different.
+0:46:19.499 --> 0:46:23.361
+And otherwise a structural.
+0:46:23.243 --> 0:46:32.419
+Ambiguity so that like some part of the sentence
+can have different rules, so the famous thing
+0:46:32.419 --> 0:46:34.350
+is this attachment.
+0:46:34.514 --> 0:46:39.724
+So the cops saw the Bulgara with a binoculars.
+0:46:39.724 --> 0:46:48.038
+Then with a binocular can be attached to saw
+or it can be attached to the.
+0:46:48.448 --> 0:46:59.897
+And so in the first two it's more probable
+that he saw the theft, and not that the theft
+0:46:59.897 --> 0:47:01.570
+has the one.
+0:47:01.982 --> 0:47:13.356
+And this, of course, makes things difficult
+while parsing and doing structure implicitly
+0:47:13.356 --> 0:47:16.424
+defining the semantics.
+0:47:20.120 --> 0:47:29.736
+Therefore, we would then go directly to semantics,
+but maybe some questions about spintax and
+0:47:29.736 --> 0:47:31.373
+how that works.
+0:47:33.113 --> 0:47:46.647
+Then we'll do a bit more about semantics,
+so now we only describe the structure of the
+0:47:46.647 --> 0:47:48.203
+sentence.
+0:47:48.408 --> 0:47:55.584
+And for the meaning of the sentence we typically
+have the compositionality of meaning.
+0:47:55.584 --> 0:48:03.091
+The meaning of the full sentence is determined
+by the meaning of the individual words, and
+0:48:03.091 --> 0:48:06.308
+they together form the meaning of the.
+0:48:06.686 --> 0:48:17.936
+For words that is partly true but not always
+mean for things like rainbow, jointly rain
+0:48:17.936 --> 0:48:19.086
+and bow.
+0:48:19.319 --> 0:48:26.020
+But this is not always a case, while for sentences
+typically that is happening because you can't
+0:48:26.020 --> 0:48:30.579
+directly determine the full meaning, but you
+split it into parts.
+0:48:30.590 --> 0:48:36.164
+Sometimes only in some parts like kick the
+bucket the expression.
+0:48:36.164 --> 0:48:43.596
+Of course you cannot get the meaning of kick
+the bucket by looking at the individual or
+0:48:43.596 --> 0:48:46.130
+in German abyss in its grass.
+0:48:47.207 --> 0:48:53.763
+You cannot get that he died by looking at
+the individual words of Bis ins grass, but
+0:48:53.763 --> 0:48:54.611
+they have.
+0:48:55.195 --> 0:49:10.264
+And there are different ways of describing
+that some people have tried that more commonly
+0:49:10.264 --> 0:49:13.781
+used for some tasks.
+0:49:14.654 --> 0:49:20.073
+Will come to so the first thing would be something
+like first order logic.
+0:49:20.073 --> 0:49:27.297
+If you have Peter loves Jane then you have
+this meaning and you're having the end of representation
+0:49:27.297 --> 0:49:33.005
+that you have a love property between Peter
+and Jane and you try to construct.
+0:49:32.953 --> 0:49:40.606
+That you will see this a lot more complex
+than directly than only doing syntax but also
+0:49:40.606 --> 0:49:43.650
+doing this type of representation.
+0:49:44.164 --> 0:49:47.761
+The other thing is to try to do frame semantics.
+0:49:47.867 --> 0:49:55.094
+That means that you try to represent the knowledge
+about the world and you have these ah frames.
+0:49:55.094 --> 0:49:58.372
+For example, you might have a frame to buy.
+0:49:58.418 --> 0:50:05.030
+And the meaning is that you have a commercial
+transaction.
+0:50:05.030 --> 0:50:08.840
+You have a person who is selling.
+0:50:08.969 --> 0:50:10.725
+You Have a Person Who's Buying.
+0:50:11.411 --> 0:50:16.123
+You have something that is priced, you might
+have a price, and so on.
+0:50:17.237 --> 0:50:22.698
+And then what you are doing in semantic parsing
+with frame semantics you first try to determine.
+0:50:22.902 --> 0:50:30.494
+Which frames are happening in the sentence,
+so if it's something with Bowie buying you
+0:50:30.494 --> 0:50:33.025
+would try to first identify.
+0:50:33.025 --> 0:50:40.704
+Oh, here we have to try Brain B, which does
+not always have to be indicated by the verb
+0:50:40.704 --> 0:50:42.449
+cell or other ways.
+0:50:42.582 --> 0:50:52.515
+And then you try to find out which elements
+of these frame are in the sentence and try
+0:50:52.515 --> 0:50:54.228
+to align them.
+0:50:56.856 --> 0:51:01.121
+Yeah, you have, for example, to buy and sell.
+0:51:01.121 --> 0:51:07.239
+If you have a model that has frames, they
+have the same elements.
+0:51:09.829 --> 0:51:15.018
+In addition over like sentence, then you have
+also a phenomenon beyond sentence level.
+0:51:15.018 --> 0:51:20.088
+We're coming to this later because it's a
+special challenge for machine translation.
+0:51:20.088 --> 0:51:22.295
+There is, for example, co reference.
+0:51:22.295 --> 0:51:27.186
+That means if you first mention it, it's like
+the President of the United States.
+0:51:27.467 --> 0:51:30.107
+And later you would refer to him maybe as
+he.
+0:51:30.510 --> 0:51:36.966
+And that is especially challenging in machine
+translation because you're not always using
+0:51:36.966 --> 0:51:38.114
+the same thing.
+0:51:38.114 --> 0:51:44.355
+Of course, for the president, it's he and
+air in German, but for other things it might
+0:51:44.355 --> 0:51:49.521
+be different depending on the gender in languages
+that you refer to it.
+0:51:55.435 --> 0:52:03.866
+So much for the background and the next, we
+want to look based on the knowledge we have
+0:52:03.866 --> 0:52:04.345
+now.
+0:52:04.345 --> 0:52:10.285
+Why is machine translation difficult before
+we have any more?
+0:52:16.316 --> 0:52:22.471
+The first type of problem is what we refer
+to as translation divers.
+0:52:22.471 --> 0:52:30.588
+That means that we have the same information
+in source and target, but the problem is that
+0:52:30.588 --> 0:52:33.442
+they are expressed differently.
+0:52:33.713 --> 0:52:42.222
+So it is not the same way, and we have to
+translate these things more easily by just
+0:52:42.222 --> 0:52:44.924
+having a bit more complex.
+0:52:45.325 --> 0:52:51.324
+So example is if it's only a structure in
+English, the delicious.
+0:52:51.324 --> 0:52:59.141
+The adjective is before the noun, while in
+Spanish you have to put it after the noun,
+0:52:59.141 --> 0:53:02.413
+and so you have to change the word.
+0:53:02.983 --> 0:53:10.281
+So there are different ways of divergence,
+so there can be structural divergence, which
+0:53:10.281 --> 0:53:10.613
+is.
+0:53:10.550 --> 0:53:16.121
+The word orders so that the order is different,
+so in German we have that especially in the
+0:53:16.121 --> 0:53:19.451
+in the sub clause, while in English in the
+sub clause.
+0:53:19.451 --> 0:53:24.718
+The verb is also at the second position, in
+German it's at the end, and so you have to
+0:53:24.718 --> 0:53:25.506
+move it all.
+0:53:25.465 --> 0:53:27.222
+Um All Over.
+0:53:27.487 --> 0:53:32.978
+It can be that that it's a complete different
+grammatical role.
+0:53:33.253 --> 0:53:35.080
+So,.
+0:53:35.595 --> 0:53:37.458
+You Have You Like Her.
+0:53:38.238 --> 0:53:41.472
+And eh in in.
+0:53:41.261 --> 0:53:47.708
+English: In Spanish it's a la ti gusta which
+means she so now she is no longer like object
+0:53:47.708 --> 0:53:54.509
+but she is subject here and you are now acquisitive
+and then pleases or like yeah so you really
+0:53:54.509 --> 0:53:58.689
+use a different sentence structure and you
+have to change.
+0:53:59.139 --> 0:54:03.624
+Can also be the head switch.
+0:54:03.624 --> 0:54:09.501
+In English you say the baby just ate.
+0:54:09.501 --> 0:54:16.771
+In Spanish literary you say the baby finishes.
+0:54:16.997 --> 0:54:20.803
+So the is no longer the word, but the finishing
+is the word.
+0:54:21.241 --> 0:54:30.859
+So you have to learn so you cannot always
+have the same structures in your input and
+0:54:30.859 --> 0:54:31.764
+output.
+0:54:36.856 --> 0:54:42.318
+Lexical things like to swim across or to cross
+swimming.
+0:54:43.243 --> 0:54:57.397
+You have categorical like an adjective gets
+into a noun, so you have a little bread to
+0:54:57.397 --> 0:55:00.162
+make a decision.
+0:55:00.480 --> 0:55:15.427
+That is the one challenge and the even bigger
+challenge is referred to as translation.
+0:55:17.017 --> 0:55:19.301
+That can be their lexical mismatch.
+0:55:19.301 --> 0:55:21.395
+That's the fish we talked about.
+0:55:21.395 --> 0:55:27.169
+If it's like the, the fish you eat or the
+fish which is living is the two different worlds
+0:55:27.169 --> 0:55:27.931
+in Spanish.
+0:55:28.108 --> 0:55:34.334
+And then that's partly sometimes even not
+known, so even the human might not be able
+0:55:34.334 --> 0:55:34.627
+to.
+0:55:34.774 --> 0:55:40.242
+Infer that you maybe need to see the context
+you maybe need to have the sentences around,
+0:55:40.242 --> 0:55:45.770
+so one problem is that at least traditional
+machine translation works on a sentence level,
+0:55:45.770 --> 0:55:51.663
+so we take each sentence and translate it independent
+of everything else, but that's, of course,
+0:55:51.663 --> 0:55:52.453
+not correct.
+0:55:52.532 --> 0:55:59.901
+Will look into some ways of looking at and
+doing document-based machine translation, but.
+0:56:00.380 --> 0:56:06.793
+There's gender information might be a problem,
+so in English it's player and you don't know
+0:56:06.793 --> 0:56:10.139
+if it's Spieler Spielerin or if it's not known.
+0:56:10.330 --> 0:56:15.770
+But in the English, if you now generate German,
+you should know is the reader.
+0:56:15.770 --> 0:56:21.830
+Does he know the gender or does he not know
+the gender and then generate the right one?
+0:56:22.082 --> 0:56:38.333
+So just imagine a commentator if he's talking
+about the player and you can see if it's male
+0:56:38.333 --> 0:56:40.276
+or female.
+0:56:40.540 --> 0:56:47.801
+So in generally the problem is that if you
+have less information and you need more information
+0:56:47.801 --> 0:56:51.928
+in your target, this translation doesn't really
+work.
+0:56:55.175 --> 0:56:59.180
+Another problem is we just talked about the
+the.
+0:56:59.119 --> 0:57:01.429
+The co reference.
+0:57:01.641 --> 0:57:08.818
+So if you refer to an object and that can
+be across sentence boundaries then you have
+0:57:08.818 --> 0:57:14.492
+to use the right pronoun and you cannot just
+translate the pronoun.
+0:57:14.492 --> 0:57:18.581
+If the baby does not thrive on raw milk boil
+it.
+0:57:19.079 --> 0:57:28.279
+And if you are now using it and just take
+the typical translation, it will be: And That
+0:57:28.279 --> 0:57:31.065
+Will Be Ah Wrong.
+0:57:31.291 --> 0:57:35.784
+No, that will be even right because it is
+dust baby.
+0:57:35.784 --> 0:57:42.650
+Yes, but I mean, you have to determine that
+and it might be wrong at some point.
+0:57:42.650 --> 0:57:48.753
+So getting this this um yeah, it will be wrong
+yes, that is right yeah.
+0:57:48.908 --> 0:57:55.469
+Because in English both are baby and milk,
+and baby are both referred to it, so if you
+0:57:55.469 --> 0:58:02.180
+do S it will be to the first one referred to,
+so it's correct, but in Germany it will be
+0:58:02.180 --> 0:58:06.101
+S, and so if you translate it as S it will
+be baby.
+0:58:06.546 --> 0:58:13.808
+But you have to do Z because milk is female,
+although that is really very uncommon because
+0:58:13.808 --> 0:58:18.037
+maybe a model is an object and so it should
+be more.
+0:58:18.358 --> 0:58:25.176
+Of course, I agree there might be a situation
+which is a bit created and not a common thing,
+0:58:25.176 --> 0:58:29.062
+but you can see that these things are not that
+easy.
+0:58:29.069 --> 0:58:31.779
+Another example is this: Dr.
+0:58:31.779 --> 0:58:37.855
+McLean often brings his dog champion to visit
+with his patients.
+0:58:37.855 --> 0:58:41.594
+He loves to give big wets loppy kisses.
+0:58:42.122 --> 0:58:58.371
+And there, of course, it's also important
+if he refers to the dog or to the doctor.
+0:58:59.779 --> 0:59:11.260
+Another example of challenging is that we
+don't have a fixed language and that was referred
+0:59:11.260 --> 0:59:16.501
+to morphology and we can build new words.
+0:59:16.496 --> 0:59:23.787
+So we can in all languages build new words
+by just concatinating part of it like braxits,
+0:59:23.787 --> 0:59:30.570
+some things like: And then, of course, also
+words don't exist in languages, don't exist
+0:59:30.570 --> 0:59:31.578
+in isolations.
+0:59:32.012 --> 0:59:41.591
+In Germany you can now use the word download
+somewhere and you can also use a morphological
+0:59:41.591 --> 0:59:43.570
+operation on that.
+0:59:43.570 --> 0:59:48.152
+I guess there is even not the correct word.
+0:59:48.508 --> 0:59:55.575
+But so you have to deal with these things,
+and yeah, in social meters.
+0:59:55.996 --> 1:00:00.215
+This word is maybe most of you have forgotten
+already.
+1:00:00.215 --> 1:00:02.517
+This was ten years ago or so.
+1:00:02.517 --> 1:00:08.885
+I don't know there was a volcano in Iceland
+which stopped Europeans flying around.
+1:00:09.929 --> 1:00:14.706
+So there is always new words coming up and
+you have to deal with.
+1:00:18.278 --> 1:00:24.041
+Yeah, one last thing, so some of these examples
+we have seen are a bit artificial.
+1:00:24.041 --> 1:00:30.429
+So one example what is very common with machine
+translation doesn't really work is this box
+1:00:30.429 --> 1:00:31.540
+was in the pen.
+1:00:32.192 --> 1:00:36.887
+And maybe you would be surprised, at least
+when read it.
+1:00:36.887 --> 1:00:39.441
+How can a box be inside a pen?
+1:00:40.320 --> 1:00:44.175
+Does anybody have a solution for that while
+the sentence is still correct?
+1:00:47.367 --> 1:00:51.692
+Maybe it's directly clear for you, maybe your
+English was aside, yeah.
+1:00:54.654 --> 1:01:07.377
+Yes, like at a farm or for small children,
+and that is also called a pen or a pen on a
+1:01:07.377 --> 1:01:08.254
+farm.
+1:01:08.368 --> 1:01:12.056
+And then this is, and so you can mean okay.
+1:01:12.056 --> 1:01:16.079
+To infer these two meanings is quite difficult.
+1:01:16.436 --> 1:01:23.620
+But at least when I saw it, I wasn't completely
+convinced because it's maybe not the sentence
+1:01:23.620 --> 1:01:29.505
+you're using in your daily life, and some of
+these constructions seem to be.
+1:01:29.509 --> 1:01:35.155
+They are very good in showing where the problem
+is, but the question is, does it really imply
+1:01:35.155 --> 1:01:35.995
+in real life?
+1:01:35.996 --> 1:01:42.349
+And therefore here some examples also that
+we had here with a lecture translator that
+1:01:42.349 --> 1:01:43.605
+really occurred.
+1:01:43.605 --> 1:01:49.663
+They maybe looked simple, but you will see
+that some of them still are happening.
+1:01:50.050 --> 1:01:53.948
+And they are partly about spitting words,
+and then they are happening.
+1:01:54.294 --> 1:01:56.816
+So Um.
+1:01:56.596 --> 1:02:03.087
+We had a text about the numeral system in
+German, the Silen system, which got splitted
+1:02:03.087 --> 1:02:07.041
+into sub parts because otherwise we can't translate.
+1:02:07.367 --> 1:02:14.927
+And then he did only a proximate match and
+was talking about the binary payment system
+1:02:14.927 --> 1:02:23.270
+because the payment system was a lot more common
+in the training data than the Thailand system.
+1:02:23.823 --> 1:02:29.900
+And so there you see like rare words, which
+don't occur that often.
+1:02:29.900 --> 1:02:38.211
+They are very challenging to deal with because
+we are good and inferring that sometimes, but
+1:02:38.211 --> 1:02:41.250
+for others that's very difficult.
+1:02:44.344 --> 1:02:49.605
+Another challenge is that, of course, the
+context is very difficult.
+1:02:50.010 --> 1:02:56.448
+This is also an example a bit older from also
+the lecture translators we were translating
+1:02:56.448 --> 1:03:01.813
+in mass lecture, and he was always talking
+about the omens of the numbers.
+1:03:02.322 --> 1:03:11.063
+Which doesn't make any sense at all, but the
+German word fortsizing can of course mean the
+1:03:11.063 --> 1:03:12.408
+sign and the.
+1:03:12.732 --> 1:03:22.703
+And if you not have the right to main knowledge
+in there and encode it, it might use the main
+1:03:22.703 --> 1:03:23.869
+knowledge.
+1:03:25.705 --> 1:03:31.205
+A more recent version of that is like here
+from a paper where it's about translating.
+1:03:31.205 --> 1:03:36.833
+We had this pivot based translation where
+you translate maybe to English and to another
+1:03:36.833 --> 1:03:39.583
+because you have not enough training data.
+1:03:40.880 --> 1:03:48.051
+And we did that from Dutch to German guess
+if you don't understand Dutch, if you speak
+1:03:48.051 --> 1:03:48.710
+German.
+1:03:48.908 --> 1:03:56.939
+So we have this raven forebuilt, which means
+to geben in English.
+1:03:56.939 --> 1:04:05.417
+It's correctly in setting an example: However,
+if we're then translate to German, he didn't
+1:04:05.417 --> 1:04:11.524
+get the full context, and in German you normally
+don't set an example, but you give an example,
+1:04:11.524 --> 1:04:16.740
+and so yes, going through another language
+you introduce their additional errors.
+1:04:19.919 --> 1:04:27.568
+Good so much for this are there more questions
+about why this is difficult.
+1:04:30.730 --> 1:04:35.606
+Then we'll start with this one.
+1:04:35.606 --> 1:04:44.596
+I have to leave a bit early today in a quarter
+of an hour.
+1:04:44.904 --> 1:04:58.403
+If you look about linguistic approaches to
+machine translation, they are typically described
+1:04:58.403 --> 1:05:03.599
+by: So we can do a direct translation, so you
+take the Suez language.
+1:05:03.599 --> 1:05:09.452
+Do not apply a lot of the analysis we were
+discussing today about syntax representation,
+1:05:09.452 --> 1:05:11.096
+semantic representation.
+1:05:11.551 --> 1:05:14.678
+But you directly translate to your target
+text.
+1:05:14.678 --> 1:05:16.241
+That's here the direct.
+1:05:16.516 --> 1:05:19.285
+Then there is a transfer based approach.
+1:05:19.285 --> 1:05:23.811
+Then you transfer everything over and you
+do the text translation.
+1:05:24.064 --> 1:05:28.354
+And you can do that at two levels, more at
+the syntax level.
+1:05:28.354 --> 1:05:34.683
+That means you only do synthetic analysts
+like you do a pasture or so, or at the semantic
+1:05:34.683 --> 1:05:37.848
+level where you do a semantic parsing frame.
+1:05:38.638 --> 1:05:51.489
+Then there is an interlingua based approach
+where you don't do any transfer anymore, but
+1:05:51.489 --> 1:05:55.099
+you only do an analysis.
+1:05:57.437 --> 1:06:02.790
+So how does now the direct transfer, the direct
+translation?
+1:06:03.043 --> 1:06:07.031
+Look like it's one of the earliest approaches.
+1:06:07.327 --> 1:06:18.485
+So you do maybe some morphological analysts,
+but not a lot, and then you do this bilingual
+1:06:18.485 --> 1:06:20.202
+word mapping.
+1:06:20.540 --> 1:06:25.067
+You might do some here in generations.
+1:06:25.067 --> 1:06:32.148
+These two things are not really big, but you
+are working on.
+1:06:32.672 --> 1:06:39.237
+And of course this might be a first easy solution
+about all the challenges we have seen that
+1:06:39.237 --> 1:06:41.214
+the structure is different.
+1:06:41.214 --> 1:06:45.449
+That you have to reorder, look at the agreement,
+then work.
+1:06:45.449 --> 1:06:47.638
+That's why the first approach.
+1:06:47.827 --> 1:06:54.618
+So if we have different word order, structural
+shifts or idiomatic expressions that doesn't
+1:06:54.618 --> 1:06:55.208
+really.
+1:06:57.797 --> 1:07:05.034
+Then there are these rule based approaches
+which were more commonly used.
+1:07:05.034 --> 1:07:15.249
+They might still be somewhere: Mean most commonly
+they are now used by neural networks but wouldn't
+1:07:15.249 --> 1:07:19.254
+be sure there is no system out there but.
+1:07:19.719 --> 1:07:25.936
+And in this transfer based approach we have
+these steps there nicely visualized in the.
+1:07:26.406 --> 1:07:32.397
+Triangle, so we have the analytic of the sur
+sentence where we then get some type of abstract
+1:07:32.397 --> 1:07:33.416
+representation.
+1:07:33.693 --> 1:07:40.010
+Then we are doing the transfer of the representation
+of the source sentence into the representation
+1:07:40.010 --> 1:07:40.263
+of.
+1:07:40.580 --> 1:07:46.754
+And then we have the generation where we take
+this abstract representation and do then the
+1:07:46.754 --> 1:07:47.772
+surface forms.
+1:07:47.772 --> 1:07:54.217
+For example, it might be that there is no
+morphological variants in the episode representation
+1:07:54.217 --> 1:07:56.524
+and we have to do this agreement.
+1:07:56.656 --> 1:08:00.077
+Which components do you they need?
+1:08:01.061 --> 1:08:08.854
+You need monolingual source and target lexicon
+and the corresponding grammars in order to
+1:08:08.854 --> 1:08:12.318
+do both the analyst and the generation.
+1:08:12.412 --> 1:08:18.584
+Then you need the bilingual dictionary in
+order to do the lexical translation and the
+1:08:18.584 --> 1:08:25.116
+bilingual transfer rules in order to transfer
+the grammar, for example in German, into the
+1:08:25.116 --> 1:08:28.920
+grammar in English, and that enables you to
+do that.
+1:08:29.269 --> 1:08:32.579
+So an example is is something like this here.
+1:08:32.579 --> 1:08:38.193
+So if you're doing a syntactic transfer it
+means you're starting with John E.
+1:08:38.193 --> 1:08:38.408
+Z.
+1:08:38.408 --> 1:08:43.014
+Apple you do the analyst then you have this
+type of graph here.
+1:08:43.014 --> 1:08:48.340
+Therefore you need your monolingual lexicon
+and your monolingual grammar.
+1:08:48.748 --> 1:08:59.113
+Then you're doing the transfer where you're
+transferring this representation into this
+1:08:59.113 --> 1:09:01.020
+representation.
+1:09:01.681 --> 1:09:05.965
+So how could this type of translation then
+look like?
+1:09:07.607 --> 1:09:08.276
+Style.
+1:09:08.276 --> 1:09:14.389
+We have the example of a delicious soup and
+una soup deliciosa.
+1:09:14.894 --> 1:09:22.173
+This is your source language tree and this
+is your target language tree and then the rules
+1:09:22.173 --> 1:09:26.092
+that you need are these ones to do the transfer.
+1:09:26.092 --> 1:09:31.211
+So if you have a noun phrase that also goes
+to the noun phrase.
+1:09:31.691 --> 1:09:44.609
+You see here that the switch is happening,
+so the second position is here at the first
+1:09:44.609 --> 1:09:46.094
+position.
+1:09:46.146 --> 1:09:52.669
+Then you have the translation of determiner
+of the words, so the dictionary entries.
+1:09:53.053 --> 1:10:07.752
+And with these types of rules you can then
+do these mappings and do the transfer between
+1:10:07.752 --> 1:10:11.056
+the representation.
+1:10:25.705 --> 1:10:32.505
+Think it more depends on the amount of expertise
+you have in representing them.
+1:10:32.505 --> 1:10:35.480
+The rules will get more difficult.
+1:10:36.136 --> 1:10:42.445
+For example, these rule based were, so I think
+it more depends on how difficult the structure
+1:10:42.445 --> 1:10:42.713
+is.
+1:10:42.713 --> 1:10:48.619
+So for German generating German they were
+quite long, quite successful because modeling
+1:10:48.619 --> 1:10:52.579
+all the German phenomena which are in there
+was difficult.
+1:10:52.953 --> 1:10:56.786
+And that can be done there, and it wasn't
+easy to learn that just from data.
+1:10:59.019 --> 1:11:07.716
+Think even if you think about Chinese and
+English or so, if you have the trees there
+1:11:07.716 --> 1:11:10.172
+is quite some rule and.
+1:11:15.775 --> 1:11:23.370
+Another thing is you can also try to do something
+like that on the semantic, which means this
+1:11:23.370 --> 1:11:24.905
+gets more complex.
+1:11:25.645 --> 1:11:31.047
+This gets maybe a bit easier because this
+representation, the semantic representation
+1:11:31.047 --> 1:11:36.198
+between languages, are more similar and therefore
+this gets more difficult again.
+1:11:36.496 --> 1:11:45.869
+So typically if you go higher in your triangle
+this is more work while this is less work.
+1:11:49.729 --> 1:11:56.023
+So it can be then, for example, like in Gusta,
+we have again that the the the order changes.
+1:11:56.023 --> 1:12:02.182
+So you see the transfer rule for like is that
+the first argument is here and the second is
+1:12:02.182 --> 1:12:06.514
+there, while on the on the Gusta side here
+the second argument.
+1:12:06.466 --> 1:12:11.232
+It is in the first position and the first
+argument is in the second position.
+1:12:11.511 --> 1:12:14.061
+So that you do yeah, and also there you're
+ordering,.
+1:12:14.354 --> 1:12:20.767
+From the principle it is more like you have
+a different type of formalism of representing
+1:12:20.767 --> 1:12:27.038
+your sentence and therefore you need to do
+more on one side and less on the other side.
+1:12:32.852 --> 1:12:42.365
+Then so in general transfer based approaches
+are you have to first select how to represent
+1:12:42.365 --> 1:12:44.769
+a synthetic structure.
+1:12:45.165 --> 1:12:55.147
+There's like these variable abstraction levels
+and then you have the three components: The
+1:12:55.147 --> 1:13:04.652
+disadvantage is that on the one hand you need
+normally a lot of experts monolingual experts
+1:13:04.652 --> 1:13:08.371
+who analyze how to do the transfer.
+1:13:08.868 --> 1:13:18.860
+And if you're doing a new language, you have
+to do analyst transfer in generation and the
+1:13:18.860 --> 1:13:19.970
+transfer.
+1:13:20.400 --> 1:13:27.074
+So if you need one language, add one language
+in existing systems, of course you have to
+1:13:27.074 --> 1:13:29.624
+do transfer to all the languages.
+1:13:32.752 --> 1:13:39.297
+Therefore, the other idea which people were
+interested in is the interlingua based machine
+1:13:39.297 --> 1:13:40.232
+translation.
+1:13:40.560 --> 1:13:47.321
+Where the idea is that we have this intermediate
+language with this abstract language independent
+1:13:47.321 --> 1:13:53.530
+representation and so the important thing is
+it's language independent so it's really the
+1:13:53.530 --> 1:13:59.188
+same for all language and it's a pure meaning
+and there is no ambiguity in there.
+1:14:00.100 --> 1:14:05.833
+That allows this nice translation without
+transfer, so you just do an analysis into your
+1:14:05.833 --> 1:14:11.695
+representation, and there afterwards you do
+the generation into the other target language.
+1:14:13.293 --> 1:14:16.953
+And that of course makes especially multilingual.
+1:14:16.953 --> 1:14:19.150
+It's like somehow is a dream.
+1:14:19.150 --> 1:14:25.519
+If you want to add a language you just need
+to add one analyst tool and one generation
+1:14:25.519 --> 1:14:25.959
+tool.
+1:14:29.249 --> 1:14:32.279
+Which is not the case in the other scenario.
+1:14:33.193 --> 1:14:40.547
+However, the big challenge is in this case
+the interlingua based representation because
+1:14:40.547 --> 1:14:47.651
+you need to represent all different types of
+knowledge in there in order to do that.
+1:14:47.807 --> 1:14:54.371
+And also like world knowledge, so something
+like an apple is a fruit and property is a
+1:14:54.371 --> 1:14:57.993
+fruit, so they are eatable and stuff like that.
+1:14:58.578 --> 1:15:06.286
+So that is why this is typically always only
+done for small amounts of data.
+1:15:06.326 --> 1:15:13.106
+So what people have done for special applications
+like hotel reservation people have looked into
+1:15:13.106 --> 1:15:18.348
+that, but they have typically not done it for
+any possibility of doing it.
+1:15:18.718 --> 1:15:31.640
+So the advantage is you need to represent
+all the world knowledge in your interlingua.
+1:15:32.092 --> 1:15:40.198
+And that is not possible at the moment or
+never was possible so far.
+1:15:40.198 --> 1:15:47.364
+Typically they were for small domains for
+hotel reservation.
+1:15:51.431 --> 1:15:57.926
+But of course this idea of doing that and
+that's why some people are interested in is
+1:15:57.926 --> 1:16:04.950
+like if you now do a neural system where you
+learn the representation in your neural network
+1:16:04.950 --> 1:16:07.442
+is that some type of artificial.
+1:16:08.848 --> 1:16:09.620
+Interlingua.
+1:16:09.620 --> 1:16:15.025
+However, what we at least found out until
+now is that there's often very language specific
+1:16:15.025 --> 1:16:15.975
+information in.
+1:16:16.196 --> 1:16:19.648
+And they might be important and essential.
+1:16:19.648 --> 1:16:26.552
+You don't have all the information in your
+input, so you typically can't do resolving
+1:16:26.552 --> 1:16:32.412
+all ambiguities inside there because you might
+not have all information.
+1:16:32.652 --> 1:16:37.870
+So in English you don't know if it's a living
+fish or the fish which you're eating, and if
+1:16:37.870 --> 1:16:43.087
+you're translating to Germany you also don't
+have to resolve this problem because you have
+1:16:43.087 --> 1:16:45.610
+the same ambiguity in your target language.
+1:16:45.610 --> 1:16:50.828
+So why would you put in our effort in finding
+out if it's a dish or the other fish if it's
+1:16:50.828 --> 1:16:52.089
+not necessary at all?
+1:16:54.774 --> 1:16:59.509
+Yeah Yeah.
+1:17:05.585 --> 1:17:15.019
+The semantic transfer is not the same for
+both languages, so you still represent the
+1:17:15.019 --> 1:17:17.127
+semantic language.
+1:17:17.377 --> 1:17:23.685
+So you have the like semantic representation
+in the Gusta, but that's not the same as semantic
+1:17:23.685 --> 1:17:28.134
+representation for both languages, and that's
+the main difference.
+1:17:35.515 --> 1:17:44.707
+Okay, then these are the most important things
+for today: what is language and how our rule
+1:17:44.707 --> 1:17:46.205
+based systems.
+1:17:46.926 --> 1:17:59.337
+And if there is no more questions thank you
+for joining, we have today a bit of a shorter
+1:17:59.337 --> 1:18:00.578
+lecture.

demo_data/lectures/Lecture-02-20.04.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0ac15772e9e528ff3f7fb957401be410fcdf4a4ad54542e96916fe654443eb3
+size 111655016

demo_data/lectures/Lecture-03-25.04.2023/English.vtt ADDED Viewed

	@@ -0,0 +1,3102 @@

+WEBVTT
+0:00:02.822 --> 0:00:07.880
+We look into more linguistic approaches.
+0:00:07.880 --> 0:00:14.912
+We can do machine translation in a more traditional
+way.
+0:00:14.912 --> 0:00:21.224
+It should be: Translation should be generated
+this way.
+0:00:21.224 --> 0:00:27.933
+We can analyze versus a sewer sentence what
+is the meaning or the syntax.
+0:00:27.933 --> 0:00:35.185
+Then we transfer this information to the target
+side and then we then generate.
+0:00:36.556 --> 0:00:42.341
+And this was the strong and common used approach
+for yeah several years.
+0:00:44.024 --> 0:00:50.839
+However, we saw already at the beginning there
+some challenges with that: Language is very
+0:00:50.839 --> 0:00:57.232
+ambigue and it's often very difficult to really
+get high coated rules.
+0:00:57.232 --> 0:01:05.336
+What are the different meanings and we have
+to do that also with a living language so new
+0:01:05.336 --> 0:01:06.596
+things occur.
+0:01:07.007 --> 0:01:09.308
+And that's why people look into.
+0:01:09.308 --> 0:01:13.282
+Can we maybe do it differently and use machine
+learning?
+0:01:13.333 --> 0:01:24.849
+So we are no longer giving rules of how to
+do it, but we just give examples and the system.
+0:01:25.045 --> 0:01:34.836
+And one important thing then is these examples:
+how can we learn how to translate one sentence?
+0:01:35.635 --> 0:01:42.516
+And therefore these yeah, the data is now
+really a very important issue.
+0:01:42.582 --> 0:01:50.021
+And that is what we want to look into today.
+0:01:50.021 --> 0:01:58.783
+What type of data do we use for machine translation?
+0:01:59.019 --> 0:02:08.674
+So the idea in preprocessing is always: Can
+we make the task somehow a bit easier so that
+0:02:08.674 --> 0:02:13.180
+the empty system will be in a way better?
+0:02:13.493 --> 0:02:28.309
+So one example could be if it has problems
+dealing with numbers because they are occurring.
+0:02:28.648 --> 0:02:35.479
+Or think about so one problem which still
+might be is there in some systems think about
+0:02:35.479 --> 0:02:36.333
+different.
+0:02:36.656 --> 0:02:44.897
+So a system might learn that of course if
+there's a German over in English there should.
+0:02:45.365 --> 0:02:52.270
+However, if it's in pearl text, it will see
+that in Germany there is often km, and in English
+0:02:52.270 --> 0:02:54.107
+typically various miles.
+0:02:54.594 --> 0:03:00.607
+Might just translate three hundred and fifty
+five miles into three hundred and fiftY five
+0:03:00.607 --> 0:03:04.348
+kilometers, which of course is not right, and
+so forth.
+0:03:04.348 --> 0:03:06.953
+It might make things to look into the.
+0:03:07.067 --> 0:03:13.072
+Therefore, first step when you build your
+machine translation system is normally to look
+0:03:13.072 --> 0:03:19.077
+at the data, to check it, to see if there is
+anything happening which you should address
+0:03:19.077 --> 0:03:19.887
+beforehand.
+0:03:20.360 --> 0:03:29.152
+And then the second part is how do you represent
+no works machine learning normally?
+0:03:29.109 --> 0:03:35.404
+So the question is how do we get out from
+the words into numbers and I've seen some of
+0:03:35.404 --> 0:03:35.766
+you?
+0:03:35.766 --> 0:03:42.568
+For example, in advance there we have introduced
+to an algorithm which we also shortly repeat
+0:03:42.568 --> 0:03:43.075
+today.
+0:03:43.303 --> 0:03:53.842
+The subword unit approach which was first
+introduced in machine translation and now used
+0:03:53.842 --> 0:04:05.271
+for an in order to represent: Now you've learned
+about morphology, so you know that maybe in
+0:04:05.271 --> 0:04:09.270
+English it's not that important.
+0:04:09.429 --> 0:04:22.485
+In German you have all these different word
+poems and to learn independent representation.
+0:04:24.024 --> 0:04:26.031
+And then, of course, they are more extreme.
+0:04:27.807 --> 0:04:34.387
+So how are we doing?
+0:04:34.975 --> 0:04:37.099
+Machine translation.
+0:04:37.099 --> 0:04:46.202
+So hopefully you remember we had these approaches
+to machine translation, the rule based.
+0:04:46.202 --> 0:04:52.473
+We had a big block of corpus space machine
+translation which.
+0:04:52.492 --> 0:05:00.443
+Will on Thursday have an overview on statistical
+models and then afterwards concentrate on the.
+0:05:00.680 --> 0:05:08.828
+Both of them are corpus based machine translation
+and therefore it's really essential, and while
+0:05:08.828 --> 0:05:16.640
+we are typically training a machine translation
+system is what we refer to as parallel data.
+0:05:16.957 --> 0:05:22.395
+Talk a lot about pearl corpus or pearl data,
+and what I mean there is something which you
+0:05:22.395 --> 0:05:28.257
+might know from was that a stone or something
+like that, so it's typically you have one sentence
+0:05:28.257 --> 0:05:33.273
+in the one language, and then you have aligned
+to it one sentence in the charcote.
+0:05:33.833 --> 0:05:38.261
+And this is how we train all our alignments.
+0:05:38.261 --> 0:05:43.181
+We'll see today that of course we might not
+have.
+0:05:43.723 --> 0:05:51.279
+However, this is relatively easy to create,
+at least for iquality data.
+0:05:51.279 --> 0:06:00.933
+We look into data trawling so that means how
+we can automatically create this parallel data
+0:06:00.933 --> 0:06:02.927
+from the Internet.
+0:06:04.144 --> 0:06:13.850
+It's not so difficult to learn these alignments
+if we have some type of dictionary, so which
+0:06:13.850 --> 0:06:16.981
+sentence is aligned to which.
+0:06:18.718 --> 0:06:25.069
+What it would, of course, be a lot more difficult
+is really to word alignment, and that's also
+0:06:25.069 --> 0:06:27.476
+often no longer that good possible.
+0:06:27.476 --> 0:06:33.360
+We do that automatically in some yes for symbols,
+but it's definitely more challenging.
+0:06:33.733 --> 0:06:40.691
+For sentence alignment, of course, it's still
+not always perfect, so there might be that
+0:06:40.691 --> 0:06:46.085
+there is two German sentences and one English
+sentence or the other.
+0:06:46.085 --> 0:06:53.511
+So there's not always perfect alignment, but
+if you look at text, it's still bigly relatively.
+0:06:54.014 --> 0:07:03.862
+If we have that then we can build a machine
+learning model which tries to map ignition
+0:07:03.862 --> 0:07:06.239
+sentences somewhere.
+0:07:06.626 --> 0:07:15.932
+So this is the idea of behind statistical
+machine translation and machine translation.
+0:07:15.932 --> 0:07:27.098
+The difference is: Statistical machine translation
+is typically a whole box of different models
+0:07:27.098 --> 0:07:30.205
+which try to evaluate the.
+0:07:30.510 --> 0:07:42.798
+In neural machine translation, it's all one
+large neural network where we use the one-sur-sentence
+0:07:42.798 --> 0:07:43.667
+input.
+0:07:44.584 --> 0:07:50.971
+And then we can train it by having exactly
+this mapping port or parallel data.
+0:07:54.214 --> 0:08:02.964
+So what we want today to look at today is
+we want to first look at general text data.
+0:08:03.083 --> 0:08:06.250
+So what is text data?
+0:08:06.250 --> 0:08:09.850
+What text data is there?
+0:08:09.850 --> 0:08:18.202
+Why is it challenging so that we have large
+vocabularies?
+0:08:18.378 --> 0:08:22.003
+It's so that you always have words which you
+haven't seen.
+0:08:22.142 --> 0:08:29.053
+If you increase your corporate science normally
+you will also increase your vocabulary so you
+0:08:29.053 --> 0:08:30.744
+always find new words.
+0:08:31.811 --> 0:08:39.738
+Then based on that we'll look into pre-processing.
+0:08:39.738 --> 0:08:45.333
+So how can we pre-process our data?
+0:08:45.333 --> 0:08:46.421
+Maybe.
+0:08:46.526 --> 0:08:54.788
+This is a lot about tokenization, for example,
+which we heard is not so challenging in European
+0:08:54.788 --> 0:09:02.534
+languages but still important, but might be
+really difficult in Asian languages where you
+0:09:02.534 --> 0:09:05.030
+don't have space separation.
+0:09:05.986 --> 0:09:12.161
+And this preprocessing typically tries to
+deal with the extreme cases where you have
+0:09:12.161 --> 0:09:13.105
+seen things.
+0:09:13.353 --> 0:09:25.091
+If you have seen your words three one hundred
+times, it doesn't really matter if you have
+0:09:25.091 --> 0:09:31.221
+seen them with them without punctuation or
+so.
+0:09:31.651 --> 0:09:38.578
+And then we look into word representation,
+so what is the best way to represent a word?
+0:09:38.578 --> 0:09:45.584
+And finally, we look into the other type of
+data we really need for machine translation.
+0:09:45.725 --> 0:09:56.842
+So in first we can use for many tasks, and
+later we can also use purely monolingual data
+0:09:56.842 --> 0:10:00.465
+to make machine translation.
+0:10:00.660 --> 0:10:03.187
+So then the traditional approach was that
+it was easier.
+0:10:03.483 --> 0:10:08.697
+We have this type of language model which
+we can train only on the target data to make
+0:10:08.697 --> 0:10:12.173
+the text more fluent in neural machine translation
+model.
+0:10:12.173 --> 0:10:18.106
+It's partly a bit more complicated to integrate
+this data but still it's very important especially
+0:10:18.106 --> 0:10:22.362
+if you think about lower issue languages where
+you have very few data.
+0:10:23.603 --> 0:10:26.999
+It's harder to get parallel data than you
+get monolingual data.
+0:10:27.347 --> 0:10:33.821
+Because monolingual data you just have out
+there not huge amounts for some languages,
+0:10:33.821 --> 0:10:38.113
+but definitely the amount of data is always
+significant.
+0:10:40.940 --> 0:10:50.454
+When we talk about data, it's also of course
+important how we use it for machine learning.
+0:10:50.530 --> 0:11:05.867
+And that you hopefully learn in some prior
+class, so typically we separate our data into
+0:11:05.867 --> 0:11:17.848
+three chunks: So this is really by far the
+largest, and this grows with the data we get.
+0:11:17.848 --> 0:11:21.387
+Today we get here millions.
+0:11:22.222 --> 0:11:27.320
+Then we have our validation data and that
+is to train some type of parameters.
+0:11:27.320 --> 0:11:33.129
+So not only you have some things to configure
+and you don't know what is the right value,
+0:11:33.129 --> 0:11:39.067
+so what you can do is train a model and change
+these a bit and try to find the best ones on
+0:11:39.067 --> 0:11:40.164
+your validation.
+0:11:40.700 --> 0:11:48.531
+For a statistical model, for example data
+in what you want to use if you have several
+0:11:48.531 --> 0:11:54.664
+models: You know how to combine it, so how
+much focus should you put on the different
+0:11:54.664 --> 0:11:55.186
+models?
+0:11:55.186 --> 0:11:59.301
+And if it's like twenty models, so it's only
+twenty per meter.
+0:11:59.301 --> 0:12:02.828
+It's not that much, so that is still bigly
+estimated.
+0:12:03.183 --> 0:12:18.964
+In your model there's often a question how
+long should train the model before you have
+0:12:18.964 --> 0:12:21.322
+overfitting.
+0:12:22.902 --> 0:12:28.679
+And then you have your test data, which is
+finally where you report on your test.
+0:12:29.009 --> 0:12:33.663
+And therefore it's also important that from
+time to time you get new test data because
+0:12:33.663 --> 0:12:38.423
+if you're always through your experiments you
+test on it and then you do new experiments
+0:12:38.423 --> 0:12:43.452
+and tests again at some point you have tested
+so many on it that you do some type of training
+0:12:43.452 --> 0:12:48.373
+on your test data again because you just select
+the things which is at the end best on your
+0:12:48.373 --> 0:12:48.962
+test data.
+0:12:49.009 --> 0:12:54.755
+It's important to get a new test data from
+time to time, for example in important evaluation
+0:12:54.755 --> 0:12:58.340
+campaigns for machine translation and speech
+translation.
+0:12:58.618 --> 0:13:07.459
+There is like every year there should do tests
+that create it so we can see if the model really
+0:13:07.459 --> 0:13:09.761
+gets better on new data.
+0:13:10.951 --> 0:13:19.629
+And of course it is important that this is
+a representative of the use case you are interested.
+0:13:19.879 --> 0:13:36.511
+So if you're building a system for translating
+websites, this should be on websites.
+0:13:36.816 --> 0:13:39.356
+So normally a system is good on some tasks.
+0:13:40.780 --> 0:13:48.596
+I would solve everything and then your test
+data should be out of everything because if
+0:13:48.596 --> 0:13:54.102
+you only have a very small subset you know
+it's good on this.
+0:13:54.394 --> 0:14:02.714
+Therefore, the selection of your test data
+is really important in order to ensure that
+0:14:02.714 --> 0:14:05.200
+the MP system in the end.
+0:14:05.525 --> 0:14:12.646
+Is the greatest system ever you have evaluated
+on translating Bible.
+0:14:12.646 --> 0:14:21.830
+The use case is to translate some Twitter
+data and you can imagine the performance might
+0:14:21.830 --> 0:14:22.965
+be really.
+0:14:23.803 --> 0:14:25.471
+And privately.
+0:14:25.471 --> 0:14:35.478
+Of course, in honor to have this and realistic
+evaluation, it's important that there's no
+0:14:35.478 --> 0:14:39.370
+overlap between this data because.
+0:14:39.799 --> 0:14:51.615
+Because the danger might be is learning by
+heart how to translate the sentences from your
+0:14:51.615 --> 0:14:53.584
+training data.
+0:14:54.194 --> 0:15:04.430
+That the test data is really different from
+your training data.
+0:15:04.430 --> 0:15:16.811
+Therefore, it's important to: So what type
+of data we have?
+0:15:16.811 --> 0:15:24.966
+There's a lot of different text data and the
+nice thing is with digitalization.
+0:15:25.345 --> 0:15:31.785
+You might think there's a large amount with
+books, but to be honest books and printed things
+0:15:31.785 --> 0:15:35.524
+that's by now a minor percentage of the data
+we have.
+0:15:35.815 --> 0:15:39.947
+There's like so much data created every day
+on the Internet.
+0:15:39.980 --> 0:15:46.223
+With social media and all the other types.
+0:15:46.223 --> 0:15:56.821
+This of course is a largest amount of data,
+more of colloquial language.
+0:15:56.856 --> 0:16:02.609
+It might be more noisy and harder to process,
+so there is a whole area on how to deal with
+0:16:02.609 --> 0:16:04.948
+more social media and outdoor stuff.
+0:16:07.347 --> 0:16:20.702
+What type of data is there if you think about
+parallel data news type of data official sites?
+0:16:20.900 --> 0:16:26.629
+So the first Power Corpora were like things
+like the European Parliament or like some news
+0:16:26.629 --> 0:16:27.069
+sites.
+0:16:27.227 --> 0:16:32.888
+Nowadays there's quite a large amount of data
+crawled from the Internet, but of course if
+0:16:32.888 --> 0:16:38.613
+you crawl parallel data from the Internet,
+a lot of the data is also like company websites
+0:16:38.613 --> 0:16:41.884
+or so which gets translated into several languages.
+0:16:45.365 --> 0:17:00.613
+Then, of course, there is different levels
+of text and we have to look at what level we
+0:17:00.613 --> 0:17:05.118
+want to process our data.
+0:17:05.885 --> 0:17:16.140
+It one normally doesn't make sense to work
+on full sentences because a lot of sentences
+0:17:16.140 --> 0:17:22.899
+have never been seen and you always create
+new sentences.
+0:17:23.283 --> 0:17:37.421
+So typically what we take is our basic words,
+something between words and letters, and that
+0:17:37.421 --> 0:17:40.033
+is an essential.
+0:17:40.400 --> 0:17:47.873
+So we need some of these atomic blocks or
+basic blocks on which we can't make smaller.
+0:17:48.128 --> 0:17:55.987
+So if we're building a sentence, for example,
+you can build it out of something and you can
+0:17:55.987 --> 0:17:57.268
+either decide.
+0:17:57.268 --> 0:18:01.967
+For example, you take words and you spit them
+further.
+0:18:03.683 --> 0:18:10.178
+Then, of course, the nice thing is not too
+small and therefore building larger things
+0:18:10.178 --> 0:18:11.386
+like sentences.
+0:18:11.831 --> 0:18:16.690
+So you only have to take your vocabulary and
+put it somewhere together to get your full
+0:18:16.690 --> 0:18:17.132
+center.
+0:18:19.659 --> 0:18:27.670
+However, if it's too large, these blocks don't
+occur often enough, and you have more blocks
+0:18:27.670 --> 0:18:28.715
+that occur.
+0:18:29.249 --> 0:18:34.400
+And that's why yeah we can work with blocks
+for smaller like software blocks.
+0:18:34.714 --> 0:18:38.183
+Work with neural models.
+0:18:38.183 --> 0:18:50.533
+Then you can work on letters so you have a
+system which tries to understand the sentence
+0:18:50.533 --> 0:18:53.031
+letter by letter.
+0:18:53.313 --> 0:18:57.608
+But that is a design decision which you have
+to take at some point.
+0:18:57.608 --> 0:19:03.292
+On which level do you want to split your text
+and that of the evasive blocks that you are
+0:19:03.292 --> 0:19:04.176
+working with?
+0:19:04.176 --> 0:19:06.955
+And that's something we'll look into today.
+0:19:06.955 --> 0:19:08.471
+What possibilities are?
+0:19:12.572 --> 0:19:14.189
+Any question.
+0:19:17.998 --> 0:19:24.456
+Then let's look a bit on what type of data
+there is in how much data there is to person.
+0:19:24.824 --> 0:19:34.006
+Is that nowadays, at least for pure text,
+it's no longer for some language.
+0:19:34.006 --> 0:19:38.959
+There is so much data we cannot process.
+0:19:39.479 --> 0:19:49.384
+That is only true for some languages, but
+there is also interest in other languages and
+0:19:49.384 --> 0:19:50.622
+important.
+0:19:50.810 --> 0:20:01.483
+So if you want to build a system for Sweden
+or for some dialect in other countries, then
+0:20:01.483 --> 0:20:02.802
+of course.
+0:20:03.103 --> 0:20:06.888
+Otherwise you have this huge amount of hair.
+0:20:06.888 --> 0:20:11.515
+We are often no longer taking about gigabytes
+or more.
+0:20:11.891 --> 0:20:35.788
+The general information that is produced every
+year is: And this is like all the information
+0:20:35.788 --> 0:20:40.661
+that are available in the, so there are really.
+0:20:41.001 --> 0:20:44.129
+We look at machine translation.
+0:20:44.129 --> 0:20:53.027
+We can see these numbers are really like more
+than ten years old, but we see this increase
+0:20:53.027 --> 0:20:58.796
+in one billion works we had at that time for
+English data.
+0:20:59.019 --> 0:21:01.955
+Then I wore like new shuffle on Google Maps
+and stuff.
+0:21:02.382 --> 0:21:05.003
+For this one you could train your system on.
+0:21:05.805 --> 0:21:20.457
+And the interesting thing is this one billion
+words is more than any human typically speaks.
+0:21:21.001 --> 0:21:25.892
+So these systems they see by now like a magnitude
+of more data.
+0:21:25.892 --> 0:21:32.465
+We know I think are a magnitude higher of
+more data than a human has ever seen in his
+0:21:32.465 --> 0:21:33.229
+lifetime.
+0:21:35.175 --> 0:21:41.808
+And that is maybe the interesting thing why
+it still doesn't work on it because you see
+0:21:41.808 --> 0:21:42.637
+they seem.
+0:21:43.103 --> 0:21:48.745
+So we are seeing a really impressive result,
+but in most cases it's not that they're really
+0:21:48.745 --> 0:21:49.911
+better than human.
+0:21:50.170 --> 0:21:56.852
+However, they really have seen more data than
+any human ever has seen in this lifetime.
+0:21:57.197 --> 0:22:01.468
+They can just process so much data, so.
+0:22:01.501 --> 0:22:08.425
+The question is, can we make them more efficient
+so that they can learn similarly good without
+0:22:08.425 --> 0:22:09.592
+that much data?
+0:22:09.592 --> 0:22:16.443
+And that is essential if we now go to Lawrence's
+languages where we might never get that much
+0:22:16.443 --> 0:22:21.254
+data, and we should be also able to achieve
+a reasonable perform.
+0:22:23.303 --> 0:22:32.399
+On the other hand, this of course links also
+to one topic which we will cover later: If
+0:22:32.399 --> 0:22:37.965
+you think about this, it's really important
+that your algorithms are also very efficient
+0:22:37.965 --> 0:22:41.280
+in order to process that much data both in
+training.
+0:22:41.280 --> 0:22:46.408
+If you have more data, you want to process
+more data so you can make use of that.
+0:22:46.466 --> 0:22:54.499
+On the other hand, if more and more data is
+processed, more and more people will use machine
+0:22:54.499 --> 0:23:06.816
+translation to generate translations, and it
+will be important to: And there is yeah, there
+0:23:06.816 --> 0:23:07.257
+is.
+0:23:07.607 --> 0:23:10.610
+More.
+0:23:10.170 --> 0:23:17.262
+More data generated every day, we hear just
+some general numbers on how much data there
+0:23:17.262 --> 0:23:17.584
+is.
+0:23:17.584 --> 0:23:24.595
+It says that a lot of the data we produce
+at least at the moment is text rich, so text
+0:23:24.595 --> 0:23:26.046
+that is produced.
+0:23:26.026 --> 0:23:29.748
+That is very important to either wise.
+0:23:29.748 --> 0:23:33.949
+We can use it as training data in some way.
+0:23:33.873 --> 0:23:40.836
+That we want to translate some of that because
+it might not be published in all the languages,
+0:23:40.836 --> 0:23:46.039
+and step with the need for machine translation
+is even more important.
+0:23:47.907 --> 0:23:51.547
+So what are the challenges with this?
+0:23:51.831 --> 0:24:01.360
+So first of all that seems to be very good
+news, so there is more and more data, so we
+0:24:01.360 --> 0:24:10.780
+can just wait for three years and have more
+data, and then our system will be better.
+0:24:11.011 --> 0:24:22.629
+If you see in competitions, the system performance
+increases.
+0:24:24.004 --> 0:24:27.190
+See that here are three different systems.
+0:24:27.190 --> 0:24:34.008
+Blue score is metric to measure how good an
+empty system is and we'll talk about evaluation
+0:24:34.008 --> 0:24:40.974
+and the next week so you'll have to evaluate
+machine validation and also a practical session.
+0:24:41.581 --> 0:24:45.219
+And so.
+0:24:44.784 --> 0:24:50.960
+This shows you that this is like how much
+data of the training data you have five percent.
+0:24:50.960 --> 0:24:56.117
+You're significantly worse than if you're
+forty percent and eighty percent.
+0:24:56.117 --> 0:25:02.021
+You're getting better and you're seeing two
+between this curve, which maybe not really
+0:25:02.021 --> 0:25:02.971
+flattens out.
+0:25:02.971 --> 0:25:03.311
+But.
+0:25:03.263 --> 0:25:07.525
+Of course, the gains you get are normally
+smaller and smaller.
+0:25:07.525 --> 0:25:09.216
+The more data you have,.
+0:25:09.549 --> 0:25:21.432
+If your improvements are unnormally better,
+if you add the same thing or even double your
+0:25:21.432 --> 0:25:25.657
+data late, of course more data.
+0:25:26.526 --> 0:25:34.955
+However, you see the clear tendency if you
+need to improve your system.
+0:25:34.955 --> 0:25:38.935
+This is possible by just getting.
+0:25:39.039 --> 0:25:41.110
+But it's not all about data.
+0:25:41.110 --> 0:25:45.396
+It can also be the domain of the day that
+there's building.
+0:25:45.865 --> 0:25:55.668
+So this was a test on machine translation
+system on translating genome data.
+0:25:55.668 --> 0:26:02.669
+We have the like SAI said he's working on
+translating.
+0:26:02.862 --> 0:26:06.868
+Here you see the performance began with GreenScore.
+0:26:06.868 --> 0:26:12.569
+You see one system which only was trained
+on genome data and it only has.
+0:26:12.812 --> 0:26:17.742
+That's very, very few for machine translation.
+0:26:18.438 --> 0:26:23.927
+And to compare that to a system which was
+generally trained on used translation data.
+0:26:24.104 --> 0:26:34.177
+With four point five million sentences so
+roughly one hundred times as much data you
+0:26:34.177 --> 0:26:40.458
+still see that this system doesn't really work
+well.
+0:26:40.820 --> 0:26:50.575
+So you see it's not only about data, it's
+also that the data has to somewhat fit to the
+0:26:50.575 --> 0:26:51.462
+domain.
+0:26:51.831 --> 0:26:58.069
+The more general data you get that you have
+covered up all domains.
+0:26:58.418 --> 0:27:07.906
+But that's very difficult and especially for
+more specific domains.
+0:27:07.906 --> 0:27:16.696
+It can be really important to get data which
+fits your domain.
+0:27:16.716 --> 0:27:18.520
+Maybe if you can do some very much broccoli
+or something like that, maybe if you.
+0:27:18.598 --> 0:27:22.341
+To say okay, concentrate this as you like
+for being at better.
+0:27:24.564 --> 0:27:28.201
+It's not that easy to prompt it.
+0:27:28.201 --> 0:27:35.807
+You can do the prompting in the more traditional
+way of fine tuning.
+0:27:35.807 --> 0:27:44.514
+Then, of course, if you select UIV later combine
+this one, you can get better.
+0:27:44.904 --> 0:27:52.675
+But it will always be that this type of similar
+data is much more important than the general.
+0:27:52.912 --> 0:28:00.705
+So of course it can make the lower system
+a lot better if you search for similar data
+0:28:00.705 --> 0:28:01.612
+and find.
+0:28:02.122 --> 0:28:08.190
+Will have a lecture on domain adaptation where
+it's exactly the idea how you can make systems
+0:28:08.190 --> 0:28:13.935
+in these situations better so you can adapt
+it to this data but then you still need this
+0:28:13.935 --> 0:28:14.839
+type of data.
+0:28:15.335 --> 0:28:21.590
+And in prompting it might work if you have
+seen it in your data so it can make the system
+0:28:21.590 --> 0:28:25.134
+aware and tell it focus more in this type of
+data.
+0:28:25.465 --> 0:28:30.684
+But if you haven't had enough of the really
+specific good matching data, I think it will
+0:28:30.684 --> 0:28:31.681
+always not work.
+0:28:31.681 --> 0:28:37.077
+So you need to have this type of data and
+therefore it's important not only to have general
+0:28:37.077 --> 0:28:42.120
+data but also data, at least in your overall
+system, which really fits to the domain.
+0:28:45.966 --> 0:28:53.298
+And then the second thing, of course, is you
+need to have data that has good quality.
+0:28:53.693 --> 0:29:00.170
+In the early stages it might be good to have
+all the data but later it's especially important
+0:29:00.170 --> 0:29:06.577
+that you have somehow good quality and so that
+you're learning what you really want to learn
+0:29:06.577 --> 0:29:09.057
+and not learning some great things.
+0:29:10.370 --> 0:29:21.551
+We talked about this with the kilometers and
+miles, so if you just take in some type of
+0:29:21.551 --> 0:29:26.253
+data and don't look at the quality,.
+0:29:26.766 --> 0:29:30.875
+But of course, the question here is what is
+good quality data?
+0:29:31.331 --> 0:29:35.054
+It is not yet that easy to define what is
+a good quality data.
+0:29:36.096 --> 0:29:43.961
+That doesn't mean it has to what people generally
+assume as high quality text or so, like written
+0:29:43.961 --> 0:29:47.814
+by a Nobel Prize winner or something like that.
+0:29:47.814 --> 0:29:54.074
+This is not what we mean by this quality,
+but again the most important again.
+0:29:54.354 --> 0:30:09.181
+So if you have Twitter data, high quality
+data doesn't mean you have now some novels.
+0:30:09.309 --> 0:30:12.875
+Test data, but it should also be represented
+similarly.
+0:30:12.875 --> 0:30:18.480
+Don't have, for example, quality definitely
+as it should be really translating yourself
+0:30:18.480 --> 0:30:18.862
+into.
+0:30:19.199 --> 0:30:25.556
+So especially if you corral data you would
+often have that it's not a direct translation.
+0:30:25.805 --> 0:30:28.436
+So then, of course, this is not high quality
+teaching.
+0:30:29.449 --> 0:30:39.974
+But in generally that's a very difficult thing
+to, and it's very difficult to design what
+0:30:39.974 --> 0:30:41.378
+is reading.
+0:30:41.982 --> 0:30:48.333
+And of course a biometric is always the quality
+of your data is good if your machine translation.
+0:30:48.648 --> 0:30:50.719
+So that is like the indirect.
+0:30:50.991 --> 0:30:52.447
+Well, what can we motive?
+0:30:52.447 --> 0:30:57.210
+Of course, it's difficult to always try a
+lot of things and evaluate either of them,
+0:30:57.210 --> 0:30:59.396
+build a full MP system and then check.
+0:30:59.396 --> 0:31:00.852
+Oh, was this a good idea?
+0:31:00.852 --> 0:31:01.357
+I mean,.
+0:31:01.581 --> 0:31:19.055
+You have two tokenizers who like split sentences
+and the words you really want to apply.
+0:31:19.179 --> 0:31:21.652
+Now you could maybe argue or your idea could
+be.
+0:31:21.841 --> 0:31:30.186
+Just take it there very fast and then get
+the result, but the problem is there is not
+0:31:30.186 --> 0:31:31.448
+always this.
+0:31:31.531 --> 0:31:36.269
+One thing that works very well for small data.
+0:31:36.269 --> 0:31:43.123
+It's not for sure that the same effect will
+happen in large stages.
+0:31:43.223 --> 0:31:50.395
+This idea really improves on very low resource
+data if only train on hundred words.
+0:31:51.271 --> 0:31:58.357
+But if you use it for a large data set, it
+doesn't really matter and all your ideas not.
+0:31:58.598 --> 0:32:01.172
+So that is also a typical thing.
+0:32:01.172 --> 0:32:05.383
+This quality issue is more and more important
+if you.
+0:32:06.026 --> 0:32:16.459
+By one motivation which generally you should
+have, you want to represent your data in having
+0:32:16.459 --> 0:32:17.469
+as many.
+0:32:17.677 --> 0:32:21.805
+Why is this the case any idea?
+0:32:21.805 --> 0:32:33.389
+Why this could be a motivation that we try
+to represent the data in a way that we have
+0:32:33.389 --> 0:32:34.587
+as many.
+0:32:38.338 --> 0:32:50.501
+We also want to learn about the fun text because
+maybe sometimes some grows in the fun text.
+0:32:52.612 --> 0:32:54.020
+The context is here.
+0:32:54.020 --> 0:32:56.432
+It's more about the learning first.
+0:32:56.432 --> 0:33:00.990
+You can generally learn better if you've seen
+something more often.
+0:33:00.990 --> 0:33:06.553
+So if you have seen an event only once, it's
+really hard to learn about the event.
+0:33:07.107 --> 0:33:15.057
+If you have seen an event a hundred times
+your bearing estimating which and maybe that
+0:33:15.057 --> 0:33:18.529
+is the context, then you can use the.
+0:33:18.778 --> 0:33:21.331
+So, for example, if you here have the word
+towels.
+0:33:21.761 --> 0:33:28.440
+If you would just take the data normally you
+would directly process the data.
+0:33:28.440 --> 0:33:32.893
+In the upper case you would the house with
+the dog.
+0:33:32.893 --> 0:33:40.085
+That's a different word than the house this
+way and then the house with the common.
+0:33:40.520 --> 0:33:48.365
+So you want to learn how this translates into
+house, but you translate an upper case.
+0:33:48.365 --> 0:33:50.281
+How this translates.
+0:33:50.610 --> 0:33:59.445
+You were learning how to translate into house
+and house, so you have to learn four different
+0:33:59.445 --> 0:34:00.205
+things.
+0:34:00.205 --> 0:34:06.000
+Instead, we really want to learn that house
+gets into house.
+0:34:06.366 --> 0:34:18.796
+And then imagine if it would be even a beak,
+it might be like here a house would be into.
+0:34:18.678 --> 0:34:22.089
+Good-bye Then.
+0:34:22.202 --> 0:34:29.512
+If it's an upper case then I always have to
+translate it into a boiler while it's a lower
+0:34:29.512 --> 0:34:34.955
+case that is translated into house and that's
+of course not right.
+0:34:34.955 --> 0:34:39.260
+We have to use the context to decide what
+is better.
+0:34:39.679 --> 0:34:47.086
+If you have seen an event several times then
+you are better able to learn your model and
+0:34:47.086 --> 0:34:51.414
+that doesn't matter what type of learning you
+have.
+0:34:52.392 --> 0:34:58.981
+I shouldn't say all but for most of these
+models it's always better to have like seen
+0:34:58.981 --> 0:35:00.897
+an event war more often.
+0:35:00.920 --> 0:35:11.483
+Therefore, if you preprocessive data, you
+should ask the question how can represent data
+0:35:11.483 --> 0:35:14.212
+in order to have seen.
+0:35:14.514 --> 0:35:17.885
+Of course you should not remove that information.
+0:35:18.078 --> 0:35:25.519
+So you could now, of course, just lowercase
+everything.
+0:35:25.519 --> 0:35:30.303
+Then you've seen things more often.
+0:35:30.710 --> 0:35:38.443
+And that might be an issue because in the
+final application you want to have real text
+0:35:38.443 --> 0:35:38.887
+and.
+0:35:40.440 --> 0:35:44.003
+And finally, even it's more important than
+it's consistent.
+0:35:44.965 --> 0:35:52.630
+So this is a problem where, for example, aren't
+consistent.
+0:35:52.630 --> 0:35:58.762
+So I am, I'm together written in training
+data.
+0:35:58.762 --> 0:36:04.512
+And if you're not in test data, have a high.
+0:36:04.824 --> 0:36:14.612
+Therefore, most important is to generate preprocessing
+and represent your data that is most consistent
+0:36:14.612 --> 0:36:18.413
+because it's easier to map how similar.
+0:36:18.758 --> 0:36:26.588
+If your text is represented very, very differently
+then your data will be badly be translated.
+0:36:26.666 --> 0:36:30.664
+So we once had the case.
+0:36:30.664 --> 0:36:40.420
+For example, there is some data who wrote
+it, but in German.
+0:36:40.900 --> 0:36:44.187
+And if you read it as a human you see it.
+0:36:44.187 --> 0:36:49.507
+It's even hard to get the difference because
+it looks very similar.
+0:36:50.130 --> 0:37:02.997
+If you use it for a machine translation system,
+it would not be able to translate anything
+0:37:02.997 --> 0:37:08.229
+of it because it's a different word.
+0:37:09.990 --> 0:37:17.736
+And especially on the other hand you should
+of course not rechange significant training
+0:37:17.736 --> 0:37:18.968
+data thereby.
+0:37:18.968 --> 0:37:27.155
+For example, removing case information because
+if your task is to generate case information.
+0:37:31.191 --> 0:37:41.081
+One thing which is a bit point to look into
+it in order to see the difficulty of your data
+0:37:41.081 --> 0:37:42.711
+is to compare.
+0:37:43.103 --> 0:37:45.583
+There are types.
+0:37:45.583 --> 0:37:57.983
+We mean the number of unique words in the
+corpus, so your vocabulary and the tokens.
+0:37:58.298 --> 0:38:08.628
+And then you can look at the type token ratio
+that means a number of types per token.
+0:38:15.815 --> 0:38:22.381
+Have less types than tokens because every
+word appears at least in the corpus, but most
+0:38:22.381 --> 0:38:27.081
+of them will occur more often until this number
+is bigger, so.
+0:38:27.667 --> 0:38:30.548
+And of course this changes if you have more
+date.
+0:38:31.191 --> 0:38:38.103
+Here is an example from an English Wikipedia.
+0:38:38.103 --> 0:38:45.015
+That means each word in average occurs times.
+0:38:45.425 --> 0:38:47.058
+Of course there's a big difference.
+0:38:47.058 --> 0:38:51.323
+There will be some words which occur one hundred
+times, but therefore most of the words occur
+0:38:51.323 --> 0:38:51.777
+only one.
+0:38:52.252 --> 0:38:55.165
+However, you see this ratio goes down.
+0:38:55.165 --> 0:39:01.812
+That's a good thing, so you have seen each
+word more often and therefore your model gets
+0:39:01.812 --> 0:39:03.156
+typically better.
+0:39:03.156 --> 0:39:08.683
+However, the problem is we always have a lot
+of words which we have seen.
+0:39:09.749 --> 0:39:15.111
+Even here there will be a bound of words which
+you have only seen once.
+0:39:15.111 --> 0:39:20.472
+However, this can give you an indication about
+the quality of the data.
+0:39:20.472 --> 0:39:27.323
+So you should always, of course, try to achieve
+data where you have a very low type to talk
+0:39:27.323 --> 0:39:28.142
+and ratio.
+0:39:28.808 --> 0:39:39.108
+For example, if you compare, simplify and
+not only Wikipedia, what would be your expectation?
+0:39:41.861 --> 0:39:49.842
+Yes, that's exactly, but however it's surprisingly
+only a little bit lower, but you see that it's
+0:39:49.842 --> 0:39:57.579
+lower, so we are using less words to express
+the same thing, and therefore the task to produce
+0:39:57.579 --> 0:39:59.941
+this text is also a gesture.
+0:40:01.221 --> 0:40:07.702
+However, as how many words are there, there
+is no clear definition.
+0:40:07.787 --> 0:40:19.915
+So there will be always more words, especially
+depending on your dataset, how many different
+0:40:19.915 --> 0:40:22.132
+words there are.
+0:40:22.482 --> 0:40:30.027
+So if you have million tweets where around
+fifty million tokens and you have six hundred
+0:40:30.027 --> 0:40:30.875
+thousand.
+0:40:31.251 --> 0:40:40.299
+If you have times this money teen tweeds you
+also have significantly more tokens but also.
+0:40:40.660 --> 0:40:58.590
+So especially in things like the social media,
+of course, there's always different types of
+0:40:58.590 --> 0:40:59.954
+words.
+0:41:00.040 --> 0:41:04.028
+Another example from not social media is here.
+0:41:04.264 --> 0:41:18.360
+So yeah, there is a small liter sandwich like
+phone conversations, two million tokens, and
+0:41:18.360 --> 0:41:22.697
+only twenty thousand words.
+0:41:23.883 --> 0:41:37.221
+If you think about Shakespeare, it has even
+less token, significantly less than a million,
+0:41:37.221 --> 0:41:40.006
+but the number of.
+0:41:40.060 --> 0:41:48.781
+On the other hand, there is this Google Engron
+corpus which has tokens and there is always
+0:41:48.781 --> 0:41:50.506
+new words coming.
+0:41:50.991 --> 0:41:52.841
+Is English.
+0:41:52.841 --> 0:42:08.103
+The nice thing about English is that the vocabulary
+is relatively small, too small, but relatively
+0:42:08.103 --> 0:42:09.183
+small.
+0:42:09.409 --> 0:42:14.224
+So here you see the Ted Corpus here.
+0:42:15.555 --> 0:42:18.144
+All know Ted's lectures.
+0:42:18.144 --> 0:42:26.429
+They are transcribed, translated, not a source
+for us, especially small crocus.
+0:42:26.846 --> 0:42:32.702
+You can do a lot of experiments with that
+and you see that the corpus site is relatively
+0:42:32.702 --> 0:42:36.782
+similar so we have around four million tokens
+in this corpus.
+0:42:36.957 --> 0:42:44.464
+However, if you look at the vocabulary, English
+has half as many words in their different words
+0:42:44.464 --> 0:42:47.045
+as German and Dutch and Italian.
+0:42:47.527 --> 0:42:56.260
+So this is one influence from positional works
+like which are more frequent in German, the
+0:42:56.260 --> 0:43:02.978
+more important since we have all these different
+morphological forms.
+0:43:03.263 --> 0:43:08.170
+There all leads to new words and they need
+to be somewhat expressed in there.
+0:43:11.531 --> 0:43:20.278
+So to deal with this, the question is how
+can we normalize the text in order to make
+0:43:20.278 --> 0:43:22.028
+the text easier?
+0:43:22.028 --> 0:43:25.424
+Can we simplify the task easier?
+0:43:25.424 --> 0:43:29.231
+But we need to keep all information.
+0:43:29.409 --> 0:43:32.239
+So an example where not all information skipped.
+0:43:32.239 --> 0:43:35.012
+Of course you make the task easier if you
+just.
+0:43:35.275 --> 0:43:41.141
+You don't have to deal with different cases.
+0:43:41.141 --> 0:43:42.836
+It's easier.
+0:43:42.836 --> 0:43:52.482
+However, information gets lost and you might
+need to generate the target.
+0:43:52.832 --> 0:44:00.153
+So the question is always: How can we on the
+one hand simplify the task but keep all the
+0:44:00.153 --> 0:44:01.223
+information?
+0:44:01.441 --> 0:44:06.639
+Say necessary because it depends on the task.
+0:44:06.639 --> 0:44:11.724
+For some tasks you might find to remove the.
+0:44:14.194 --> 0:44:23.463
+So the steps they were typically doing are
+that you can the segment and words in a running
+0:44:23.463 --> 0:44:30.696
+text, so you can normalize word forms and segmentation
+into sentences.
+0:44:30.696 --> 0:44:33.955
+Also, if you have not a single.
+0:44:33.933 --> 0:44:38.739
+If this is not a redundancy point to segments,
+the text is also into segments.
+0:44:39.779 --> 0:44:52.609
+So what are we doing there for European language
+segmentation into words?
+0:44:52.609 --> 0:44:57.290
+It's not that complicated.
+0:44:57.277 --> 0:45:06.001
+You have to somehow handle the joint words
+and by handling joint words the most important.
+0:45:06.526 --> 0:45:11.331
+So in most systems it really doesn't matter
+much.
+0:45:11.331 --> 0:45:16.712
+If you write, I'm together as one word or
+as two words.
+0:45:17.197 --> 0:45:23.511
+The nice thing about iron is maybe this is
+so often that it doesn't matter if you both
+0:45:23.511 --> 0:45:26.560
+and if they're both accrued often enough.
+0:45:26.560 --> 0:45:32.802
+But you'll have some of these cases where
+they don't occur there often, so you should
+0:45:32.802 --> 0:45:35.487
+have more as consistent as possible.
+0:45:36.796 --> 0:45:41.662
+But of course things can get more complicated.
+0:45:41.662 --> 0:45:48.598
+If you have Finland capital, do you want to
+split the ends or not?
+0:45:48.598 --> 0:45:53.256
+Isn't you split or do you even write it out?
+0:45:53.433 --> 0:46:00.468
+And what about like things with hyphens in
+the middle and so on?
+0:46:00.540 --> 0:46:07.729
+So there is not everything is very easy, but
+is generally possible to somewhat keep as.
+0:46:11.791 --> 0:46:25.725
+Sometimes the most challenging and traditional
+systems were compounds, or how to deal with
+0:46:25.725 --> 0:46:28.481
+things like this.
+0:46:28.668 --> 0:46:32.154
+The nice thing is, as said, will come to the
+later.
+0:46:32.154 --> 0:46:34.501
+Nowadays we typically use subword.
+0:46:35.255 --> 0:46:42.261
+Unit, so we don't have to deal with this in
+the preprocessing directly, but in the subword
+0:46:42.261 --> 0:46:47.804
+splitting we're doing it, and then we can learn
+how to best spit these.
+0:46:52.392 --> 0:46:56.974
+Things Get More Complicated.
+0:46:56.977 --> 0:46:59.934
+About non European languages.
+0:46:59.934 --> 0:47:08.707
+Because in non European languages, not all
+of them, there is no space between the words.
+0:47:09.029 --> 0:47:18.752
+Nowadays you can also download word segmentation
+models where you put in the full sentence and
+0:47:18.752 --> 0:47:22.744
+then it's getting splitted into parts.
+0:47:22.963 --> 0:47:31.814
+And then, of course, it's even that you have
+different writing systems, sometimes in Japanese.
+0:47:31.814 --> 0:47:40.385
+For example, they have these katakana, hiragana
+and kanji symbols in there, and you have to
+0:47:40.385 --> 0:47:42.435
+some idea with these.
+0:47:49.669 --> 0:47:54.560
+To the, the next thing is can reduce some
+normalization.
+0:47:54.874 --> 0:48:00.376
+So the idea is that you map several words
+onto the same.
+0:48:00.460 --> 0:48:07.877
+And that is test dependent, and the idea is
+to define something like acronym classes so
+0:48:07.877 --> 0:48:15.546
+that words, which have the same meaning where
+it's not in order to have the difference, to
+0:48:15.546 --> 0:48:19.423
+map onto the same thing in order to make the.
+0:48:19.679 --> 0:48:27.023
+The most important thing is there about tasing,
+and then there is something like sometimes
+0:48:27.023 --> 0:48:27.508
+word.
+0:48:28.048 --> 0:48:37.063
+For casing you can do two things and then
+depend on the task.
+0:48:37.063 --> 0:48:44.769
+You can lowercase everything, maybe some exceptions.
+0:48:45.045 --> 0:48:47.831
+For the target side, it should normally it's
+normally not done.
+0:48:48.188 --> 0:48:51.020
+Why is it not done?
+0:48:51.020 --> 0:48:56.542
+Why should you only do it for suicide?
+0:48:56.542 --> 0:49:07.729
+Yes, so you have to generate correct text
+instead of lower case and uppercase.
+0:49:08.848 --> 0:49:16.370
+Nowadays to be always do true casing on both
+sides, also on the sewer side, that means you
+0:49:16.370 --> 0:49:17.610
+keep the case.
+0:49:17.610 --> 0:49:24.966
+The only thing where people try to work on
+or sometimes do that is that at the beginning
+0:49:24.966 --> 0:49:25.628
+of the.
+0:49:25.825 --> 0:49:31.115
+For words like this, this is not that important
+because you will have seen otherwise a lot
+0:49:31.115 --> 0:49:31.696
+of times.
+0:49:31.696 --> 0:49:36.928
+But if you know have rare words, which you
+only have seen maybe three times, and you have
+0:49:36.928 --> 0:49:42.334
+only seen in the middle of the sentence, and
+now it occurs at the beginning of the sentence,
+0:49:42.334 --> 0:49:45.763
+which is upper case, then you don't know how
+to deal with.
+0:49:46.146 --> 0:49:50.983
+So then it might be good to do a true casing.
+0:49:50.983 --> 0:49:56.241
+That means you recase each word on the beginning.
+0:49:56.576 --> 0:49:59.830
+The only question, of course, is how do you
+recase it?
+0:49:59.830 --> 0:50:01.961
+So what case would you always know?
+0:50:02.162 --> 0:50:18.918
+Word of the senders, or do you have a better
+solution, especially not English, maybe German.
+0:50:18.918 --> 0:50:20.000
+It's.
+0:50:25.966 --> 0:50:36.648
+The fancy solution would be to count hope
+and decide based on this, the unfancy running
+0:50:36.648 --> 0:50:43.147
+would: Think it's not really good because most
+of the cane boards are lower paced.
+0:50:43.683 --> 0:50:53.657
+That is one idea to count and definitely better
+because as a word more often occurs upper case.
+0:50:53.653 --> 0:50:57.934
+Otherwise you only have a lower case at the
+beginning where you have again.
+0:50:58.338 --> 0:51:03.269
+Haven't gained anything, you can make it even
+a bit better when counting.
+0:51:03.269 --> 0:51:09.134
+You're ignoring the first position so that
+you don't count the word beginning and yeah,
+0:51:09.134 --> 0:51:12.999
+that's typically how it's done to do this type
+of casing.
+0:51:13.273 --> 0:51:23.907
+And that's the easy thing you can't even use
+like then bygram teachers who work pairs.
+0:51:23.907 --> 0:51:29.651
+There's very few words which occur more often.
+0:51:29.970 --> 0:51:33.163
+It's OK to have them boast because you can
+otherwise learn it.
+0:51:36.376 --> 0:51:52.305
+Another thing about these classes is to use
+word classes that were partly done, for example,
+0:51:52.305 --> 0:51:55.046
+and more often.
+0:51:55.375 --> 0:51:57.214
+Ten Thousand One Hundred Books.
+0:51:57.597 --> 0:52:07.397
+And then for an system that might not be important
+you can do something at number books.
+0:52:07.847 --> 0:52:16.450
+However, you see here already that it's not
+that easy because if you have one book you
+0:52:16.450 --> 0:52:19.318
+don't have to do with a pro.
+0:52:20.020 --> 0:52:21.669
+Always be careful.
+0:52:21.669 --> 0:52:28.094
+It's very fast to ignore some exceptions and
+make more things worse than.
+0:52:28.488 --> 0:52:37.879
+So it's always difficult to decide when to
+do this and when to better not do it and keep
+0:52:37.879 --> 0:52:38.724
+things.
+0:52:43.483 --> 0:52:56.202
+Then the next step is sentence segmentation,
+so we are typically working on sentences.
+0:52:56.476 --> 0:53:11.633
+However, dots things are a bit more complicated,
+so you can do a bit more.
+0:53:11.731 --> 0:53:20.111
+You can even have some type of classifier
+with features by then generally.
+0:53:20.500 --> 0:53:30.731
+Is not too complicated, so you can have different
+types of classifiers to do that, but in generally.
+0:53:30.650 --> 0:53:32.537
+I Didn't Know It.
+0:53:33.393 --> 0:53:35.583
+It's not a super complicated task.
+0:53:35.583 --> 0:53:39.461
+There are nowadays also a lot of libraries
+which you can use.
+0:53:39.699 --> 0:53:45.714
+To do that normally if you're doing the normalization
+beforehand that can be done there so you only
+0:53:45.714 --> 0:53:51.126
+split up the dot if it's like the sentence
+boundary and otherwise you keep it to the word
+0:53:51.126 --> 0:53:54.194
+so you can do that a bit jointly with the segment.
+0:53:54.634 --> 0:54:06.017
+It's something to think about to care because
+it's where arrows happen.
+0:54:06.017 --> 0:54:14.712
+However, on the one end you can still do it
+very well.
+0:54:14.834 --> 0:54:19.740
+You will never get data which is perfectly
+clean and where everything is great.
+0:54:20.340 --> 0:54:31.020
+There's just too much data and it will never
+happen, so therefore it's important to be aware
+0:54:31.020 --> 0:54:35.269
+of that during the full development.
+0:54:37.237 --> 0:54:42.369
+And one last thing about the preprocessing,
+we'll get into the representation.
+0:54:42.369 --> 0:54:47.046
+If you're working on that, you'll get a friend
+with regular expression.
+0:54:47.046 --> 0:54:50.034
+That's not only how you do all this matching.
+0:54:50.430 --> 0:55:03.811
+And if you look into the scripts of how to
+deal with pancreation marks and stuff like
+0:55:03.811 --> 0:55:04.900
+that,.
+0:55:11.011 --> 0:55:19.025
+So if we have now the data of our next step
+to build, the system is to represent our words.
+0:55:19.639 --> 0:55:27.650
+Before we start with this, any more questions
+about preprocessing.
+0:55:27.650 --> 0:55:32.672
+While we work on the pure text, I'm sure.
+0:55:33.453 --> 0:55:40.852
+The idea is again to make things more simple
+because if you think about the production mark
+0:55:40.852 --> 0:55:48.252
+at the beginning of a sentence, it might be
+that you haven't seen the word or, for example,
+0:55:48.252 --> 0:55:49.619
+think of titles.
+0:55:49.619 --> 0:55:56.153
+In newspaper articles there's: So you then
+have seen the word now in the title before,
+0:55:56.153 --> 0:55:58.425
+and the text you have never seen.
+0:55:58.898 --> 0:56:03.147
+But there is always the decision.
+0:56:03.123 --> 0:56:09.097
+Do I gain more because I've seen things more
+often or do I lose because now I remove information
+0:56:09.097 --> 0:56:11.252
+which helps me to the same degree?
+0:56:11.571 --> 0:56:21.771
+Because if we, for example, do that in German
+and remove the case, this might be an important
+0:56:21.771 --> 0:56:22.531
+issue.
+0:56:22.842 --> 0:56:30.648
+So there is not the perfect solution, but
+generally you can get some arrows to make things
+0:56:30.648 --> 0:56:32.277
+look more similar.
+0:56:35.295 --> 0:56:43.275
+What you can do about products like the state
+of the area or the trends that are more or
+0:56:43.275 --> 0:56:43.813
+less.
+0:56:44.944 --> 0:56:50.193
+It starts even less because models get more
+powerful, so it's not that important, but be
+0:56:50.193 --> 0:56:51.136
+careful partly.
+0:56:51.136 --> 0:56:56.326
+It's also the evaluation thing because these
+things which are problematic are happening
+0:56:56.326 --> 0:56:57.092
+very rarely.
+0:56:57.092 --> 0:57:00.159
+If you take average performance, it doesn't
+matter.
+0:57:00.340 --> 0:57:06.715
+However, in between it's doing the stupid
+mistakes that don't count on average, but they
+0:57:06.715 --> 0:57:08.219
+are not really good.
+0:57:09.089 --> 0:57:15.118
+Done you do some type of tokenization?
+0:57:15.118 --> 0:57:19.911
+You can do true casing or not.
+0:57:19.911 --> 0:57:28.723
+Some people nowadays don't do it, but that's
+still done.
+0:57:28.948 --> 0:57:34.441
+Then it depends on who is a bit on the type
+of domain.
+0:57:34.441 --> 0:57:37.437
+Again we have so translation.
+0:57:37.717 --> 0:57:46.031
+So in the text sometimes there is mark in
+the menu, later the shortcut.
+0:57:46.031 --> 0:57:49.957
+This letter is used for shortcut.
+0:57:49.957 --> 0:57:57.232
+You cannot mistake the word because it's no
+longer a file but.
+0:57:58.018 --> 0:58:09.037
+Then you cannot deal with it, so then it might
+make sense to remove this.
+0:58:12.032 --> 0:58:17.437
+Now the next step is how to match words into
+numbers.
+0:58:17.437 --> 0:58:22.142
+Machine learning models deal with some digits.
+0:58:22.342 --> 0:58:27.091
+The first idea is to use words as our basic
+components.
+0:58:27.247 --> 0:58:40.695
+And then you have a large vocabulary where
+each word gets referenced to an indigenous.
+0:58:40.900 --> 0:58:49.059
+So your sentence go home is now and that is
+your set.
+0:58:52.052 --> 0:59:00.811
+So the nice thing is you have very short sequences
+so that you can deal with them.
+0:59:00.811 --> 0:59:01.867
+However,.
+0:59:01.982 --> 0:59:11.086
+So you have not really understood how words
+are processed.
+0:59:11.086 --> 0:59:16.951
+Why is this or can that be a problem?
+0:59:17.497 --> 0:59:20.741
+And there is an easy solution to deal with
+unknown words.
+0:59:20.741 --> 0:59:22.698
+You just have one token, which is.
+0:59:23.123 --> 0:59:25.906
+Worrying in maybe some railroads in your training
+day, do you deal?
+0:59:26.206 --> 0:59:34.938
+That's working a bit for some province, but
+in general it's not good because you know nothing
+0:59:34.938 --> 0:59:35.588
+about.
+0:59:35.895 --> 0:59:38.770
+Can at least deal with this and maybe map
+it.
+0:59:38.770 --> 0:59:44.269
+So an easy solution in machine translation
+is always if it's an unknown word or we just
+0:59:44.269 --> 0:59:49.642
+copy it to the target side because unknown
+words are often named entities and in many
+0:59:49.642 --> 0:59:52.454
+languages the good solution is just to keep.
+0:59:53.013 --> 1:00:01.203
+So that is somehow a trick, trick, but yeah,
+that's of course not a good thing.
+1:00:01.821 --> 1:00:08.959
+It's also a problem if you deal with full
+words is that you have very few examples for
+1:00:08.959 --> 1:00:09.451
+some.
+1:00:09.949 --> 1:00:17.696
+And of course if you've seen a word once you
+can, someone may be translated, but we will
+1:00:17.696 --> 1:00:24.050
+learn that in your networks you represent words
+with continuous vectors.
+1:00:24.264 --> 1:00:26.591
+You have seen them two, three or four times.
+1:00:26.591 --> 1:00:31.246
+It is not really well learned, and you are
+typically doing most Arabs and words with your
+1:00:31.246 --> 1:00:31.763
+crow rap.
+1:00:33.053 --> 1:00:40.543
+And yeah, you cannot deal with things which
+are inside the world.
+1:00:40.543 --> 1:00:50.303
+So if you know that houses set one hundred
+and twelve and you see no houses, you have
+1:00:50.303 --> 1:00:51.324
+no idea.
+1:00:51.931 --> 1:00:55.533
+Of course, not really convenient, so humans
+are better.
+1:00:55.533 --> 1:00:58.042
+They can use the internal information.
+1:00:58.498 --> 1:01:04.080
+So if we have houses you'll know that it's
+like the bluer form of house.
+1:01:05.285 --> 1:01:16.829
+And for the ones who weren't in advance, ay,
+you have this night worth here and guess.
+1:01:16.716 --> 1:01:20.454
+Don't know the meaning of these words.
+1:01:20.454 --> 1:01:25.821
+However, all of you will know is the fear
+of something.
+1:01:26.686 --> 1:01:39.437
+From the ending, the phobia phobia is always
+the fear of something, but you don't know how.
+1:01:39.879 --> 1:01:46.618
+So we can split words into some parts that
+is helpful to deal with.
+1:01:46.618 --> 1:01:49.888
+This, for example, is a fear of.
+1:01:50.450 --> 1:02:04.022
+It's not very important, it's not how to happen
+very often, but yeah, it's also not important
+1:02:04.022 --> 1:02:10.374
+for understanding that you know everything.
+1:02:15.115 --> 1:02:18.791
+So what can we do instead?
+1:02:18.791 --> 1:02:29.685
+One thing which we could do instead is to
+represent words by the other extreme.
+1:02:29.949 --> 1:02:42.900
+So you really do like if you have a person's
+eye and a and age, then you need a space symbol.
+1:02:43.203 --> 1:02:55.875
+So you have now a representation for each
+character that enables you to implicitly learn
+1:02:55.875 --> 1:03:01.143
+morphology because words which have.
+1:03:01.541 --> 1:03:05.517
+And you can then deal with unknown words.
+1:03:05.517 --> 1:03:10.344
+There's still not everything you can process,
+but.
+1:03:11.851 --> 1:03:16.953
+So if you would go on charity level what might
+still be a problem?
+1:03:18.598 --> 1:03:24.007
+So all characters which you haven't seen,
+but that's nowadays a little bit more often
+1:03:24.007 --> 1:03:25.140
+with new emoties.
+1:03:25.140 --> 1:03:26.020
+You couldn't.
+1:03:26.020 --> 1:03:31.366
+It could also be that you have translated
+from Germany and German, and then there is
+1:03:31.366 --> 1:03:35.077
+a Japanese character or Chinese that you cannot
+translate.
+1:03:35.435 --> 1:03:43.938
+But most of the time all directions occur
+have been seen so that someone works very good.
+1:03:44.464 --> 1:03:58.681
+This is first a nice thing, so you have a
+very small vocabulary size, so one big part
+1:03:58.681 --> 1:04:01.987
+of the calculation.
+1:04:02.222 --> 1:04:11.960
+Neural networks is the calculation of the
+vocabulary size, so if you are efficient there
+1:04:11.960 --> 1:04:13.382
+it's better.
+1:04:14.914 --> 1:04:26.998
+On the other hand, the problem is you have
+no very long sequences, so if you think about
+1:04:26.998 --> 1:04:29.985
+this before you have.
+1:04:30.410 --> 1:04:43.535
+Your computation often depends on your input
+size and not only linear but quadratic going
+1:04:43.535 --> 1:04:44.410
+more.
+1:04:44.504 --> 1:04:49.832
+And of course it might also be that you just
+generally make things more complicated than
+1:04:49.832 --> 1:04:50.910
+they were before.
+1:04:50.951 --> 1:04:58.679
+We said before make things easy, but now if
+we really have to analyze each director independently,
+1:04:58.679 --> 1:05:05.003
+we cannot directly learn that university is
+the same, but we have to learn that.
+1:05:05.185 --> 1:05:12.179
+Is beginning and then there is an I and then
+there is an E and then all this together means
+1:05:12.179 --> 1:05:17.273
+university but another combination of these
+letters is a complete.
+1:05:17.677 --> 1:05:24.135
+So of course you make everything here a lot
+more complicated than you have on word basis.
+1:05:24.744 --> 1:05:32.543
+Character based models work very well in conditions
+with few data because you have seen the words
+1:05:32.543 --> 1:05:33.578
+very rarely.
+1:05:33.578 --> 1:05:38.751
+It's not good to learn but you have seen all
+letters more often.
+1:05:38.751 --> 1:05:44.083
+So if you have scenarios with very few data
+this is like one good.
+1:05:46.446 --> 1:05:59.668
+The other idea is to split now not doing the
+extreme, so either taking forwards or taking
+1:05:59.668 --> 1:06:06.573
+only directives by doing something in between.
+1:06:07.327 --> 1:06:12.909
+And one of these ideas has been done for a
+long time.
+1:06:12.909 --> 1:06:17.560
+It's called compound splitting, but we only.
+1:06:17.477 --> 1:06:18.424
+Bounce them.
+1:06:18.424 --> 1:06:24.831
+You see that Baum and Stumbo accrue very often,
+then maybe more often than Bounce them.
+1:06:24.831 --> 1:06:28.180
+Then you split Baum and Stumb and you use
+it.
+1:06:29.509 --> 1:06:44.165
+But it's even not so easy it will learn wrong
+splits so we did that in all the systems and
+1:06:44.165 --> 1:06:47.708
+there is a word Asia.
+1:06:48.288 --> 1:06:56.137
+And the business, of course, is not a really
+good way of dealing it because it is non-semantic.
+1:06:56.676 --> 1:07:05.869
+The good thing is we didn't really care that
+much about it because the system wasn't learned
+1:07:05.869 --> 1:07:09.428
+if you have Asia and Tish together.
+1:07:09.729 --> 1:07:17.452
+So you can of course learn all that the compound
+spirit doesn't really help you to get a deeper
+1:07:17.452 --> 1:07:18.658
+understanding.
+1:07:21.661 --> 1:07:23.364
+The Thing of Course.
+1:07:23.943 --> 1:07:30.475
+Yeah, there was one paper where this doesn't
+work like they report, but it's called Burning
+1:07:30.475 --> 1:07:30.972
+Ducks.
+1:07:30.972 --> 1:07:37.503
+I think because it was like if you had German
+NS Branter, you could split it in NS Branter,
+1:07:37.503 --> 1:07:43.254
+and sometimes you have to add an E to make
+the compounds that was Enter Branter.
+1:07:43.583 --> 1:07:48.515
+So he translated Esperanto into burning dark.
+1:07:48.888 --> 1:07:56.127
+So of course you can introduce there some
+type of additional arrows, but in generally
+1:07:56.127 --> 1:07:57.221
+it's a good.
+1:07:57.617 --> 1:08:03.306
+Of course there is a trade off between vocabulary
+size so you want to have a lower vocabulary
+1:08:03.306 --> 1:08:08.812
+size so you've seen everything more often but
+the length of the sequence should not be too
+1:08:08.812 --> 1:08:13.654
+long because if you split more often you get
+less different types but you have.
+1:08:16.896 --> 1:08:25.281
+The motivation of the advantage of compared
+to Character based models is that you can directly
+1:08:25.281 --> 1:08:33.489
+learn the representation for works that occur
+very often while still being able to represent
+1:08:33.489 --> 1:08:35.783
+works that are rare into.
+1:08:36.176 --> 1:08:42.973
+And while first this was only done for compounds,
+nowadays there's an algorithm which really
+1:08:42.973 --> 1:08:49.405
+tries to do it on everything and there are
+different ways to be honest compound fitting
+1:08:49.405 --> 1:08:50.209
+and so on.
+1:08:50.209 --> 1:08:56.129
+But the most successful one which is commonly
+used is based on data compression.
+1:08:56.476 --> 1:08:59.246
+And there the idea is okay.
+1:08:59.246 --> 1:09:06.765
+Can we find an encoding so that parts are
+compressed in the most efficient?
+1:09:07.027 --> 1:09:22.917
+And the compression algorithm is called the
+bipear encoding, and this is also then used
+1:09:22.917 --> 1:09:25.625
+for splitting.
+1:09:26.346 --> 1:09:39.164
+And the idea is we recursively represent the
+most frequent pair of bites by a new bike.
+1:09:39.819 --> 1:09:51.926
+Language is now you splitch, burst all your
+words into letters, and then you look at what
+1:09:51.926 --> 1:09:59.593
+is the most frequent bigrams of which two letters
+occur.
+1:10:00.040 --> 1:10:04.896
+And then you replace your repeat until you
+have a fixed vocabulary.
+1:10:04.985 --> 1:10:08.031
+So that's a nice thing.
+1:10:08.031 --> 1:10:16.663
+Now you can predefine your vocabulary as want
+to represent my text.
+1:10:16.936 --> 1:10:28.486
+By hand, and then you can represent any text
+with these symbols, and of course the shorter
+1:10:28.486 --> 1:10:30.517
+your text will.
+1:10:32.772 --> 1:10:36.543
+So the original idea was something like that.
+1:10:36.543 --> 1:10:39.411
+We have to sequence A, B, A, B, C.
+1:10:39.411 --> 1:10:45.149
+For example, a common biogram is A, B, so
+you can face A, B, B, I, D.
+1:10:45.149 --> 1:10:46.788
+Then the text gets.
+1:10:48.108 --> 1:10:53.615
+Then you can make to and then you have eating
+beet and so on, so this is then your text.
+1:10:54.514 --> 1:11:00.691
+Similarly, we can do it now for tanking.
+1:11:01.761 --> 1:11:05.436
+Let's assume you have these sentences.
+1:11:05.436 --> 1:11:11.185
+I go, he goes, she goes, so your vocabulary
+is go, goes, he.
+1:11:11.851 --> 1:11:30.849
+And the first thing you're doing is split
+your crocus into singles.
+1:11:30.810 --> 1:11:34.692
+So thereby you can split words again like
+split senses into words.
+1:11:34.692 --> 1:11:38.980
+Because now you only have chiracters, you
+don't know the word boundaries.
+1:11:38.980 --> 1:11:44.194
+You introduce the word boundaries by having
+a special symbol at the end of each word, and
+1:11:44.194 --> 1:11:46.222
+then you know this symbol happens.
+1:11:46.222 --> 1:11:48.366
+I can split it and have it in a new.
+1:11:48.708 --> 1:11:55.245
+So you have the corpus I go, he goes, and
+she goes, and then you have now here the sequences
+1:11:55.245 --> 1:11:56.229
+of Character.
+1:11:56.229 --> 1:12:02.625
+So then the Character based per presentation,
+and now you calculate the bigram statistics.
+1:12:02.625 --> 1:12:08.458
+So I and the end of word occurs one time G
+& O across three times, so there there.
+1:12:09.189 --> 1:12:18.732
+And these are all the others, and now you
+look, which is the most common happening.
+1:12:19.119 --> 1:12:26.046
+So then you have known the rules.
+1:12:26.046 --> 1:12:39.235
+If and have them together you have these new
+words: Now is no longer two symbols, but it's
+1:12:39.235 --> 1:12:41.738
+one single symbol because if you join that.
+1:12:42.402 --> 1:12:51.175
+And then you have here now the new number
+of biceps, steel and wood, and and so on.
+1:12:52.092 --> 1:13:01.753
+In small examples now you have a lot of rules
+which occur the same time.
+1:13:01.753 --> 1:13:09.561
+In reality that is happening sometimes but
+not that often.
+1:13:10.370 --> 1:13:21.240
+You add the end of words to him, and so this
+way you go on until you have your vocabulary.
+1:13:21.601 --> 1:13:38.242
+And your vocabulary is in these rules, so
+people speak about the vocabulary of the rules.
+1:13:38.658 --> 1:13:43.637
+And these are the rules, and if you have not
+a different sentence, something like they tell.
+1:13:44.184 --> 1:13:53.600
+Then your final output looks like something
+like that.
+1:13:53.600 --> 1:13:59.250
+These two words represent by by.
+1:14:00.940 --> 1:14:06.398
+And that is your algorithm.
+1:14:06.398 --> 1:14:18.873
+Now you can represent any type of text with
+a fixed vocabulary.
+1:14:20.400 --> 1:14:23.593
+So think that's defined in the beginning.
+1:14:23.593 --> 1:14:27.243
+Fill how many egos have won and that has spent.
+1:14:28.408 --> 1:14:35.253
+It's nearly correct that it writes a number
+of characters.
+1:14:35.253 --> 1:14:38.734
+It can be that in additional.
+1:14:38.878 --> 1:14:49.162
+So on the one end all three of the right side
+of the rules can occur, and then additionally
+1:14:49.162 --> 1:14:49.721
+all.
+1:14:49.809 --> 1:14:55.851
+In reality it can even happen that there is
+less your vocabulary smaller because it might
+1:14:55.851 --> 1:15:01.960
+happen that like for example go never occurs
+singular at the end but you always like merge
+1:15:01.960 --> 1:15:06.793
+all occurrences so there are not all right
+sides really happen because.
+1:15:06.746 --> 1:15:11.269
+This rule is never only applied, but afterwards
+another rule is also applied.
+1:15:11.531 --> 1:15:15.621
+So it's a summary approbounce of your vocabulary
+than static.
+1:15:20.480 --> 1:15:29.014
+Then we come to the last part, which is about
+parallel data, but we have some questions beforehand.
+1:15:36.436 --> 1:15:38.824
+So what is parallel data?
+1:15:38.824 --> 1:15:47.368
+So if we set machine translations really,
+really important that we are dealing with parallel
+1:15:47.368 --> 1:15:52.054
+data, that means we have a lined input and
+output.
+1:15:52.054 --> 1:15:54.626
+You have this type of data.
+1:15:55.015 --> 1:16:01.773
+However, in machine translation we have one
+very big advantage that is somewhat naturally
+1:16:01.773 --> 1:16:07.255
+occurring, so you have a lot of parallel data
+which you can summar gaps.
+1:16:07.255 --> 1:16:13.788
+In many P tests you need to manually annotate
+your data and generate the aligned data.
+1:16:14.414 --> 1:16:22.540
+We have to manually create translations, and
+of course that is very expensive, but it's
+1:16:22.540 --> 1:16:29.281
+really expensive to pay for like one million
+sentences to be translated.
+1:16:29.889 --> 1:16:36.952
+The nice thing is that in there is data normally
+available because other people have done machine
+1:16:36.952 --> 1:16:37.889
+translation.
+1:16:40.120 --> 1:16:44.672
+So there is this data and of course process
+it.
+1:16:44.672 --> 1:16:51.406
+We'll have a full lecture on how to deal with
+more complex situations.
+1:16:52.032 --> 1:16:56.645
+The idea is really you don't do really much
+human work.
+1:16:56.645 --> 1:17:02.825
+You really just start the caller with some
+initials, start pages and then.
+1:17:03.203 --> 1:17:07.953
+But a lot of iquality parallel data is really
+targeted on some scenarios.
+1:17:07.953 --> 1:17:13.987
+So, for example, think of the European Parliament
+as one website where you can easily extract
+1:17:13.987 --> 1:17:17.581
+these information from and there you have a
+large data.
+1:17:17.937 --> 1:17:22.500
+Or like we have the TED data, which is also
+you can get from the TED website.
+1:17:23.783 --> 1:17:33.555
+So in generally parallel corpus is a collection
+of texts with translations into one of several.
+1:17:34.134 --> 1:17:42.269
+And this data is important because there is
+no general empty normally, but you work secured.
+1:17:42.222 --> 1:17:46.732
+It works especially good if your training
+and test conditions are similar.
+1:17:46.732 --> 1:17:50.460
+So if the topic is similar, the style of modality
+is similar.
+1:17:50.460 --> 1:17:55.391
+So if you want to translate speech, it's often
+better to train all to own speech.
+1:17:55.391 --> 1:17:58.818
+If you want to translate text, it's better
+to translate.
+1:17:59.379 --> 1:18:08.457
+And there is a lot of these data available
+nowadays for common languages.
+1:18:08.457 --> 1:18:12.014
+You normally can start with.
+1:18:12.252 --> 1:18:15.298
+It's really available.
+1:18:15.298 --> 1:18:27.350
+For example, Opus is a big website collecting
+different types of parallel corpus where you
+1:18:27.350 --> 1:18:29.601
+can select them.
+1:18:29.529 --> 1:18:33.276
+You have this document alignment will come
+to that layout.
+1:18:33.553 --> 1:18:39.248
+There is things like comparable data where
+you have not full sentences but only some parts
+1:18:39.248 --> 1:18:40.062
+of parallel.
+1:18:40.220 --> 1:18:48.700
+But now first let's assume we have easy tasks
+like European Parliament when we have the speech
+1:18:48.700 --> 1:18:55.485
+in German and the speech in English and you
+need to generate parallel data.
+1:18:55.485 --> 1:18:59.949
+That means you have to align the sewer sentences.
+1:19:00.000 --> 1:19:01.573
+And doing this right.
+1:19:05.905 --> 1:19:08.435
+How can we do that?
+1:19:08.435 --> 1:19:19.315
+And that is what people refer to sentence
+alignment, so we have parallel documents in
+1:19:19.315 --> 1:19:20.707
+languages.
+1:19:22.602 --> 1:19:32.076
+This is so you cannot normally do that word
+by word because there is no direct correlation
+1:19:32.076 --> 1:19:34.158
+between, but it is.
+1:19:34.074 --> 1:19:39.837
+Relatively possible to do it on sentence level,
+it will not be perfect, so you sometimes have
+1:19:39.837 --> 1:19:42.535
+two sentences in English and one in German.
+1:19:42.535 --> 1:19:47.992
+German like to have these long sentences with
+sub clauses and so on, so there you can do
+1:19:47.992 --> 1:19:51.733
+it, but with long sentences it might not be
+really possible.
+1:19:55.015 --> 1:19:59.454
+And for some we saw that sentence Marcus Andre
+there, so it's more complicated.
+1:19:59.819 --> 1:20:10.090
+So how can we formalize this sentence alignment
+problem?
+1:20:10.090 --> 1:20:16.756
+So we have a set of sewer sentences.
+1:20:17.377 --> 1:20:22.167
+And machine translation relatively often.
+1:20:22.167 --> 1:20:32.317
+Sometimes source sentences nowadays are and,
+but traditionally it was and because people
+1:20:32.317 --> 1:20:34.027
+started using.
+1:20:34.594 --> 1:20:45.625
+And then the idea is to find this alignment
+where we have alignment.
+1:20:46.306 --> 1:20:50.421
+And of course you want these sequences to
+be shown as possible.
+1:20:50.421 --> 1:20:56.400
+Of course an easy solution is here all my
+screen sentences and here all my target sentences.
+1:20:56.756 --> 1:21:07.558
+So want to have short sequences there, typically
+one sentence or maximum two or three sentences,
+1:21:07.558 --> 1:21:09.340
+so that really.
+1:21:13.913 --> 1:21:21.479
+Then there is different ways of restriction
+to this type of alignment, so first of all
+1:21:21.479 --> 1:21:29.131
+it should be a monotone alignment, so that
+means that each segment on the source should
+1:21:29.131 --> 1:21:31.218
+start after each other.
+1:21:31.431 --> 1:21:36.428
+So we assume that in document there's really
+a monotone and it's going the same way in source.
+1:21:36.957 --> 1:21:41.965
+Course for a very free translation that might
+not be valid anymore.
+1:21:41.965 --> 1:21:49.331
+But this algorithm, the first one in the church
+and gay algorithm, is more than really translations
+1:21:49.331 --> 1:21:51.025
+which are very direct.
+1:21:51.025 --> 1:21:54.708
+So each segment should be like coming after
+each.
+1:21:55.115 --> 1:22:04.117
+Then we want to translate the full sequence,
+and of course each segment should start before
+1:22:04.117 --> 1:22:04.802
+it is.
+1:22:05.525 --> 1:22:22.654
+And then you want to have something like that,
+but you have to alignments or alignments.
+1:22:25.525 --> 1:22:41.851
+The alignment types are: You then, of course,
+sometimes insertions and Venetians where there
+1:22:41.851 --> 1:22:43.858
+is some information added.
+1:22:44.224 --> 1:22:50.412
+Hand be, for example, explanation, so it can
+be that some term is known in the one language
+1:22:50.412 --> 1:22:51.018
+but not.
+1:22:51.111 --> 1:22:53.724
+Think of things like Deutschland ticket.
+1:22:53.724 --> 1:22:58.187
+In Germany everybody will by now know what
+the Deutschland ticket is.
+1:22:58.187 --> 1:23:03.797
+But if you translate it to English it might
+be important to explain it and other things
+1:23:03.797 --> 1:23:04.116
+are.
+1:23:04.116 --> 1:23:09.853
+So sometimes you have to explain things and
+then you have more sentences with insertions.
+1:23:10.410 --> 1:23:15.956
+Then you have two to one and one to two alignment,
+and that is, for example, in Germany you have
+1:23:15.956 --> 1:23:19.616
+a lot of sub-classes and bipes that are expressed
+by two cents.
+1:23:20.580 --> 1:23:37.725
+Of course, it might be more complex, but typically
+to make it simple and only allow for this type
+1:23:37.725 --> 1:23:40.174
+of alignment.
+1:23:41.301 --> 1:23:56.588
+Then it is about finding the alignment and
+that is, we try to score where we just take
+1:23:56.588 --> 1:23:59.575
+a general score.
+1:24:00.000 --> 1:24:04.011
+That is true like gala algorithms and the
+matching of one segment.
+1:24:04.011 --> 1:24:09.279
+If you have one segment now so this is one
+of the global things so the global alignment
+1:24:09.279 --> 1:24:13.828
+is as good as the product of all single steps
+and then you have two scores.
+1:24:13.828 --> 1:24:18.558
+First of all you say one to one alignments
+are much better than all the hours.
+1:24:19.059 --> 1:24:26.884
+And then you have a lexical similarity, which
+is, for example, based on an initial dictionary
+1:24:26.884 --> 1:24:30.713
+which counts how many dictionary entries are.
+1:24:31.091 --> 1:24:35.407
+So this is a very simple algorithm.
+1:24:35.407 --> 1:24:41.881
+Typically violates like your first step and
+you want.
+1:24:43.303 --> 1:24:54.454
+And that is like with this one you can get
+an initial one you can have better parallel
+1:24:54.454 --> 1:24:55.223
+data.
+1:24:55.675 --> 1:25:02.369
+No, it is an optimization problem and you
+are now based on the scores you can calculate
+1:25:02.369 --> 1:25:07.541
+for each possible alignment and score and then
+select the best one.
+1:25:07.541 --> 1:25:14.386
+Of course, you won't try all possibilities
+out but you can do a good search and then find
+1:25:14.386 --> 1:25:15.451
+the best one.
+1:25:15.815 --> 1:25:18.726
+Can typically be automatically.
+1:25:18.726 --> 1:25:25.456
+Of course, you should do some checks like
+aligning sentences as possible.
+1:25:26.766 --> 1:25:32.043
+A bill like typically for training data is
+done this way.
+1:25:32.043 --> 1:25:35.045
+Maybe if you have test data you.
+1:25:40.000 --> 1:25:47.323
+Sorry, I'm a bit late because originally wanted
+to do a quiz at the end.
+1:25:47.323 --> 1:25:49.129
+Can we go a quiz?
+1:25:49.429 --> 1:25:51.833
+We'll do it somewhere else.
+1:25:51.833 --> 1:25:56.813
+We had a bachelor project about making quiz
+for lectures.
+1:25:56.813 --> 1:25:59.217
+And I still want to try it.
+1:25:59.217 --> 1:26:04.197
+So let's see I hope in some other lecture
+we can do that.
+1:26:04.197 --> 1:26:09.435
+Then we can at the island of the lecture do
+some quiz about.
+1:26:09.609 --> 1:26:13.081
+All We Can Do Is Is the Practical Thing Let's
+See.
+1:26:13.533 --> 1:26:24.719
+And: Today, so what you should remember is
+what is parallel data and how we can.
+1:26:25.045 --> 1:26:29.553
+Create parallel data like how to generally
+process data.
+1:26:29.553 --> 1:26:36.435
+What you think about data is really important
+if you build systems and different ways.
+1:26:36.696 --> 1:26:46.857
+The three main options like forwards is directly
+on director level or using subword things.
+1:26:47.687 --> 1:26:49.634
+Is there any question?
+1:26:52.192 --> 1:26:57.768
+Yes, this is the alignment thing in Cadillac
+band in Tyne walking with people.
+1:27:00.000 --> 1:27:05.761
+It's not directly using than every time walking,
+but the idea is similar and you can use all
+1:27:05.761 --> 1:27:11.771
+this type of similar algorithms, which is the
+main thing which is the question of the difficulty
+1:27:11.771 --> 1:27:14.807
+is to define me at your your loss function
+here.
+1:27:14.807 --> 1:27:16.418
+What is a good alignment?
+1:27:16.736 --> 1:27:24.115
+But as you do not have a time walk on, you
+have a monotone alignment in there, and you
+1:27:24.115 --> 1:27:26.150
+cannot have rehonoring.
+1:27:30.770 --> 1:27:40.121
+There then thanks a lot and on first day we
+will then start with or discuss.

demo_data/lectures/Lecture-03-25.04.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b241226dacb56a88fcbccaecb2639c3b5765fbea6f60e4758715c6941fbc512
+size 117644511

demo_data/lectures/Lecture-04-27.04.2023/English.vtt ADDED Viewed

	@@ -0,0 +1,2919 @@

+WEBVTT
+0:00:03.663 --> 0:00:07.970
+Okay, then I should switch back to English,
+sorry,.
+0:00:08.528 --> 0:00:18.970
+So welcome to today's lecture in the cross
+machine translation and today we're planning
+0:00:18.970 --> 0:00:20.038
+to talk.
+0:00:20.880 --> 0:00:31.845
+Which will be without our summary of power
+translation was done from around till.
+0:00:32.872 --> 0:00:38.471
+Fourteen, so this was an approach which was
+quite long.
+0:00:38.471 --> 0:00:47.070
+It was the first approach where at the end
+the quality was really so good that it was
+0:00:47.070 --> 0:00:49.969
+used as a commercial system.
+0:00:49.990 --> 0:00:56.482
+Or something like that, so the first systems
+there was using the statistical machine translation.
+0:00:57.937 --> 0:01:02.706
+So when I came into the field this was the
+main part of the lecture, so there would be
+0:01:02.706 --> 0:01:07.912
+not be one lecture, but in more detail than
+half of the full course would be about statistical
+0:01:07.912 --> 0:01:09.063
+machine translation.
+0:01:09.369 --> 0:01:23.381
+So what we try to do today is like get the
+most important things, which think our part
+0:01:23.381 --> 0:01:27.408
+is still very important.
+0:01:27.267 --> 0:01:31.196
+Four State of the Art Box.
+0:01:31.952 --> 0:01:45.240
+Then we'll have the presentation about how
+to evaluate the other part of the machine translation.
+0:01:45.505 --> 0:01:58.396
+The other important thing is the language
+modeling part will explain later how they combine.
+0:01:59.539 --> 0:02:04.563
+Shortly mentioned this one already.
+0:02:04.824 --> 0:02:06.025
+On Tuesday.
+0:02:06.246 --> 0:02:21.849
+So in a lot of these explanations, how we
+model translation process, it might be surprising:
+0:02:22.082 --> 0:02:27.905
+Later some people say it's for four eight words
+traditionally came because the first models
+0:02:27.905 --> 0:02:32.715
+which you'll discuss here also when they are
+referred to as the IVM models.
+0:02:32.832 --> 0:02:40.043
+They were trained on French to English translation
+directions and that's why they started using
+0:02:40.043 --> 0:02:44.399
+F and E and then this was done for the next
+twenty years.
+0:02:44.664 --> 0:02:52.316
+So while we are trying to wait, the source
+words is: We have a big eye, typically the
+0:02:52.316 --> 0:03:02.701
+lengths of the sewer sentence in small eye,
+the position, and similarly in the target and
+0:03:02.701 --> 0:03:05.240
+the lengths of small.
+0:03:05.485 --> 0:03:13.248
+Things will get a bit complicated in this
+way because it is not always clear what is
+0:03:13.248 --> 0:03:13.704
+the.
+0:03:14.014 --> 0:03:21.962
+See that there is this noisy channel model
+which switches the direction in your model,
+0:03:21.962 --> 0:03:25.616
+but in the application it's the target.
+0:03:26.006 --> 0:03:37.077
+So that is why if you especially read these
+papers, it might sometimes be a bit disturbing.
+0:03:37.437 --> 0:03:40.209
+Try to keep it here always.
+0:03:40.209 --> 0:03:48.427
+The source is, and even if we use a model
+where it's inverse, we'll keep this way.
+0:03:48.468 --> 0:03:55.138
+Don't get disturbed by that, and I think it's
+possible to understand all that without this
+0:03:55.138 --> 0:03:55.944
+confusion.
+0:03:55.944 --> 0:04:01.734
+But in some of the papers you might get confused
+because they switched to the.
+0:04:04.944 --> 0:04:17.138
+In general, in statistics and machine translation,
+the goal is how we do translation.
+0:04:17.377 --> 0:04:25.562
+But first we are seeing all our possible target
+sentences as possible translations.
+0:04:26.726 --> 0:04:37.495
+And we are assigning some probability to the
+combination, so we are modeling.
+0:04:39.359 --> 0:04:49.746
+And then we are doing a search over all possible
+things or at least theoretically, and we are
+0:04:49.746 --> 0:04:56.486
+trying to find the translation with the highest
+probability.
+0:04:56.936 --> 0:05:05.116
+And this general idea is also true for neuromachine
+translation.
+0:05:05.116 --> 0:05:07.633
+They differ in how.
+0:05:08.088 --> 0:05:10.801
+So these were then of course the two big challenges.
+0:05:11.171 --> 0:05:17.414
+On the one hand, how can we estimate this
+probability?
+0:05:17.414 --> 0:05:21.615
+How is the translation of the other?
+0:05:22.262 --> 0:05:32.412
+The other challenge is the search, so we cannot,
+of course, say we want to find the most probable
+0:05:32.412 --> 0:05:33.759
+translation.
+0:05:33.759 --> 0:05:42.045
+We cannot go over all possible English sentences
+and calculate the probability.
+0:05:43.103 --> 0:05:45.004
+So,.
+0:05:45.165 --> 0:05:53.423
+What we have to do there is some are doing
+intelligent search and look for the ones and
+0:05:53.423 --> 0:05:54.268
+compare.
+0:05:54.734 --> 0:05:57.384
+That will be done.
+0:05:57.384 --> 0:06:07.006
+This process of finding them is called the
+decoding process because.
+0:06:07.247 --> 0:06:09.015
+They will be covered well later.
+0:06:09.015 --> 0:06:11.104
+Today we will concentrate on the mile.
+0:06:11.451 --> 0:06:23.566
+The model is trained using data, so in the
+first step we're having data, we're somehow
+0:06:23.566 --> 0:06:30.529
+having a definition of what the model looks
+like.
+0:06:34.034 --> 0:06:42.913
+And in statistical machine translation the
+common model is behind.
+0:06:42.913 --> 0:06:46.358
+That is what is referred.
+0:06:46.786 --> 0:06:55.475
+And this is motivated by the initial idea
+from Shannon.
+0:06:55.475 --> 0:07:02.457
+We have this that you can think of decoding.
+0:07:02.722 --> 0:07:10.472
+So think of it as we have this text in maybe
+German.
+0:07:10.472 --> 0:07:21.147
+Originally it was an English text, but somebody
+used some nice decoding.
+0:07:21.021 --> 0:07:28.579
+Task is to decipher it again, this crazy cyborg
+expressing things in German, and to decipher
+0:07:28.579 --> 0:07:31.993
+the meaning again and doing that between.
+0:07:32.452 --> 0:07:35.735
+And that is the idea about this noisy channel
+when it.
+0:07:36.236 --> 0:07:47.209
+It goes through some type of channel which
+adds noise to the source and then you receive
+0:07:47.209 --> 0:07:48.811
+the message.
+0:07:49.429 --> 0:08:00.190
+And then the idea is, can we now construct
+the original message out of these messages
+0:08:00.190 --> 0:08:05.070
+by modeling some of the channels here?
+0:08:06.726 --> 0:08:15.797
+There you know to see a bit the surface of
+the source message with English.
+0:08:15.797 --> 0:08:22.361
+It went through some channel and received
+the message.
+0:08:22.682 --> 0:08:31.381
+If you're not looking at machine translation,
+your source language is English.
+0:08:31.671 --> 0:08:44.388
+Here you see now a bit of this where the confusion
+starts while English as a target language is
+0:08:44.388 --> 0:08:47.700
+also the source message.
+0:08:47.927 --> 0:08:48.674
+You can see.
+0:08:48.674 --> 0:08:51.488
+There is also a mathematics of how we model
+the.
+0:08:52.592 --> 0:08:56.888
+It's a noisy channel model from a mathematic
+point of view.
+0:08:56.997 --> 0:09:00.245
+So this is again our general formula.
+0:09:00.245 --> 0:09:08.623
+We are looking for the most probable translation
+and that is the translation that has the highest
+0:09:08.623 --> 0:09:09.735
+probability.
+0:09:09.809 --> 0:09:19.467
+We are not interested in the probability itself,
+but we are interesting in this target sentence
+0:09:19.467 --> 0:09:22.082
+E where this probability.
+0:09:23.483 --> 0:09:33.479
+And: Therefore, we can use them twice definition
+of conditional probability and using the base
+0:09:33.479 --> 0:09:42.712
+rules, so this probability equals the probability
+of f giving any kind of probability of e divided
+0:09:42.712 --> 0:09:44.858
+by the probability of.
+0:09:45.525 --> 0:09:48.218
+Now see mathematically this confusion.
+0:09:48.218 --> 0:09:54.983
+Originally we are interested in the probability
+of the target sentence given the search sentence.
+0:09:55.295 --> 0:10:00.742
+And if we are modeling things now, we are
+looking here at the inverse direction, so the
+0:10:00.742 --> 0:10:06.499
+probability of F given E to the probability
+of the source sentence given the target sentence
+0:10:06.499 --> 0:10:10.832
+is the probability of the target sentence divided
+by the probability.
+0:10:13.033 --> 0:10:15.353
+Why are we doing this?
+0:10:15.353 --> 0:10:24.333
+Maybe I mean, of course, once it's motivated
+by our model, that we were saying this type
+0:10:24.333 --> 0:10:27.058
+of how we are modeling it.
+0:10:27.058 --> 0:10:30.791
+The other interesting thing is that.
+0:10:31.231 --> 0:10:40.019
+So we are looking at this probability up there,
+which we had before we formulate that we can
+0:10:40.019 --> 0:10:40.775
+remove.
+0:10:41.181 --> 0:10:46.164
+If we are searching for the highest translation,
+this is fixed.
+0:10:46.164 --> 0:10:47.800
+This doesn't change.
+0:10:47.800 --> 0:10:52.550
+We have an input, the source sentence, and
+we cannot change.
+0:10:52.812 --> 0:11:02.780
+Is always the same, so we can ignore it in
+the ACMAX because the lower one is exactly
+0:11:02.780 --> 0:11:03.939
+the same.
+0:11:04.344 --> 0:11:06.683
+And then we have p o f.
+0:11:06.606 --> 0:11:13.177
+E times P of E and that is so we are modeling
+the translation process on the one hand with
+0:11:13.177 --> 0:11:19.748
+the translation model which models how probable
+is the sentence F given E and on the other
+0:11:19.748 --> 0:11:25.958
+hand with the language model which models only
+how probable is this English sentence.
+0:11:26.586 --> 0:11:39.366
+That somebody wrote this language or translation
+point of view, this is about fluency.
+0:11:40.200 --> 0:11:44.416
+You should have in German, for example, agreement.
+0:11:44.416 --> 0:11:50.863
+If the agreement is not right, that's properly
+not said by anybody in German.
+0:11:50.863 --> 0:11:58.220
+Nobody would say that's Schönest's house because
+it's not according to the German rules.
+0:11:58.598 --> 0:12:02.302
+So this can be modeled by the language model.
+0:12:02.542 --> 0:12:09.855
+And you have the translation model which models
+housings get translated between the.
+0:12:10.910 --> 0:12:18.775
+And here you see again our confusion again,
+and now here put the translation model: Wage
+0:12:18.775 --> 0:12:24.360
+is a big income counterintuitive because the
+probability of a sewer sentence giving the
+0:12:24.360 --> 0:12:24.868
+target.
+0:12:26.306 --> 0:12:35.094
+Have to do that for the bass farmer, but in
+the following slides I'll talk again about.
+0:12:35.535 --> 0:12:45.414
+Because yeah, that's more intuitive that you
+model the translation of the target sentence
+0:12:45.414 --> 0:12:48.377
+given the source sentence.
+0:12:50.930 --> 0:12:55.668
+And this is what we want to talk about today.
+0:12:55.668 --> 0:13:01.023
+We later talk about language models how to
+do that.
+0:13:00.940 --> 0:13:04.493
+And maybe also how to combine them.
+0:13:04.493 --> 0:13:13.080
+But the focus on today would be how can we
+model this probability to how to generate a
+0:13:13.080 --> 0:13:16.535
+translation from source to target?
+0:13:19.960 --> 0:13:24.263
+How can we do that and the easiest thing?
+0:13:24.263 --> 0:13:33.588
+Maybe if you think about statistics, you count
+how many examples you have, how many target
+0:13:33.588 --> 0:13:39.121
+sentences go occur, and that gives you an estimation.
+0:13:40.160 --> 0:13:51.632
+However, like in another model that is not
+possible because most sentences you will never
+0:13:51.632 --> 0:13:52.780
+see, so.
+0:13:53.333 --> 0:14:06.924
+So what we have to do is break up the translation
+process into smaller models and model each
+0:14:06.924 --> 0:14:09.555
+of the decisions.
+0:14:09.970 --> 0:14:26.300
+So this simple solution with how you throw
+a dice is like you have a and that gives you
+0:14:26.300 --> 0:14:29.454
+the probability.
+0:14:29.449 --> 0:14:40.439
+But here's the principle because each event
+is so rare that most of them never have helped.
+0:14:43.063 --> 0:14:48.164
+Although it might be that in all your training
+data you have never seen this title of set.
+0:14:49.589 --> 0:14:52.388
+How can we do that?
+0:14:52.388 --> 0:15:04.845
+We look in statistical machine translation
+into two different models, a generative model
+0:15:04.845 --> 0:15:05.825
+where.
+0:15:06.166 --> 0:15:11.736
+So the idea was to really model model like
+each individual translation between words.
+0:15:12.052 --> 0:15:22.598
+So you break down the translation of a full
+sentence into the translation of each individual's
+0:15:22.598 --> 0:15:23.264
+word.
+0:15:23.264 --> 0:15:31.922
+So you say if you have the black cat, if you
+translate it, the full sentence.
+0:15:32.932 --> 0:15:38.797
+Of course, this has some challenges, any ideas
+where this type of model could be very challenging.
+0:15:40.240 --> 0:15:47.396
+Vocabularies and videos: Yes, we're going
+to be able to play in the very color.
+0:15:47.867 --> 0:15:51.592
+Yes, but you could at least use a bit of the
+context around it.
+0:15:51.592 --> 0:15:55.491
+It will not only depend on the word, but it's
+already challenging.
+0:15:55.491 --> 0:15:59.157
+You make things very hard, so that's definitely
+one challenge.
+0:16:00.500 --> 0:16:07.085
+One other, what did you talk about that we
+just don't want to say?
+0:16:08.348 --> 0:16:11.483
+Yes, they are challenging.
+0:16:11.483 --> 0:16:21.817
+You have to do something like words, but the
+problem is that you might introduce errors.
+0:16:21.841 --> 0:16:23.298
+Later and makes things very comfortable.
+0:16:25.265 --> 0:16:28.153
+Wrong splitting is the worst things that are
+very complicated.
+0:16:32.032 --> 0:16:35.580
+Saints, for example, and also maybe Japanese
+medicine.
+0:16:35.735 --> 0:16:41.203
+In German, yes, especially like these are
+all right.
+0:16:41.203 --> 0:16:46.981
+The first thing is maybe the one which is
+most obvious.
+0:16:46.981 --> 0:16:49.972
+It is raining cats and dogs.
+0:16:51.631 --> 0:17:01.837
+To German, the cat doesn't translate this
+whole chunk into something because there is
+0:17:01.837 --> 0:17:03.261
+not really.
+0:17:03.403 --> 0:17:08.610
+Mean, of course, in generally there is this
+type of alignment, so there is a correspondence
+0:17:08.610 --> 0:17:11.439
+between words in English and the words in German.
+0:17:11.439 --> 0:17:16.363
+However, that's not true for all sentences,
+so in some sentences you cannot really say
+0:17:16.363 --> 0:17:18.174
+this word translates into that.
+0:17:18.498 --> 0:17:21.583
+But you can only let more locate this whole
+phrase.
+0:17:21.583 --> 0:17:23.482
+This model into something else.
+0:17:23.563 --> 0:17:30.970
+If you think about the don't in English, the
+do is not really clearly where should that
+0:17:30.970 --> 0:17:31.895
+be allied.
+0:17:32.712 --> 0:17:39.079
+Then for a long time the most successful approach
+was this phrase based translation model where
+0:17:39.079 --> 0:17:45.511
+the idea is your block is not a single word
+but a longer phrase if you try to build translations
+0:17:45.511 --> 0:17:46.572
+based on these.
+0:17:48.768 --> 0:17:54.105
+But let's start with a word based and what
+you need.
+0:17:54.105 --> 0:18:03.470
+There is two main knowledge sources, so on
+the one hand we have a lexicon where we translate
+0:18:03.470 --> 0:18:05.786
+possible translations.
+0:18:06.166 --> 0:18:16.084
+The main difference between the lexicon and
+statistical machine translation and lexicon
+0:18:16.084 --> 0:18:17.550
+as you know.
+0:18:17.837 --> 0:18:23.590
+Traditional lexicon: You know how word is
+translated and mainly it's giving you two or
+0:18:23.590 --> 0:18:26.367
+three examples with any example sentence.
+0:18:26.367 --> 0:18:30.136
+So in this context it gets translated like
+that henceon.
+0:18:30.570 --> 0:18:38.822
+In order to model that and work with probabilities
+what we need in a machine translation is these:
+0:18:39.099 --> 0:18:47.962
+So if we have the German word bargain, it sends
+me out with a probability of zero point five.
+0:18:47.962 --> 0:18:51.545
+Maybe it's translated into a vehicle.
+0:18:52.792 --> 0:18:58.876
+And of course this is not easy to be created
+by a shoveman.
+0:18:58.876 --> 0:19:07.960
+If ask you and give probabilities for how
+probable this vehicle is, there might: So how
+0:19:07.960 --> 0:19:12.848
+we are doing is again that the lexicon is automatically
+will be created from a corpus.
+0:19:13.333 --> 0:19:18.754
+And we're just counting here, so we count
+how often does it work, how often does it co
+0:19:18.754 --> 0:19:24.425
+occur with vehicle, and then we're taking the
+ratio and saying in the house of time on the
+0:19:24.425 --> 0:19:26.481
+English side there was vehicles.
+0:19:26.481 --> 0:19:31.840
+There was a probability of vehicles given
+back, and there's something like zero point
+0:19:31.840 --> 0:19:32.214
+five.
+0:19:33.793 --> 0:19:46.669
+That we need another concept, and that is
+this concept of alignment, and now you can
+0:19:46.669 --> 0:19:47.578
+have.
+0:19:47.667 --> 0:19:53.113
+Since this is quite complicated, the alignment
+in general can be complex.
+0:19:53.113 --> 0:19:55.689
+It can be that it's not only like.
+0:19:55.895 --> 0:20:04.283
+It can be that two words of a surrender target
+sign and it's also imbiguous.
+0:20:04.283 --> 0:20:13.761
+It can be that you say all these two words
+only are aligned together and our words are
+0:20:13.761 --> 0:20:15.504
+aligned or not.
+0:20:15.875 --> 0:20:21.581
+Is should the do be aligned to the knot in
+German?
+0:20:21.581 --> 0:20:29.301
+It's only there because in German it's not,
+so it should be aligned.
+0:20:30.510 --> 0:20:39.736
+However, typically it's formalized and it's
+formalized by a function from the target language.
+0:20:40.180 --> 0:20:44.051
+And that is to make these models get easier
+and clearer.
+0:20:44.304 --> 0:20:49.860
+That means what means does it mean that you
+have a fence that means that each.
+0:20:49.809 --> 0:20:58.700
+A sewer's word gives target word and the alliance
+to only one source word because the function
+0:20:58.700 --> 0:21:00.384
+is also directly.
+0:21:00.384 --> 0:21:05.999
+However, a source word can be hit or like
+by signal target.
+0:21:06.286 --> 0:21:11.332
+So you are allowing for one to many alignments,
+but not for many to one alignment.
+0:21:11.831 --> 0:21:17.848
+That is a bit of a challenge because you assume
+a lightning should be symmetrical.
+0:21:17.848 --> 0:21:24.372
+So if you look at a parallel sentence, it
+should not matter if you look at it from German
+0:21:24.372 --> 0:21:26.764
+to English or English to German.
+0:21:26.764 --> 0:21:34.352
+So however, it makes these models: Yea possible
+and we'll like to see yea for the phrase bass
+0:21:34.352 --> 0:21:36.545
+until we need these alignments.
+0:21:36.836 --> 0:21:41.423
+So this alignment was the most important of
+the world based models.
+0:21:41.423 --> 0:21:47.763
+For the next twenty years you need the world
+based models to generate this type of alignment,
+0:21:47.763 --> 0:21:50.798
+which is then the first step for the phrase.
+0:21:51.931 --> 0:21:59.642
+Approach, and there you can then combine them
+again like both directions into one we'll see.
+0:22:00.280 --> 0:22:06.850
+This alignment is very important and allows
+us to do this type of separation.
+0:22:08.308 --> 0:22:15.786
+And yet the most commonly used word based
+models are these models referred to as IBM
+0:22:15.786 --> 0:22:25.422
+models, and there is a sequence of them with
+great names: And they were like yeah very commonly
+0:22:25.422 --> 0:22:26.050
+used.
+0:22:26.246 --> 0:22:31.719
+We'll mainly focus on the simple one here
+and look how this works and then not do all
+0:22:31.719 --> 0:22:34.138
+the details about the further models.
+0:22:34.138 --> 0:22:38.084
+The interesting thing is also that all of
+them are important.
+0:22:38.084 --> 0:22:43.366
+So if you want to train this alignment what
+you normally do is train an IVM model.
+0:22:43.743 --> 0:22:50.940
+Then you take that as your initialization
+to then train the IBM model too and so on.
+0:22:50.940 --> 0:22:53.734
+The motivation for that is yeah.
+0:22:53.734 --> 0:23:00.462
+The first model gives you: Is so simple that
+you can even find a global optimum, so it gives
+0:23:00.462 --> 0:23:06.403
+you a good starting point for the next one
+where the optimization in finding the right
+0:23:06.403 --> 0:23:12.344
+model is more difficult and therefore like
+the defore technique was to make your model
+0:23:12.344 --> 0:23:13.641
+step by step more.
+0:23:15.195 --> 0:23:27.333
+In these models we are breaking down the probability
+into smaller steps and then we can define:
+0:23:27.367 --> 0:23:38.981
+You see it's not a bit different, so it's not
+the curability and one specific alignment given.
+0:23:39.299 --> 0:23:42.729
+We'll let us learn how we can then go from
+one alignment to the full set.
+0:23:43.203 --> 0:23:52.889
+The probability of target sentences and one
+alignment between the source and target sentences
+0:23:52.889 --> 0:23:56.599
+alignment is this type of function.
+0:23:57.057 --> 0:24:14.347
+That every word is aligned in order to ensure
+that every word is aligned.
+0:24:15.835 --> 0:24:28.148
+So first of all you do some epsilon, the epsilon
+is just a normalization factor that everything
+0:24:28.148 --> 0:24:31.739
+is somehow to inferability.
+0:24:31.631 --> 0:24:37.539
+Of source sentences plus one to the power
+of the length of the targets.
+0:24:37.937 --> 0:24:50.987
+And this is somehow the probability of this
+alignment.
+0:24:51.131 --> 0:24:53.224
+So is this alignment probable or not?
+0:24:53.224 --> 0:24:55.373
+Of course you can have some intuition.
+0:24:55.373 --> 0:24:58.403
+So if there's a lot of crossing, it may be
+not a good.
+0:24:58.403 --> 0:25:03.196
+If all of the words align to the same one
+might be not a good alignment, but generally
+0:25:03.196 --> 0:25:06.501
+it's difficult to really describe what is a
+good alignment.
+0:25:07.067 --> 0:25:11.482
+Say for the first model that's the most simple
+thing.
+0:25:11.482 --> 0:25:18.760
+What can be the most simple thing if you think
+about giving a probability to some event?
+0:25:21.401 --> 0:25:25.973
+Yes exactly, so just take the uniform distribution.
+0:25:25.973 --> 0:25:33.534
+If we don't really know the best thing of
+modeling is all equally probable, of course
+0:25:33.534 --> 0:25:38.105
+that is not true, but it's giving you a good
+study.
+0:25:38.618 --> 0:25:44.519
+And so this one is just a number of all possible
+alignments for this sentence.
+0:25:44.644 --> 0:25:53.096
+So how many alignments are possible, so the
+first target word can be allied to all sources
+0:25:53.096 --> 0:25:53.746
+worth.
+0:25:54.234 --> 0:26:09.743
+The second one can also be aligned to all
+source work, and the third one also to source.
+0:26:10.850 --> 0:26:13.678
+This is the number of alignments.
+0:26:13.678 --> 0:26:19.002
+The second part is to model the probability
+of the translation.
+0:26:19.439 --> 0:26:31.596
+And there it's not nice to have this function,
+so now we are making the product over all target.
+0:26:31.911 --> 0:26:40.068
+And we are making a very strong independent
+assumption because in these models we normally
+0:26:40.068 --> 0:26:45.715
+assume the translation probability of one word
+is independent.
+0:26:46.126 --> 0:26:49.800
+So how you translate and visit it is independent
+of all the other parts.
+0:26:50.290 --> 0:26:52.907
+That is very strong and very bad.
+0:26:52.907 --> 0:26:55.294
+Yeah, you should do it better.
+0:26:55.294 --> 0:27:00.452
+We know that it's wrong because how you translate
+this depends on.
+0:27:00.452 --> 0:27:05.302
+However, it's a first easy solution and again
+a good starting.
+0:27:05.966 --> 0:27:14.237
+So what you do is that you take a product
+of all words and take a translation probability
+0:27:14.237 --> 0:27:15.707
+on this target.
+0:27:16.076 --> 0:27:23.901
+And because we know that there is always one
+source word allied to that, so it.
+0:27:24.344 --> 0:27:37.409
+If the probability of visits in the zoo doesn't
+really work, the good here I'm again.
+0:27:38.098 --> 0:27:51.943
+So most only we have it here, so the probability
+is an absolute divided pipe to the power.
+0:27:53.913 --> 0:27:58.401
+And then there is somewhere in the last one.
+0:27:58.401 --> 0:28:04.484
+There is an arrow and switch, so it is the
+other way around.
+0:28:04.985 --> 0:28:07.511
+Then you have your translation model.
+0:28:07.511 --> 0:28:12.498
+Hopefully let's assume you have your water
+train so that's only a signing.
+0:28:12.953 --> 0:28:25.466
+And then this sentence has the probability
+of generating I visit a friend given that you
+0:28:25.466 --> 0:28:31.371
+have the source sentence if Bezukhov I'm.
+0:28:32.012 --> 0:28:34.498
+Time stand to the power of minus five.
+0:28:35.155 --> 0:28:36.098
+So this is your model.
+0:28:36.098 --> 0:28:37.738
+This is how you're applying your model.
+0:28:39.479 --> 0:28:44.220
+As you said, it's the most simple bottle you
+assume that all word translations are.
+0:28:44.204 --> 0:28:46.540
+Independent of each other.
+0:28:46.540 --> 0:28:54.069
+You assume that all alignments are equally
+important, and then the only thing you need
+0:28:54.069 --> 0:29:00.126
+for this type of model is to have this lexicon
+in order to calculate.
+0:29:00.940 --> 0:29:04.560
+And that is, of course, now the training process.
+0:29:04.560 --> 0:29:08.180
+The question is how do we get this type of
+lexic?
+0:29:09.609 --> 0:29:15.461
+But before we look into the training, do you
+have any questions about the model itself?
+0:29:21.101 --> 0:29:26.816
+The problem in training is that we have incomplete
+data.
+0:29:26.816 --> 0:29:32.432
+So if you want to count, I mean said you want
+to count.
+0:29:33.073 --> 0:29:39.348
+However, if you don't have the alignment,
+on the other hand, if you would have a lexicon
+0:29:39.348 --> 0:29:44.495
+you could maybe generate the alignment, which
+is the most probable word.
+0:29:45.225 --> 0:29:55.667
+And this is the very common problem that you
+have this type of incomplete data where you
+0:29:55.667 --> 0:29:59.656
+have not one type of information.
+0:30:00.120 --> 0:30:08.767
+And you can model this by considering the
+alignment as your hidden variable and then
+0:30:08.767 --> 0:30:17.619
+you can use the expectation maximization algorithm
+in order to generate the alignment.
+0:30:17.577 --> 0:30:26.801
+So the nice thing is that you only need your
+parallel data, which is aligned on sentence
+0:30:26.801 --> 0:30:29.392
+level, but you normally.
+0:30:29.389 --> 0:30:33.720
+Is just a lot of work we saw last time.
+0:30:33.720 --> 0:30:39.567
+Typically what you have is this type of corpus
+where.
+0:30:41.561 --> 0:30:50.364
+And yeah, the ERM algorithm sounds very fancy.
+0:30:50.364 --> 0:30:58.605
+However, again look at a little high level.
+0:30:58.838 --> 0:31:05.841
+So you're initializing a model by uniform
+distribution.
+0:31:05.841 --> 0:31:14.719
+You're just saying if have lexicon, if all
+words are equally possible.
+0:31:15.215 --> 0:31:23.872
+And then you apply your model to the data,
+and that is your expectation step.
+0:31:23.872 --> 0:31:30.421
+So given this initial lexicon, we are now
+calculating the.
+0:31:30.951 --> 0:31:36.043
+So we can now take all our parallel sentences,
+and of course ought to check what is the most
+0:31:36.043 --> 0:31:36.591
+probable.
+0:31:38.338 --> 0:31:49.851
+And then, of course, at the beginning maybe
+houses most often in line.
+0:31:50.350 --> 0:31:58.105
+Once we have done this expectation step, we
+can next do the maximization step and based
+0:31:58.105 --> 0:32:06.036
+on this guest alignment, which we have, we
+can now learn better translation probabilities
+0:32:06.036 --> 0:32:09.297
+by just counting how often do words.
+0:32:09.829 --> 0:32:22.289
+And then it's rated these steps: We can make
+this whole process even more stable, only taking
+0:32:22.289 --> 0:32:26.366
+the most probable alignment.
+0:32:26.346 --> 0:32:36.839
+Second step, but in contrast we calculate
+for all possible alignments the alignment probability
+0:32:36.839 --> 0:32:40.009
+and weigh the correcurrence.
+0:32:40.000 --> 0:32:41.593
+Then Things Are Most.
+0:32:42.942 --> 0:32:49.249
+Why could that be very challenging if we do
+it in general and really calculate all probabilities
+0:32:49.249 --> 0:32:49.834
+for all?
+0:32:53.673 --> 0:32:55.905
+How many alignments are there for a Simpson?
+0:32:58.498 --> 0:33:03.344
+Yes there, we just saw that in the formula
+if you remember.
+0:33:03.984 --> 0:33:12.336
+This was the formula so it's exponential in
+the lengths of the target sentence.
+0:33:12.336 --> 0:33:15.259
+It would calculate all the.
+0:33:15.415 --> 0:33:18.500
+Be very inefficient and really possible.
+0:33:18.500 --> 0:33:25.424
+The nice thing is we can again use some type
+of dynamic programming, so then we can do this
+0:33:25.424 --> 0:33:27.983
+without really calculating audit.
+0:33:28.948 --> 0:33:40.791
+We have the next pipe slides or so with the
+most equations in the whole lecture, so don't
+0:33:40.791 --> 0:33:41.713
+worry.
+0:33:42.902 --> 0:34:01.427
+So we said we have first explanation where
+it is about calculating the alignment.
+0:34:02.022 --> 0:34:20.253
+And we can do this with our initial definition
+of because this formula.
+0:34:20.160 --> 0:34:25.392
+So we can define this as and and divided by
+and.
+0:34:25.905 --> 0:34:30.562
+This is just the normal definition of a conditional
+probability.
+0:34:31.231 --> 0:34:37.937
+And what we then need to assume a meter calculate
+is P of E given.
+0:34:37.937 --> 0:34:41.441
+P of E given is still again quiet.
+0:34:41.982 --> 0:34:56.554
+Simple: The probability of the sewer sentence
+given the target sentence is quite intuitive.
+0:34:57.637 --> 0:35:15.047
+So let's just calculate how to calculate the
+probability of a event.
+0:35:15.215 --> 0:35:21.258
+So in here we can then put in our original
+form in our soils.
+0:35:21.201 --> 0:35:28.023
+There are some of the possible alignments
+of the first word, and so until the sum of
+0:35:28.023 --> 0:35:30.030
+all possible alignments.
+0:35:29.990 --> 0:35:41.590
+And then we have the probability here of the
+alignment type, this product of translation.
+0:35:42.562 --> 0:35:58.857
+Now this one is independent of the alignment,
+so we can put it to the front here.
+0:35:58.959 --> 0:36:03.537
+And now this is where dynamic programming
+works in.
+0:36:03.537 --> 0:36:08.556
+We can change that and make thereby things
+a lot easier.
+0:36:08.668 --> 0:36:21.783
+Can reform it like this just as a product
+over all target positions, and then it's the
+0:36:21.783 --> 0:36:26.456
+sum over all source positions.
+0:36:27.127 --> 0:36:36.454
+Maybe at least the intuition why this is equal
+is a lot easier if you look into it as graphic.
+0:36:36.816 --> 0:36:39.041
+So what we have here is the table.
+0:36:39.041 --> 0:36:42.345
+We have the target position and the Swiss
+position.
+0:36:42.862 --> 0:37:03.643
+And we have to sum up all possible passes
+through that: The nice thing is that each of
+0:37:03.643 --> 0:37:07.127
+these passes these probabilities are independent
+of each.
+0:37:07.607 --> 0:37:19.678
+In order to get the sum of all passes through
+this table you can use dynamic programming
+0:37:19.678 --> 0:37:27.002
+and then say oh this probability is exactly
+the same.
+0:37:26.886 --> 0:37:34.618
+Times the sun of this column finds the sum
+of this column, and times the sun of this colun.
+0:37:35.255 --> 0:37:41.823
+That is the same as if you go through all
+possible passes here and multiply always the
+0:37:41.823 --> 0:37:42.577
+elements.
+0:37:43.923 --> 0:37:54.227
+And that is a simplification because now we
+only have quadratic numbers and we don't have
+0:37:54.227 --> 0:37:55.029
+to go.
+0:37:55.355 --> 0:38:12.315
+Similar to guess you may be seen the same
+type of algorithm for what is it?
+0:38:14.314 --> 0:38:19.926
+Yeah, well yeah, so that is the saying.
+0:38:19.926 --> 0:38:31.431
+But yeah, I think graphically this is seeable
+if you don't know exactly the mass.
+0:38:32.472 --> 0:38:49.786
+Now put these both together, so if you really
+want to take a piece of and put these two formulas
+0:38:49.786 --> 0:38:51.750
+together,.
+0:38:51.611 --> 0:38:56.661
+Eliminated and Then You Get Your Final Formula.
+0:38:56.716 --> 0:39:01.148
+And that somehow really makes now really intuitively
+again sense.
+0:39:01.401 --> 0:39:08.301
+So the probability of an alignment is the
+product of all target sentences, and then it's
+0:39:08.301 --> 0:39:15.124
+the probability of to translate a word into
+the word that is aligned to divided by some
+0:39:15.124 --> 0:39:17.915
+of the other words in the sentence.
+0:39:18.678 --> 0:39:31.773
+If you look at this again, it makes real descent.
+0:39:31.891 --> 0:39:43.872
+So you're looking at how probable it is to
+translate compared to all the other words.
+0:39:43.872 --> 0:39:45.404
+So you're.
+0:39:45.865 --> 0:39:48.543
+So and that gives you the alignment probability.
+0:39:48.768 --> 0:39:54.949
+Somehow it's not only that it's mathematically
+correct if you look at it this way, it's somehow
+0:39:54.949 --> 0:39:55.785
+intuitively.
+0:39:55.785 --> 0:39:58.682
+So if you would say how good is it to align?
+0:39:58.638 --> 0:40:04.562
+We had to zoo him to visit, or yet it should
+depend on how good this is the translation
+0:40:04.562 --> 0:40:10.620
+probability compared to how good are the other
+words in the sentence, and how probable is
+0:40:10.620 --> 0:40:12.639
+it that I align them to them.
+0:40:15.655 --> 0:40:26.131
+Then you have the expectations that the next
+thing is now the maximization step, so we have
+0:40:26.131 --> 0:40:30.344
+now the probability of an alignment.
+0:40:31.451 --> 0:40:37.099
+Intuitively, that means how often are words
+aligned to each other giving this alignment
+0:40:37.099 --> 0:40:39.281
+or more in a perverse definition?
+0:40:39.281 --> 0:40:43.581
+What is the expectation value that they are
+aligned to each other?
+0:40:43.581 --> 0:40:49.613
+So if there's a lot of alignments with hyperability
+that they're aligned to each other, then.
+0:40:50.050 --> 0:41:07.501
+So the count of E and given F given our caravan
+data is a sum of all possible alignments.
+0:41:07.968 --> 0:41:14.262
+That is, this count, and you don't do just
+count with absolute numbers, but you count
+0:41:14.262 --> 0:41:14.847
+always.
+0:41:15.815 --> 0:41:26.519
+And to make that translation probability is
+that you have to normalize it, of course, through:
+0:41:27.487 --> 0:41:30.584
+And that's then the whole model.
+0:41:31.111 --> 0:41:39.512
+It looks now maybe a bit mathematically complex.
+0:41:39.512 --> 0:41:47.398
+The whole training process is described here.
+0:41:47.627 --> 0:41:53.809
+So you really, really just have to collect
+these counts and later normalize that.
+0:41:54.134 --> 0:42:03.812
+So repeating that until convergence we have
+said the ear migration is always done again.
+0:42:04.204 --> 0:42:15.152
+Equally, then you go over all sentence pairs
+and all of words and calculate the translation.
+0:42:15.355 --> 0:42:17.983
+And then you go once again over.
+0:42:17.983 --> 0:42:22.522
+It counted this count, count given, and totally
+e-given.
+0:42:22.702 --> 0:42:35.316
+Initially how probable is the E translated
+to something else, and you normalize your translation
+0:42:35.316 --> 0:42:37.267
+probabilities.
+0:42:38.538 --> 0:42:45.761
+So this is an old training process for this
+type of.
+0:42:46.166 --> 0:43:00.575
+How that then works is shown here a bit, so
+we have a very simple corpus.
+0:43:01.221 --> 0:43:12.522
+And as we said, you initialize your translation
+with yes or possible translations, so dusk
+0:43:12.522 --> 0:43:16.620
+can be aligned to the bookhouse.
+0:43:16.997 --> 0:43:25.867
+And the other ones are missing because only
+a curse with and book, and then the others
+0:43:25.867 --> 0:43:26.988
+will soon.
+0:43:27.127 --> 0:43:34.316
+In the initial way your vocabulary is for
+works, so the initial probabilities are all:
+0:43:34.794 --> 0:43:50.947
+And then if you iterate you see that the things
+which occur often and then get alignments get
+0:43:50.947 --> 0:43:53.525
+more and more.
+0:43:55.615 --> 0:44:01.506
+In reality, of course, you won't get like
+zero alignments, but you would normally get
+0:44:01.506 --> 0:44:02.671
+there sometimes.
+0:44:03.203 --> 0:44:05.534
+But as the probability increases.
+0:44:05.785 --> 0:44:17.181
+The training process is also guaranteed that
+the probability of your training data is always
+0:44:17.181 --> 0:44:20.122
+increased in iteration.
+0:44:21.421 --> 0:44:27.958
+You see that the model tries to model your
+training data and give you at least good models.
+0:44:30.130 --> 0:44:37.765
+Okay, are there any more questions to the
+training of these type of word-based models?
+0:44:38.838 --> 0:44:54.790
+Initially there is like forwards in the source
+site, so it's just one force to do equal distribution.
+0:44:55.215 --> 0:45:01.888
+So each target word, the probability of the
+target word, is at four target words, so the
+0:45:01.888 --> 0:45:03.538
+uniform distribution.
+0:45:07.807 --> 0:45:14.430
+However, there is problems with this initial
+order and we have this already mentioned at
+0:45:14.430 --> 0:45:15.547
+the beginning.
+0:45:15.547 --> 0:45:21.872
+There is for example things that yeah you
+want to allow for reordering but there are
+0:45:21.872 --> 0:45:27.081
+definitely some alignments which should be
+more probable than others.
+0:45:27.347 --> 0:45:42.333
+So a friend visit should have a lower probability
+than visit a friend.
+0:45:42.302 --> 0:45:50.233
+It's not always monitoring, there is some
+reordering happening, but if you just mix it
+0:45:50.233 --> 0:45:51.782
+crazy, it's not.
+0:45:52.252 --> 0:46:11.014
+You have slings like one too many alignments
+and they are not really models.
+0:46:11.491 --> 0:46:17.066
+But it shouldn't be that you align one word
+to all the others, and that is, you don't want
+0:46:17.066 --> 0:46:18.659
+this type of probability.
+0:46:19.199 --> 0:46:27.879
+You don't want to align to null, so there's
+nothing about that and how to deal with other
+0:46:27.879 --> 0:46:30.386
+words on the source side.
+0:46:32.272 --> 0:46:45.074
+And therefore this was only like the initial
+model in there.
+0:46:45.325 --> 0:46:47.639
+Models, which we saw.
+0:46:47.639 --> 0:46:57.001
+They only model the translation probability,
+so how probable is it to translate one word
+0:46:57.001 --> 0:46:58.263
+to another?
+0:46:58.678 --> 0:47:05.915
+What you could then add is the absolute position.
+0:47:05.915 --> 0:47:16.481
+Yeah, the second word should more probable
+align to the second position.
+0:47:17.557 --> 0:47:22.767
+We add a fertility model that means one word
+is mostly translated into one word.
+0:47:23.523 --> 0:47:29.257
+For example, we saw it there that should be
+translated into two words, but most words should
+0:47:29.257 --> 0:47:32.463
+be one to one, and it's even modeled for each
+word.
+0:47:32.463 --> 0:47:37.889
+So for each source word, how probable is it
+that it is translated to one, two, three or
+0:47:37.889 --> 0:47:38.259
+more?
+0:47:40.620 --> 0:47:50.291
+Then either one of four acts relative positions,
+so it's asks: Maybe instead of modeling, how
+0:47:50.291 --> 0:47:55.433
+probable is it that you translate from position
+five to position twenty five?
+0:47:55.433 --> 0:48:01.367
+It's not a very good way, but in a relative
+position instead of what you try to model it.
+0:48:01.321 --> 0:48:06.472
+How probable is that you are jumping Swiss
+steps forward or Swiss steps back?
+0:48:07.287 --> 0:48:15.285
+However, this makes sense more complex because
+what is a jump forward and a jump backward
+0:48:15.285 --> 0:48:16.885
+is not that easy.
+0:48:18.318 --> 0:48:30.423
+You want to have a model that describes reality,
+so every sentence that is not possible should
+0:48:30.423 --> 0:48:37.304
+have the probability zero because that cannot
+happen.
+0:48:37.837 --> 0:48:48.037
+However, with this type of IBM model four
+this has a positive probability, so it makes
+0:48:48.037 --> 0:48:54.251
+a sentence more complex and you can easily
+check it.
+0:48:57.457 --> 0:49:09.547
+So these models were the first models which
+tried to directly model and where they are
+0:49:09.547 --> 0:49:14.132
+the first to do the translation.
+0:49:14.414 --> 0:49:19.605
+So in all of these models, the probability
+of a word translating into another word is
+0:49:19.605 --> 0:49:25.339
+always independent of all the other translations,
+and that is a challenge because we know that
+0:49:25.339 --> 0:49:26.486
+this is not right.
+0:49:26.967 --> 0:49:32.342
+And therefore we will come now to then the
+phrase-based translation models.
+0:49:35.215 --> 0:49:42.057
+However, this word alignment is the very important
+concept which was used in phrase based.
+0:49:42.162 --> 0:49:50.559
+Even when people use phrase based, they first
+would always train a word based model not to
+0:49:50.559 --> 0:49:56.188
+get the really model but only to get this type
+of alignment.
+0:49:57.497 --> 0:50:01.343
+What was the main idea of a phrase based machine
+translation?
+0:50:03.223 --> 0:50:08.898
+It's not only that things got mathematically
+a lot more simple here because you don't try
+0:50:08.898 --> 0:50:13.628
+to express the whole translation process, but
+it's a discriminative model.
+0:50:13.628 --> 0:50:19.871
+So what you only try to model is this translation
+probability or is this translation more probable
+0:50:19.871 --> 0:50:20.943
+than some other.
+0:50:24.664 --> 0:50:28.542
+The main idea is that the basic units are
+are the phrases.
+0:50:28.542 --> 0:50:31.500
+That's why it's called phrase phrase phrase.
+0:50:31.500 --> 0:50:35.444
+You have to be aware that these are not linguistic
+phrases.
+0:50:35.444 --> 0:50:39.124
+I guess you have some intuition about what
+is a phrase.
+0:50:39.399 --> 0:50:45.547
+You would express as a phrase.
+0:50:45.547 --> 0:50:58.836
+However, you wouldn't say that is a very good
+phrase because it's.
+0:50:59.339 --> 0:51:06.529
+However, in this machine learning-based motivated
+thing, phrases are just indicative.
+0:51:07.127 --> 0:51:08.832
+So it can be any split.
+0:51:08.832 --> 0:51:12.455
+We don't consider linguistically motivated
+or not.
+0:51:12.455 --> 0:51:15.226
+It can be any sequence of consecutive.
+0:51:15.335 --> 0:51:16.842
+That's the Only Important Thing.
+0:51:16.977 --> 0:51:25.955
+The phrase is always a thing of consecutive
+words, and the motivation behind that is getting
+0:51:25.955 --> 0:51:27.403
+computational.
+0:51:27.387 --> 0:51:35.912
+People have looked into how you can also discontinuous
+phrases, which might be very helpful if you
+0:51:35.912 --> 0:51:38.237
+think about German harbor.
+0:51:38.237 --> 0:51:40.046
+Has this one phrase?
+0:51:40.000 --> 0:51:47.068
+There's two phrases, although there's many
+things in between, but in order to make things
+0:51:47.068 --> 0:51:52.330
+still possible and runner will, it's always
+like consecutive work.
+0:51:53.313 --> 0:52:05.450
+The nice thing is that on the one hand you
+don't need this word to word correspondence
+0:52:05.450 --> 0:52:06.706
+anymore.
+0:52:06.906 --> 0:52:17.088
+You now need to invent some type of alignment
+that in this case doesn't really make sense.
+0:52:17.417 --> 0:52:21.710
+So you can just learn okay, you have this
+phrase and this phrase and their translation.
+0:52:22.862 --> 0:52:25.989
+Secondly, we can add a bit of context into
+that.
+0:52:26.946 --> 0:52:43.782
+You're saying, for example, of Ultimate Customs
+and of My Shift.
+0:52:44.404 --> 0:52:51.443
+And this was difficult to model and work based
+models because they always model the translation.
+0:52:52.232 --> 0:52:57.877
+Here you can have phrases where you have more
+context and just jointly translate the phrases,
+0:52:57.877 --> 0:53:03.703
+and if you then have seen all by the question
+as a phrase you can directly use that to generate.
+0:53:08.468 --> 0:53:19.781
+Okay, before we go into how to do that, then
+we start, so the start is when we start with
+0:53:19.781 --> 0:53:21.667
+the alignment.
+0:53:22.022 --> 0:53:35.846
+So that is what we get from the work based
+model and we are assuming to get the.
+0:53:36.356 --> 0:53:40.786
+So that is your starting point.
+0:53:40.786 --> 0:53:47.846
+You have a certain sentence and one most probable.
+0:53:48.989 --> 0:54:11.419
+The challenge you now have is that these alignments
+are: On the one hand, a source word like hit
+0:54:11.419 --> 0:54:19.977
+several times with one source word can be aligned
+to several: So in this case you see that for
+0:54:19.977 --> 0:54:29.594
+example Bisher is aligned to three words, so
+this can be the alignment from English to German,
+0:54:29.594 --> 0:54:32.833
+but it cannot be the alignment.
+0:54:33.273 --> 0:54:41.024
+In order to address for this inconsistency
+and being able to do that, what you typically
+0:54:41.024 --> 0:54:49.221
+then do is: If you have this inconsistency
+and you get different things in both directions,.
+0:54:54.774 --> 0:55:01.418
+In machine translation to do that you just
+do it in both directions and somehow combine
+0:55:01.418 --> 0:55:08.363
+them because both will do arrows and the hope
+is yeah if you know both things you minimize.
+0:55:08.648 --> 0:55:20.060
+So you would also do it in the other direction
+and get a different type of lineup, for example
+0:55:20.060 --> 0:55:22.822
+that you now have saw.
+0:55:23.323 --> 0:55:37.135
+So in this way you are having two alignments
+and the question is now how do get one alignment
+0:55:37.135 --> 0:55:38.605
+and what?
+0:55:38.638 --> 0:55:45.828
+There were a lot of different types of heuristics.
+0:55:45.828 --> 0:55:55.556
+They normally start with intersection because
+you should trust them.
+0:55:55.996 --> 0:55:59.661
+And your maximum will could take this, the
+union thought,.
+0:55:59.980 --> 0:56:04.679
+If one of the systems says they are not aligned
+then maybe you should not align them.
+0:56:05.986 --> 0:56:12.240
+The only question they are different is what
+should I do about things where they don't agree?
+0:56:12.240 --> 0:56:18.096
+So where only one of them enlines and then
+you have heuristics depending on other words
+0:56:18.096 --> 0:56:22.288
+around it, you can decide should I align them
+or should I not.
+0:56:24.804 --> 0:56:34.728
+So that is your first step and then the second
+step in your model.
+0:56:34.728 --> 0:56:41.689
+So now you have one alignment for the process.
+0:56:42.042 --> 0:56:47.918
+And the idea is that we will now extract all
+phrase pairs to combinations of source and
+0:56:47.918 --> 0:56:51.858
+target phrases where they are consistent within
+alignment.
+0:56:52.152 --> 0:56:57.980
+The idea is a consistence with an alignment
+that should be a good example and that we can
+0:56:57.980 --> 0:56:58.563
+extract.
+0:56:59.459 --> 0:57:14.533
+And there are three conditions where we say
+an alignment has to be consistent.
+0:57:14.533 --> 0:57:17.968
+The first one is.
+0:57:18.318 --> 0:57:24.774
+So if you add bisher, then it's in your phrase.
+0:57:24.774 --> 0:57:32.306
+All the three words up till and now should
+be in there.
+0:57:32.492 --> 0:57:42.328
+So Bisheret Till would not be a valid phrase
+pair in this case, but for example Bisheret
+0:57:42.328 --> 0:57:43.433
+Till now.
+0:57:45.525 --> 0:58:04.090
+Does anybody now have already an idea about
+the second rule that should be there?
+0:58:05.325 --> 0:58:10.529
+Yes, that is exactly the other thing.
+0:58:10.529 --> 0:58:22.642
+If a target verse is in the phrase pair, there
+are also: Then there is one very obvious one.
+0:58:22.642 --> 0:58:28.401
+If you strike a phrase pair, at least one
+word in the phrase.
+0:58:29.069 --> 0:58:32.686
+And this is a knife with working.
+0:58:32.686 --> 0:58:40.026
+However, in reality a captain will select
+some part of the sentence.
+0:58:40.380 --> 0:58:47.416
+You can take any possible combination of sewers
+and target words for this part, and that of
+0:58:47.416 --> 0:58:54.222
+course is not very helpful because you just
+have no idea, and therefore it says at least
+0:58:54.222 --> 0:58:58.735
+one sewer should be aligned to one target word
+to prevent.
+0:58:59.399 --> 0:59:09.615
+But still, it means that if you have normally
+analyzed words, the more analyzed words you
+0:59:09.615 --> 0:59:10.183
+can.
+0:59:10.630 --> 0:59:13.088
+That's not true for the very extreme case.
+0:59:13.088 --> 0:59:17.603
+If no word is a line you can extract nothing
+because you can never fulfill it.
+0:59:17.603 --> 0:59:23.376
+However, if only for example one word is aligned
+then you can align a lot of different possibilities
+0:59:23.376 --> 0:59:28.977
+because you can start with this word and then
+add source words or target words or any combination
+0:59:28.977 --> 0:59:29.606
+of source.
+0:59:30.410 --> 0:59:37.585
+So there was typically a problem that if you
+have too few works in light you can really
+0:59:37.585 --> 0:59:38.319
+extract.
+0:59:38.558 --> 0:59:45.787
+If you think about this already here you can
+extract very, very many phrase pairs from:
+0:59:45.845 --> 0:59:55.476
+So what you can extract is, for example, what
+we saw up and so on.
+0:59:55.476 --> 1:00:00.363
+So all of them will be extracted.
+1:00:00.400 --> 1:00:08.379
+In order to limit this you typically have
+a length limit so you can only extract phrases
+1:00:08.379 --> 1:00:08.738
+up.
+1:00:09.049 --> 1:00:18.328
+But still there these phrases where you have
+all these phrases extracted.
+1:00:18.328 --> 1:00:22.968
+You have to think about how to deal.
+1:00:26.366 --> 1:00:34.966
+Now we have the phrases, so the other question
+is what is a good phrase pair and not so good.
+1:00:35.255 --> 1:00:39.933
+You might be that you sometimes extract one
+which is explaining this sentence but is not
+1:00:39.933 --> 1:00:44.769
+really a good one because there is something
+ever in there or something special so it might
+1:00:44.769 --> 1:00:47.239
+not be a good phase pair in another situation.
+1:00:49.629 --> 1:00:59.752
+And therefore the easiest thing is again just
+count, and if a phrase pair occurs very often
+1:00:59.752 --> 1:01:03.273
+seems to be a good phrase pair.
+1:01:03.743 --> 1:01:05.185
+So if we have this one.
+1:01:05.665 --> 1:01:09.179
+And if you have the exam up till now,.
+1:01:09.469 --> 1:01:20.759
+Then you look how often does up till now to
+this hair occur?
+1:01:20.759 --> 1:01:28.533
+How often does up until now to this hair?
+1:01:30.090 --> 1:01:36.426
+So this is one way of yeah describing the
+quality of the phrase book.
+1:01:37.257 --> 1:01:47.456
+So one difference is now, and that is the
+advantage of these primitive models.
+1:01:47.867 --> 1:01:55.442
+But instead we are trying to have a lot of
+features describing how good a phrase parent
+1:01:55.442 --> 1:01:55.786
+is.
+1:01:55.786 --> 1:02:04.211
+One of these features is this one describing:
+But in this model we'll later see how to combine
+1:02:04.211 --> 1:02:04.515
+it.
+1:02:04.515 --> 1:02:10.987
+The nice thing is we can invent any other
+type of features and add that and normally
+1:02:10.987 --> 1:02:14.870
+if you have two or three metrics to describe
+then.
+1:02:15.435 --> 1:02:18.393
+And therefore the spray spray sprays.
+1:02:18.393 --> 1:02:23.220
+They were not only like evaluated by one type
+but by several.
+1:02:23.763 --> 1:02:36.580
+So this could, for example, have a problem
+because your target phrase here occurs only
+1:02:36.580 --> 1:02:37.464
+once.
+1:02:38.398 --> 1:02:46.026
+It will of course only occur with one other
+source trait, and that probability will be
+1:02:46.026 --> 1:02:53.040
+one which might not be a very good estimation
+because you've only seen it once.
+1:02:53.533 --> 1:02:58.856
+Therefore, we use additional ones to better
+deal with that, and the first thing is we're
+1:02:58.856 --> 1:02:59.634
+doing again.
+1:02:59.634 --> 1:03:01.129
+Yeah, we know it by now.
+1:03:01.129 --> 1:03:06.692
+If you look at it in the one direction, it's
+helpful to us to look into the other direction.
+1:03:06.692 --> 1:03:11.297
+So you take also the inverse probability,
+so you not only take in peer of E.
+1:03:11.297 --> 1:03:11.477
+G.
+1:03:11.477 --> 1:03:11.656
+M.
+1:03:11.656 --> 1:03:12.972
+F., but also peer of.
+1:03:13.693 --> 1:03:19.933
+And then in addition you say maybe for the
+especially prolonged phrases they occur rarely,
+1:03:19.933 --> 1:03:25.898
+and then you have very high probabilities,
+and that might not be always the right one.
+1:03:25.898 --> 1:03:32.138
+So maybe it's good to also look at the word
+based probabilities to represent how good they
+1:03:32.138 --> 1:03:32.480
+are.
+1:03:32.692 --> 1:03:44.202
+So in addition you take the work based probabilities
+of this phrase pair as an additional model.
+1:03:44.704 --> 1:03:52.828
+So then you would have in total four different
+values describing how good the phrase is.
+1:03:52.828 --> 1:04:00.952
+It would be the relatively frequencies in
+both directions and the lexical probabilities.
+1:04:01.361 --> 1:04:08.515
+So four values in describing how probable
+a phrase translation is.
+1:04:11.871 --> 1:04:20.419
+Then the next challenge is how can we combine
+these different types of probabilities into
+1:04:20.419 --> 1:04:23.458
+a global score saying how good?
+1:04:24.424 --> 1:04:36.259
+Model, but before we are doing that give any
+questions to this phrase extraction and phrase
+1:04:36.259 --> 1:04:37.546
+creation.
+1:04:40.260 --> 1:04:44.961
+And the motivation for that this was our initial
+moral.
+1:04:44.961 --> 1:04:52.937
+If you remember from the beginning of a lecture
+we had the probability of like PFO three times
+1:04:52.937 --> 1:04:53.357
+PFO.
+1:04:55.155 --> 1:04:57.051
+Now the problem is here.
+1:04:57.051 --> 1:04:59.100
+That is, of course, right.
+1:04:59.100 --> 1:05:06.231
+However, we have done a lot of simplification
+that the translation probability is independent
+1:05:06.231 --> 1:05:08.204
+of the other translation.
+1:05:08.628 --> 1:05:14.609
+So therefore our estimations of pH give me
+and pH might not be right, and therefore the
+1:05:14.609 --> 1:05:16.784
+combination might not be right.
+1:05:17.317 --> 1:05:22.499
+So it can be that, for example, at the edge
+you have a fluid but not accurate translation.
+1:05:22.782 --> 1:05:25.909
+And Then There's Could Be an Easy Way Around
+It.
+1:05:26.126 --> 1:05:32.019
+If our effluent but not accurate, it might
+be that we put too much effort on the language
+1:05:32.019 --> 1:05:36.341
+model and we are putting too few effort on
+the translation model.
+1:05:36.936 --> 1:05:43.016
+There we can wait a minute so we can do this
+a bit stronger.
+1:05:43.016 --> 1:05:46.305
+This one is more important than.
+1:05:48.528 --> 1:05:53.511
+And based on that we can extend this idea
+to the lacteria mole.
+1:05:53.893 --> 1:06:02.164
+The log linear model now says all the translation
+probabilities is just we have.
+1:06:02.082 --> 1:06:09.230
+Describing how good this translation process
+is, these are the speeches H which depend on
+1:06:09.230 --> 1:06:09.468
+E.
+1:06:09.468 --> 1:06:09.706
+F.
+1:06:09.706 --> 1:06:13.280
+Only one of them, but generally depend on
+E.
+1:06:13.280 --> 1:06:13.518
+E.
+1:06:13.518 --> 1:06:13.757
+E.
+1:06:13.757 --> 1:06:13.995
+N.
+1:06:13.995 --> 1:06:14.233
+F.
+1:06:14.474 --> 1:06:22.393
+Each of these pictures has a weight saying
+yeah how good does it model it so that if you're
+1:06:22.393 --> 1:06:29.968
+asking a lot of people about some opinion it
+might also be waiting some opinion more so
+1:06:29.968 --> 1:06:34.100
+I put more effort on that and he may not be
+so.
+1:06:34.314 --> 1:06:39.239
+If you're saying that it's maybe a good indication,
+yeah, would trust that much.
+1:06:39.559 --> 1:06:41.380
+And exactly you can do that for you too.
+1:06:41.380 --> 1:06:42.446
+You can't add no below.
+1:06:43.423 --> 1:07:01.965
+It's like depending on how many you want to
+have and each of the features gives you value.
+1:07:02.102 --> 1:07:12.655
+The nice thing is that we can normally ignore
+because we are not interested in the probability
+1:07:12.655 --> 1:07:13.544
+itself.
+1:07:13.733 --> 1:07:18.640
+And again, if that's not normalized, that's
+fine.
+1:07:18.640 --> 1:07:23.841
+So if this value is the highest, that's the
+highest.
+1:07:26.987 --> 1:07:29.302
+Can we do that?
+1:07:29.302 --> 1:07:34.510
+Let's start with two simple things.
+1:07:34.510 --> 1:07:39.864
+Then you have one translation model.
+1:07:40.000 --> 1:07:43.102
+Which gives you the peer of eagerness.
+1:07:43.383 --> 1:07:49.203
+It can be typically as a feature it would
+take the liberalism of this ability, so mine
+1:07:49.203 --> 1:07:51.478
+is nine hundred and fourty seven.
+1:07:51.451 --> 1:07:57.846
+And the language model which says you how
+clue in the English side is how you can calculate
+1:07:57.846 --> 1:07:59.028
+the probability.
+1:07:58.979 --> 1:08:03.129
+In some future lectures we'll give you all
+superbology.
+1:08:03.129 --> 1:08:10.465
+You can feature again the luck of the purbology,
+then you have minus seven and then give different
+1:08:10.465 --> 1:08:11.725
+weights to them.
+1:08:12.292 --> 1:08:19.243
+And that means that your probability is one
+divided by said to the power of this.
+1:08:20.840 --> 1:08:38.853
+You're not really interested in the probability,
+so you just calculate on the score to the exponendum.
+1:08:40.000 --> 1:08:41.668
+Maximal Maximal I Think.
+1:08:42.122 --> 1:08:57.445
+You can, for example, try different translations,
+calculate all their scores and take in the
+1:08:57.445 --> 1:09:00.905
+end the translation.
+1:09:03.423 --> 1:09:04.661
+Why to do that.
+1:09:05.986 --> 1:09:10.698
+We've done that now for two, but of course
+you cannot only do it with two.
+1:09:10.698 --> 1:09:16.352
+You can do it now with any fixed number, so
+of course you have to decide in the beginning
+1:09:16.352 --> 1:09:21.944
+I want to have ten features or something like
+that, but you can take all these features.
+1:09:22.002 --> 1:09:29.378
+And yeah, based on them, they calculate your
+model probability or the model score.
+1:09:31.031 --> 1:09:40.849
+A big advantage over the initial.
+1:09:40.580 --> 1:09:45.506
+A model because now we can add a lot of features
+and there was diamond machine translation,
+1:09:45.506 --> 1:09:47.380
+a statistical machine translation.
+1:09:47.647 --> 1:09:57.063
+So how can develop new features, new ways
+of evaluating them so that can hopefully better
+1:09:57.063 --> 1:10:00.725
+describe what is good translation?
+1:10:01.001 --> 1:10:16.916
+If you have a new great feature you can calculate
+these features and then how much better do
+1:10:16.916 --> 1:10:18.969
+they model?
+1:10:21.741 --> 1:10:27.903
+There is one challenge which haven't touched
+upon yet.
+1:10:27.903 --> 1:10:33.505
+So could you easily build your model if you
+have.
+1:10:38.999 --> 1:10:43.016
+Assumed here something which just gazed, but
+which might not be that easy.
+1:10:49.990 --> 1:10:56.333
+The weight for the translation model is and
+the weight for the language model is.
+1:10:56.716 --> 1:11:08.030
+That's a bit arbitrary, so why should you
+use this one and guess normally you won't be
+1:11:08.030 --> 1:11:11.801
+able to select that by hand?
+1:11:11.992 --> 1:11:19.123
+So typically we didn't have like or features
+in there, but features is very common.
+1:11:19.779 --> 1:11:21.711
+So how do you select them?
+1:11:21.711 --> 1:11:24.645
+There was a second part of the training.
+1:11:24.645 --> 1:11:27.507
+These models were trained in two steps.
+1:11:27.507 --> 1:11:32.302
+On the one hand, we had the training of the
+individual components.
+1:11:32.302 --> 1:11:38.169
+We saw that now how to build the phrase based
+system, how to extract the phrases.
+1:11:38.738 --> 1:11:46.223
+But then if you have these different components
+you need a second training to learn the optimal.
+1:11:46.926 --> 1:11:51.158
+And typically this is referred to as the tuning
+of the system.
+1:11:51.431 --> 1:12:07.030
+So now if you have different types of models
+describing what a good translation is you need
+1:12:07.030 --> 1:12:10.760
+to find good weights.
+1:12:12.312 --> 1:12:14.315
+So how can you do it?
+1:12:14.315 --> 1:12:20.871
+The easiest thing is, of course, you can just
+try different things out.
+1:12:21.121 --> 1:12:27.496
+You can then always select the best hyper
+scissors.
+1:12:27.496 --> 1:12:38.089
+You can evaluate it with some metrics saying:
+You can score all your outputs, always select
+1:12:38.089 --> 1:12:42.543
+the best one and then get this translation.
+1:12:42.983 --> 1:12:45.930
+And you can do that for a lot of different
+possible combinations.
+1:12:47.067 --> 1:12:59.179
+However, the challenge is the complexity,
+so if you have only parameters and each of
+1:12:59.179 --> 1:13:04.166
+them has values you try for, then.
+1:13:04.804 --> 1:13:16.895
+We won't be able to try all of these possible
+combinations, so what we have to do is some
+1:13:16.895 --> 1:13:19.313
+more intelligent.
+1:13:20.540 --> 1:13:34.027
+And what has been done there in machine translation
+is referred to as a minimum error rate training.
+1:13:34.534 --> 1:13:41.743
+Whole surge is a very intuitive one, so have
+all these different parameters, so how do.
+1:13:42.522 --> 1:13:44.358
+And the idea is okay.
+1:13:44.358 --> 1:13:52.121
+I start with an initial guess and then I optimize
+one single parameter that's always easier.
+1:13:52.121 --> 1:13:54.041
+That's some or linear.
+1:13:54.041 --> 1:13:58.882
+So you're searching the best value for the
+one parameter.
+1:13:59.759 --> 1:14:04.130
+Often visualized with a San Francisco map.
+1:14:04.130 --> 1:14:13.786
+Just imagine if you want to go to the highest
+spot in San Francisco, you're standing somewhere
+1:14:13.786 --> 1:14:14.395
+here.
+1:14:14.574 --> 1:14:21.220
+You are switching your dimensions so you are
+going in this direction again finding.
+1:14:21.661 --> 1:14:33.804
+Now you're on a different street and this
+one is not a different one so you go in here
+1:14:33.804 --> 1:14:36.736
+so you can interact.
+1:14:36.977 --> 1:14:56.368
+The one thing of course is find a local optimum,
+especially if you start in two different positions.
+1:14:56.536 --> 1:15:10.030
+So yeah, there is a heuristic in there, so
+typically it's done again if you land in different
+1:15:10.030 --> 1:15:16.059
+positions with different starting points.
+1:15:16.516 --> 1:15:29.585
+What is different or what is like the addition
+of arrow rate training compared to the standard?
+1:15:29.729 --> 1:15:37.806
+So the question is, like we said, you can
+now evaluate different values for one parameter.
+1:15:38.918 --> 1:15:42.857
+And the question is: Which values should you
+try out for one parameters?
+1:15:42.857 --> 1:15:47.281
+Should you just do zero point one, zero point
+two, zero point three, or anything?
+1:15:49.029 --> 1:16:03.880
+If you change only one parameter then you
+can define the score of translation as a linear
+1:16:03.880 --> 1:16:05.530
+function.
+1:16:05.945 --> 1:16:17.258
+That this is the one that possesses, and yet
+if you change the parameter, the score of this.
+1:16:17.397 --> 1:16:26.506
+It may depend so your score is there because
+the rest you don't change your feature value.
+1:16:26.826 --> 1:16:30.100
+And the feature value is there for the steepness
+of their purse.
+1:16:30.750 --> 1:16:38.887
+And now look at different possible translations.
+1:16:38.887 --> 1:16:46.692
+Therefore, how they go up here is differently.
+1:16:47.247 --> 1:16:59.289
+So in this case if you look at the minimum
+score so there should be as minimum.
+1:17:00.300 --> 1:17:10.642
+So it's enough to check once a year and check
+once here because if you check here and here.
+1:17:11.111 --> 1:17:24.941
+And that is the idea in minimum air rate training
+when you select different hypotheses.
+1:17:29.309 --> 1:17:34.378
+So in yeah, the minimum air raid training
+is a power search.
+1:17:34.378 --> 1:17:37.453
+Then we do an intelligent step size.
+1:17:37.453 --> 1:17:39.364
+We do random restarts.
+1:17:39.364 --> 1:17:46.428
+Then things are still too slow because it
+might say we would have to decode a lot of
+1:17:46.428 --> 1:17:47.009
+times.
+1:17:46.987 --> 1:17:54.460
+So what we can do to make things even faster
+is we are decoding once with the current parameters,
+1:17:54.460 --> 1:18:01.248
+but then we are not generating only the most
+probable translation, but we are generating
+1:18:01.248 --> 1:18:05.061
+the most probable ten hundred translations
+or so.
+1:18:06.006 --> 1:18:18.338
+And then we are optimizing our weights by
+only looking at this one hundred translation
+1:18:18.338 --> 1:18:23.725
+and finding the optimal values there.
+1:18:24.564 --> 1:18:39.284
+Of course, it might be a problem that at some
+point you have now good ways to find good translations
+1:18:39.284 --> 1:18:42.928
+inside your ambest list.
+1:18:43.143 --> 1:18:52.357
+You have to iterate that sometime, but the
+important thing is you don't have to decode
+1:18:52.357 --> 1:18:56.382
+every time you need weights, but you.
+1:18:57.397 --> 1:19:11.325
+There is mainly a speed up process in order
+to make things more, make things even faster.
+1:19:15.515 --> 1:19:20.160
+Good Then We'll Finish With.
+1:19:20.440 --> 1:19:25.289
+Looking at how do you really calculate the
+scores and everything?
+1:19:25.289 --> 1:19:32.121
+Because what we did look into was a translation
+of a full sentence doesn't really consist of
+1:19:32.121 --> 1:19:37.190
+only one single phrase, but of course you have
+to combine different.
+1:19:37.637 --> 1:19:40.855
+So how does that now really look and how do
+we have to do?
+1:19:41.361 --> 1:19:48.252
+Just think again of the translation we have
+done before.
+1:19:48.252 --> 1:19:59.708
+The sentence must be: What is the probability
+of translating this one into what we saw after
+1:19:59.708 --> 1:20:00.301
+now?
+1:20:00.301 --> 1:20:03.501
+We're doing this by using.
+1:20:03.883 --> 1:20:07.157
+So we're having the phrase pair.
+1:20:07.157 --> 1:20:12.911
+Vasvia is the phrase pair up to now and gazine
+harm into.
+1:20:13.233 --> 1:20:18.970
+In addition, that is important because translation
+is not monotone.
+1:20:18.970 --> 1:20:26.311
+We are not putting phrase pairs in the same
+order as we are doing it on the source and
+1:20:26.311 --> 1:20:31.796
+on the target, but in order to generate the
+correct translation.
+1:20:31.771 --> 1:20:34.030
+So we have to shuffle the phrase pears.
+1:20:34.294 --> 1:20:39.747
+And the blue wand is in front on the search
+side but not on the back of the tag.
+1:20:40.200 --> 1:20:49.709
+This reordering makes a statistic of the machine
+translation really complicated because if you
+1:20:49.709 --> 1:20:53.313
+would just monotonely do this then.
+1:20:53.593 --> 1:21:05.288
+The problem is if you would analyze all possible
+combinations of reshuffling them, then again.
+1:21:05.565 --> 1:21:11.508
+So you again have to use some type of heuristics
+which shuffle you allow and which you don't
+1:21:11.508 --> 1:21:11.955
+allow.
+1:21:12.472 --> 1:21:27.889
+That was relatively challenging since, for
+example, if you think of Germany you would
+1:21:27.889 --> 1:21:32.371
+have to allow very long.
+1:21:33.033 --> 1:21:52.218
+But if we have now this, how do we calculate
+the translation score so the translation score?
+1:21:52.432 --> 1:21:55.792
+That's why we sum up the scores at the end.
+1:21:56.036 --> 1:22:08.524
+So you said our first feature is the probability
+of the full sentence.
+1:22:08.588 --> 1:22:13.932
+So we say, the translation of each phrase
+pair is independent of each other, and then
+1:22:13.932 --> 1:22:19.959
+we can hear the probability of the full sentences,
+fear of what we give, but fear of times, fear
+1:22:19.959 --> 1:22:24.246
+of sobbing because they have time to feel up
+till now is impossible.
+1:22:24.664 --> 1:22:29.379
+Now we can use the loss of logarithmal calculation.
+1:22:29.609 --> 1:22:36.563
+That's logarithm of the first perability.
+1:22:36.563 --> 1:22:48.153
+We'll get our first score, which says the
+translation model is minus.
+1:22:49.970 --> 1:22:56.586
+And that we're not doing only once, but we're
+exactly doing it with all our translation model.
+1:22:56.957 --> 1:23:03.705
+So we said we also have the relative frequency
+and the inverse directions of the.
+1:23:03.843 --> 1:23:06.226
+So in the end you'll have four scores.
+1:23:06.226 --> 1:23:09.097
+Here how you combine them is exactly the same.
+1:23:09.097 --> 1:23:12.824
+The only thing is how you look them up for
+each phrase pair.
+1:23:12.824 --> 1:23:18.139
+We have said in the beginning we are storing
+four scores describing how good they are.
+1:23:19.119 --> 1:23:25.415
+And these are then of force points describing
+how probable the sense.
+1:23:27.427 --> 1:23:31.579
+Then we can have more sports.
+1:23:31.579 --> 1:23:37.806
+For example, we can have a distortion model.
+1:23:37.806 --> 1:23:41.820
+How much reordering is done?
+1:23:41.841 --> 1:23:47.322
+There were different types of ones who won't
+go into detail, but just imagine you have no
+1:23:47.322 --> 1:23:47.748
+score.
+1:23:48.548 --> 1:23:56.651
+Then you have a language model which is the
+sequence of what we saw until now.
+1:23:56.651 --> 1:24:06.580
+How we generate this language model for ability
+will cover: And there weren't even more probabilities.
+1:24:06.580 --> 1:24:11.841
+So one, for example, was a phrase count scarf,
+which just counts how many.
+1:24:12.072 --> 1:24:19.555
+In order to learn is it better to have more
+short phrases or should bias on having fewer
+1:24:19.555 --> 1:24:20.564
+and longer.
+1:24:20.940 --> 1:24:28.885
+Easily add this but just counting so the value
+will be here and like putting in a count like
+1:24:28.885 --> 1:24:32.217
+typically how good is it to translate.
+1:24:32.932 --> 1:24:44.887
+For language model, the probability normally
+gets shorter the longer the sequences in order
+1:24:44.887 --> 1:24:46.836
+to counteract.
+1:24:47.827 --> 1:24:59.717
+And then you get your final score by multi-climbing
+each of the scores we had before.
+1:24:59.619 --> 1:25:07.339
+Optimization and that gives you a final score
+maybe of twenty three point seven eight five
+1:25:07.339 --> 1:25:13.278
+and then you can do that with several possible
+translation tests and.
+1:25:14.114 --> 1:25:23.949
+One may be important point here is so the
+score not only depends on the target side but
+1:25:23.949 --> 1:25:32.444
+it also depends on which phrases you have used
+so you could have generated.
+1:25:32.772 --> 1:25:38.076
+So you would have the same translation, but
+you would have a different split into phrase.
+1:25:38.979 --> 1:25:45.636
+And this was normally ignored so you would
+just look at all of them and then select the
+1:25:45.636 --> 1:25:52.672
+one which has the highest probability and ignore
+that this translation could be generated by
+1:25:52.672 --> 1:25:54.790
+several splits into phrase.
+1:25:57.497 --> 1:26:06.097
+So to summarize what we look into today and
+what you should hopefully remember is: Statistical
+1:26:06.097 --> 1:26:11.440
+models in how to generate machine translation
+output that were the word based statistical
+1:26:11.440 --> 1:26:11.915
+models.
+1:26:11.915 --> 1:26:16.962
+There was IBM models at the beginning and
+then we have the phrase based entity where
+1:26:16.962 --> 1:26:22.601
+it's about building the translation by putting
+together these blocks of phrases and combining.
+1:26:23.283 --> 1:26:34.771
+If you have a water which has several features
+you can't do that with millions but with features.
+1:26:34.834 --> 1:26:42.007
+Then you can combine them with your local
+model, which allows you to have your variable
+1:26:42.007 --> 1:26:45.186
+number of features and easily combine.
+1:26:45.365 --> 1:26:47.920
+Yeah, how much can you trust each of these
+more?
+1:26:51.091 --> 1:26:54.584
+Do you have any further questions for this
+topic?
+1:26:58.378 --> 1:27:08.715
+And there will be on Tuesday a lecture by
+Tuan about evaluation, and then next Thursday
+1:27:08.715 --> 1:27:12.710
+there will be the practical part.
+1:27:12.993 --> 1:27:21.461
+So please bring the practical pot here, but
+you can do something yourself if you are not
+1:27:21.461 --> 1:27:22.317
+able to.
+1:27:23.503 --> 1:27:26.848
+So then please tell us and we'll have to see
+how we find the difference in this.

demo_data/lectures/Lecture-04-27.04.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8786f0bc34cf397879e95757fe367887c5f5d01d0f388aa98f768203cccc5269
+size 116390723

demo_data/lectures/Lecture-05-02.05.2023/English.vtt ADDED Viewed

	@@ -0,0 +1,1124 @@

+WEBVTT
+0:00:56.957 --> 0:01:10.166
+In today you are going to talk about evaluation
+like how you can tell how well your translation.
+0:01:11.251 --> 0:01:23.175
+Today we're going to talk about first some
+introduction about the difficulties and also
+0:01:23.175 --> 0:01:27.783
+the dimensions of the evaluation.
+0:01:28.248 --> 0:01:32.315
+And the second one is on automatic evaluation.
+0:01:32.315 --> 0:01:33.960
+The second one is.
+0:01:33.893 --> 0:01:40.952
+Would be less human effort costly, but it
+probably is not really as perfect.
+0:01:42.702 --> 0:02:01.262
+So on machine translation evaluation, so the
+goal is to measure the quality of translation.
+0:02:03.003 --> 0:02:06.949
+We need machine translation evaluation.
+0:02:06.949 --> 0:02:14.152
+The first thing is for application scenarios
+and whether it is reliable.
+0:02:14.674 --> 0:02:22.911
+Second thing is to guide our research because
+given symmetrics we will be able to find out
+0:02:22.911 --> 0:02:30.875
+which improvement direction is valuable for
+our machine translation system and the last
+0:02:30.875 --> 0:02:34.224
+thing is for our system development.
+0:02:36.116 --> 0:02:42.926
+So now we will come to some difficulties on
+evaluation.
+0:02:42.926 --> 0:02:50.952
+The first thing is ambiguity because usually
+for one sentence it.
+0:02:51.431 --> 0:03:04.031
+Here you can see that, for example, we have
+the correct reference.
+0:03:05.325 --> 0:03:19.124
+The second difficulty is that small changes
+can be very important.
+0:03:20.060 --> 0:03:22.531
+The first difficulty is subjective.
+0:03:23.123 --> 0:03:39.266
+So it depends on each person's opinion whether
+translation is correct.
+0:03:41.041 --> 0:03:49.393
+The last is that evaluation sometimes is application
+dependent.
+0:03:49.393 --> 0:03:54.745
+We're not sure how good it's getting up.
+0:03:57.437 --> 0:04:04.502
+The first dimension is human versus automatic
+evaluation, which I definitely talked about
+0:04:04.502 --> 0:04:06.151
+in the introduction.
+0:04:06.151 --> 0:04:13.373
+The second thing is on granulity, so evaluation
+could be on sentence level, document level,
+0:04:13.373 --> 0:04:14.472
+or task base.
+0:04:15.375 --> 0:04:28.622
+The last thing is whether the translation
+is correct in order to capture the meaning.
+0:04:30.630 --> 0:04:33.769
+So on the first dimensions, human verses are
+automatic.
+0:04:34.334 --> 0:04:45.069
+So human evaluation education is the goal
+standard because in the end we give our machine
+0:04:45.069 --> 0:04:48.647
+translation system to people.
+0:04:49.329 --> 0:04:55.040
+And is also expensive and time consuming for
+people to manually evaluate some systems.
+0:04:57.057 --> 0:05:05.575
+For automatic evaluation, it is of course
+tupper and faster, and it would use human reference.
+0:05:08.168 --> 0:05:16.971
+The next dimension is on granulity.
+0:05:16.971 --> 0:05:25.529
+The first level is sentence based.
+0:05:25.885 --> 0:05:33.003
+But this is difficult because if you translate
+a single sentence, it will be difficult to
+0:05:33.003 --> 0:05:35.454
+tell whether this translation.
+0:05:37.537 --> 0:05:40.633
+The second level is document based.
+0:05:40.633 --> 0:05:46.051
+This should be the most commonly used in automatic
+evaluation.
+0:05:46.286 --> 0:06:00.750
+This should be like the final bowl of our
+machine translation.
+0:06:01.061 --> 0:06:02.315
+And slow in general.
+0:06:02.315 --> 0:06:07.753
+We are not sure whether the arrows come from
+the machine translation system itself or some
+0:06:07.753 --> 0:06:08.828
+other components.
+0:06:11.431 --> 0:06:21.300
+The next dimension is on adigocy because it's
+fluency, so adigocy is meaning translated correctly.
+0:06:22.642 --> 0:06:25.384
+Can see the example here.
+0:06:25.384 --> 0:06:32.237
+In hypothesis different is everything now,
+so basically it just.
+0:06:32.852 --> 0:06:36.520
+But then you can see it's not fluent.
+0:06:36.520 --> 0:06:38.933
+It sounds kind of weird.
+0:06:38.933 --> 0:06:41.442
+Nothing is different now.
+0:06:41.442 --> 0:06:43.179
+It sounds fluent.
+0:06:46.006 --> 0:06:50.650
+Next we come to error analysis.
+0:06:50.650 --> 0:07:02.407
+When we value the system and give a score
+we want to have interpretable results.
+0:07:03.083 --> 0:07:07.930
+So usually there would be some tetsus first
+in order to detect these errors.
+0:07:08.448 --> 0:07:21.077
+And usually they would be like quite specific
+to some specific type of arrow, for example
+0:07:21.077 --> 0:07:23.743
+wrong translation.
+0:07:24.344 --> 0:07:32.127
+All morphological agreements in whether the
+world form is correct.
+0:07:32.127 --> 0:07:35.031
+If you have the article.
+0:07:37.577 --> 0:07:45.904
+So now we come to human evaluation, which
+is the final goal of machine translation.
+0:07:47.287 --> 0:07:50.287
+So why do we perform human evaluation?
+0:07:51.011 --> 0:08:00.115
+The first thing is that automatic machine
+translation magic is not sufficient.
+0:08:00.480 --> 0:08:06.725
+Existing automated metrics and are sometimes
+biased.
+0:08:06.725 --> 0:08:16.033
+For example, the blue spar, but the blue scar
+will usually try to look at the.
+0:08:16.496 --> 0:08:24.018
+So it doesn't take into account some deeper
+meaning like cares about word-to-word matching
+0:08:24.018 --> 0:08:26.829
+instead of rephrasing or synonym.
+0:08:27.587 --> 0:08:34.881
+And bias, as in that metrics like that would
+usually depend a lot on the goal standard reference
+0:08:34.881 --> 0:08:41.948
+given from some human, and that person could
+have some specific type or language preferences,
+0:08:41.948 --> 0:08:43.979
+and then the metric would.
+0:08:47.147 --> 0:08:55.422
+The next thing is that automatic metrics don't
+provide sufficient insights for error analysis.
+0:08:57.317 --> 0:09:04.096
+Different types of errors would have different
+implications depending on the underlying task.
+0:09:04.644 --> 0:09:09.895
+So, for example, if you use machine translation
+for information with you both,.
+0:09:10.470 --> 0:09:20.202
+Then if it makes some error omitting some
+words in translation then it would be very
+0:09:20.202 --> 0:09:20.775
+bad.
+0:09:21.321 --> 0:09:30.305
+Another example is if you use machine translation
+in chat pop then fluency would be very important
+0:09:30.305 --> 0:09:50.253
+because: And we also need human measure in
+order to develop and assess automatic translation
+0:09:50.253 --> 0:09:52.324
+evaluation.
+0:09:55.455 --> 0:10:01.872
+Okay, so now we will come to the quality measures
+of human evaluation.
+0:10:02.402 --> 0:10:05.165
+The first thing is inter allotator agreement.
+0:10:05.825 --> 0:10:25.985
+This is agreement between different annotators.
+0:10:26.126 --> 0:10:31.496
+So as you can see here, this would measure
+the reliability of the other features.
+0:10:32.252 --> 0:10:49.440
+And here we have an example of where the pace
+car here is.
+0:10:49.849 --> 0:10:57.700
+And this is in contrast to intra-annuator
+agreement, so this is agreement within an annotator.
+0:10:58.118 --> 0:11:03.950
+So instead of measuring reliability, here
+it measures consistency of a single animator.
+0:11:04.884 --> 0:11:07.027
+And yep.
+0:11:07.027 --> 0:11:22.260
+We also have an example here of the which
+is so which is quite.
+0:11:23.263 --> 0:11:42.120
+So now we will come to the main types of human
+assessment: The first thing is direct assessment.
+0:11:42.842 --> 0:11:53.826
+The second thing is human ranking of the translation
+at sentence level.
+0:11:56.176 --> 0:12:11.087
+So direct assessment given the source and
+translation, and possibly the reference translation.
+0:12:12.612 --> 0:12:18.023
+The goal here is to give the scores to evaluate
+performance,adequacy and fluency.
+0:12:18.598 --> 0:12:23.619
+The problem here is that we need normalization
+across different judges, different human.
+0:12:24.604 --> 0:12:27.043
+And here we have an example.
+0:12:27.043 --> 0:12:33.517
+She was treated at the site by an emergency
+doctor and taken to hospital by.
+0:12:34.334 --> 0:12:48.444
+The hypothesis here is that she was treated
+on site and emergency medical rescue workers
+0:12:48.444 --> 0:12:52.090
+brought to a hospital.
+0:12:52.472 --> 0:12:56.267
+Lesson five is best in one sport.
+0:13:00.060 --> 0:13:04.716
+I don't think it's hard because I think there
+should be broad threat to a hospital right.
+0:13:05.905 --> 0:13:09.553
+Yes, that is like a crucial error.
+0:13:09.553 --> 0:13:19.558
+Yeah, I think I would agree because this sentence
+somehow gives us the idea of what the meaning
+0:13:19.558 --> 0:13:21.642
+of the sentence is.
+0:13:21.642 --> 0:13:24.768
+But then it lost towards her.
+0:13:27.027 --> 0:13:29.298
+The next time of human evaluation is ranking.
+0:13:30.810 --> 0:13:38.893
+Which is a great different system according
+to performance like which one is better.
+0:13:40.981 --> 0:13:43.914
+So here now we have a second hypothesis.
+0:13:43.914 --> 0:13:49.280
+She was hospitalized on the spot and taken
+to hospital by ambulance crews.
+0:13:50.630 --> 0:14:01.608
+As you can see here, the second hypothesis
+seems to be more fluent, more smooth.
+0:14:01.608 --> 0:14:09.096
+The meaning capture seems to be: So yeah,
+it's difficult to compare different errors
+0:14:09.096 --> 0:14:11.143
+in whether which error is more severe.
+0:14:13.373 --> 0:14:16.068
+The next type of human evaluation is post
+editing.
+0:14:17.817 --> 0:14:29.483
+So we want to measure how much time and effort
+human needs to spend in order to turn it into
+0:14:29.483 --> 0:14:32.117
+correct translation.
+0:14:32.993 --> 0:14:47.905
+So this area can be measured by time or key
+shop.
+0:14:49.649 --> 0:14:52.889
+And the last one is task based evaluation.
+0:14:52.889 --> 0:14:56.806
+Here we would want to evaluate the complete
+system.
+0:14:56.806 --> 0:15:03.436
+But if you are using the lecture translator
+and you see my lecture in German, the final
+0:15:03.436 --> 0:15:05.772
+evaluation here would be like.
+0:15:05.772 --> 0:15:08.183
+In the end, can you understand?
+0:15:09.769 --> 0:15:15.301
+Their friendship here that we get the overall
+performance, which is our final goal.
+0:15:16.816 --> 0:15:25.850
+But the disadvantage here that it could be
+complex and again if the spur is low it might
+0:15:25.850 --> 0:15:31.432
+be other problems than the machine translation
+itself.
+0:15:33.613 --> 0:15:42.941
+So guess that was about the human evaluation
+part any question so far.
+0:15:42.941 --> 0:15:44.255
+Yes, and.
+0:16:00.000 --> 0:16:15.655
+Then we will come to our magic matrix here
+to access the quality of the machine translation
+0:16:15.655 --> 0:16:26.179
+system by comparing: So the premise here is
+that the more similar translation is to reference,
+0:16:26.179 --> 0:16:31.437
+the better and we want some algorithms that
+can approximate.
+0:16:34.114 --> 0:16:47.735
+So the most famous measure could be the blow
+spark and the bilingual evaluation.
+0:16:50.930 --> 0:16:56.358
+So if we are given the goal that the more
+similar translation is to the reference, the
+0:16:56.358 --> 0:17:01.785
+better I think the most naive way would be
+count the number of people sentenced to the
+0:17:01.785 --> 0:17:02.472
+reference.
+0:17:02.472 --> 0:17:08.211
+But as you can see, this would be very difficult
+because sentence being exactly the same to
+0:17:08.211 --> 0:17:10.332
+the reference would be very rare.
+0:17:11.831 --> 0:17:24.222
+You can see the example here in the reference
+and machine translation output.
+0:17:24.764 --> 0:17:31.930
+So the idea here is that instead of comparing
+the two whole sentences up, we consider the.
+0:17:35.255 --> 0:17:43.333
+Now we can look at an example, so for the
+blow score we consider one to three four grams.
+0:17:44.844 --> 0:17:52.611
+The one ramp of a lap we would have back to
+the future, not at premieres thirty years ago,
+0:17:52.611 --> 0:17:59.524
+so it should be like one, two, three, four,
+five, six, seven, eight, so like it.
+0:17:59.459 --> 0:18:01.476
+One ram is overlap to the reverence.
+0:18:01.921 --> 0:18:03.366
+So you should be over.
+0:18:06.666 --> 0:18:08.994
+Is kind of the same.
+0:18:08.994 --> 0:18:18.529
+Instead of considering only the word back
+for three, one is to be back to the future.
+0:18:19.439 --> 0:18:31.360
+So that is basically the idea of the blue
+score, and in the end we calculate the geometric.
+0:18:32.812 --> 0:18:39.745
+So as you can see here, when we look at the
+A brand overlap you can only look at the machine
+0:18:39.745 --> 0:18:40.715
+translation.
+0:18:41.041 --> 0:18:55.181
+We only care about how many words in the machine
+translation output appear.
+0:18:55.455 --> 0:19:02.370
+So this metric is kind of like a precision
+based and not really recall based.
+0:19:04.224 --> 0:19:08.112
+So this would lead to a problem like the example
+here.
+0:19:08.112 --> 0:19:14.828
+The reference is back to the future of Premier
+30 years ago and the machine translation output
+0:19:14.828 --> 0:19:16.807
+is only back to the future.
+0:19:17.557 --> 0:19:28.722
+The one grab overlap will be formed because
+you can see back to the future is overlap entirely
+0:19:28.722 --> 0:19:30.367
+in reference.
+0:19:31.231 --> 0:19:38.314
+Is not right because one is the perfect score,
+but this is obviously not a good translation.
+0:19:40.120 --> 0:19:47.160
+So in order to tackle this they use something
+called pre gravity velocity.
+0:19:47.988 --> 0:19:59.910
+So it should be a factor that is multiplied
+to the geometric nymph.
+0:19:59.910 --> 0:20:04.820
+This form is the length of.
+0:20:05.525 --> 0:20:19.901
+So the penalty over or overseas to the power
+of the length of this river over.
+0:20:21.321 --> 0:20:32.298
+Which is lower than, and if we apply this
+to the example, the blowscorn is going to be
+0:20:32.298 --> 0:20:36.462
+which is not a good translation.
+0:20:38.999 --> 0:20:42.152
+Yep so any question of this place.
+0:20:44.064 --> 0:21:00.947
+Yes exactly that should be a problem as well,
+and it will be mentioned later on.
+0:21:00.947 --> 0:21:01.990
+But.
+0:21:03.203 --> 0:21:08.239
+Is very sensitive to zero score like that,
+so that is why we usually don't use the blue
+0:21:08.239 --> 0:21:13.103
+score sentence level because sentence can be
+short and then there can be no overlap.
+0:21:13.103 --> 0:21:16.709
+That is why we usually use it on documents
+as you can imagine.
+0:21:16.709 --> 0:21:20.657
+Documents are very long and very little chance
+to have zero overlap.
+0:21:23.363 --> 0:21:28.531
+Yeah okay, so the next thing on the blow's
+floor is slipping.
+0:21:29.809 --> 0:21:42.925
+So you can see here we have two references,
+the new movie and the new film, and we have
+0:21:42.925 --> 0:21:47.396
+a machine translation output.
+0:21:47.807 --> 0:21:54.735
+Because the here is also in the reference,
+so yeah two or two books is one, which is:
+0:21:56.236 --> 0:22:02.085
+So but then this is not what we want because
+this is just repeating something that appears.
+0:22:02.702 --> 0:22:06.058
+So that's why we use clipping.
+0:22:06.058 --> 0:22:15.368
+Clipping here is that we consider the mask
+counts in any reference, so as you can see
+0:22:15.368 --> 0:22:17.425
+here in reference.
+0:22:18.098 --> 0:22:28.833
+So here when we do clipping we will just use
+the maximum opponents in the references.
+0:22:29.809 --> 0:22:38.717
+Yeah, just to avoid avoid overlapping repetitive
+words in the translation.
+0:22:41.641 --> 0:23:00.599
+It could happen that there is no overlap between
+the machine translation output and reference.
+0:23:00.500 --> 0:23:01.917
+Then Everything Is Going To Go To Zero.
+0:23:02.402 --> 0:23:07.876
+So that's why for blow score we usually use
+Japanese level score where we arrogate the
+0:23:07.876 --> 0:23:08.631
+statistics.
+0:23:12.092 --> 0:23:18.589
+Some summary about the brewer as you can see
+it mash exact words.
+0:23:18.589 --> 0:23:31.751
+It can take several references: It measured
+a depotency by the word precision and if measured
+0:23:31.751 --> 0:23:36.656
+the fluency by the gram precision.
+0:23:37.437 --> 0:23:47.254
+And as mentioned, it doesn't consider how
+much meaning that is captured in the machine
+0:23:47.254 --> 0:23:48.721
+translation.
+0:23:49.589 --> 0:23:53.538
+So here they use reality penalty to prevent
+short sentences.
+0:23:54.654 --> 0:24:04.395
+Will get the spot over the last test set to
+avoid the zero issues.
+0:24:04.395 --> 0:24:07.012
+As we mentioned,.
+0:24:09.829 --> 0:24:22.387
+Yes, that's mentioned with multiple reference
+translation simultaneously, and it's a precision
+0:24:22.387 --> 0:24:24.238
+based matrix.
+0:24:24.238 --> 0:24:27.939
+So we are not sure if this.
+0:24:29.689 --> 0:24:37.423
+The second thing is that blows calls common
+safe for recall by routine penalty, and we
+0:24:37.423 --> 0:24:38.667
+are not sure.
+0:24:39.659 --> 0:24:50.902
+Matches, so can still improve the similarity
+measure and improve the correlation score to
+0:24:50.902 --> 0:24:51.776
+human.
+0:24:52.832 --> 0:25:01.673
+The next is that all work will have the same
+importance.
+0:25:01.673 --> 0:25:07.101
+What if a scheme for wedding work?
+0:25:11.571 --> 0:25:26.862
+And the last witness is that blows for high
+grade order engrams that can confluency dramatically.
+0:25:27.547 --> 0:25:32.101
+So the pressure is that can be accounted for
+fluency, and grammatically there's some other.
+0:25:35.956 --> 0:25:47.257
+We have some further issues and not created
+equally so we can use stemming or knowledge
+0:25:47.257 --> 0:25:48.156
+space.
+0:25:50.730 --> 0:26:00.576
+The next way we incorporate information is
+within the metrics.
+0:26:01.101 --> 0:26:07.101
+And can be used like a stop list to like somehow
+ignore the non-important words.
+0:26:08.688 --> 0:26:12.687
+Text normalization spelling conjugation lower
+case and mix case.
+0:26:12.687 --> 0:26:18.592
+The next thing is that for some language like
+Chinese there can be different world segmentation
+0:26:18.592 --> 0:26:23.944
+so exact word matching might no longer be a
+good idea so maybe it's ready to cover the
+0:26:23.944 --> 0:26:27.388
+score as the character level instead of the
+word level.
+0:26:29.209 --> 0:26:33.794
+And the last thing is speech translation.
+0:26:33.794 --> 0:26:38.707
+Usually input from speech translation would.
+0:26:38.979 --> 0:26:51.399
+And there should be some way to segment into
+sentences so that we can calculate the score
+0:26:51.399 --> 0:26:52.090
+and.
+0:26:52.953 --> 0:27:01.326
+And the way to soften is to use some tools
+like enware segmentation to align the output
+0:27:01.326 --> 0:27:01.896
+with.
+0:27:06.306 --> 0:27:10.274
+Yes, so guess that was all about the blow
+score any question.
+0:27:14.274 --> 0:27:28.292
+Again on automatic metrics we'll talk about
+probably good metrics, strange automatic metrics,
+0:27:28.292 --> 0:27:32.021
+use cases on evaluation.
+0:27:34.374 --> 0:27:44.763
+How to measure the performance of the matrix,
+so a good matrix would be a.
+0:27:49.949 --> 0:28:04.905
+We would want the matrix to be interpretable
+if this is the ranking from a human that somehow
+0:28:04.905 --> 0:28:08.247
+can rank the system.
+0:28:12.132 --> 0:28:15.819
+We would also want the evaluation metric to
+be sensitive.
+0:28:15.819 --> 0:28:21.732
+Like small differences in the machine translation
+can be distinguished, we would not need to
+0:28:21.732 --> 0:28:22.686
+be consistent.
+0:28:22.686 --> 0:28:28.472
+Like if the same machine translation system
+is used on a similar text, it should reproduce
+0:28:28.472 --> 0:28:29.553
+a similar score.
+0:28:31.972 --> 0:28:40.050
+Next, we would want the machine translation
+system to be reliable.
+0:28:40.050 --> 0:28:42.583
+Machine translation.
+0:28:43.223 --> 0:28:52.143
+We want the matrix to be easy to run in general
+and can be applied to multiple different machine.
+0:28:55.035 --> 0:29:11.148
+The difficulty of evaluating the metric itself
+is kind of similar to when you evaluate the
+0:29:11.148 --> 0:29:13.450
+translation.
+0:29:18.638 --> 0:29:23.813
+And here is some components of the automatic
+machine translation matrix.
+0:29:23.813 --> 0:29:28.420
+So for the matching matrix the component would
+be the precision.
+0:29:28.420 --> 0:29:30.689
+Recall our Levinstein distance.
+0:29:30.689 --> 0:29:35.225
+So for the blow sparks you have seen it cares
+mostly about the.
+0:29:36.396 --> 0:29:45.613
+And on the features it would be about how
+to measure the matches or character based.
+0:29:48.588 --> 0:30:01.304
+Now we will talk about more matrix because
+the blue score is the most common.
+0:30:02.082 --> 0:30:10.863
+So it compared the reference and hypothesis
+using edit operations.
+0:30:10.863 --> 0:30:14.925
+They count how many insertion.
+0:30:23.143 --> 0:30:31.968
+We already talked about it beyond what matching
+would care about character based mathematization
+0:30:31.968 --> 0:30:34.425
+or linguistic information.
+0:30:36.636 --> 0:30:41.502
+The next metric is the meteor metric.
+0:30:41.502 --> 0:30:50.978
+This is strong called metric for evaluation
+of translation with explicit.
+0:30:51.331 --> 0:31:03.236
+So merely their new idea is that they reintroduce
+repose and combine with precision as small
+0:31:03.236 --> 0:31:04.772
+components.
+0:31:05.986 --> 0:31:16.700
+The language translation output with each
+reference individually and takes part of the
+0:31:16.700 --> 0:31:18.301
+best parent.
+0:31:20.940 --> 0:31:27.330
+The next thing is that matching takes into
+counterfection variation by stepping, so it's
+0:31:27.330 --> 0:31:28.119
+no longer.
+0:31:30.230 --> 0:31:40.165
+When they address fluency, they're a direct
+penalty instead of ink arms so they would care
+0:31:40.165 --> 0:31:40.929
+about.
+0:31:45.925 --> 0:31:56.287
+The next thing is on two noble metrics, so
+for this metric we want to extract some features.
+0:31:56.936 --> 0:32:04.450
+So for example here the nice house is on the
+right and the building is on the right side
+0:32:04.450 --> 0:32:12.216
+so we will have to extract some pictures like
+for example here the reference and hypothesis
+0:32:12.216 --> 0:32:14.158
+have hypers in common.
+0:32:14.714 --> 0:32:19.163
+They have one insertion, two deletions, and
+they have the same verb.
+0:32:21.141 --> 0:32:31.530
+So the idea is to use machine translation
+techniques to combine features and this machine
+0:32:31.530 --> 0:32:37.532
+translation model will be trained on human
+ranking.
+0:32:39.819 --> 0:32:44.788
+Any common framework for this is comet.
+0:32:44.684 --> 0:32:48.094
+Which is a narrow model that is used with
+X for.
+0:32:48.094 --> 0:32:54.149
+The feature would be created using some prejutant
+model like X, L, M, U, R, A, BO, DA.
+0:32:54.149 --> 0:33:00.622
+Here the input would be the source, the reference
+and the hypothesis and then they would try
+0:33:00.622 --> 0:33:02.431
+to produce an assessment.
+0:33:03.583 --> 0:33:05.428
+Yeah, it's strange to predict human sport.
+0:33:06.346 --> 0:33:19.131
+And they also have some additional versions,
+as we train this model in order to tell whether
+0:33:19.131 --> 0:33:20.918
+translation.
+0:33:21.221 --> 0:33:29.724
+So instead of checking the source and the
+hypothesis as input, they could take only the
+0:33:29.724 --> 0:33:38.034
+source and the hypotheses as input and try
+to predict the quality of the translation.
+0:33:42.562 --> 0:33:49.836
+So assumptions before machine translation
+systems are often used in larger systems.
+0:33:50.430 --> 0:33:57.713
+So the question is how to evaluate the performance
+of the machine translation system in this larger
+0:33:57.713 --> 0:34:04.997
+scenario, and an example would be speech translation
+system when you try to translate English audio
+0:34:04.997 --> 0:34:05.798
+to German.
+0:34:06.506 --> 0:34:13.605
+Then it would usually have two opponents,
+ASR and MT, where ASR is like speech recognition
+0:34:13.605 --> 0:34:20.626
+that can describe English audio to English
+text, and then we have the machine translation
+0:34:20.626 --> 0:34:24.682
+system that translates English text to German
+text.
+0:34:26.967 --> 0:34:33.339
+So in order to have these overall performances
+in this bigger scenario, they are so willing
+0:34:33.339 --> 0:34:34.447
+to evaluate it.
+0:34:34.447 --> 0:34:41.236
+So the first one is to evaluate the individual
+components like how good is the speech recognizer,
+0:34:41.236 --> 0:34:46.916
+how good is the analyzed and generalization
+engines, how good is the synthesizer.
+0:34:47.727 --> 0:34:56.905
+The second way is to evaluate translation
+quality from speech input to text output.
+0:34:56.905 --> 0:35:00.729
+How good is the final translation?
+0:35:02.102 --> 0:35:10.042
+The next thing is to measure the to evaluate
+the architecture effectiveness like: How is
+0:35:10.042 --> 0:35:12.325
+the level effects in general?
+0:35:12.325 --> 0:35:19.252
+The next one is task based evaluation or use
+a study like we just simply ask the user what
+0:35:19.252 --> 0:35:24.960
+is their experience like whether the system
+works well and how well it is.
+0:35:27.267 --> 0:35:32.646
+So here we have an example of the ITF shale
+test result.
+0:35:33.153 --> 0:35:38.911
+So the first block would be the human evaluation
+like I think they are asked to give a spawl
+0:35:38.911 --> 0:35:44.917
+from one to five again where a fight is best
+and one is worst and the lower one is the blowscore
+0:35:44.917 --> 0:35:50.490
+and they find out that the human evaluation
+is far actually correlated with the blowsfall
+0:35:50.490 --> 0:35:51.233
+quite well.
+0:35:53.193 --> 0:36:02.743
+Here you can also see that the systems from
+our university are actually on top many sub-tasts.
+0:36:05.605 --> 0:36:07.429
+So Yeah.
+0:36:08.868 --> 0:36:14.401
+For this lecture is that machine translation
+evaluation is difficult.
+0:36:14.401 --> 0:36:21.671
+We talk about human versus automatic evaluation
+that human would be costly, but then is the
+0:36:21.671 --> 0:36:27.046
+goal standard automatic evaluation would be
+a fast and cheaper way.
+0:36:27.547 --> 0:36:36.441
+We talk about granulity on sentence level,
+document level or task level evaluation machine
+0:36:36.441 --> 0:36:38.395
+translation system.
+0:36:39.679 --> 0:36:51.977
+And we talked about human evaluation versus
+automatic metrics in details.
+0:36:54.034 --> 0:36:59.840
+So we introduced a lot of metric metrics.
+0:36:59.840 --> 0:37:10.348
+How do they compare from the quadrating of
+human assessment so it's better?
+0:37:12.052 --> 0:37:16.294
+I don't have the exact score and reference
+in my head.
+0:37:16.294 --> 0:37:22.928
+I would assume that mediators should have
+a better correlation because here they also
+0:37:22.928 --> 0:37:30.025
+consider other aspects like the recall whether
+the information in the reference is captured
+0:37:30.025 --> 0:37:31.568
+in the translation.
+0:37:32.872 --> 0:37:41.875
+Like synonyms, so I would assume that mid
+air is better, but again don't have the reference
+0:37:41.875 --> 0:37:43.441
+in my hair, so.
+0:37:43.903 --> 0:37:49.771
+But guess the reason people are still using
+BlueScore is that in most literature, a machine
+0:37:49.771 --> 0:38:00.823
+translation system, they report: So now you
+create a new machine translation system.
+0:38:00.823 --> 0:38:07.990
+It might be better to also report the blow.
+0:38:08.228 --> 0:38:11.472
+Exactly just slice good, just spread white,
+and then we're going to go ahead.
+0:38:12.332 --> 0:38:14.745
+And don't know what you're doing.
+0:38:17.457 --> 0:38:18.907
+I Want to Talk Quickly About.
+0:38:19.059 --> 0:38:32.902
+So it is like a language model, so it's kind
+of the same uses as.
+0:38:33.053 --> 0:38:39.343
+So the idea is that we have this layer in
+order to embed the sauce and the reference
+0:38:39.343 --> 0:38:39.713
+and.
+0:38:40.000 --> 0:38:54.199
+Into some feature vectors that we can later
+on use to predict the human sport in the.
+0:38:58.618 --> 0:39:00.051
+It If There's Nothing Else.

demo_data/lectures/Lecture-05-02.05.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5014f3570b8db38818ab44ed117dc6d67206c5163b6b87b45df4a2aa426b8222
+size 314238982

demo_data/lectures/Lecture-06-09.05.2023/English.vtt ADDED Viewed

	@@ -0,0 +1,2970 @@

+WEBVTT
+0:00:01.721 --> 0:00:08.584
+Hey, then welcome to today's lecture on language
+modeling.
+0:00:09.409 --> 0:00:21.608
+We had not a different view on machine translation,
+which was the evaluation path it's important
+0:00:21.608 --> 0:00:24.249
+to evaluate and see.
+0:00:24.664 --> 0:00:33.186
+We want to continue with building the MT system
+and this will be the last part before we are
+0:00:33.186 --> 0:00:36.668
+going into a neural step on Thursday.
+0:00:37.017 --> 0:00:45.478
+So we had the the broader view on statistical
+machine translation and the.
+0:00:45.385 --> 0:00:52.977
+Thursday: A week ago we talked about the statistical
+machine translation and mainly the translation
+0:00:52.977 --> 0:00:59.355
+model, so how we model how probable is it that
+one word is translated into another.
+0:01:00.800 --> 0:01:15.583
+However, there is another component when doing
+generation tasks in general and machine translation.
+0:01:16.016 --> 0:01:23.797
+There are several characteristics which you
+only need to model on the target side in the
+0:01:23.797 --> 0:01:31.754
+traditional approach where we talked about
+the generation from more semantic or synthectic
+0:01:31.754 --> 0:01:34.902
+representation into the real world.
+0:01:35.555 --> 0:01:51.013
+And the challenge is that there's some constructs
+which are only there in the target language.
+0:01:52.132 --> 0:01:57.908
+You cannot really get that translation, but
+it's more something that needs to model on
+0:01:57.908 --> 0:01:58.704
+the target.
+0:01:59.359 --> 0:02:05.742
+And this is done typically by a language model
+and this concept of language model.
+0:02:06.326 --> 0:02:11.057
+Guess you can assume nowadays very important.
+0:02:11.057 --> 0:02:20.416
+You've read a lot about large language models
+recently and they are all somehow trained or
+0:02:20.416 --> 0:02:22.164
+the idea behind.
+0:02:25.986 --> 0:02:41.802
+What we'll look today at if get the next night
+and look what a language model is and today's
+0:02:41.802 --> 0:02:42.992
+focus.
+0:02:43.363 --> 0:02:49.188
+This was the common approach to the language
+model for twenty or thirty years, so a lot
+0:02:49.188 --> 0:02:52.101
+of time it was really the state of the art.
+0:02:52.101 --> 0:02:58.124
+And people have used that in many applications
+in machine translation and automatic speech
+0:02:58.124 --> 0:02:58.985
+recognition.
+0:02:59.879 --> 0:03:11.607
+Again you are measuring the performance, but
+this is purely the performance of the language
+0:03:11.607 --> 0:03:12.499
+model.
+0:03:13.033 --> 0:03:23.137
+And then we will see that the traditional
+language will have a major drawback in how
+0:03:23.137 --> 0:03:24.683
+we can deal.
+0:03:24.944 --> 0:03:32.422
+So if you model language you will see that
+in most of the sentences and you have not really
+0:03:32.422 --> 0:03:39.981
+seen and you're still able to assess if this
+is good language or if this is native language.
+0:03:40.620 --> 0:03:45.092
+And this is challenging if you do just like
+parameter estimation.
+0:03:45.605 --> 0:03:59.277
+We are using two different techniques to do:
+interpolation, and these are essentially in
+0:03:59.277 --> 0:04:01.735
+order to build.
+0:04:01.881 --> 0:04:11.941
+It also motivates why things might be easier
+if we are going into neural morals as we will.
+0:04:12.312 --> 0:04:18.203
+And at the end we'll talk a bit about some
+additional type of language models which are
+0:04:18.203 --> 0:04:18.605
+also.
+0:04:20.440 --> 0:04:29.459
+So where our language was used, or how are
+they used in the machine translations?
+0:04:30.010 --> 0:04:38.513
+So the idea of a language model is that we
+are modeling what is the fluency of language.
+0:04:38.898 --> 0:04:49.381
+So if you have, for example, sentence will,
+then you can estimate that there are some words:
+0:04:49.669 --> 0:05:08.929
+For example, the next word is valid, but will
+card's words not?
+0:05:09.069 --> 0:05:13.673
+And we can do that.
+0:05:13.673 --> 0:05:22.192
+We have seen that the noise channel.
+0:05:22.322 --> 0:05:33.991
+That we have seen someone two weeks ago, and
+today we will look into how can we model P
+0:05:33.991 --> 0:05:36.909
+of Y or how possible.
+0:05:37.177 --> 0:05:44.192
+Now this is completely independent of the
+translation process.
+0:05:44.192 --> 0:05:49.761
+How fluent is a sentence and how you can express?
+0:05:51.591 --> 0:06:01.699
+And this language model task has one really
+big advantage and assume that is even the big
+0:06:01.699 --> 0:06:02.935
+advantage.
+0:06:03.663 --> 0:06:16.345
+The big advantage is the data we need to train
+that so normally we are doing supervised learning.
+0:06:16.876 --> 0:06:20.206
+So machine translation will talk about.
+0:06:20.206 --> 0:06:24.867
+That means we have the source center and target
+center.
+0:06:25.005 --> 0:06:27.620
+They need to be aligned.
+0:06:27.620 --> 0:06:31.386
+We look into how we can model them.
+0:06:31.386 --> 0:06:39.270
+Generally, the problem with this is that:
+Machine translation: You still have the advantage
+0:06:39.270 --> 0:06:45.697
+that there's quite huge amounts of this data
+for many languages, not all but many, but other
+0:06:45.697 --> 0:06:47.701
+classes even more difficult.
+0:06:47.701 --> 0:06:50.879
+There's very few data where you have summary.
+0:06:51.871 --> 0:07:02.185
+So the big advantage of language model is
+we're only modeling the centers, so we only
+0:07:02.185 --> 0:07:04.103
+need pure text.
+0:07:04.584 --> 0:07:11.286
+And pure text, especially since we have the
+Internet face melting large amounts of text.
+0:07:11.331 --> 0:07:17.886
+Of course, it's still, it's still maybe only
+for some domains, some type.
+0:07:18.198 --> 0:07:23.466
+Want to have data for speech about machine
+translation.
+0:07:23.466 --> 0:07:27.040
+Maybe there's only limited data that.
+0:07:27.027 --> 0:07:40.030
+There's always and also you go to some more
+exotic languages and then you will have less
+0:07:40.030 --> 0:07:40.906
+data.
+0:07:41.181 --> 0:07:46.803
+And in language once we can now look, how
+can we make use of these data?
+0:07:47.187 --> 0:07:54.326
+And: Nowadays this is often also framed as
+self supervised learning because on the one
+0:07:54.326 --> 0:08:00.900
+hand here we'll see it's a time of classification
+cast or supervised learning but we create some
+0:08:00.900 --> 0:08:02.730
+other data science itself.
+0:08:02.742 --> 0:08:13.922
+So it's not that we have this pair of data
+text and labels, but we have only the text.
+0:08:15.515 --> 0:08:21.367
+So the question is how can we use this modeling
+data and how can we train our language?
+0:08:22.302 --> 0:08:35.086
+The main goal is to produce fluent English,
+so we want to somehow model that something
+0:08:35.086 --> 0:08:38.024
+is a sentence of a.
+0:08:38.298 --> 0:08:44.897
+So there is no clear separation about semantics
+and syntax, but in this case it is not about
+0:08:44.897 --> 0:08:46.317
+a clear separation.
+0:08:46.746 --> 0:08:50.751
+So we will monitor them somehow in there.
+0:08:50.751 --> 0:08:56.091
+There will be some notion of semantics, some
+notion of.
+0:08:56.076 --> 0:09:08.748
+Because you say you want to water how fluid
+or probable is that the native speaker is producing
+0:09:08.748 --> 0:09:12.444
+that because of the one in.
+0:09:12.512 --> 0:09:17.711
+We are rarely talking like things that are
+semantically wrong, and therefore there is
+0:09:17.711 --> 0:09:18.679
+also some type.
+0:09:19.399 --> 0:09:24.048
+So, for example, the house is small.
+0:09:24.048 --> 0:09:30.455
+It should be a higher stability than the house
+is.
+0:09:31.251 --> 0:09:38.112
+Because home and house are both meaning German,
+they are used differently.
+0:09:38.112 --> 0:09:43.234
+For example, it should be more probable that
+the plane.
+0:09:44.444 --> 0:09:51.408
+So this is both synthetically correct, but
+cementically not.
+0:09:51.408 --> 0:09:58.372
+But still you will see much more often the
+probability that.
+0:10:03.883 --> 0:10:14.315
+So more formally, it's about like the language
+should be some type of function, and it gives
+0:10:14.315 --> 0:10:18.690
+us the probability that this sentence.
+0:10:19.519 --> 0:10:27.312
+Indicating that this is good English or more
+generally English, of course you can do that.
+0:10:28.448 --> 0:10:37.609
+And earlier times people have even done try
+to do that deterministic that was especially
+0:10:37.609 --> 0:10:40.903
+used for more dialogue systems.
+0:10:40.840 --> 0:10:50.660
+You have a very strict syntax so you can only
+use like turn off the, turn off the radio.
+0:10:50.690 --> 0:10:56.928
+Something else, but you have a very strict
+deterministic finance state grammar like which
+0:10:56.928 --> 0:10:58.107
+type of phrases.
+0:10:58.218 --> 0:11:04.791
+The problem of course if we're dealing with
+language is that language is variable, we're
+0:11:04.791 --> 0:11:10.183
+not always talking correct sentences, and so
+this type of deterministic.
+0:11:10.650 --> 0:11:22.121
+That's why for already many, many years people
+look into statistical language models and try
+0:11:22.121 --> 0:11:24.587
+to model something.
+0:11:24.924 --> 0:11:35.096
+So something like what is the probability
+of the sequences of to, and that is what.
+0:11:35.495 --> 0:11:43.076
+The advantage of doing it statistically is
+that we can train large text databases so we
+0:11:43.076 --> 0:11:44.454
+can train them.
+0:11:44.454 --> 0:11:52.380
+We don't have to define it and most of these
+cases we don't want to have the hard decision.
+0:11:52.380 --> 0:11:55.481
+This is a sentence of the language.
+0:11:55.815 --> 0:11:57.914
+Why we want to have some type of probability?
+0:11:57.914 --> 0:11:59.785
+How probable is this part of the center?
+0:12:00.560 --> 0:12:04.175
+Because yeah, even for a few minutes, it's
+not always clear.
+0:12:04.175 --> 0:12:06.782
+Is this a sentence that you can use or not?
+0:12:06.782 --> 0:12:12.174
+I mean, I just in this presentation gave several
+sentences, which are not correct English.
+0:12:12.174 --> 0:12:17.744
+So it might still happen that people speak
+sentences or write sentences that I'm not correct,
+0:12:17.744 --> 0:12:19.758
+and you want to deal with all of.
+0:12:20.020 --> 0:12:25.064
+So that is then, of course, a big advantage
+if you use your more statistical models.
+0:12:25.705 --> 0:12:35.810
+The disadvantage is that you need a subtitle
+of large text databases which might exist from
+0:12:35.810 --> 0:12:37.567
+many languages.
+0:12:37.857 --> 0:12:46.511
+Nowadays you see that there is of course issues
+that you need large computational resources
+0:12:46.511 --> 0:12:47.827
+to deal with.
+0:12:47.827 --> 0:12:56.198
+You need to collect all these crawlers on
+the internet which can create enormous amounts
+0:12:56.198 --> 0:12:57.891
+of training data.
+0:12:58.999 --> 0:13:08.224
+So if we want to build this then the question
+is of course how can we estimate the probability?
+0:13:08.448 --> 0:13:10.986
+So how probable is the sentence good morning?
+0:13:11.871 --> 0:13:15.450
+And you all know basic statistics.
+0:13:15.450 --> 0:13:21.483
+So if you see this you have a large database
+of sentences.
+0:13:21.901 --> 0:13:28.003
+Made this a real example, so this was from
+the TED talks.
+0:13:28.003 --> 0:13:37.050
+I guess most of you have heard about them,
+and if you account for all many sentences,
+0:13:37.050 --> 0:13:38.523
+good morning.
+0:13:38.718 --> 0:13:49.513
+It happens so the probability of good morning
+is sweet point times to the power minus.
+0:13:50.030 --> 0:13:53.755
+Okay, so this is a very easy thing.
+0:13:53.755 --> 0:13:58.101
+We can directly model the language model.
+0:13:58.959 --> 0:14:03.489
+Does anybody see a problem why this might
+not be the final solution?
+0:14:06.326 --> 0:14:14.962
+Think we would need a folder of more sentences
+to make anything useful of this.
+0:14:15.315 --> 0:14:29.340
+Because the probability of the talk starting
+with good morning, good morning is much higher
+0:14:29.340 --> 0:14:32.084
+than ten minutes.
+0:14:33.553 --> 0:14:41.700
+In all the probability presented in this face,
+not how we usually think about it.
+0:14:42.942 --> 0:14:55.038
+The probability is even OK, but you're going
+into the right direction about the large data.
+0:14:55.038 --> 0:14:59.771
+Yes, you can't form a new sentence.
+0:15:00.160 --> 0:15:04.763
+It's about a large data, so you said it's
+hard to get enough data.
+0:15:04.763 --> 0:15:05.931
+It's impossible.
+0:15:05.931 --> 0:15:11.839
+I would say we are always saying sentences
+which have never been said and we are able
+0:15:11.839 --> 0:15:12.801
+to deal with.
+0:15:13.133 --> 0:15:25.485
+The problem with the sparsity of the data
+will have a lot of perfect English sentences.
+0:15:26.226 --> 0:15:31.338
+And this is, of course, not what we want to
+deal with.
+0:15:31.338 --> 0:15:39.332
+If we want to model that, we need to have
+a model which can really estimate how good.
+0:15:39.599 --> 0:15:47.970
+And if we are just like counting this way,
+most of it will get a zero probability, which
+0:15:47.970 --> 0:15:48.722
+is not.
+0:15:49.029 --> 0:15:56.572
+So we need to make things a bit different.
+0:15:56.572 --> 0:16:06.221
+For the models we had already some idea of
+doing that.
+0:16:06.486 --> 0:16:08.058
+And that we can do here again.
+0:16:08.528 --> 0:16:12.866
+So we can especially use the gel gel.
+0:16:12.772 --> 0:16:19.651
+The chain rule and the definition of conditional
+probability solve the conditional probability.
+0:16:19.599 --> 0:16:26.369
+Of an event B given in an event A is the probability
+of A and B divided to the probability of A.
+0:16:26.369 --> 0:16:32.720
+Yes, I recently had a exam on a manic speech
+recognition and Mister Rival said this is not
+0:16:32.720 --> 0:16:39.629
+called a chain of wood because I use this terminology
+and he said it's just applying base another.
+0:16:40.500 --> 0:16:56.684
+But this is definitely the definition of the
+condition of probability.
+0:16:57.137 --> 0:17:08.630
+The probability is defined as P of A and P
+of supposed to be divided by the one.
+0:17:08.888 --> 0:17:16.392
+And that can be easily rewritten into and
+times given.
+0:17:16.816 --> 0:17:35.279
+And the nice thing is, we can easily extend
+it, of course, into more variables so we can
+0:17:35.279 --> 0:17:38.383
+have: And so on.
+0:17:38.383 --> 0:17:49.823
+So more generally you can do that for now
+any length of sequence.
+0:17:50.650 --> 0:18:04.802
+So if we are now going back to words, we can
+model that as the probability of the sequence
+0:18:04.802 --> 0:18:08.223
+is given its history.
+0:18:08.908 --> 0:18:23.717
+Maybe it's more clear if we're looking at
+real works, so if we have pee-off, it's water
+0:18:23.717 --> 0:18:26.914
+is so transparent.
+0:18:26.906 --> 0:18:39.136
+So this way we are able to model the ability
+of the whole sentence given the sequence by
+0:18:39.136 --> 0:18:42.159
+looking at each word.
+0:18:42.762 --> 0:18:49.206
+And of course the big advantage is that each
+word occurs less often than the full sect.
+0:18:49.206 --> 0:18:54.991
+So hopefully we see that still, of course,
+the problem the word doesn't occur.
+0:18:54.991 --> 0:19:01.435
+Then this doesn't work, but let's recover
+most of the lectures today about dealing with
+0:19:01.435 --> 0:19:01.874
+this.
+0:19:02.382 --> 0:19:08.727
+So by first of all, we generally is at least
+easier as the thing we have before.
+0:19:13.133 --> 0:19:23.531
+That we really make sense easier, no, because
+those jumps get utterly long and we have central.
+0:19:23.943 --> 0:19:29.628
+Yes exactly, so when we look at the last probability
+here, we still have to have seen the full.
+0:19:30.170 --> 0:19:38.146
+So if we want a molecule of transparent, if
+water is so we have to see the food sequence.
+0:19:38.578 --> 0:19:48.061
+So in first step we didn't really have to
+have seen the full sentence.
+0:19:48.969 --> 0:19:52.090
+However, a little bit of a step nearer.
+0:19:52.512 --> 0:19:59.673
+So this is still a problem and we will never
+have seen it for all the time.
+0:20:00.020 --> 0:20:08.223
+So you can look at this if you have a vocabulary
+of words.
+0:20:08.223 --> 0:20:17.956
+Now, for example, if the average sentence
+is, you would leave to the.
+0:20:18.298 --> 0:20:22.394
+And we are quite sure we have never seen that
+much date.
+0:20:22.902 --> 0:20:26.246
+So this is, we cannot really compute this
+probability.
+0:20:26.786 --> 0:20:37.794
+However, there's a trick how we can do that
+and that's the idea between most of the language.
+0:20:38.458 --> 0:20:44.446
+So instead of saying how often does this work
+happen to exactly this history, we are trying
+0:20:44.446 --> 0:20:50.433
+to do some kind of clustering and cluster a
+lot of different histories into the same class,
+0:20:50.433 --> 0:20:55.900
+and then we are modeling the probability of
+the word given this class of histories.
+0:20:56.776 --> 0:21:06.245
+And then, of course, the big design decision
+is how to be modeled like how to cluster history.
+0:21:06.666 --> 0:21:17.330
+So how do we put all these histories together
+so that we have seen each of one off enough
+0:21:17.330 --> 0:21:18.396
+so that.
+0:21:20.320 --> 0:21:25.623
+So there is quite different types of things
+people can do.
+0:21:25.623 --> 0:21:33.533
+You can add some speech texts, you can do
+semantic words, you can model the similarity,
+0:21:33.533 --> 0:21:46.113
+you can model grammatical content, and things
+like: However, like quite often in these statistical
+0:21:46.113 --> 0:21:53.091
+models, if you have a very simple solution.
+0:21:53.433 --> 0:21:58.455
+And this is what most statistical models do.
+0:21:58.455 --> 0:22:09.616
+They are based on the so called mark of assumption,
+and that means we are assuming all this history
+0:22:09.616 --> 0:22:12.183
+is not that important.
+0:22:12.792 --> 0:22:25.895
+So we are modeling the probability of zirkins
+is so transparent that or we have maybe two
+0:22:25.895 --> 0:22:29.534
+words by having a fixed.
+0:22:29.729 --> 0:22:38.761
+So the class of all our history from word
+to word minus one is just the last two words.
+0:22:39.679 --> 0:22:45.229
+And by doing this classification, which of
+course does need any additional knowledge.
+0:22:45.545 --> 0:22:51.176
+It's very easy to calculate we have no limited
+our our histories.
+0:22:51.291 --> 0:23:00.906
+So instead of an arbitrary long one here,
+we have here only like.
+0:23:00.906 --> 0:23:10.375
+For example, if we have two grams, a lot of
+them will not occur.
+0:23:10.930 --> 0:23:20.079
+So it's a very simple trick to make all these
+classes into a few classes and motivated by,
+0:23:20.079 --> 0:23:24.905
+of course, the language the nearest things
+are.
+0:23:24.944 --> 0:23:33.043
+Like a lot of sequences, they mainly depend
+on the previous one, and things which are far
+0:23:33.043 --> 0:23:33.583
+away.
+0:23:38.118 --> 0:23:47.361
+In our product here everything is just modeled
+not by the whole history but by the last and
+0:23:47.361 --> 0:23:48.969
+minus one word.
+0:23:50.470 --> 0:23:54.322
+So and this is typically expressed by people.
+0:23:54.322 --> 0:24:01.776
+They're therefore also talking by an N gram
+language model because we are always looking
+0:24:01.776 --> 0:24:06.550
+at these chimes of N words and modeling the
+probability.
+0:24:07.527 --> 0:24:10.485
+So again start with the most simple case.
+0:24:10.485 --> 0:24:15.485
+Even extreme is the unigram case, so we're
+ignoring the whole history.
+0:24:15.835 --> 0:24:24.825
+The probability of a sequence of words is
+just the probability of each of the words in
+0:24:24.825 --> 0:24:25.548
+there.
+0:24:26.046 --> 0:24:32.129
+And therefore we are removing the whole context.
+0:24:32.129 --> 0:24:40.944
+The most probable sequence would be something
+like one of them is the.
+0:24:42.162 --> 0:24:44.694
+Most probable wordsuit by itself.
+0:24:44.694 --> 0:24:49.684
+It might not make sense, but it, of course,
+can give you a bit of.
+0:24:49.629 --> 0:24:52.682
+Intuition like which types of words should
+be more frequent.
+0:24:53.393 --> 0:25:00.012
+And if you what you can do is train such a
+button and you can just automatically generate.
+0:25:00.140 --> 0:25:09.496
+And this sequence is generated by sampling,
+so we will later come in the lecture too.
+0:25:09.496 --> 0:25:16.024
+The sampling is that you randomly pick a word
+but based on.
+0:25:16.096 --> 0:25:22.711
+So if the probability of one word is zero
+point two then you'll put it on and if another
+0:25:22.711 --> 0:25:23.157
+word.
+0:25:23.483 --> 0:25:36.996
+And if you see that you'll see here now, for
+example, it seems that these are two occurring
+0:25:36.996 --> 0:25:38.024
+posts.
+0:25:38.138 --> 0:25:53.467
+But you see there's not really any continuing
+type of structure because each word is modeled
+0:25:53.467 --> 0:25:55.940
+independently.
+0:25:57.597 --> 0:26:03.037
+This you can do better even though going to
+a biograph, so then we're having a bit of context.
+0:26:03.037 --> 0:26:08.650
+Of course, it's still very small, so the probability
+of your word of the actual word only depends
+0:26:08.650 --> 0:26:12.429
+on the previous word and all the context before
+there is ignored.
+0:26:13.133 --> 0:26:18.951
+This of course will come to that wrong, but
+it models a regular language significantly
+0:26:18.951 --> 0:26:19.486
+better.
+0:26:19.779 --> 0:26:28.094
+Seeing some things here still doesn't really
+make a lot of sense, but you're seeing some
+0:26:28.094 --> 0:26:29.682
+typical phrases.
+0:26:29.949 --> 0:26:39.619
+In this hope doesn't make sense, but in this
+issue is also frequent.
+0:26:39.619 --> 0:26:51.335
+Issue is also: Very nice is this year new
+car parking lot after, so if you have the word
+0:26:51.335 --> 0:26:53.634
+new then the word.
+0:26:53.893 --> 0:27:01.428
+Is also quite common, but new car they wouldn't
+put parking.
+0:27:01.428 --> 0:27:06.369
+Often the continuation is packing lots.
+0:27:06.967 --> 0:27:12.417
+And now it's very interesting because here
+we see the two cementic meanings of lot: You
+0:27:12.417 --> 0:27:25.889
+have a parking lot, but in general if you just
+think about the history, the most common use
+0:27:25.889 --> 0:27:27.353
+is a lot.
+0:27:27.527 --> 0:27:33.392
+So you see that he's really not using the
+context before, but he's only using the current
+0:27:33.392 --> 0:27:33.979
+context.
+0:27:38.338 --> 0:27:41.371
+So in general we can of course do that longer.
+0:27:41.371 --> 0:27:43.888
+We can do unigrams, bigrams, trigrams.
+0:27:45.845 --> 0:27:52.061
+People typically went up to four or five grams,
+and then it's getting difficult because.
+0:27:52.792 --> 0:27:56.671
+There are so many five grams that it's getting
+complicated.
+0:27:56.671 --> 0:28:02.425
+Storing all of them and storing these models
+get so big that it's no longer working, and
+0:28:02.425 --> 0:28:08.050
+of course at some point the calculation of
+the probabilities again gets too difficult,
+0:28:08.050 --> 0:28:09.213
+and each of them.
+0:28:09.429 --> 0:28:14.777
+If you have a small corpus, of course you
+will use a smaller ingram length.
+0:28:14.777 --> 0:28:16.466
+You will take a larger.
+0:28:18.638 --> 0:28:24.976
+What is important to keep in mind is that,
+of course, this is wrong.
+0:28:25.285 --> 0:28:36.608
+So we have long range dependencies, and if
+we really want to model everything in language
+0:28:36.608 --> 0:28:37.363
+then.
+0:28:37.337 --> 0:28:46.965
+So here is like one of these extreme cases,
+the computer, which has just put into the machine
+0:28:46.965 --> 0:28:49.423
+room in the slow crash.
+0:28:49.423 --> 0:28:55.978
+Like somehow, there is a dependency between
+computer and crash.
+0:28:57.978 --> 0:29:10.646
+However, in most situations these are typically
+rare and normally most important things happen
+0:29:10.646 --> 0:29:13.446
+in the near context.
+0:29:15.495 --> 0:29:28.408
+But of course it's important to keep that
+in mind that you can't model the thing so you
+0:29:28.408 --> 0:29:29.876
+can't do.
+0:29:33.433 --> 0:29:50.200
+The next question is again how can we train
+so we have to estimate these probabilities.
+0:29:51.071 --> 0:30:00.131
+And the question is how we do that, and again
+the most simple thing.
+0:30:00.440 --> 0:30:03.168
+The thing is exactly what's maximum legal
+destination.
+0:30:03.168 --> 0:30:12.641
+What gives you the right answer is: So how
+probable is that the word is following minus
+0:30:12.641 --> 0:30:13.370
+one?
+0:30:13.370 --> 0:30:20.946
+You just count how often does this sequence
+happen?
+0:30:21.301 --> 0:30:28.165
+So guess this is what most of you would have
+intuitively done, and this also works best.
+0:30:28.568 --> 0:30:39.012
+So it's not a complicated train, so you once
+have to go over your corpus, you have to count
+0:30:39.012 --> 0:30:48.662
+our diagrams and unigrams, and then you can
+directly train the basic language model.
+0:30:49.189 --> 0:30:50.651
+Who is it difficult?
+0:30:50.651 --> 0:30:58.855
+There are two difficulties: The basic language
+well doesn't work that well because of zero
+0:30:58.855 --> 0:31:03.154
+counts and how we address that and the second.
+0:31:03.163 --> 0:31:13.716
+Because we saw that especially if you go for
+larger you have to store all these engrams
+0:31:13.716 --> 0:31:15.275
+efficiently.
+0:31:17.697 --> 0:31:21.220
+So how we can do that?
+0:31:21.220 --> 0:31:24.590
+Here's some examples.
+0:31:24.590 --> 0:31:33.626
+For example, if you have the sequence your
+training curve.
+0:31:33.713 --> 0:31:41.372
+You see that the word happens, ascends the
+star and the sequence happens two times.
+0:31:42.182 --> 0:31:45.651
+We have three times.
+0:31:45.651 --> 0:31:58.043
+The same starts as the probability is to thirds
+and the other probability.
+0:31:58.858 --> 0:32:09.204
+Here we have what is following so you have
+twice and once do so again two thirds and one.
+0:32:09.809 --> 0:32:20.627
+And this is all that you need to know here
+about it, so you can do this calculation.
+0:32:23.723 --> 0:32:35.506
+So the question then, of course, is what do
+we really learn in these types of models?
+0:32:35.506 --> 0:32:45.549
+Here are examples from the Europycopterus:
+The green, the red, and the blue, and here
+0:32:45.549 --> 0:32:48.594
+you have the probabilities which is the next.
+0:32:48.989 --> 0:33:01.897
+That there is a lot more than just like the
+syntax because the initial phrase is all the
+0:33:01.897 --> 0:33:02.767
+same.
+0:33:03.163 --> 0:33:10.132
+For example, you see the green paper in the
+green group.
+0:33:10.132 --> 0:33:16.979
+It's more European palaman, the red cross,
+which is by.
+0:33:17.197 --> 0:33:21.777
+What you also see that it's like sometimes
+Indian, sometimes it's more difficult.
+0:33:22.302 --> 0:33:28.345
+So, for example, following the rats, in one
+hundred cases it was a red cross.
+0:33:28.668 --> 0:33:48.472
+So it seems to be easier to guess the next
+word.
+0:33:48.528 --> 0:33:55.152
+So there is different types of information
+coded in that you also know that I guess sometimes
+0:33:55.152 --> 0:33:58.675
+you directly know all the speakers will continue.
+0:33:58.675 --> 0:34:04.946
+It's not a lot of new information in the next
+word, but in other cases like blue there's
+0:34:04.946 --> 0:34:06.496
+a lot of information.
+0:34:11.291 --> 0:34:14.849
+Another example is this Berkeley restaurant
+sentences.
+0:34:14.849 --> 0:34:21.059
+It's collected at Berkeley and you have sentences
+like can you tell me about any good spaghetti
+0:34:21.059 --> 0:34:21.835
+restaurant.
+0:34:21.835 --> 0:34:27.463
+Big price title is what I'm looking for so
+it's more like a dialogue system and people
+0:34:27.463 --> 0:34:31.215
+have collected this data and of course you
+can also look.
+0:34:31.551 --> 0:34:46.878
+Into this and get the counts, so you count
+the vibrants in the top, so the color is the.
+0:34:49.409 --> 0:34:52.912
+This is a bigram which is the first word of
+West.
+0:34:52.912 --> 0:34:54.524
+This one fuzzy is one.
+0:34:56.576 --> 0:35:12.160
+One because want to hyperability, but want
+a lot less, and there where you see it, for
+0:35:12.160 --> 0:35:17.004
+example: So here you see after I want.
+0:35:17.004 --> 0:35:23.064
+It's very often for I eat, but an island which
+is not just.
+0:35:27.347 --> 0:35:39.267
+The absolute counts of how often each road
+occurs, and then you can see here the probabilities
+0:35:39.267 --> 0:35:40.145
+again.
+0:35:42.422 --> 0:35:54.519
+Then do that if you want to do iwan Dutch
+food you get the sequence you have to multiply
+0:35:54.519 --> 0:35:55.471
+olive.
+0:35:55.635 --> 0:36:00.281
+And then you of course get a bit of interesting
+experience on that.
+0:36:00.281 --> 0:36:04.726
+For example: Information is there.
+0:36:04.726 --> 0:36:15.876
+So, for example, if you compare I want Dutch
+or I want Chinese, it seems that.
+0:36:16.176 --> 0:36:22.910
+That the sentence often starts with eye.
+0:36:22.910 --> 0:36:31.615
+You have it after two is possible, but after
+one it.
+0:36:31.731 --> 0:36:39.724
+And you cannot say want, but you have to say
+want to spend, so there's grammical information.
+0:36:40.000 --> 0:36:51.032
+To main information and source: Here before
+we're going into measuring quality, is there
+0:36:51.032 --> 0:36:58.297
+any questions about language model and the
+idea of modeling?
+0:37:02.702 --> 0:37:13.501
+Hope that doesn't mean everybody sleeping,
+and so when we're doing the training these
+0:37:13.501 --> 0:37:15.761
+language models,.
+0:37:16.356 --> 0:37:26.429
+You need to model what is the engrum length
+should we use a trigram or a forkrum.
+0:37:27.007 --> 0:37:34.040
+So in order to decide how can you now decide
+which of the two models are better?
+0:37:34.914 --> 0:37:40.702
+And if you would have to do that, how would
+you decide taking language model or taking
+0:37:40.702 --> 0:37:41.367
+language?
+0:37:43.263 --> 0:37:53.484
+I take some test text and see which model
+assigns a higher probability to me.
+0:37:54.354 --> 0:38:03.978
+It's very good, so that's even the second
+thing, so the first thing maybe would have
+0:38:03.978 --> 0:38:04.657
+been.
+0:38:05.925 --> 0:38:12.300
+The problem is the and then you take the language
+language language and machine translation.
+0:38:13.193 --> 0:38:18.773
+Problems: First of all you have to build a
+whole system which is very time consuming and
+0:38:18.773 --> 0:38:21.407
+it might not only depend on the language.
+0:38:21.407 --> 0:38:24.730
+On the other hand, that's of course what the
+end is.
+0:38:24.730 --> 0:38:30.373
+The end want and the pressure will model each
+component individually or do you want to do
+0:38:30.373 --> 0:38:31.313
+an end to end.
+0:38:31.771 --> 0:38:35.463
+What can also happen is you'll see your metric
+model.
+0:38:35.463 --> 0:38:41.412
+This is a very good language model, but it
+somewhat doesn't really work well with your
+0:38:41.412 --> 0:38:42.711
+translation model.
+0:38:43.803 --> 0:38:49.523
+But of course it's very good to also have
+this type of intrinsic evaluation where the
+0:38:49.523 --> 0:38:52.116
+assumption should be as a pointed out.
+0:38:52.116 --> 0:38:57.503
+If we have Good English it shouldn't be a
+high probability and it's bad English.
+0:38:58.318 --> 0:39:07.594
+And this is measured by the take a held out
+data set, so some data which you don't train
+0:39:07.594 --> 0:39:12.596
+on then calculate the probability of this data.
+0:39:12.912 --> 0:39:26.374
+Then you're just looking at the language model
+and you take the language model.
+0:39:27.727 --> 0:39:33.595
+You're not directly using the probability,
+but you're taking the perplexity.
+0:39:33.595 --> 0:39:40.454
+The perplexity is due to the power of the
+cross entropy, and you see in the cross entropy
+0:39:40.454 --> 0:39:46.322
+you're doing something like an average probability
+of always coming to this.
+0:39:46.846 --> 0:39:54.721
+Not so how exactly is that define perplexity
+is typically what people refer to all across.
+0:39:54.894 --> 0:40:02.328
+The cross edge is negative and average, and
+then you have the lock of the probability of
+0:40:02.328 --> 0:40:03.246
+the whole.
+0:40:04.584 --> 0:40:10.609
+We are modeling this probability as the product
+of each of the words.
+0:40:10.609 --> 0:40:18.613
+That's how the end gram was defined and now
+you hopefully can remember the rules of logarism
+0:40:18.613 --> 0:40:23.089
+so you can get the probability within the logarism.
+0:40:23.063 --> 0:40:31.036
+The sum here so the cross entry is minus one
+by two by n, and the sum of all your words
+0:40:31.036 --> 0:40:35.566
+and the lowerism of the probability of each
+word.
+0:40:36.176 --> 0:40:39.418
+And then the perplexity is just like two to
+the power.
+0:40:41.201 --> 0:40:44.706
+Why can this be interpreted as a branching
+factor?
+0:40:44.706 --> 0:40:50.479
+So it gives you a bit like the average thing,
+like how many possibilities you have.
+0:40:51.071 --> 0:41:02.249
+You have a digit task and you have no idea,
+but the probability of the next digit is like
+0:41:02.249 --> 0:41:03.367
+one ten.
+0:41:03.783 --> 0:41:09.354
+And if you then take a later perplexity, it
+will be exactly ten.
+0:41:09.849 --> 0:41:24.191
+And that is like this perplexity gives you
+a million interpretations, so how much randomness
+0:41:24.191 --> 0:41:27.121
+is still in there?
+0:41:27.307 --> 0:41:32.433
+Of course, now it's good to have a lower perplexity.
+0:41:32.433 --> 0:41:36.012
+We have less ambiguity in there and.
+0:41:35.976 --> 0:41:48.127
+If you have a hundred words and you only have
+to uniformly compare it to ten different, so
+0:41:48.127 --> 0:41:49.462
+you have.
+0:41:49.609 --> 0:41:53.255
+Yes, think so it should be.
+0:41:53.255 --> 0:42:03.673
+You had here logarism and then to the power
+and that should then be eliminated.
+0:42:03.743 --> 0:42:22.155
+So which logarism you use is not that important
+because it's a constant factor to reformulate.
+0:42:23.403 --> 0:42:28.462
+Yes and Yeah So the Best.
+0:42:31.931 --> 0:42:50.263
+The best model is always like you want to
+have a high probability.
+0:42:51.811 --> 0:43:04.549
+Time you see here, so here the probabilities
+would like to commend the rapporteur on his
+0:43:04.549 --> 0:43:05.408
+work.
+0:43:05.285 --> 0:43:14.116
+You have then locked two probabilities and
+then the average, so this is not the perplexity
+0:43:14.116 --> 0:43:18.095
+but the cross entropy as mentioned here.
+0:43:18.318 --> 0:43:26.651
+And then due to the power of that we'll give
+you the perplexity of the center.
+0:43:29.329 --> 0:43:40.967
+And these metrics of perplexity are essential
+in modeling that and we'll also see nowadays.
+0:43:41.121 --> 0:43:47.898
+You also measure like equality often in perplexity
+or cross entropy, which gives you how good
+0:43:47.898 --> 0:43:50.062
+is it in estimating the same.
+0:43:50.010 --> 0:43:53.647
+The better the model is, the more information
+you have about this.
+0:43:55.795 --> 0:44:03.106
+Talked about isomic ability or quit sentences,
+but don't most have to any much because.
+0:44:03.463 --> 0:44:12.512
+You are doing that in this way implicitly
+because of the correct word.
+0:44:12.512 --> 0:44:19.266
+If you are modeling this one, the sun over
+all next.
+0:44:20.020 --> 0:44:29.409
+Therefore, you have that implicitly in there
+because in each position you're modeling the
+0:44:29.409 --> 0:44:32.957
+probability of this witch behind.
+0:44:35.515 --> 0:44:43.811
+You have a very large number of negative examples
+because all the possible extensions which are
+0:44:43.811 --> 0:44:49.515
+not there are incorrect, which of course might
+also be a problem.
+0:44:52.312 --> 0:45:00.256
+And the biggest challenge of these types of
+models is how to model unseen events.
+0:45:00.840 --> 0:45:04.973
+So that can be unknown words or it can be
+unknown vibrants.
+0:45:05.245 --> 0:45:10.096
+So that's important also like you've seen
+all the words.
+0:45:10.096 --> 0:45:17.756
+But if you have a bigram language model, if
+you haven't seen the bigram, you'll still get
+0:45:17.756 --> 0:45:23.628
+a zero probability because we know that the
+bigram's divided by the.
+0:45:24.644 --> 0:45:35.299
+If you have unknown words, the problem gets
+even bigger because one word typically causes
+0:45:35.299 --> 0:45:37.075
+a lot of zero.
+0:45:37.217 --> 0:45:41.038
+So if you, for example, if your vocabulary
+is go to and care it,.
+0:45:41.341 --> 0:45:43.467
+And you have not a sentence.
+0:45:43.467 --> 0:45:47.941
+I want to pay a T, so you have one word, which
+is here 'an'.
+0:45:47.887 --> 0:45:54.354
+It is unknow then you have the proper.
+0:45:54.354 --> 0:46:02.147
+It is I get a sentence star and sentence star.
+0:46:02.582 --> 0:46:09.850
+To model this probability you always have
+to take the account from these sequences divided
+0:46:09.850 --> 0:46:19.145
+by: Since when does it occur, all of these
+angrams can also occur because of the word
+0:46:19.145 --> 0:46:19.961
+middle.
+0:46:20.260 --> 0:46:27.800
+So all of these probabilities are directly
+zero.
+0:46:27.800 --> 0:46:33.647
+You see that just by having a single.
+0:46:34.254 --> 0:46:47.968
+Tells you it might not always be better to
+have larger grams because if you have a gram
+0:46:47.968 --> 0:46:50.306
+language more.
+0:46:50.730 --> 0:46:57.870
+So sometimes it's better to have a smaller
+angram counter because the chances that you're
+0:46:57.870 --> 0:47:00.170
+seeing the angram is higher.
+0:47:00.170 --> 0:47:07.310
+On the other hand, you want to have a larger
+account because the larger the count is, the
+0:47:07.310 --> 0:47:09.849
+longer the context is modeling.
+0:47:10.670 --> 0:47:17.565
+So how can we address this type of problem?
+0:47:17.565 --> 0:47:28.064
+We address this type of problem by somehow
+adjusting our accounts.
+0:47:29.749 --> 0:47:40.482
+We have often, but most of your entries in
+the table are zero, and if one of these engrams
+0:47:40.482 --> 0:47:45.082
+occurs you'll have a zero probability.
+0:47:46.806 --> 0:48:06.999
+So therefore we need to find some of our ways
+in order to estimate this type of event because:
+0:48:07.427 --> 0:48:11.619
+So there are different ways of how to model
+it and how to adjust it.
+0:48:11.619 --> 0:48:15.326
+The one I hear is to do smoocing and that's
+the first thing.
+0:48:15.326 --> 0:48:20.734
+So in smoocing you're saying okay, we take
+a bit of the probability we have to our scene
+0:48:20.734 --> 0:48:23.893
+events and distribute this thing we're taking
+away.
+0:48:23.893 --> 0:48:26.567
+We're distributing to all the other events.
+0:48:26.946 --> 0:48:33.927
+The nice thing is in this case oh now each
+event has a non zero probability and that is
+0:48:33.927 --> 0:48:39.718
+of course very helpful because we don't have
+zero probabilities anymore.
+0:48:40.180 --> 0:48:48.422
+It smoothed out, but at least you have some
+kind of probability everywhere, so you take
+0:48:48.422 --> 0:48:50.764
+some of the probability.
+0:48:53.053 --> 0:49:05.465
+You can also do that more here when you have
+the endgram, for example, and this is your
+0:49:05.465 --> 0:49:08.709
+original distribution.
+0:49:08.648 --> 0:49:15.463
+Then you are taking some mass away from here
+and distributing this mass to all the other
+0:49:15.463 --> 0:49:17.453
+words that you have seen.
+0:49:18.638 --> 0:49:26.797
+And thereby you are now making sure that it's
+yeah, that it's now possible to model that.
+0:49:28.828 --> 0:49:36.163
+The other idea we're coming into more detail
+on how we can do this type of smoking, but
+0:49:36.163 --> 0:49:41.164
+one other idea you can do is to do some type
+of clustering.
+0:49:41.501 --> 0:49:48.486
+And that means if we are can't model go Kit's,
+for example because we haven't seen that.
+0:49:49.349 --> 0:49:56.128
+Then we're just looking at the full thing
+and we're just going to live directly how probable.
+0:49:56.156 --> 0:49:58.162
+Go two ways or so.
+0:49:58.162 --> 0:50:09.040
+Then we are modeling just only the word interpolation
+where you're interpolating all the probabilities
+0:50:09.040 --> 0:50:10.836
+and thereby can.
+0:50:11.111 --> 0:50:16.355
+These are the two things which are helpful
+in order to better calculate all these types.
+0:50:19.499 --> 0:50:28.404
+Let's start with what counts news so the idea
+is okay.
+0:50:28.404 --> 0:50:38.119
+We have not seen an event and then the probability
+is zero.
+0:50:38.618 --> 0:50:50.902
+It's not that high, but you should always
+be aware that there might be new things happening
+0:50:50.902 --> 0:50:55.308
+and somehow be able to estimate.
+0:50:56.276 --> 0:50:59.914
+So the idea is okay.
+0:50:59.914 --> 0:51:09.442
+We can also assign a positive probability
+to a higher.
+0:51:10.590 --> 0:51:23.233
+We are changing so currently we worked on
+imperial accounts so how often we have seen
+0:51:23.233 --> 0:51:25.292
+the accounts.
+0:51:25.745 --> 0:51:37.174
+And now we are going on to expect account
+how often this would occur in an unseen.
+0:51:37.517 --> 0:51:39.282
+So we are directly trying to model that.
+0:51:39.859 --> 0:51:45.836
+Of course, the empirical accounts are a good
+starting point, so if you've seen the world
+0:51:45.836 --> 0:51:51.880
+very often in your training data, it's a good
+estimation of how often you would see it in
+0:51:51.880 --> 0:51:52.685
+the future.
+0:51:52.685 --> 0:51:58.125
+However, it might make sense to think about
+it only because you haven't seen it.
+0:51:58.578 --> 0:52:10.742
+So does anybody have a very simple idea how
+you start with smoothing it?
+0:52:10.742 --> 0:52:15.241
+What count would you give?
+0:52:21.281 --> 0:52:32.279
+Now you have the probability to calculation
+how often have you seen the biogram with zero
+0:52:32.279 --> 0:52:33.135
+count.
+0:52:33.193 --> 0:52:39.209
+So what count would you give in order to still
+do this calculation?
+0:52:39.209 --> 0:52:41.509
+We have to smooth, so we.
+0:52:44.884 --> 0:52:52.151
+We could clump together all the rare words,
+for example everywhere we have only seen ones.
+0:52:52.652 --> 0:52:56.904
+And then just we can do the massive moment
+of those and don't.
+0:52:56.936 --> 0:53:00.085
+So remove the real ones.
+0:53:00.085 --> 0:53:06.130
+Yes, and then every unseen word is one of
+them.
+0:53:06.130 --> 0:53:13.939
+Yeah, but it's not only about unseen words,
+it's even unseen.
+0:53:14.874 --> 0:53:20.180
+You can even start easier and that's what
+people do at the first thing.
+0:53:20.180 --> 0:53:22.243
+That's at one smooth thing.
+0:53:22.243 --> 0:53:28.580
+You'll see it's not working good but the variation
+works fine and we're just as here.
+0:53:28.580 --> 0:53:30.644
+We've seen everything once.
+0:53:31.771 --> 0:53:39.896
+That's similar to this because you're clustering
+the one and the zero together and you just
+0:53:39.896 --> 0:53:45.814
+say you've seen everything once or have seen
+them twice and so on.
+0:53:46.386 --> 0:53:53.249
+And if you've done that wow, there's no probability
+because each event has happened once.
+0:53:55.795 --> 0:54:02.395
+If you otherwise have seen the bigram five
+times, you would not now do five times but
+0:54:02.395 --> 0:54:03.239
+six times.
+0:54:03.363 --> 0:54:09.117
+So the nice thing is to have seen everything.
+0:54:09.117 --> 0:54:19.124
+Once the probability of the engrap is now
+out, you have seen it divided by the.
+0:54:20.780 --> 0:54:23.763
+How long ago there's one big big problem with
+it?
+0:54:24.064 --> 0:54:38.509
+Just imagine that you have a vocabulary of
+words, and you have a corpus of thirty million
+0:54:38.509 --> 0:54:39.954
+bigrams.
+0:54:39.954 --> 0:54:42.843
+So if you have a.
+0:54:43.543 --> 0:54:46.580
+Simple Things So You've Seen Them Thirty Million
+Times.
+0:54:47.247 --> 0:54:49.818
+That is your count, your distributing.
+0:54:49.818 --> 0:54:55.225
+According to your gain, the problem is yet
+how many possible bigrams do you have?
+0:54:55.225 --> 0:55:00.895
+You have seven point five billion possible
+bigrams, and each of them you are counting
+0:55:00.895 --> 0:55:04.785
+now as give up your ability, like you give
+account of one.
+0:55:04.785 --> 0:55:07.092
+So each of them is saying a curse.
+0:55:07.627 --> 0:55:16.697
+Then this number of possible vigrams is many
+times larger than the number you really see.
+0:55:17.537 --> 0:55:21.151
+You're mainly doing equal distribution.
+0:55:21.151 --> 0:55:26.753
+Everything gets the same because this is much
+more important.
+0:55:26.753 --> 0:55:31.541
+Most of your probability mass is used for
+smoothing.
+0:55:32.412 --> 0:55:37.493
+Because most of the probability miles have
+to be distributed that you at least give every
+0:55:37.493 --> 0:55:42.687
+biogram at least a count of one, and the other
+counts are only the thirty million, so seven
+0:55:42.687 --> 0:55:48.219
+point five billion counts go to like a distribute
+around all the engrons, and only thirty million
+0:55:48.219 --> 0:55:50.026
+are according to your frequent.
+0:55:50.210 --> 0:56:02.406
+So you put a lot too much mass on your smoothing
+and you're doing some kind of extreme smoothing.
+0:56:02.742 --> 0:56:08.986
+So that of course is a bit bad then and will
+give you not the best performance.
+0:56:10.130 --> 0:56:16.160
+However, there's a nice thing and that means
+to do probability calculations.
+0:56:16.160 --> 0:56:21.800
+We are doing it based on counts, but to do
+this division we don't need.
+0:56:22.302 --> 0:56:32.112
+So we can also do that with floating point
+values and there is still a valid type of calculation.
+0:56:32.392 --> 0:56:39.380
+So we can have less probability mass to unseen
+events.
+0:56:39.380 --> 0:56:45.352
+We don't have to give one because if we count.
+0:56:45.785 --> 0:56:50.976
+But to do our calculation we can also give
+zero point zero to something like that, so
+0:56:50.976 --> 0:56:56.167
+very small value, and thereby we have less
+value on the smooth thing, and we are more
+0:56:56.167 --> 0:56:58.038
+focusing on the actual corpus.
+0:56:58.758 --> 0:57:03.045
+And that is what people refer to as Alpha
+Smoozing.
+0:57:03.223 --> 0:57:12.032
+You see that we are now adding not one to
+it but only alpha, and then we are giving less
+0:57:12.032 --> 0:57:19.258
+probability to the unseen event and more probability
+to the really seen.
+0:57:20.780 --> 0:57:24.713
+Questions: Of course, how do you find see
+also?
+0:57:24.713 --> 0:57:29.711
+I'm here to either use some help out data
+and optimize them.
+0:57:30.951 --> 0:57:35.153
+So what what does it now really mean?
+0:57:35.153 --> 0:57:40.130
+This gives you a bit of an idea behind that.
+0:57:40.700 --> 0:57:57.751
+So here you have the grams which occur one
+time, for example all grams which occur one.
+0:57:57.978 --> 0:58:10.890
+So, for example, that means that if you have
+engrams which occur one time, then.
+0:58:11.371 --> 0:58:22.896
+If you look at all the engrams which occur
+two times, then they occur.
+0:58:22.896 --> 0:58:31.013
+If you look at the engrams that occur zero,
+then.
+0:58:32.832 --> 0:58:46.511
+So if you are now doing the smoothing you
+can look what is the probability estimating
+0:58:46.511 --> 0:58:47.466
+them.
+0:58:47.847 --> 0:59:00.963
+You see that for all the endbreaks you heavily
+underestimate how often they occur in the test
+0:59:00.963 --> 0:59:01.801
+card.
+0:59:02.002 --> 0:59:10.067
+So what you want is very good to estimate
+this distribution, so for each Enron estimate
+0:59:10.067 --> 0:59:12.083
+quite well how often.
+0:59:12.632 --> 0:59:16.029
+You're quite bad at that for all of them.
+0:59:16.029 --> 0:59:22.500
+You're apparently underestimating only for
+the top ones which you haven't seen.
+0:59:22.500 --> 0:59:24.845
+You'll heavily overestimate.
+0:59:25.645 --> 0:59:30.887
+If you're doing alpha smoothing and optimize
+that to fit on the zero count because that's
+0:59:30.887 --> 0:59:36.361
+not completely fair because this alpha is now
+optimizes the test counter, you see that you're
+0:59:36.361 --> 0:59:37.526
+doing a lot better.
+0:59:37.526 --> 0:59:42.360
+It's not perfect, but you're a lot better
+in estimating how often they will occur.
+0:59:45.545 --> 0:59:49.316
+So this is one idea of doing it.
+0:59:49.316 --> 0:59:57.771
+Of course there's other ways and this is like
+a large research direction.
+0:59:58.318 --> 1:00:03.287
+So there is this needed estimation.
+1:00:03.287 --> 1:00:11.569
+What you are doing is filling your trading
+data into parts.
+1:00:11.972 --> 1:00:19.547
+Looking at how many engrams occur exactly
+are types, which engrams occur are times in
+1:00:19.547 --> 1:00:20.868
+your training.
+1:00:21.281 --> 1:00:27.716
+And then you look for these ones.
+1:00:27.716 --> 1:00:36.611
+How often do they occur in your training data?
+1:00:36.611 --> 1:00:37.746
+It's.
+1:00:38.118 --> 1:00:45.214
+And then you say oh this engram, the expector
+counts how often will see.
+1:00:45.214 --> 1:00:56.020
+It is divided by: Some type of clustering
+you're putting all the engrams which occur
+1:00:56.020 --> 1:01:04.341
+are at times in your data together and in order
+to estimate how often.
+1:01:05.185 --> 1:01:12.489
+And if you do half your data related to your
+final estimation by just using those statistics,.
+1:01:14.014 --> 1:01:25.210
+So this is called added estimation, and thereby
+you are not able to estimate better how often
+1:01:25.210 --> 1:01:25.924
+does.
+1:01:28.368 --> 1:01:34.559
+And again we can do the same look and compare
+it to the expected counts.
+1:01:34.559 --> 1:01:37.782
+Again we have exactly the same table.
+1:01:38.398 --> 1:01:47.611
+So then we're having to hear how many engrams
+that does exist.
+1:01:47.611 --> 1:01:55.361
+So, for example, there's like engrams which
+you can.
+1:01:55.835 --> 1:02:08.583
+Then you look into your other half and how
+often do these N grams occur in your 2nd part
+1:02:08.583 --> 1:02:11.734
+of the training data?
+1:02:12.012 --> 1:02:22.558
+For example, an unseen N gram I expect to
+occur, an engram which occurs one time.
+1:02:22.558 --> 1:02:25.774
+I expect that it occurs.
+1:02:27.527 --> 1:02:42.564
+Yeah, the number of zero counts are if take
+my one grams and then just calculate how many
+1:02:42.564 --> 1:02:45.572
+possible bigrams.
+1:02:45.525 --> 1:02:50.729
+Yes, so in this case we are now not assuming
+about having a more larger cattle because then,
+1:02:50.729 --> 1:02:52.127
+of course, it's getting.
+1:02:52.272 --> 1:02:54.730
+So you're doing that given the current gram.
+1:02:54.730 --> 1:03:06.057
+The cavalry is better to: So yeah, there's
+another problem in how to deal with them.
+1:03:06.057 --> 1:03:11.150
+This is more about how to smuse the engram
+counts to also deal.
+1:03:14.394 --> 1:03:18.329
+Certainly as I Think The.
+1:03:18.198 --> 1:03:25.197
+Yes, the last idea of doing is so called good
+cheering, and and the I hear here is in it
+1:03:25.197 --> 1:03:32.747
+similar, so there is a typical mathematic approve,
+but you can show that a very good estimation
+1:03:32.747 --> 1:03:34.713
+for the expected counts.
+1:03:34.654 --> 1:03:42.339
+Is that you take the number of engrams which
+occur one time more divided by the number of
+1:03:42.339 --> 1:03:46.011
+engram which occur R times and R plus one.
+1:03:46.666 --> 1:03:49.263
+So this is then the estimation of.
+1:03:49.549 --> 1:04:05.911
+So if you are looking now at an engram which
+occurs times then you are looking at how many
+1:04:05.911 --> 1:04:08.608
+engrams occur.
+1:04:09.009 --> 1:04:18.938
+It's very simple, so in this one you only
+have to count all the bigrams, how many different
+1:04:18.938 --> 1:04:23.471
+bigrams out there, and that is very good.
+1:04:23.903 --> 1:04:33.137
+So if you are saying now about end drums which
+occur or times,.
+1:04:33.473 --> 1:04:46.626
+It might be that there are some occurring
+times, but no times, and then.
+1:04:46.866 --> 1:04:54.721
+So what you normally do is you are doing for
+small R, and for large R you do some curve
+1:04:54.721 --> 1:04:55.524
+fitting.
+1:04:56.016 --> 1:05:07.377
+In general this type of smoothing is important
+for engrams which occur rarely.
+1:05:07.377 --> 1:05:15.719
+If an engram occurs so this is more important
+for events.
+1:05:17.717 --> 1:05:25.652
+So here again you see you have the counts
+and then based on that you get the adjusted
+1:05:25.652 --> 1:05:26.390
+counts.
+1:05:26.390 --> 1:05:34.786
+This is here and if you compare it's a test
+count you see that it really works quite well.
+1:05:35.035 --> 1:05:41.093
+But for the low numbers it's a very good modeling
+of how much how good this works.
+1:05:45.005 --> 1:05:50.018
+Then, of course, the question is how good
+does it work in language modeling?
+1:05:50.018 --> 1:05:51.516
+We also want tomorrow.
+1:05:52.372 --> 1:05:54.996
+We can measure that perplexity.
+1:05:54.996 --> 1:05:59.261
+We learned that before and then we have everyone's.
+1:05:59.579 --> 1:06:07.326
+You saw that a lot of too much probability
+mass is put to the events which have your probability.
+1:06:07.667 --> 1:06:11.098
+Then you have an alpha smoothing.
+1:06:11.098 --> 1:06:16.042
+Here's a start because it's not completely
+fair.
+1:06:16.042 --> 1:06:20.281
+The alpha was maximized on the test data.
+1:06:20.480 --> 1:06:25.904
+But you see that like the leaded estimation
+of the touring gives you a similar performance.
+1:06:26.226 --> 1:06:29.141
+So they seem to really work quite well.
+1:06:32.232 --> 1:06:41.552
+So this is about all assigning probability
+mass to aimed grams, which we have not seen
+1:06:41.552 --> 1:06:50.657
+in order to also estimate their probability
+before we're going to the interpolation.
+1:06:55.635 --> 1:07:00.207
+Good, so now we have.
+1:07:00.080 --> 1:07:11.818
+Done this estimation, and the problem is we
+have this general.
+1:07:11.651 --> 1:07:19.470
+We want to have a longer context because we
+can model longer than language better because
+1:07:19.470 --> 1:07:21.468
+long range dependency.
+1:07:21.701 --> 1:07:26.745
+On the other hand, we have limited data so
+we want to have stored angrums because they
+1:07:26.745 --> 1:07:28.426
+reach angrums at first more.
+1:07:29.029 --> 1:07:43.664
+And about the smooth thing in the discounting
+we did before, it always treats all angrams.
+1:07:44.024 --> 1:07:46.006
+So we didn't really look at the end drums.
+1:07:46.006 --> 1:07:48.174
+They were all classed into how often they
+are.
+1:07:49.169 --> 1:08:00.006
+However, sometimes this might not be very
+helpful, so for example look at the engram
+1:08:00.006 --> 1:08:06.253
+Scottish beer drinkers and Scottish beer eaters.
+1:08:06.686 --> 1:08:12.037
+Because we have not seen the trigram, so you
+will estimate the trigram probability by the
+1:08:12.037 --> 1:08:14.593
+probability you assign to the zero county.
+1:08:15.455 --> 1:08:26.700
+However, if you look at the background probability
+that you might have seen and might be helpful,.
+1:08:26.866 --> 1:08:34.538
+So be a drinker is more probable to see than
+Scottish be a drinker, and be a drinker should
+1:08:34.538 --> 1:08:36.039
+be more probable.
+1:08:36.896 --> 1:08:39.919
+So this type of information is somehow ignored.
+1:08:39.919 --> 1:08:45.271
+So if we have the Trigram language model,
+we are only looking at trigrams divided by
+1:08:45.271 --> 1:08:46.089
+the Vigrams.
+1:08:46.089 --> 1:08:49.678
+But if we have not seen the Vigrams, we are
+not looking.
+1:08:49.678 --> 1:08:53.456
+Oh, maybe we will have seen the Vigram and
+we can back off.
+1:08:54.114 --> 1:09:01.978
+And that is what people do in interpolation
+and back off.
+1:09:01.978 --> 1:09:09.164
+The idea is if we don't have seen the large
+engrams.
+1:09:09.429 --> 1:09:16.169
+So don't have to go to a shorter sequence
+and try to see if we came on in this probability.
+1:09:16.776 --> 1:09:20.730
+And this is the idea of interpolation.
+1:09:20.730 --> 1:09:25.291
+There's like two different ways of doing it.
+1:09:25.291 --> 1:09:26.507
+One is the.
+1:09:26.646 --> 1:09:29.465
+The easiest thing is like okay.
+1:09:29.465 --> 1:09:32.812
+If we have bigrams, we have trigrams.
+1:09:32.812 --> 1:09:35.103
+If we have programs, why?
+1:09:35.355 --> 1:09:46.544
+Mean, of course, we have the larger ones,
+the larger context, but the short amounts are
+1:09:46.544 --> 1:09:49.596
+maybe better estimated.
+1:09:50.090 --> 1:10:00.487
+Time just by taking the probability of just
+the word class of probability of and.
+1:10:01.261 --> 1:10:07.052
+And of course we need to know because otherwise
+we don't have a probability distribution, but
+1:10:07.052 --> 1:10:09.332
+we can somehow optimize the weights.
+1:10:09.332 --> 1:10:15.930
+For example, the health out data set: And
+thereby we have now a probability distribution
+1:10:15.930 --> 1:10:17.777
+which takes both into account.
+1:10:18.118 --> 1:10:23.705
+The thing about the Scottish be a drink business.
+1:10:23.705 --> 1:10:33.763
+The dry rum probability will be the same for
+the post office because they both occur zero
+1:10:33.763 --> 1:10:34.546
+times.
+1:10:36.116 --> 1:10:45.332
+But the two grand verability will hopefully
+be different because we might have seen beer
+1:10:45.332 --> 1:10:47.611
+eaters and therefore.
+1:10:48.668 --> 1:10:57.296
+The idea that sometimes it's better to have
+different models and combine them instead.
+1:10:58.678 --> 1:10:59.976
+Another idea in style.
+1:11:00.000 --> 1:11:08.506
+Of this overall interpolation is you can also
+do this type of recursive interpolation.
+1:11:08.969 --> 1:11:23.804
+The probability of the word given its history
+is in the current language model probability.
+1:11:24.664 --> 1:11:30.686
+Thus one minus the weights of this two some
+after one, and here it's an interpolated probability
+1:11:30.686 --> 1:11:36.832
+from the n minus one breath, and then of course
+it goes recursively on until you are at a junigram
+1:11:36.832 --> 1:11:37.639
+probability.
+1:11:38.558 --> 1:11:49.513
+What you can also do, you can not only do
+the same weights for all our words, but you
+1:11:49.513 --> 1:12:06.020
+can for example: For example, for engrams,
+which you have seen very often, you put more
+1:12:06.020 --> 1:12:10.580
+weight on the trigrams.
+1:12:13.673 --> 1:12:29.892
+The other thing you can do is the back off
+and the difference in back off is we are not
+1:12:29.892 --> 1:12:32.656
+interpolating.
+1:12:32.892 --> 1:12:41.954
+If we have seen the trigram probability so
+if the trigram hound is bigger then we take
+1:12:41.954 --> 1:12:48.412
+the trigram probability and if we have seen
+this one then we.
+1:12:48.868 --> 1:12:54.092
+So that is the difference.
+1:12:54.092 --> 1:13:06.279
+We are always taking all the angle probabilities
+and back off.
+1:13:07.147 --> 1:13:09.941
+Why do we need to do this just a minute?
+1:13:09.941 --> 1:13:13.621
+So why have we here just take the probability
+of the.
+1:13:15.595 --> 1:13:18.711
+Yes, because otherwise the probabilities from
+some people.
+1:13:19.059 --> 1:13:28.213
+In order to make them still sound one, we
+have to take away a bit of a probability mass
+1:13:28.213 --> 1:13:29.773
+for the scene.
+1:13:29.709 --> 1:13:38.919
+The difference is we are no longer distributing
+it equally as before to the unseen, but we
+1:13:38.919 --> 1:13:40.741
+are distributing.
+1:13:44.864 --> 1:13:56.220
+For example, this can be done with gutturing,
+so the expected counts in goodturing we saw.
+1:13:57.697 --> 1:13:59.804
+The adjusted counts.
+1:13:59.804 --> 1:14:04.719
+They are always lower than the ones we see
+here.
+1:14:04.719 --> 1:14:14.972
+These counts are always: See that so you can
+now take this different and distribute this
+1:14:14.972 --> 1:14:18.852
+weights to the lower based input.
+1:14:23.323 --> 1:14:29.896
+Is how we can distribute things.
+1:14:29.896 --> 1:14:43.442
+Then there is one last thing people are doing,
+especially how much.
+1:14:43.563 --> 1:14:55.464
+And there's one thing which is called well
+written by Mozilla.
+1:14:55.315 --> 1:15:01.335
+In the background, like in the background,
+it might make sense to look at the words and
+1:15:01.335 --> 1:15:04.893
+see how probable it is that you need to background.
+1:15:05.425 --> 1:15:11.232
+So look at these words five and one cent.
+1:15:11.232 --> 1:15:15.934
+Those occur exactly times in the.
+1:15:16.316 --> 1:15:27.804
+They would be treated exactly the same because
+both occur at the same time, and it would be
+1:15:27.804 --> 1:15:29.053
+the same.
+1:15:29.809 --> 1:15:48.401
+However, it shouldn't really model the same.
+1:15:48.568 --> 1:15:57.447
+If you compare that for constant there are
+four hundred different continuations of this
+1:15:57.447 --> 1:16:01.282
+work, so there is nearly always this.
+1:16:02.902 --> 1:16:11.203
+So if you're now seeing a new bigram or a
+biogram with Isaac Constant or Spite starting
+1:16:11.203 --> 1:16:13.467
+and then another word,.
+1:16:15.215 --> 1:16:25.606
+In constant, it's very frequent that you see
+new angrups because there are many different
+1:16:25.606 --> 1:16:27.222
+combinations.
+1:16:27.587 --> 1:16:35.421
+Therefore, it might look not only to look
+at the counts, the end grams, but also how
+1:16:35.421 --> 1:16:37.449
+many extensions does.
+1:16:38.218 --> 1:16:43.222
+And this is done by witt velk smoothing.
+1:16:43.222 --> 1:16:51.032
+The idea is we count how many possible extensions
+in this case.
+1:16:51.371 --> 1:17:01.966
+So we had for spive, we had possible extensions,
+and for constant we had a lot more.
+1:17:02.382 --> 1:17:09.394
+And then how much we put into our backup model,
+how much weight we put into the backup is,
+1:17:09.394 --> 1:17:13.170
+depending on this number of possible extensions.
+1:17:14.374 --> 1:17:15.557
+Style.
+1:17:15.557 --> 1:17:29.583
+We have it here, so this is the weight you
+put on your lower end gram probability.
+1:17:29.583 --> 1:17:46.596
+For example: And if you compare these two
+numbers, so for Spike you do how many extensions
+1:17:46.596 --> 1:17:55.333
+does Spike have divided by: While for constant
+you have zero point three, you know,.
+1:17:55.815 --> 1:18:05.780
+So you're putting a lot more weight to like
+it's not as bad to fall off to the back of
+1:18:05.780 --> 1:18:06.581
+model.
+1:18:06.581 --> 1:18:10.705
+So for the spy it's really unusual.
+1:18:10.730 --> 1:18:13.369
+For Constant there's a lot of probability
+medicine.
+1:18:13.369 --> 1:18:15.906
+The chances that you're doing that is quite
+high.
+1:18:20.000 --> 1:18:26.209
+Similarly, but just from the other way around,
+it's now looking at this probability distribution.
+1:18:26.546 --> 1:18:37.103
+So now when we back off the probability distribution
+for the lower angrums, we calculated exactly
+1:18:37.103 --> 1:18:40.227
+the same as the probability.
+1:18:40.320 --> 1:18:48.254
+However, they are used in a different way,
+so the lower order end drums are only used
+1:18:48.254 --> 1:18:49.361
+if we have.
+1:18:50.410 --> 1:18:54.264
+So it's like you're modeling something different.
+1:18:54.264 --> 1:19:01.278
+You're not modeling how probable this engram
+if we haven't seen the larger engram and that
+1:19:01.278 --> 1:19:04.361
+is tried by the diversity of histories.
+1:19:04.944 --> 1:19:14.714
+For example, if you look at York, that's a
+quite frequent work.
+1:19:14.714 --> 1:19:18.530
+It occurs as many times.
+1:19:19.559 --> 1:19:27.985
+However, four hundred seventy three times
+it was followed the way before it was mute.
+1:19:29.449 --> 1:19:40.237
+So if you now think the unigram model is only
+used, the probability of York as a unigram
+1:19:40.237 --> 1:19:49.947
+model should be very, very low because: So
+you should have a lower probability for your
+1:19:49.947 --> 1:19:56.292
+than, for example, for foods, although you
+have seen both of them at the same time, and
+1:19:56.292 --> 1:20:02.853
+this is done by Knesser and Nye Smoothing where
+you are not counting the words itself, but
+1:20:02.853 --> 1:20:05.377
+you count the number of mysteries.
+1:20:05.845 --> 1:20:15.233
+So how many other way around was it followed
+by how many different words were before?
+1:20:15.233 --> 1:20:28.232
+Then instead of the normal way you count the
+words: So you don't need to know all the formulas
+1:20:28.232 --> 1:20:28.864
+here.
+1:20:28.864 --> 1:20:33.498
+The more important thing is this intuition.
+1:20:34.874 --> 1:20:44.646
+More than it means already that I haven't
+seen the larger end grammar, and therefore
+1:20:44.646 --> 1:20:49.704
+it might be better to model it differently.
+1:20:49.929 --> 1:20:56.976
+So if there's a new engram with something
+in New York that's very unprofitable compared
+1:20:56.976 --> 1:20:57.297
+to.
+1:21:00.180 --> 1:21:06.130
+And yeah, this modified Kneffer Nice music
+is what people took into use.
+1:21:06.130 --> 1:21:08.249
+That's the fall approach.
+1:21:08.728 --> 1:21:20.481
+Has an absolute discounting for small and
+grams, and then bells smoothing, and for it
+1:21:20.481 --> 1:21:27.724
+uses the discounting of histories which we
+just had.
+1:21:28.028 --> 1:21:32.207
+And there's even two versions of it, like
+the backup and the interpolator.
+1:21:32.472 --> 1:21:34.264
+So that may be interesting.
+1:21:34.264 --> 1:21:40.216
+These are here even works well for interpolation,
+although your assumption is even no longer
+1:21:40.216 --> 1:21:45.592
+true because you're using the lower engrams
+even if you've seen the higher engrams.
+1:21:45.592 --> 1:21:49.113
+But since you're then focusing on the higher
+engrams,.
+1:21:49.929 --> 1:21:53.522
+So if you see that some beats on the perfectities,.
+1:21:54.754 --> 1:22:00.262
+So you see normally what interpolated movement
+class of nineties gives you some of the best
+1:22:00.262 --> 1:22:00.980
+performing.
+1:22:02.022 --> 1:22:08.032
+You see the larger your end drum than it is
+with interpolation.
+1:22:08.032 --> 1:22:15.168
+You also get significant better so you can
+not only look at the last words.
+1:22:18.638 --> 1:22:32.725
+Good so much for these types of things, and
+we will finish with some special things about
+1:22:32.725 --> 1:22:34.290
+language.
+1:22:38.678 --> 1:22:44.225
+One thing we talked about the unknown words,
+so there is different ways of doing it because
+1:22:44.225 --> 1:22:49.409
+in all the estimations we were still assuming
+mostly that we have a fixed vocabulary.
+1:22:50.270 --> 1:23:06.372
+So you can often, for example, create an unknown
+choken and use that while statistical language.
+1:23:06.766 --> 1:23:16.292
+It was mainly useful language processing since
+newer models are coming, but maybe it's surprising.
+1:23:18.578 --> 1:23:30.573
+What is also nice is that if you're going
+to really hard launch and ramps, it's more
+1:23:30.573 --> 1:23:33.114
+about efficiency.
+1:23:33.093 --> 1:23:37.378
+And then you have to remember lock it in your
+model.
+1:23:37.378 --> 1:23:41.422
+In a lot of situations it's not really important.
+1:23:41.661 --> 1:23:46.964
+It's more about ranking so which one is better
+and if they don't sum up to one that's not
+1:23:46.964 --> 1:23:47.907
+that important.
+1:23:47.907 --> 1:23:53.563
+Of course then you cannot calculate any perplexity
+anymore because if this is not a probability
+1:23:53.563 --> 1:23:58.807
+mass then the thing we had about the negative
+example doesn't fit anymore and that's not
+1:23:58.807 --> 1:23:59.338
+working.
+1:23:59.619 --> 1:24:02.202
+However, anification is also very helpful.
+1:24:02.582 --> 1:24:13.750
+And that is why there is this stupid bag-off
+presented remove all this complicated things
+1:24:13.750 --> 1:24:14.618
+which.
+1:24:15.055 --> 1:24:28.055
+And it just does once we directly take the
+absolute account, and otherwise we're doing.
+1:24:28.548 --> 1:24:41.867
+Is no longer any discounting anymore, so it's
+very, very simple and however they show you
+1:24:41.867 --> 1:24:47.935
+have to calculate a lot less statistics.
+1:24:50.750 --> 1:24:57.525
+In addition you can have other type of language
+models.
+1:24:57.525 --> 1:25:08.412
+We had word based language models and they
+normally go up to four or five for six brands.
+1:25:08.412 --> 1:25:10.831
+They are too large.
+1:25:11.531 --> 1:25:20.570
+So what people have then looked also into
+is what is referred to as part of speech language
+1:25:20.570 --> 1:25:21.258
+model.
+1:25:21.258 --> 1:25:29.806
+So instead of looking at the word sequence
+you're modeling directly the part of speech
+1:25:29.806 --> 1:25:30.788
+sequence.
+1:25:31.171 --> 1:25:34.987
+Then of course now you're only being modeling
+syntax.
+1:25:34.987 --> 1:25:41.134
+There's no cemented information anymore in
+the paddle speech test but now you might go
+1:25:41.134 --> 1:25:47.423
+to a larger context link so you can do seven
+H or nine grams and then you can write some
+1:25:47.423 --> 1:25:50.320
+of the long range dependencies in order.
+1:25:52.772 --> 1:25:59.833
+And there's other things people have done
+like cash language models, so the idea in cash
+1:25:59.833 --> 1:26:07.052
+language model is that yes words that you have
+recently seen are more frequently to do are
+1:26:07.052 --> 1:26:11.891
+more probable to reoccurr if you want to model
+the dynamics.
+1:26:12.152 --> 1:26:20.734
+If you're just talking here, we talked about
+language models in my presentation.
+1:26:20.734 --> 1:26:23.489
+There will be a lot more.
+1:26:23.883 --> 1:26:37.213
+Can do that by having a dynamic and a static
+component, and then you have a dynamic component
+1:26:37.213 --> 1:26:41.042
+which looks at the bigram.
+1:26:41.261 --> 1:26:49.802
+And thereby, for example, if you once generate
+language model of probability, it's increased
+1:26:49.802 --> 1:26:52.924
+and you're modeling that problem.
+1:26:56.816 --> 1:27:03.114
+Said the dynamic component is trained on the
+text translated so far.
+1:27:04.564 --> 1:27:12.488
+To train them what you just have done, there's
+no human feet there.
+1:27:12.712 --> 1:27:25.466
+The speech model all the time and then it
+will repeat its errors and that is, of course,.
+1:27:25.966 --> 1:27:31.506
+A similar idea is people have looked into
+trigger language model whereas one word occurs
+1:27:31.506 --> 1:27:34.931
+then you increase the probability of some other
+words.
+1:27:34.931 --> 1:27:40.596
+So if you're talking about money that will
+increase the probability of bank saving account
+1:27:40.596 --> 1:27:41.343
+dollar and.
+1:27:41.801 --> 1:27:47.352
+Because then you have to somehow model this
+dependency, but it's somehow also an idea of
+1:27:47.352 --> 1:27:52.840
+modeling long range dependency, because if
+one word occurs very often in your document,
+1:27:52.840 --> 1:27:58.203
+you like somehow like learning which other
+words to occur because they are more often
+1:27:58.203 --> 1:27:59.201
+than by chance.
+1:28:02.822 --> 1:28:10.822
+Yes, then the last thing is, of course, especially
+for languages which are, which are morphologically
+1:28:10.822 --> 1:28:11.292
+rich.
+1:28:11.292 --> 1:28:18.115
+You can do something similar to BPE so you
+can now do more themes or so, and then more
+1:28:18.115 --> 1:28:22.821
+the morphine sequence because the morphines
+are more often.
+1:28:23.023 --> 1:28:26.877
+However, the program is opposed that your
+sequence length also gets longer.
+1:28:27.127 --> 1:28:33.185
+And so if they have a four gram language model,
+it's not counting the last three words but
+1:28:33.185 --> 1:28:35.782
+only the last three more films, which.
+1:28:36.196 --> 1:28:39.833
+So of course then it's a bit challenging and
+know how to deal with.
+1:28:40.680 --> 1:28:51.350
+What about language is finished by the idea
+of a position at the end of the world?
+1:28:51.350 --> 1:28:58.807
+Yeah, but there you can typically do something
+like that.
+1:28:59.159 --> 1:29:02.157
+It is not the one perfect solution.
+1:29:02.157 --> 1:29:05.989
+You have to do a bit of testing what is best.
+1:29:06.246 --> 1:29:13.417
+One way of dealing with a large vocabulary
+that you haven't seen is to split these words
+1:29:13.417 --> 1:29:20.508
+into parts and themes that either like more
+linguistic motivated in more themes or more
+1:29:20.508 --> 1:29:25.826
+statistically motivated like we have in the
+bike pair and coding.
+1:29:28.188 --> 1:29:33.216
+The representation of your text is different.
+1:29:33.216 --> 1:29:41.197
+How you are later doing all the counting and
+the statistics is the same.
+1:29:41.197 --> 1:29:44.914
+What you assume is your sequence.
+1:29:45.805 --> 1:29:49.998
+That's the same thing for the other things
+we had here.
+1:29:49.998 --> 1:29:55.390
+Here you don't have words, but everything
+you're doing is done exactly.
+1:29:57.857 --> 1:29:59.457
+Some practical issues.
+1:29:59.457 --> 1:30:05.646
+Typically you're doing things on the lock
+and you're adding because mild decline in very
+1:30:05.646 --> 1:30:09.819
+small values gives you sometimes problems with
+calculation.
+1:30:10.230 --> 1:30:16.687
+Good thing is you don't have to care with
+this mostly so there is very good two kids
+1:30:16.687 --> 1:30:23.448
+like Azarayan or Kendalan which when you can
+just give your data and they will train the
+1:30:23.448 --> 1:30:30.286
+language more then do all the complicated maths
+behind that and you are able to run them.
+1:30:31.911 --> 1:30:39.894
+So what you should keep from today is what
+is a language model and how we can do maximum
+1:30:39.894 --> 1:30:44.199
+training on that and different language models.
+1:30:44.199 --> 1:30:49.939
+Similar ideas we use for a lot of different
+statistical models.
+1:30:50.350 --> 1:30:52.267
+Where You Always Have the Problem.
+1:30:53.233 --> 1:31:01.608
+Different way of looking at it and doing it
+will do it on Thursday when we will go to language.

demo_data/lectures/Lecture-06-09.05.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59fe56576cf62256b2c62b8fdcf6e502ce1931907278fc420d397cd360774f72
+size 129548573

demo_data/lectures/Lecture-07-11.05.2023/English.vtt ADDED Viewed

	@@ -0,0 +1,2596 @@

+WEBVTT
+0:00:01.301 --> 0:00:05.676
+IntroductionOkay So Welcome to Today's Lecture.
+0:00:06.066 --> 0:00:12.592
+I'm sorry for the inconvenience.
+0:00:12.394 --> 0:00:19.823
+Sometimes they are project meetings.
+0:00:19.622 --> 0:00:25.853
+There will be one other time.
+0:00:26.806 --> 0:00:40.863
+So what we want to talk today about is want
+to start with neural approaches to machine
+0:00:40.863 --> 0:00:42.964
+translation.
+0:00:43.123 --> 0:00:51.285
+I guess you have heard about other types of
+neural models for other types of neural language
+0:00:51.285 --> 0:00:52.339
+processing.
+0:00:52.251 --> 0:00:59.888
+This was some of the first steps in introducing
+neal networks to machine translation.
+0:01:00.600 --> 0:01:06.203
+They are similar to what you know they see
+in as large language models.
+0:01:06.666 --> 0:01:11.764
+And today look into what are these neuro-language
+models?
+0:01:11.676 --> 0:01:13.831
+What is the difference?
+0:01:13.741 --> 0:01:15.989
+What is the motivation?
+0:01:16.316 --> 0:01:21.445
+And first will use them in statistics and
+machine translation.
+0:01:21.364 --> 0:01:28.918
+So if you remember how fully like two or three
+weeks ago we had this likely model where you
+0:01:28.918 --> 0:01:31.053
+can integrate easily any.
+0:01:31.351 --> 0:01:40.967
+We just have another model which evaluates
+how good a system is or how good a fluent language
+0:01:40.967 --> 0:01:41.376
+is.
+0:01:41.274 --> 0:01:55.291
+The main advantage compared to the statistical
+models we saw on Tuesday is: Next week we will
+0:01:55.291 --> 0:02:06.475
+then go for a neural machine translation where
+we replace the whole model.
+0:02:11.211 --> 0:02:21.078
+Just as a remember from Tuesday, we've seen
+the main challenge in language world was that
+0:02:21.078 --> 0:02:25.134
+most of the engrams we haven't seen.
+0:02:26.946 --> 0:02:33.967
+So this was therefore difficult to estimate
+any probability because you've seen that normally
+0:02:33.967 --> 0:02:39.494
+if you have not seen the endgram you will assign
+the probability of zero.
+0:02:39.980 --> 0:02:49.420
+However, this is not really very good because
+we don't want to give zero probabilities to
+0:02:49.420 --> 0:02:54.979
+sentences, which still might be a very good
+English.
+0:02:55.415 --> 0:03:02.167
+And then we learned a lot of techniques and
+that is the main challenging statistical machine
+0:03:02.167 --> 0:03:04.490
+translate statistical language.
+0:03:04.417 --> 0:03:10.630
+What's how we can give a good estimate of
+probability to events that we haven't seen
+0:03:10.630 --> 0:03:12.238
+smoothing techniques?
+0:03:12.165 --> 0:03:15.310
+We've seen this interpolation and begoff.
+0:03:15.435 --> 0:03:21.637
+And they invent or develop very specific techniques.
+0:03:21.520 --> 0:03:26.906
+To deal with that, however, it might not be.
+0:03:28.568 --> 0:03:43.190
+And therefore maybe we can do things different,
+so if we have not seen an gram before in statistical
+0:03:43.190 --> 0:03:44.348
+models.
+0:03:45.225 --> 0:03:51.361
+Before and we can only get information from
+exactly the same words.
+0:03:51.411 --> 0:04:06.782
+We don't have some on like approximate matching
+like that, maybe in a sentence that cures similarly.
+0:04:06.629 --> 0:04:10.289
+So if you have seen a.
+0:04:11.191 --> 0:04:17.748
+And so you would like to have more something
+like that where endgrams are represented, more
+0:04:17.748 --> 0:04:21.953
+in a general space, and we can generalize similar
+numbers.
+0:04:22.262 --> 0:04:29.874
+So if you learn something about walk then
+maybe we can use this knowledge and also apply.
+0:04:30.290 --> 0:04:42.596
+The same as we have done before, but we can
+really better model how similar they are and
+0:04:42.596 --> 0:04:45.223
+transfer to other.
+0:04:47.047 --> 0:04:54.236
+And we maybe want to do that in a more hierarchical
+approach that we know okay.
+0:04:54.146 --> 0:05:02.743
+Some words are similar but like go and walk
+is somehow similar and I and P and G and therefore
+0:05:02.743 --> 0:05:06.997
+like maybe if we then merge them in an engram.
+0:05:07.387 --> 0:05:15.861
+If we learn something about our walk, then
+it should tell us also something about Hugo.
+0:05:15.765 --> 0:05:17.121
+He walks or.
+0:05:17.197 --> 0:05:27.327
+You see that there is some relations which
+we need to integrate for you.
+0:05:27.188 --> 0:05:35.516
+We need to add the s, but maybe walks should
+also be here.
+0:05:37.137 --> 0:05:45.319
+And luckily there is one really convincing
+method in doing that: And that is by using
+0:05:45.319 --> 0:05:47.222
+a neural mechanism.
+0:05:47.387 --> 0:05:58.497
+That's what we will introduce today so we
+can use this type of neural networks to try
+0:05:58.497 --> 0:06:04.053
+to learn this similarity and to learn how.
+0:06:04.324 --> 0:06:14.355
+And that is one of the main advantages that
+we have by switching from the standard statistical
+0:06:14.355 --> 0:06:15.200
+models.
+0:06:15.115 --> 0:06:22.830
+To learn similarities between words and generalized,
+and learn what is called hidden representations
+0:06:22.830 --> 0:06:29.705
+or representations of words, where we can measure
+similarity in some dimensions of words.
+0:06:30.290 --> 0:06:42.384
+So we can measure in which way words are similar.
+0:06:42.822 --> 0:06:48.902
+We had it before and we've seen that words
+were just easier.
+0:06:48.802 --> 0:06:51.994
+The only thing we did is like.
+0:06:52.192 --> 0:07:02.272
+But this energies don't have any meaning,
+so it wasn't that word is more similar to words.
+0:07:02.582 --> 0:07:12.112
+So we couldn't learn anything about words
+in the statistical model and that's a big challenge.
+0:07:12.192 --> 0:07:23.063
+About words even like in morphology, so going
+goes is somehow more similar because the person
+0:07:23.063 --> 0:07:24.219
+singular.
+0:07:24.264 --> 0:07:34.924
+The basic models we have to now have no idea
+about that and goes as similar to go than it
+0:07:34.924 --> 0:07:37.175
+might be to sleep.
+0:07:39.919 --> 0:07:44.073
+So what we want to do today.
+0:07:43.930 --> 0:07:53.098
+In order to go to this we will have a short
+introduction into.
+0:07:53.954 --> 0:08:05.984
+It very short just to see how we use them
+here, but that's a good thing, so most of you
+0:08:05.984 --> 0:08:08.445
+think it will be.
+0:08:08.928 --> 0:08:14.078
+And then we will first look into a feet forward
+neural network language models.
+0:08:14.454 --> 0:08:23.706
+And there we will still have this approximation.
+0:08:23.517 --> 0:08:33.906
+We have before we are looking only at a fixed
+window.
+0:08:34.154 --> 0:08:35.030
+The case.
+0:08:34.942 --> 0:08:38.217
+However, we have the umbellent here.
+0:08:38.129 --> 0:08:43.353
+That's why they're already better in order
+to generalize.
+0:08:44.024 --> 0:08:53.169
+And then at the end we'll look at language
+models where we then have the additional advantage.
+0:08:53.093 --> 0:09:04.317
+Case that we need to have a fixed history,
+but in theory we can model arbitrary long dependencies.
+0:09:04.304 --> 0:09:12.687
+And we talked about on Tuesday where it is
+not clear what type of information it is to.
+0:09:16.396 --> 0:09:24.981
+So in general molecular networks I normally
+learn to prove that they perform some tasks.
+0:09:25.325 --> 0:09:33.472
+We have the structure and we are learning
+them from samples so that is similar to what
+0:09:33.472 --> 0:09:34.971
+we have before.
+0:09:34.877 --> 0:09:42.277
+So now we have the same task here, a language
+model giving input or forwards.
+0:09:42.642 --> 0:09:48.959
+And is somewhat originally motivated by human
+brain.
+0:09:48.840 --> 0:10:00.640
+However, when you now need to know about artificial
+neural networks, it's hard to get similarity.
+0:10:00.540 --> 0:10:02.889
+There seemed to be not that point.
+0:10:03.123 --> 0:10:11.014
+So what they are mainly doing is summoning
+multiplication and then one non-linear activation.
+0:10:12.692 --> 0:10:16.085
+So the basic units are these type of.
+0:10:17.937 --> 0:10:29.891
+Perceptron basic blocks which we have and
+this does processing so we have a fixed number
+0:10:29.891 --> 0:10:36.070
+of input features and that will be important.
+0:10:36.096 --> 0:10:39.689
+So we have here numbers to xn as input.
+0:10:40.060 --> 0:10:53.221
+And this makes partly of course language processing
+difficult.
+0:10:54.114 --> 0:10:57.609
+So we have to model this time on and then
+go stand home and model.
+0:10:58.198 --> 0:11:02.099
+Then we are having weights, which are the
+parameters and the number of weights exactly
+0:11:02.099 --> 0:11:03.668
+the same as the number of weights.
+0:11:04.164 --> 0:11:06.322
+Of input features.
+0:11:06.208 --> 0:11:15.070
+Sometimes he has his fires in there, and then
+it's not really an input from.
+0:11:15.195 --> 0:11:19.205
+And what you then do is multiply.
+0:11:19.087 --> 0:11:26.166
+Each input resists weight and then you sum
+it up and then.
+0:11:26.606 --> 0:11:34.357
+What is then additionally later important
+is that we have an activation function and
+0:11:34.357 --> 0:11:42.473
+it's important that this activation function
+is non linear, so we come to just a linear.
+0:11:43.243 --> 0:11:54.088
+And later it will be important that this is
+differentiable because otherwise all the training.
+0:11:54.714 --> 0:12:01.907
+This model by itself is not very powerful.
+0:12:01.739 --> 0:12:10.440
+It was originally shown that this is not powerful.
+0:12:10.710 --> 0:12:19.463
+However, there is a very easy extension, the
+multi layer perceptual, and then things get
+0:12:19.463 --> 0:12:20.939
+very powerful.
+0:12:21.081 --> 0:12:27.719
+The thing is you just connect a lot of these
+in this layer of structures and we have our
+0:12:27.719 --> 0:12:35.029
+input layer where we have the inputs and our
+hidden layer at least one where there is everywhere.
+0:12:35.395 --> 0:12:39.817
+And then we can combine them all to do that.
+0:12:40.260 --> 0:12:48.320
+The input layer is of course somewhat given
+by a problem of dimension.
+0:12:48.206 --> 0:13:00.014
+The outward layer is also given by your dimension,
+but the hidden layer is of course a hyperparameter.
+0:13:01.621 --> 0:13:06.982
+How we represent wordsSo let's start with
+the first question, now more language related,
+0:13:06.982 --> 0:13:08.788
+and that is how we represent.
+0:13:09.149 --> 0:13:23.460
+So we've seen here we have the but the question
+is now how can we put in a word into this?
+0:13:26.866 --> 0:13:34.117
+Noise: The first thing we're able to be better
+is by the fact that like you are said,.
+0:13:34.314 --> 0:13:43.028
+That is not that easy because the continuous
+vector will come to that.
+0:13:42.905 --> 0:13:50.395
+So from the neo-network we can directly put
+in the bedding.
+0:13:50.630 --> 0:13:57.277
+But if we need to input a word into the needle
+network, it has to be something which is easily
+0:13:57.277 --> 0:13:57.907
+defined.
+0:13:59.079 --> 0:14:12.492
+The one hood encoding, and then we have one
+out of encoding, so one value is one, and all
+0:14:12.492 --> 0:14:15.324
+the others is the.
+0:14:16.316 --> 0:14:25.936
+That means we are always dealing with fixed
+vocabulary because what said is we cannot.
+0:14:26.246 --> 0:14:38.017
+So you cannot easily extend your vocabulary
+because if you mean you would extend your vocabulary.
+0:14:39.980 --> 0:14:41.502
+That's also motivating.
+0:14:41.438 --> 0:14:43.683
+We're talked about biperriagoding.
+0:14:43.619 --> 0:14:45.383
+That's a nice thing there.
+0:14:45.318 --> 0:14:47.214
+We have a fixed vocabulary.
+0:14:48.048 --> 0:14:55.804
+The big advantage of this one encoding is
+that we don't implicitly sum our implement
+0:14:55.804 --> 0:15:04.291
+similarity between words, but really re-learning
+because if you first think about this, this
+0:15:04.291 --> 0:15:06.938
+is a very, very inefficient.
+0:15:07.227 --> 0:15:15.889
+So you need like to represent end words, you
+need a dimension of an end dimensional vector.
+0:15:16.236 --> 0:15:24.846
+Imagine you could do binary encoding so you
+could represent words as binary vectors.
+0:15:24.745 --> 0:15:26.474
+Then you would.
+0:15:26.806 --> 0:15:31.177
+Will be significantly more efficient.
+0:15:31.062 --> 0:15:36.781
+However, then you have some implicit similarity.
+0:15:36.664 --> 0:15:39.121
+Some numbers share.
+0:15:39.559 --> 0:15:46.958
+Would somehow be bad because you would force
+someone to do this by hand or clear how to
+0:15:46.958 --> 0:15:47.631
+define.
+0:15:48.108 --> 0:15:55.135
+So therefore currently this is the most successful
+approach to just do this one watch.
+0:15:55.095 --> 0:15:59.563
+Representations, so we take a fixed vocabulary.
+0:15:59.470 --> 0:16:06.124
+We map each word to the inise, and then we
+represent a word like this.
+0:16:06.030 --> 0:16:13.248
+So if home will be one, the representation
+will be one zero zero zero, and.
+0:16:14.514 --> 0:16:30.639
+But this dimension here is a vocabulary size
+and that is quite high, so we are always trying
+0:16:30.639 --> 0:16:33.586
+to be efficient.
+0:16:33.853 --> 0:16:43.792
+We are doing then some type of efficiency
+because typically we are having this next layer.
+0:16:44.104 --> 0:16:51.967
+It can be still maybe two hundred or five
+hundred or one thousand neurons, but this is
+0:16:51.967 --> 0:16:53.323
+significantly.
+0:16:53.713 --> 0:17:03.792
+You can learn that directly and there we then
+have similarity between words.
+0:17:03.662 --> 0:17:07.462
+Then it is that some words.
+0:17:07.807 --> 0:17:14.772
+But the nice thing is that this is then learned
+that we are not need to hand define that.
+0:17:17.117 --> 0:17:32.742
+We'll come later to the explicit architecture
+of the neural language one, and there we can
+0:17:32.742 --> 0:17:35.146
+see how it's.
+0:17:38.418 --> 0:17:44.857
+So we're seeing that the other one or our
+representation always has the same similarity.
+0:17:45.105 --> 0:17:59.142
+Then we're having this continuous factor which
+is a lot smaller dimension and that's important
+0:17:59.142 --> 0:18:00.768
+for later.
+0:18:01.121 --> 0:18:06.989
+What we are doing then is learning these representations
+so that they are best for language.
+0:18:07.487 --> 0:18:14.968
+So the representations are implicitly training
+the language for the cards.
+0:18:14.869 --> 0:18:19.061
+This is the best way for doing language.
+0:18:19.479 --> 0:18:32.564
+And the nice thing that was found out later
+is these representations are really good.
+0:18:33.153 --> 0:18:39.253
+And that is why they are now even called word
+embeddings by themselves and used for other
+0:18:39.253 --> 0:18:39.727
+tasks.
+0:18:40.360 --> 0:18:49.821
+And they are somewhat describing very different
+things so they can describe and semantic similarities.
+0:18:49.789 --> 0:18:58.650
+Are looking at the very example of today mass
+vector space by adding words and doing some
+0:18:58.650 --> 0:19:00.618
+interesting things.
+0:19:00.940 --> 0:19:11.178
+So they got really like the first big improvement
+when switching to neurostaff.
+0:19:11.491 --> 0:19:20.456
+Are like part of the model, but with more
+complex representation, but they are the basic
+0:19:20.456 --> 0:19:21.261
+models.
+0:19:23.683 --> 0:19:36.979
+In the output layer we are also having one
+output layer structure and a connection function.
+0:19:36.997 --> 0:19:46.525
+That is, for language learning we want to
+predict what is the most common word.
+0:19:47.247 --> 0:19:56.453
+And that can be done very well with this so
+called soft back layer, where again the dimension.
+0:19:56.376 --> 0:20:02.825
+Vocabulary size, so this is a vocabulary size,
+and again the case neural represents the case
+0:20:02.825 --> 0:20:03.310
+class.
+0:20:03.241 --> 0:20:09.759
+So in our case we have again one round representation,
+someone saying this is a core report.
+0:20:10.090 --> 0:20:17.255
+Our probability distribution is a probability
+distribution over all works, so the case entry
+0:20:17.255 --> 0:20:21.338
+tells us how probable is that the next word
+is this.
+0:20:22.682 --> 0:20:33.885
+So we need to have some probability distribution
+at our output in order to achieve that this
+0:20:33.885 --> 0:20:37.017
+activation function goes.
+0:20:37.197 --> 0:20:46.944
+And we can achieve that with a soft max activation
+we take the input to the form of the value,
+0:20:46.944 --> 0:20:47.970
+and then.
+0:20:48.288 --> 0:20:58.021
+So by having this type of activation function
+we are really getting this type of probability.
+0:20:59.019 --> 0:21:15.200
+At the beginning was also very challenging
+because again we have this inefficient representation.
+0:21:15.235 --> 0:21:29.799
+You can imagine that something over is maybe
+a bit inefficient with cheap users, but definitely.
+0:21:36.316 --> 0:21:44.072
+And then for training the models that will
+be fine, so we have to use architecture now.
+0:21:44.264 --> 0:21:48.491
+We need to minimize the arrow.
+0:21:48.355 --> 0:21:53.196
+Are we doing it taking the output?
+0:21:53.058 --> 0:21:58.180
+We are comparing it to our targets.
+0:21:58.298 --> 0:22:03.830
+So one important thing is by training them.
+0:22:03.705 --> 0:22:07.531
+How can we measure the error?
+0:22:07.403 --> 0:22:12.763
+So what is if we are training the ideas?
+0:22:13.033 --> 0:22:15.163
+And how well we are measuring.
+0:22:15.094 --> 0:22:19.769
+It is in natural language processing, typically
+the cross entropy.
+0:22:19.960 --> 0:22:35.575
+And that means we are comparing the target
+with the output.
+0:22:35.335 --> 0:22:44.430
+It gets optimized and you're seeing that this,
+of course, makes it again very nice and easy
+0:22:44.430 --> 0:22:49.868
+because our target is again a one-hour representation.
+0:22:50.110 --> 0:23:00.116
+So all of these are always zero, and what
+we are then doing is we are taking the one.
+0:23:00.100 --> 0:23:04.615
+And we only need to multiply the one with
+the logarithm here, and that is all the feedback
+0:23:04.615 --> 0:23:05.955
+signal we are taking here.
+0:23:06.946 --> 0:23:13.885
+Of course, this is not always influenced by
+all the others.
+0:23:13.770 --> 0:23:17.936
+Why is this influenced by all the.
+0:23:24.304 --> 0:23:34.382
+Have the activation function, which is the
+current activation divided by some of the others.
+0:23:34.354 --> 0:23:45.924
+Otherwise it could easily just increase this
+volume and ignore the others, but if you increase
+0:23:45.924 --> 0:23:49.090
+one value all the others.
+0:23:51.351 --> 0:23:59.912
+Then we can do with neometrics one very nice
+and easy type of training that is done in all
+0:23:59.912 --> 0:24:07.721
+the neometrics where we are now calculating
+our error and especially the gradient.
+0:24:07.707 --> 0:24:11.640
+So in which direction does the error show?
+0:24:11.548 --> 0:24:18.632
+And then if we want to go to a smaller arrow
+that's what we want to achieve.
+0:24:18.540 --> 0:24:26.629
+We are taking the inverse direction of the
+gradient and thereby trying to minimize our
+0:24:26.629 --> 0:24:27.280
+error.
+0:24:27.287 --> 0:24:31.041
+And we have to do that, of course, for all
+the weights.
+0:24:30.974 --> 0:24:36.630
+And to calculate the error of all the weights,
+we won't do the defectvagation here.
+0:24:36.563 --> 0:24:41.376
+But but what you can do is you can propagate
+the arrow which measured.
+0:24:41.309 --> 0:24:46.394
+At the end you can propagate it back its basic
+mass and basic derivation.
+0:24:46.706 --> 0:24:58.854
+For each way in your model measure how much
+you contribute to the error and then change
+0:24:58.854 --> 0:25:01.339
+it in a way that.
+0:25:04.524 --> 0:25:11.265
+Multilayer Language ModelSo to summarize what
+for at least machine translation on your machine
+0:25:11.265 --> 0:25:18.502
+translation should remember, you know, to understand
+on this problem is that this is how a multilayer
+0:25:18.502 --> 0:25:20.631
+first the problem looks like.
+0:25:20.580 --> 0:25:28.251
+There are fully two layers and no connections.
+0:25:28.108 --> 0:25:29.759
+Across layers.
+0:25:29.829 --> 0:25:35.153
+And what they're doing is always just a waited
+sum here and then in activation production.
+0:25:35.415 --> 0:25:38.792
+And in order to train you have this forward
+and backward pass.
+0:25:39.039 --> 0:25:41.384
+So We Put in Here.
+0:25:41.281 --> 0:25:41.895
+Inputs.
+0:25:41.818 --> 0:25:45.285
+We have some random values at the beginning.
+0:25:45.208 --> 0:25:47.300
+Then calculate the output.
+0:25:47.222 --> 0:25:54.160
+We are measuring how our error is propagating
+the arrow back and then changing our model
+0:25:54.160 --> 0:25:57.902
+in a way that we hopefully get a smaller arrow.
+0:25:57.824 --> 0:25:59.621
+And then that is how.
+0:26:01.962 --> 0:26:12.893
+So before we're coming into our neural networks
+language models, how can we use this type of
+0:26:12.893 --> 0:26:17.595
+neural network to do language modeling?
+0:26:23.103 --> 0:26:33.157
+So how can we use them in natural language
+processing, especially machine translation?
+0:26:33.042 --> 0:26:41.398
+The first idea of using them was to estimate:
+So we have seen that the output can be monitored
+0:26:41.398 --> 0:26:42.630
+here as well.
+0:26:43.603 --> 0:26:50.311
+A probability distribution and if we have
+a full vocabulary we could mainly hear estimating
+0:26:50.311 --> 0:26:56.727
+how probable each next word is and then use
+that in our language model fashion as we've
+0:26:56.727 --> 0:26:58.112
+done it last time.
+0:26:58.039 --> 0:27:03.217
+We got the probability of a full sentence
+as a product of individual.
+0:27:04.544 --> 0:27:12.555
+And: That was done in the ninety seven years
+and it's very easy to integrate it into this
+0:27:12.555 --> 0:27:14.602
+lot of the year model.
+0:27:14.513 --> 0:27:19.553
+So we have said that this is how the locker
+here model looks like.
+0:27:19.478 --> 0:27:25.121
+So we are searching the best translation which
+minimizes each waste time.
+0:27:25.125 --> 0:27:26.362
+The Future About You.
+0:27:26.646 --> 0:27:31.647
+We have that with minimum error rate training
+if you can remember where we search for the
+0:27:31.647 --> 0:27:32.147
+optimal.
+0:27:32.512 --> 0:27:40.422
+The language model and many others, and we
+can just add here a neuromodel, have a knock
+0:27:40.422 --> 0:27:41.591
+of features.
+0:27:41.861 --> 0:27:45.761
+So that is quite easy as said.
+0:27:45.635 --> 0:27:53.140
+That was how statistical machine translation
+was improved.
+0:27:53.013 --> 0:27:57.088
+You just add one more feature.
+0:27:58.798 --> 0:28:07.631
+So how can we model the language modeling
+with a network?
+0:28:07.479 --> 0:28:16.010
+So what we have to do is model the probability
+of the.
+0:28:16.656 --> 0:28:25.047
+The problem in general in the head is that
+mostly we haven't seen long sequences.
+0:28:25.085 --> 0:28:35.650
+Mostly we have to beg off to very short sequences
+and we are working on this discrete space where
+0:28:35.650 --> 0:28:36.944
+similarity.
+0:28:37.337 --> 0:28:50.163
+So the idea is if we have now a real network,
+we can make words into continuous representation.
+0:28:51.091 --> 0:29:00.480
+And the structure then looks like this, so
+this is a basic still feed forward neural network.
+0:29:01.361 --> 0:29:10.645
+We are doing this at perximation again, so
+we are not putting in all previous words, but
+0:29:10.645 --> 0:29:11.375
+it is.
+0:29:11.691 --> 0:29:25.856
+This is done because we said that in the real
+network we can have only a fixed type of input.
+0:29:25.945 --> 0:29:31.886
+You can only do a fixed step and then we'll
+be doing that exactly in minus one.
+0:29:33.593 --> 0:29:39.536
+So here you are, for example, three words
+and three different words.
+0:29:39.450 --> 0:29:50.934
+One and all the others are: And then we're
+having the first layer of the neural network,
+0:29:50.934 --> 0:29:56.225
+which like you learns is word embedding.
+0:29:57.437 --> 0:30:04.976
+There is one thing which is maybe special
+compared to the standard neural member.
+0:30:05.345 --> 0:30:11.918
+So the representation of this word we want
+to learn first of all position independence.
+0:30:11.843 --> 0:30:19.014
+So we just want to learn what is the general
+meaning of the word independent of its neighbors.
+0:30:19.299 --> 0:30:26.239
+And therefore the representation you get here
+should be the same as if in the second position.
+0:30:27.247 --> 0:30:36.865
+The nice thing you can achieve is that this
+weights which you're using here you're reusing
+0:30:36.865 --> 0:30:41.727
+here and reusing here so we are forcing them.
+0:30:42.322 --> 0:30:48.360
+You then learn your word embedding, which
+is contextual, independent, so it's the same
+0:30:48.360 --> 0:30:49.678
+for each position.
+0:30:49.909 --> 0:31:03.482
+So that's the idea that you want to learn
+the representation first of and you don't want
+0:31:03.482 --> 0:31:07.599
+to really use the context.
+0:31:08.348 --> 0:31:13.797
+That of course might have a different meaning
+depending on where it stands, but we'll learn
+0:31:13.797 --> 0:31:14.153
+that.
+0:31:14.514 --> 0:31:20.386
+So first we are learning here representational
+words, which is just the representation.
+0:31:20.760 --> 0:31:32.498
+Normally we said in neurons all input neurons
+here are connected to all here, but we're reducing
+0:31:32.498 --> 0:31:37.338
+the complexity by saying these neurons.
+0:31:37.857 --> 0:31:47.912
+Then we have a lot denser representation that
+is our three word embedded in here, and now
+0:31:47.912 --> 0:31:57.408
+we are learning this interaction between words,
+a direction between words not based.
+0:31:57.677 --> 0:32:08.051
+So we have at least one connected layer here,
+which takes a three embedding input and then
+0:32:08.051 --> 0:32:14.208
+learns a new embedding which now represents
+the full.
+0:32:15.535 --> 0:32:16.551
+Layers.
+0:32:16.424 --> 0:32:27.856
+It is the output layer which now and then
+again the probability distribution of all the.
+0:32:28.168 --> 0:32:48.612
+So here is your target prediction.
+0:32:48.688 --> 0:32:56.361
+The nice thing is that you learn everything
+together, so you don't have to teach them what
+0:32:56.361 --> 0:32:58.722
+a good word representation.
+0:32:59.079 --> 0:33:08.306
+Training the whole number together, so it
+learns what a good representation for a word
+0:33:08.306 --> 0:33:13.079
+you get in order to perform your final task.
+0:33:15.956 --> 0:33:19.190
+Yeah, that is the main idea.
+0:33:20.660 --> 0:33:32.731
+This is now a days often referred to as one
+way of self supervise learning.
+0:33:33.053 --> 0:33:37.120
+The output is the next word and the input
+is the previous word.
+0:33:37.377 --> 0:33:46.783
+But it's not really that we created labels,
+but we artificially created a task out of unlabeled.
+0:33:46.806 --> 0:34:02.452
+We just had pure text, and then we created
+the telescopes by predicting the next word,
+0:34:02.452 --> 0:34:18.818
+which is: Say we have like two sentences like
+go home and the second one is go to prepare.
+0:34:18.858 --> 0:34:30.135
+And then we have to predict the next series
+and my questions in the labels for the album.
+0:34:31.411 --> 0:34:42.752
+We model this as one vector with like probability
+for possible weights starting again.
+0:34:44.044 --> 0:34:57.792
+Multiple examples, so then you would twice
+train one to predict KRT, one to predict home,
+0:34:57.792 --> 0:35:02.374
+and then of course the easel.
+0:35:04.564 --> 0:35:13.568
+Is a very good point, so you are not aggregating
+examples beforehand, but you are taking each.
+0:35:19.259 --> 0:35:37.204
+So when you do it simultaneously learn the
+projection layer and the endgram for abilities
+0:35:37.204 --> 0:35:39.198
+and then.
+0:35:39.499 --> 0:35:47.684
+And later analyze it that these representations
+are very powerful.
+0:35:47.562 --> 0:35:56.360
+The task is just a very important task to
+model what is the next word.
+0:35:56.816 --> 0:35:59.842
+Is motivated by nowadays.
+0:35:59.726 --> 0:36:10.668
+In order to get the meaning of the word you
+have to look at its companies where the context.
+0:36:10.790 --> 0:36:16.048
+If you read texts in days of word which you
+have never seen, you often can still estimate
+0:36:16.048 --> 0:36:21.130
+the meaning of this word because you do not
+know how it is used, and this is typically
+0:36:21.130 --> 0:36:22.240
+used as a city or.
+0:36:22.602 --> 0:36:25.865
+Just imagine you read a text about some city.
+0:36:25.794 --> 0:36:32.033
+Even if you've never seen the city before,
+you often know from the context of how it's
+0:36:32.033 --> 0:36:32.464
+used.
+0:36:34.094 --> 0:36:42.483
+So what is now the big advantage of using
+neural neckworks?
+0:36:42.343 --> 0:36:51.853
+So just imagine we have to estimate that I
+bought my first iPhone.
+0:36:52.052 --> 0:36:56.608
+So you have to monitor the probability of
+ad hitting them.
+0:36:56.530 --> 0:37:00.239
+Now imagine iPhone, which you have never seen.
+0:37:00.600 --> 0:37:11.588
+So all the techniques we had last time at
+the end, if you haven't seen iPhone you will
+0:37:11.588 --> 0:37:14.240
+always fall back to.
+0:37:15.055 --> 0:37:26.230
+You have no idea how to deal that you won't
+have seen the diagram, the trigram, and all
+0:37:26.230 --> 0:37:27.754
+the others.
+0:37:28.588 --> 0:37:43.441
+If you're having this type of model, what
+does it do if you have my first and then something?
+0:37:43.483 --> 0:37:50.270
+Maybe this representation is really messed
+up because it's mainly on a cavalry word.
+0:37:50.730 --> 0:37:57.793
+However, you have still these two information
+that two words before was first and therefore.
+0:37:58.098 --> 0:38:06.954
+So you have a lot of information in order
+to estimate how good it is.
+0:38:06.827 --> 0:38:13.282
+There could be more information if you know
+that.
+0:38:13.593 --> 0:38:25.168
+So all this type of modeling we can do that
+we couldn't do beforehand because we always
+0:38:25.168 --> 0:38:25.957
+have.
+0:38:27.027 --> 0:38:37.524
+Good point, so typically you would have one
+token for a vocabulary so that you could, for
+0:38:37.524 --> 0:38:45.922
+example: All you're doing by parent coding
+when you have a fixed thing.
+0:38:46.226 --> 0:38:49.437
+Oh yeah, you have to do something like that
+that that that's true.
+0:38:50.050 --> 0:38:55.420
+So yeah, auto vocabulary are by thanking where
+you don't have other words written.
+0:38:55.735 --> 0:39:06.295
+But then, of course, you might be getting
+very long previous things, and your sequence
+0:39:06.295 --> 0:39:11.272
+length gets very long for unknown words.
+0:39:17.357 --> 0:39:20.067
+Any more questions to the basic stable.
+0:39:23.783 --> 0:39:36.719
+For this model, what we then want to continue
+is looking a bit into how complex or how we
+0:39:36.719 --> 0:39:39.162
+can make things.
+0:39:40.580 --> 0:39:49.477
+Because at the beginning there was definitely
+a major challenge, it's still not that easy,
+0:39:49.477 --> 0:39:58.275
+and I mean our likeers followed the talk about
+their environmental fingerprint and so on.
+0:39:58.478 --> 0:40:05.700
+So this calculation is not really heavy, and
+if you build systems yourselves you have to
+0:40:05.700 --> 0:40:06.187
+wait.
+0:40:06.466 --> 0:40:14.683
+So it's good to know a bit about how complex
+things are in order to do a good or efficient
+0:40:14.683 --> 0:40:15.405
+affair.
+0:40:15.915 --> 0:40:24.211
+So one thing where most of the calculation
+really happens is if you're doing it in a bad
+0:40:24.211 --> 0:40:24.677
+way.
+0:40:25.185 --> 0:40:33.523
+So in generally all these layers we are talking
+about networks and zones fancy.
+0:40:33.419 --> 0:40:46.713
+In the end it is: So what you have to do in
+order to calculate here, for example, these
+0:40:46.713 --> 0:40:52.454
+activations: So make it simple a bit.
+0:40:52.303 --> 0:41:06.633
+Let's see where outputs and you just do metric
+multiplication between your weight matrix and
+0:41:06.633 --> 0:41:08.482
+your input.
+0:41:08.969 --> 0:41:20.992
+So that is why computers are so powerful for
+neural networks because they are very good
+0:41:20.992 --> 0:41:22.358
+in doing.
+0:41:22.782 --> 0:41:28.013
+However, for some type for the embedding layer
+this is really very inefficient.
+0:41:28.208 --> 0:41:39.652
+So because remember we're having this one
+art encoding in this input, it's always like
+0:41:39.652 --> 0:41:42.940
+one and everything else.
+0:41:42.809 --> 0:41:47.022
+It's zero if we're doing this.
+0:41:47.387 --> 0:41:55.552
+So therefore you can do at least the forward
+pass a lot more efficient if you don't really
+0:41:55.552 --> 0:42:01.833
+do this calculation, but you can select the
+one color where there is.
+0:42:01.743 --> 0:42:07.218
+Therefore, you also see this is called your
+word embedding.
+0:42:08.348 --> 0:42:19.542
+So the weight matrix of the embedding layer
+is just that in each color you have the embedding
+0:42:19.542 --> 0:42:20.018
+of.
+0:42:20.580 --> 0:42:30.983
+So this is like how your initial weights look
+like and how you can interpret or understand.
+0:42:32.692 --> 0:42:39.509
+And this is already relatively important because
+remember this is a huge dimensional thing.
+0:42:39.435 --> 0:42:46.071
+So typically here we have the number of words
+is ten thousand or so, so this is the word
+0:42:46.071 --> 0:42:51.365
+embeddings metrics, typically the most expensive
+to calculate metrics.
+0:42:51.451 --> 0:42:59.741
+Because it's the largest one there, we have
+ten thousand entries, while for the hours we
+0:42:59.741 --> 0:43:00.393
+maybe.
+0:43:00.660 --> 0:43:03.408
+So therefore the addition to a little bit
+more to make this.
+0:43:06.206 --> 0:43:10.538
+Then you can go where else the calculations
+are very difficult.
+0:43:10.830 --> 0:43:20.389
+So here we then have our network, so we have
+the word embeddings.
+0:43:20.244 --> 0:43:29.516
+We have one hidden there, and then you can
+look how difficult.
+0:43:30.270 --> 0:43:38.746
+Could save a lot of calculation by not really
+calculating the selection because that is always.
+0:43:40.600 --> 0:43:46.096
+The number of calculations you have to do
+here is so.
+0:43:45.994 --> 0:43:51.695
+The length of this layer is minus one type
+projection.
+0:43:52.993 --> 0:43:56.321
+That is a hint size.
+0:43:56.162 --> 0:44:10.270
+So the first step of calculation for this
+metrics modification is how much calculation.
+0:44:10.730 --> 0:44:18.806
+Then you have to do some activation function
+and then you have to do again the calculation.
+0:44:19.339 --> 0:44:27.994
+Here we need the vocabulary size because we
+need to calculate the probability for each
+0:44:27.994 --> 0:44:29.088
+next word.
+0:44:29.889 --> 0:44:42.016
+And if you look at these numbers, so if you
+have a projector size of and a vocabulary size
+0:44:42.016 --> 0:44:53.609
+of, you see: And that is why there has been
+especially at the beginning some ideas how
+0:44:53.609 --> 0:44:55.608
+we can reduce.
+0:44:55.956 --> 0:45:01.942
+And if we really need to calculate all of
+our capabilities, or if we can calculate only
+0:45:01.942 --> 0:45:02.350
+some.
+0:45:02.582 --> 0:45:10.871
+And there again the one important thing to
+think about is for what will use my language
+0:45:10.871 --> 0:45:11.342
+mom.
+0:45:11.248 --> 0:45:19.607
+I can use it for generations and that's what
+we will see next week in an achiever which
+0:45:19.607 --> 0:45:22.457
+really is guiding the search.
+0:45:23.123 --> 0:45:30.899
+If it just uses a feature, we do not want
+to use it for generations, but we want to only
+0:45:30.899 --> 0:45:32.559
+know how probable.
+0:45:32.953 --> 0:45:39.325
+There we might not be really interested in
+all the probabilities, but we already know
+0:45:39.325 --> 0:45:46.217
+we just want to know the probability of this
+one word, and then it might be very inefficient
+0:45:46.217 --> 0:45:49.403
+to really calculate all the probabilities.
+0:45:51.231 --> 0:45:52.919
+And how can you do that so?
+0:45:52.859 --> 0:45:56.297
+Initially, for example, the people look into
+shortness.
+0:45:56.756 --> 0:46:02.276
+So this calculation at the end is really very
+expensive.
+0:46:02.179 --> 0:46:05.765
+So can we make that more efficient.
+0:46:05.945 --> 0:46:17.375
+And most words occur very rarely, and maybe
+we don't need anger, and so there we may want
+0:46:17.375 --> 0:46:18.645
+to focus.
+0:46:19.019 --> 0:46:29.437
+And so they use the smaller vocabulary, which
+is maybe.
+0:46:29.251 --> 0:46:34.581
+This layer is used from to.
+0:46:34.391 --> 0:46:37.640
+Then you merge.
+0:46:37.937 --> 0:46:45.162
+So you're taking if the word is in the shortest,
+so in the two thousand most frequent words.
+0:46:45.825 --> 0:46:58.299
+Of this short word by some normalization here,
+and otherwise you take a back of probability
+0:46:58.299 --> 0:46:59.655
+from the.
+0:47:00.020 --> 0:47:04.933
+It will not be as good, but the idea is okay.
+0:47:04.826 --> 0:47:13.994
+Then we don't have to calculate all these
+probabilities here at the end, but we only
+0:47:13.994 --> 0:47:16.043
+have to calculate.
+0:47:19.599 --> 0:47:32.097
+With some type of cost because it means we
+don't model the probability of the infrequent
+0:47:32.097 --> 0:47:39.399
+words, and maybe it's even very important to
+model.
+0:47:39.299 --> 0:47:46.671
+And one idea is to do what is reported as
+so so structured out there.
+0:47:46.606 --> 0:47:49.571
+Network language models you see some years
+ago.
+0:47:49.510 --> 0:47:53.155
+People were very creative and giving names
+to new models.
+0:47:53.813 --> 0:48:00.341
+And there the idea is that we model the output
+vocabulary as a clustered treat.
+0:48:00.680 --> 0:48:06.919
+So you don't need to model all of our bodies
+directly, but you are putting words into a
+0:48:06.919 --> 0:48:08.479
+sequence of clusters.
+0:48:08.969 --> 0:48:15.019
+So maybe a very intriguant world is first
+in cluster three and then in cluster three.
+0:48:14.949 --> 0:48:21.212
+You have subclusters again and there is subclusters
+seven and subclusters and there is.
+0:48:21.541 --> 0:48:40.134
+And this is the path, so that is what was
+the man in the past.
+0:48:40.340 --> 0:48:52.080
+And then you can calculate the probability
+of the word again just by the product of the
+0:48:52.080 --> 0:48:55.548
+first class of the world.
+0:48:57.617 --> 0:49:07.789
+That it may be more clear where you have this
+architecture, so this is all the same.
+0:49:07.670 --> 0:49:13.775
+But then you first predict here which main
+class.
+0:49:14.154 --> 0:49:24.226
+Then you go to the appropriate subclass, then
+you calculate the probability of the subclass
+0:49:24.226 --> 0:49:26.415
+and maybe the cell.
+0:49:27.687 --> 0:49:35.419
+Anybody have an idea why this is more efficient
+or if you do it first, it looks a lot more.
+0:49:42.242 --> 0:49:51.788
+You have to do less calculations, so maybe
+if you do it here you have to calculate the
+0:49:51.788 --> 0:49:59.468
+element there, but you don't have to do all
+the one hundred thousand.
+0:49:59.980 --> 0:50:06.115
+The probabilities in the set classes that
+you're going through and not for all of them.
+0:50:06.386 --> 0:50:18.067
+Therefore, it's more efficient if you don't
+need all output proficient because you have
+0:50:18.067 --> 0:50:21.253
+to calculate the class.
+0:50:21.501 --> 0:50:28.936
+So it's only more efficient and scenarios
+where you really need to use a language model
+0:50:28.936 --> 0:50:30.034
+to evaluate.
+0:50:35.275 --> 0:50:52.456
+How this works was that you can train first
+in your language one on the short list.
+0:50:52.872 --> 0:51:03.547
+But on the input layer you have your full
+vocabulary because at the input we saw that
+0:51:03.547 --> 0:51:06.650
+this is not complicated.
+0:51:06.906 --> 0:51:26.638
+And then you can cluster down all your words
+here into classes and use that as your glasses.
+0:51:29.249 --> 0:51:34.148
+That is one idea of doing it.
+0:51:33.985 --> 0:51:44.930
+There is also a second idea of doing it, and
+again we don't need.
+0:51:45.025 --> 0:51:53.401
+So sometimes it doesn't really need to be
+a probability to evaluate.
+0:51:53.280 --> 0:51:56.562
+It's only important that.
+0:51:58.298 --> 0:52:04.989
+And: Here it's called self normalization what
+people have done so.
+0:52:04.889 --> 0:52:11.552
+We have seen that the probability is in this
+soft mechanism always to the input divided
+0:52:11.552 --> 0:52:18.214
+by our normalization, and the normalization
+is a summary of the vocabulary to the power
+0:52:18.214 --> 0:52:19.274
+of the spell.
+0:52:19.759 --> 0:52:25.194
+So this is how we calculate the software.
+0:52:25.825 --> 0:52:41.179
+In self normalization of the idea, if this
+would be zero then we don't need to calculate
+0:52:41.179 --> 0:52:42.214
+that.
+0:52:42.102 --> 0:52:54.272
+Will be zero, and then you don't even have
+to calculate the normalization because it's.
+0:52:54.514 --> 0:53:08.653
+So how can we achieve that and then the nice
+thing in your networks?
+0:53:09.009 --> 0:53:23.928
+And now we're just adding a second note with
+some either permitted here.
+0:53:24.084 --> 0:53:29.551
+And the second lost just tells us he'll be
+strained away.
+0:53:29.457 --> 0:53:31.630
+The locks at is zero.
+0:53:32.352 --> 0:53:38.614
+So then if it's nearly zero at the end we
+don't need to calculate this and it's also
+0:53:38.614 --> 0:53:39.793
+very efficient.
+0:53:40.540 --> 0:53:49.498
+One important thing is this, of course, is
+only in inference.
+0:53:49.354 --> 0:54:02.039
+During tests we don't need to calculate that
+because: You can do a bit of a hyperparameter
+0:54:02.039 --> 0:54:14.446
+here where you do the waiting, so how good
+should it be estimating the probabilities and
+0:54:14.446 --> 0:54:16.816
+how much effort?
+0:54:18.318 --> 0:54:28.577
+The only disadvantage is no speed up during
+training.
+0:54:28.387 --> 0:54:43.971
+There are other ways of doing that, for example:
+Englishman is in case you get it.
+0:54:44.344 --> 0:54:48.540
+Then we are coming very, very briefly like
+just one idea.
+0:54:48.828 --> 0:54:53.058
+That there is more things on different types
+of language models.
+0:54:52.992 --> 0:54:58.003
+We are having a very short view on restricted
+person-based language models.
+0:54:58.298 --> 0:55:08.931
+Talk about recurrent neural networks for language
+mines because they have the advantage that
+0:55:08.931 --> 0:55:17.391
+we can even further improve by not having a
+continuous representation on.
+0:55:18.238 --> 0:55:23.845
+So there's different types of neural networks.
+0:55:23.726 --> 0:55:30.171
+These are these boxing machines and the interesting.
+0:55:30.330 --> 0:55:38.519
+They have these: And they define like an energy
+function on the network, which can be in restricted
+0:55:38.519 --> 0:55:44.415
+balsam machines efficiently calculated in general
+and restricted needs.
+0:55:44.333 --> 0:55:51.138
+You only have connection between the input
+and the hidden layer, but you don't have connections
+0:55:51.138 --> 0:55:53.123
+in the input or within the.
+0:55:53.393 --> 0:56:00.194
+So you see here you don't have an input output,
+you just have an input, and you calculate.
+0:56:00.460 --> 0:56:15.612
+Which of course nicely fits with the idea
+we're having, so you can then use this for
+0:56:15.612 --> 0:56:19.177
+an N Gram language.
+0:56:19.259 --> 0:56:25.189
+Retaining the flexibility of the input by
+this type of neon networks.
+0:56:26.406 --> 0:56:30.589
+And the advantage of this type of model was
+there's.
+0:56:30.550 --> 0:56:37.520
+Very, very fast to integrate it, so that one
+was the first one which was used during the
+0:56:37.520 --> 0:56:38.616
+coding model.
+0:56:38.938 --> 0:56:45.454
+The engram language models were that they
+were very good and gave performance.
+0:56:45.371 --> 0:56:50.073
+However, calculation still with all these
+tricks takes.
+0:56:50.230 --> 0:56:58.214
+We have talked about embest lists so they
+generated an embest list of the most probable
+0:56:58.214 --> 0:57:05.836
+outputs and then they took this and best list
+scored each entry with a new network.
+0:57:06.146 --> 0:57:09.306
+A language model, and then only change the
+order again.
+0:57:09.250 --> 0:57:10.889
+Select based on that which.
+0:57:11.231 --> 0:57:17.187
+The neighboring list is maybe only like hundred
+entries.
+0:57:17.083 --> 0:57:21.788
+When decoding you look at several thousand.
+0:57:26.186 --> 0:57:35.196
+Let's look at the context so we have now seen
+your language models.
+0:57:35.063 --> 0:57:43.678
+There is the big advantage we can use this
+word similarity and.
+0:57:44.084 --> 0:57:52.266
+Remember for engram language ones is not always
+minus one words because sometimes you have
+0:57:52.266 --> 0:57:59.909
+to back off or interpolation to lower engrams
+and you don't know the previous words.
+0:58:00.760 --> 0:58:04.742
+And however in neural models we always have
+all of this importance.
+0:58:04.684 --> 0:58:05.508
+Can some of.
+0:58:07.147 --> 0:58:20.288
+The disadvantage is that you are still limited
+in your context, and if you remember the sentence
+0:58:20.288 --> 0:58:22.998
+from last lecture,.
+0:58:22.882 --> 0:58:28.328
+Sometimes you need more context and there
+is unlimited context that you might need and
+0:58:28.328 --> 0:58:34.086
+you can always create sentences where you may
+need this five context in order to put a good
+0:58:34.086 --> 0:58:34.837
+estimation.
+0:58:35.315 --> 0:58:44.956
+Can also do it different in order to understand
+that it makes sense to view language.
+0:58:45.445 --> 0:58:58.559
+Secret labelingSo secret labeling tasks are
+a very common type of task in language processing
+0:58:58.559 --> 0:59:03.442
+where you have the input sequence.
+0:59:03.323 --> 0:59:05.976
+So you have one output for each input.
+0:59:05.908 --> 0:59:12.337
+Machine translation is not a secret labeling
+cast because the number of inputs and the number
+0:59:12.337 --> 0:59:14.046
+of outputs is different.
+0:59:13.978 --> 0:59:19.940
+So you put in a string German which has five
+words and the output can be: See, for example,
+0:59:19.940 --> 0:59:24.088
+you always have the same number and the same
+number of offices.
+0:59:24.944 --> 0:59:39.779
+And you can more language waddling as that,
+and you just say the label for each word is
+0:59:39.779 --> 0:59:43.151
+always a next word.
+0:59:45.705 --> 0:59:50.312
+This is the more generous you can think of
+it.
+0:59:50.214 --> 0:59:56.195
+For example, Paddle Speech Taking named Entity
+Recognition.
+0:59:58.938 --> 1:00:12.703
+And if you look at now, this output token
+and generally sequenced labeling can depend
+1:00:12.703 --> 1:00:26.788
+on: The input tokens are the same so we can
+easily model it and they only depend on the
+1:00:26.788 --> 1:00:29.028
+input tokens.
+1:00:31.011 --> 1:00:42.306
+But we can always look at one specific type
+of sequence labeling, unidirectional sequence
+1:00:42.306 --> 1:00:44.189
+labeling type.
+1:00:44.584 --> 1:01:00.855
+The probability of the next word only depends
+on the previous words that we are having here.
+1:01:01.321 --> 1:01:05.998
+That's also not completely true in language.
+1:01:05.894 --> 1:01:14.419
+Well, the back context might also be helpful
+by direction of the model's Google.
+1:01:14.654 --> 1:01:23.039
+We will always admire the probability of the
+word given on its history.
+1:01:23.623 --> 1:01:30.562
+And currently there is approximation and sequence
+labeling that we have this windowing approach.
+1:01:30.951 --> 1:01:43.016
+So in order to predict this type of word we
+always look at the previous three words.
+1:01:42.874 --> 1:01:48.414
+This is this type of windowing model.
+1:01:49.389 --> 1:01:54.780
+If you're into neural networks you recognize
+this type of structure.
+1:01:54.702 --> 1:01:57.517
+Also, the typical neural networks.
+1:01:58.938 --> 1:02:11.050
+Yes, yes, so like engram models you can, at
+least in some way, prepare for that type of
+1:02:11.050 --> 1:02:12.289
+context.
+1:02:14.334 --> 1:02:23.321
+Are also other types of neonamic structures
+which we can use for sequins lately and which
+1:02:23.321 --> 1:02:30.710
+might help us where we don't have this type
+of fixed size representation.
+1:02:32.812 --> 1:02:34.678
+That we can do so.
+1:02:34.580 --> 1:02:39.348
+The idea is in recurrent new networks traction.
+1:02:39.249 --> 1:02:43.225
+We are saving complete history in one.
+1:02:43.623 --> 1:02:56.946
+So again we have to do this fixed size representation
+because the neural networks always need a habit.
+1:02:57.157 --> 1:03:09.028
+And then the network should look like that,
+so we start with an initial value for our storage.
+1:03:08.903 --> 1:03:15.903
+We are giving our first input and calculating
+the new.
+1:03:16.196 --> 1:03:33.972
+So again in your network with two types of
+inputs: Then you can apply it to the next type
+1:03:33.972 --> 1:03:41.676
+of input and you're again having this.
+1:03:41.478 --> 1:03:46.395
+You're taking this hidden state.
+1:03:47.367 --> 1:03:53.306
+Nice thing is now that you can do now step
+by step by step, so all the way over.
+1:03:55.495 --> 1:04:06.131
+The nice thing we are having here now is that
+now we are having context information from
+1:04:06.131 --> 1:04:07.206
+all the.
+1:04:07.607 --> 1:04:14.181
+So if you're looking like based on which words
+do you, you calculate the probability of varying.
+1:04:14.554 --> 1:04:20.090
+It depends on this part.
+1:04:19.868 --> 1:04:33.158
+It depends on and this hidden state was influenced
+by two.
+1:04:33.473 --> 1:04:38.259
+So now we're having something new.
+1:04:38.122 --> 1:04:46.465
+We can model like the word probability not
+only on a fixed.
+1:04:46.906 --> 1:04:53.565
+Because the hidden states we are having here
+in our Oregon are influenced by all the trivia.
+1:04:56.296 --> 1:05:02.578
+So how is there to be Singapore?
+1:05:02.388 --> 1:05:16.289
+But then we have the initial idea about this
+P of given on the history.
+1:05:16.736 --> 1:05:25.300
+So do not need to do any clustering here,
+and you also see how things are put together
+1:05:25.300 --> 1:05:26.284
+in order.
+1:05:29.489 --> 1:05:43.449
+The green box this night since we are starting
+from the left to the right.
+1:05:44.524 --> 1:05:51.483
+Voices: Yes, that's right, so there are clusters,
+and here is also sometimes clustering happens.
+1:05:51.871 --> 1:05:58.687
+The small difference does matter again, so
+if you have now a lot of different histories,
+1:05:58.687 --> 1:06:01.674
+the similarity which you have in here.
+1:06:01.597 --> 1:06:08.239
+If two of the histories are very similar,
+these representations will be the same, and
+1:06:08.239 --> 1:06:10.787
+then you're treating them again.
+1:06:11.071 --> 1:06:15.789
+Because in order to do the final restriction
+you only do a good base on the green box.
+1:06:16.156 --> 1:06:28.541
+So you are now still learning some type of
+clustering in there, but you are learning it
+1:06:28.541 --> 1:06:30.230
+implicitly.
+1:06:30.570 --> 1:06:38.200
+The only restriction you're giving is you
+have to stall everything that is important
+1:06:38.200 --> 1:06:39.008
+in this.
+1:06:39.359 --> 1:06:54.961
+So it's a different type of limitation, so
+you calculate the probability based on the
+1:06:54.961 --> 1:06:57.138
+last words.
+1:06:57.437 --> 1:07:04.430
+And that is how you still need to somehow
+cluster things together in order to do efficiently.
+1:07:04.356 --> 1:07:09.564
+Of course, you need to do some type of clustering
+because otherwise.
+1:07:09.970 --> 1:07:18.865
+But this is where things get merged together
+in this type of hidden representation.
+1:07:18.760 --> 1:07:27.975
+So here the probability of the word first
+only depends on this hidden representation.
+1:07:28.288 --> 1:07:33.104
+On the previous words, but they are some other
+bottleneck in order to make a good estimation.
+1:07:34.474 --> 1:07:41.231
+So the idea is that we can store all our history
+into or into one lecture.
+1:07:41.581 --> 1:07:44.812
+Which is the one that makes it more strong.
+1:07:44.739 --> 1:07:51.238
+Next we come to problems that of course at
+some point it might be difficult if you have
+1:07:51.238 --> 1:07:57.811
+very long sequences and you always write all
+the information you have on this one block.
+1:07:58.398 --> 1:08:02.233
+Then maybe things get overwritten or you cannot
+store everything in there.
+1:08:02.662 --> 1:08:04.514
+So,.
+1:08:04.184 --> 1:08:09.569
+Therefore, yet for short things like single
+sentences that works well, but especially if
+1:08:09.569 --> 1:08:15.197
+you think of other tasks and like symbolizations
+with our document based on T where you need
+1:08:15.197 --> 1:08:20.582
+to consider the full document, these things
+got got a bit more more more complicated and
+1:08:20.582 --> 1:08:23.063
+will learn another type of architecture.
+1:08:24.464 --> 1:08:30.462
+In order to understand these neighbors, it
+is good to have all the bus use always.
+1:08:30.710 --> 1:08:33.998
+So this is the unrolled view.
+1:08:33.889 --> 1:08:43.754
+Somewhere you're over the type or in language
+over the words you're unrolling a network.
+1:08:44.024 --> 1:08:52.096
+Here is the article and here is the network
+which is connected by itself and that is recurrent.
+1:08:56.176 --> 1:09:04.982
+There is one challenge in this networks and
+training.
+1:09:04.819 --> 1:09:11.998
+We can train them first of all as forward.
+1:09:12.272 --> 1:09:19.397
+So we don't really know how to train them,
+but if you unroll them like this is a feet
+1:09:19.397 --> 1:09:20.142
+forward.
+1:09:20.540 --> 1:09:38.063
+Is exactly the same, so you can measure your
+arrows here and be back to your arrows.
+1:09:38.378 --> 1:09:45.646
+If you unroll something, it's a feature in
+your laptop and you can train it the same way.
+1:09:46.106 --> 1:09:57.606
+The only important thing is again, of course,
+for different inputs.
+1:09:57.837 --> 1:10:05.145
+But since parameters are shared, it's somehow
+a similar point you can train it.
+1:10:05.054 --> 1:10:08.802
+The training algorithm is very similar.
+1:10:10.310 --> 1:10:29.568
+One thing which makes things difficult is
+what is referred to as the vanish ingredient.
+1:10:29.809 --> 1:10:32.799
+That's a very strong thing in the motivation
+of using hardness.
+1:10:33.593 --> 1:10:44.604
+The influence here gets smaller and smaller,
+and the modems are not really able to monitor.
+1:10:44.804 --> 1:10:51.939
+Because the gradient gets smaller and smaller,
+and so the arrow here propagated to this one
+1:10:51.939 --> 1:10:58.919
+that contributes to the arrow is very small,
+and therefore you don't do any changes there
+1:10:58.919 --> 1:10:59.617
+anymore.
+1:11:00.020 --> 1:11:06.703
+And yeah, that's why standard art men are
+undifficult or have to pick them at custard.
+1:11:07.247 --> 1:11:11.442
+Long-Short Memory (LDN)So everywhere talking
+to me about fire and ants nowadays,.
+1:11:11.791 --> 1:11:23.333
+What we are typically meaning are LSDN's or
+long short memories.
+1:11:23.155 --> 1:11:30.972
+You see they are by now quite old already.
+1:11:31.171 --> 1:11:39.019
+So there was a model in the language model
+task.
+1:11:38.858 --> 1:11:44.789
+It's some more storing information.
+1:11:44.684 --> 1:11:51.556
+Because if you only look at the last words,
+it's often no longer clear this is a question
+1:11:51.556 --> 1:11:52.548
+or a normal.
+1:11:53.013 --> 1:12:05.318
+So there you have these mechanisms with ripgate
+in order to store things for a longer time
+1:12:05.318 --> 1:12:08.563
+into your hidden state.
+1:12:10.730 --> 1:12:20.162
+Here they are used in in in selling quite
+a lot of works.
+1:12:21.541 --> 1:12:29.349
+For especially machine translation now, the
+standard is to do transform base models which
+1:12:29.349 --> 1:12:30.477
+we'll learn.
+1:12:30.690 --> 1:12:38.962
+But for example, in architecture we have later
+one lecture about efficiency.
+1:12:38.855 --> 1:12:42.833
+So how can we build very efficient?
+1:12:42.882 --> 1:12:53.074
+And there in the decoder in parts of the networks
+they are still using.
+1:12:53.473 --> 1:12:57.518
+So it's not that yeah our hands are of no
+importance in the body.
+1:12:59.239 --> 1:13:09.810
+In order to make them strong, there are some
+more things which are helpful and should be:
+1:13:09.810 --> 1:13:19.677
+So one thing is there is a nice trick to make
+this new network stronger and better.
+1:13:19.739 --> 1:13:21.523
+So of course it doesn't work always.
+1:13:21.475 --> 1:13:23.452
+They have to have enough training data.
+1:13:23.763 --> 1:13:28.959
+But in general there's the easiest way of
+making your models bigger and stronger just
+1:13:28.959 --> 1:13:30.590
+to increase your pyramids.
+1:13:30.630 --> 1:13:43.236
+And you've seen that with a large language
+models they are always bragging about.
+1:13:43.903 --> 1:13:56.463
+This is one way, so the question is how do
+you get more parameters?
+1:13:56.278 --> 1:14:01.273
+There's ways of doing it.
+1:14:01.521 --> 1:14:10.029
+And the other thing is to make your networks
+deeper so to have more legs in between.
+1:14:11.471 --> 1:14:13.827
+And then you can also get to get more calm.
+1:14:14.614 --> 1:14:23.340
+There's more traveling with this and it's
+very similar to what we just saw with our hand.
+1:14:23.603 --> 1:14:34.253
+We have this problem of radiant flow that
+if it flows so fast like a radiant gets very
+1:14:34.253 --> 1:14:35.477
+swollen,.
+1:14:35.795 --> 1:14:42.704
+Exactly the same thing happens in deep LSD
+ends.
+1:14:42.563 --> 1:14:52.295
+If you take here the gradient, tell you what
+is the right or wrong.
+1:14:52.612 --> 1:14:56.439
+With three layers it's no problem, but if
+you're going to ten, twenty or hundred layers.
+1:14:57.797 --> 1:14:59.698
+That's Getting Typically Young.
+1:15:00.060 --> 1:15:07.000
+Are doing is using what is called decisional
+connections.
+1:15:06.880 --> 1:15:15.857
+That's a very helpful idea, which is maybe
+very surprising that it works.
+1:15:15.956 --> 1:15:20.309
+And so the idea is that these networks.
+1:15:20.320 --> 1:15:29.982
+In between should no longer calculate what
+is a new good representation, but they're more
+1:15:29.982 --> 1:15:31.378
+calculating.
+1:15:31.731 --> 1:15:37.588
+Therefore, in the end you're always the output
+of a layer is added with the input.
+1:15:38.318 --> 1:15:48.824
+The knife is later if you are doing back propagation
+with this very fast back propagation.
+1:15:49.209 --> 1:16:02.540
+Nowadays in very deep architectures, not only
+on other but always has this residual or highway
+1:16:02.540 --> 1:16:04.224
+connection.
+1:16:04.704 --> 1:16:06.616
+Has two advantages.
+1:16:06.520 --> 1:16:15.383
+On the one hand, these layers don't need to
+learn a representation, they only need to learn
+1:16:15.383 --> 1:16:18.755
+what to change the representation.
+1:16:22.082 --> 1:16:24.172
+Good.
+1:16:23.843 --> 1:16:31.768
+That much for the new map before, so the last
+thing now means this.
+1:16:31.671 --> 1:16:33.750
+Language was are yeah.
+1:16:33.660 --> 1:16:44.081
+I were used in the molds itself and now were
+seeing them again, but one thing which at the
+1:16:44.081 --> 1:16:55.076
+beginning they were reading was very essential
+was: So people really train part of the language
+1:16:55.076 --> 1:17:00.000
+models only to get this type of embedding.
+1:16:59.886 --> 1:17:04.198
+Therefore, we want to look.
+1:17:09.229 --> 1:17:15.678
+So now some last words to the word embeddings.
+1:17:15.541 --> 1:17:27.205
+The interesting thing is that word embeddings
+can be used for very different tasks.
+1:17:27.347 --> 1:17:31.329
+The knife wing is you can train that on just
+large amounts of data.
+1:17:31.931 --> 1:17:41.569
+And then if you have these wooden beddings
+we have seen that they reduce the parameters.
+1:17:41.982 --> 1:17:52.217
+So then you can train your small mark to do
+any other task and therefore you are more efficient.
+1:17:52.532 --> 1:17:55.218
+These initial word embeddings is important.
+1:17:55.157 --> 1:18:00.492
+They really depend only on the word itself,
+so if you look at the two meanings of can,
+1:18:00.492 --> 1:18:06.318
+the can of beans or I can do that, they will
+have the same embedding, so some of the embedding
+1:18:06.318 --> 1:18:08.709
+has to save the ambiguity inside that.
+1:18:09.189 --> 1:18:12.486
+That cannot be resolved.
+1:18:12.354 --> 1:18:24.727
+Therefore, if you look at the higher levels
+in the context, but in the word embedding layers
+1:18:24.727 --> 1:18:27.920
+that really depends on.
+1:18:29.489 --> 1:18:33.757
+However, even this one has quite very interesting.
+1:18:34.034 --> 1:18:39.558
+So that people like to visualize them.
+1:18:39.417 --> 1:18:47.211
+They're always difficult because if you look
+at this.
+1:18:47.767 --> 1:18:52.879
+And drawing your five hundred damage, the
+vector is still a bit challenging.
+1:18:53.113 --> 1:19:12.472
+So you cannot directly do that, so people
+have to do it like they look at some type of.
+1:19:13.073 --> 1:19:17.209
+And of course then yes some information is
+getting lost by a bunch of control.
+1:19:18.238 --> 1:19:24.802
+And you see, for example, this is the most
+famous and common example, so what you can
+1:19:24.802 --> 1:19:31.289
+look is you can look at the difference between
+the main and the female word English.
+1:19:31.213 --> 1:19:37.854
+This is here in your embedding of king, and
+this is the embedding of queen, and this.
+1:19:38.058 --> 1:19:40.394
+You can do that for a very different work.
+1:19:40.780 --> 1:19:45.407
+And that is where the masks come into, that
+is what people then look into.
+1:19:45.725 --> 1:19:50.995
+So what you can now, for example, do is you
+can calculate the difference between man and
+1:19:50.995 --> 1:19:51.410
+woman?
+1:19:52.232 --> 1:19:55.511
+Then you can take the embedding of tea.
+1:19:55.429 --> 1:20:02.764
+You can add on it the difference between man
+and woman, and then you can notice what are
+1:20:02.764 --> 1:20:04.330
+the similar words.
+1:20:04.248 --> 1:20:08.926
+So you won't, of course, directly hit the
+correct word.
+1:20:08.843 --> 1:20:10.518
+It's a continuous.
+1:20:10.790 --> 1:20:23.127
+But you can look what are the nearest neighbors
+to this same, and often these words are near
+1:20:23.127 --> 1:20:24.056
+there.
+1:20:24.224 --> 1:20:33.913
+So it somehow learns that the difference between
+these words is always the same.
+1:20:34.374 --> 1:20:37.746
+You can do that for different things.
+1:20:37.658 --> 1:20:41.236
+He also imagines that it's not perfect.
+1:20:41.146 --> 1:20:49.019
+He says the world tends to be swimming and
+swimming, and with walking and walking you.
+1:20:49.469 --> 1:20:51.639
+So you can try to use them.
+1:20:51.561 --> 1:20:58.970
+It's no longer like saying yeah, but the interesting
+thing is this is completely unsupervised.
+1:20:58.892 --> 1:21:03.963
+So nobody taught him the principle of their
+gender in language.
+1:21:04.284 --> 1:21:09.910
+So it's purely trained on the task of doing
+the next work prediction.
+1:21:10.230 --> 1:21:20.658
+And even for really cementing information
+like the capital, this is the difference between
+1:21:20.658 --> 1:21:23.638
+the city and the capital.
+1:21:23.823 --> 1:21:25.518
+Visualization.
+1:21:25.405 --> 1:21:33.768
+Here we have done the same things of the difference
+between country and.
+1:21:33.853 --> 1:21:41.991
+You see it's not perfect, but it's building
+some kinds of a right direction, so you can't
+1:21:41.991 --> 1:21:43.347
+even use them.
+1:21:43.257 --> 1:21:51.286
+For example, for question answering, if you
+have the difference between them, you apply
+1:21:51.286 --> 1:21:53.384
+that to a new country.
+1:21:54.834 --> 1:22:02.741
+So it seems these ones are able to really
+learn a lot of information and collapse all
+1:22:02.741 --> 1:22:04.396
+this information.
+1:22:05.325 --> 1:22:12.301
+At just to do the next word prediction: And
+that also explains a bit maybe or not explains
+1:22:12.301 --> 1:22:19.276
+wrong life by motivating why what is the main
+advantage of this type of neural models that
+1:22:19.276 --> 1:22:26.022
+we can use this type of hidden representation,
+transfer them and use them in different.
+1:22:28.568 --> 1:22:41.948
+SummarySo summarize what we did today, so
+what you should hopefully have with you is
+1:22:41.948 --> 1:22:45.883
+for machine translation.
+1:22:45.805 --> 1:22:49.149
+Then how we can do language modern Chinese
+literature?
+1:22:49.449 --> 1:22:56.046
+We looked at three different architectures:
+We looked into the feet forward language mode
+1:22:56.046 --> 1:22:59.052
+and the one based on Bluetooth machines.
+1:22:59.039 --> 1:23:05.366
+And finally there are different architectures
+to do in your networks.
+1:23:05.275 --> 1:23:14.405
+We have seen feet for your networks and we'll
+see the next lectures, the last type of architecture.
+1:23:15.915 --> 1:23:17.412
+Have Any Questions.
+1:23:20.680 --> 1:23:27.341
+Then thanks a lot, and next on Tuesday we
+will be again in our order to know how to play.

demo_data/lectures/Lecture-07-11.05.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee1fc2af8bf4d95a18dacaa3d5d9aad8c6c207e0f5f63090a9adefcfcf29f418
+size 150440033

demo_data/lectures/Lecture-07-16.05.2023/English.vtt ADDED Viewed

	@@ -0,0 +1,2523 @@

+WEBVTT
+0:00:01.301 --> 0:00:05.664
+IntroductionOkay, so we're welcome to today's
+lecture.
+0:00:06.066 --> 0:00:18.128
+A bit desperate in a small room and I'm sorry
+for the inconvenience.
+0:00:17.953 --> 0:00:25.824
+Sometimes there are project meetings where.
+0:00:26.806 --> 0:00:40.863
+So what we want to talk today about is want
+to start with neural approaches to machine
+0:00:40.863 --> 0:00:42.964
+translation.
+0:00:43.123 --> 0:00:55.779
+Guess I've heard about other types of neural
+models for natural language processing.
+0:00:55.630 --> 0:00:59.954
+This was some of the first.
+0:01:00.600 --> 0:01:06.203
+They are similar to what you know they see
+in as large language models.
+0:01:06.666 --> 0:01:14.810
+And we want today look into what are these
+neural language models, how we can build them,
+0:01:14.810 --> 0:01:15.986
+what is the.
+0:01:16.316 --> 0:01:23.002
+And first we'll show how to use them in statistical
+machine translation.
+0:01:22.910 --> 0:01:31.058
+RecapIf you remember weeks ago, we had this
+log-linear model where you can integrate easily.
+0:01:31.351 --> 0:01:42.756
+And that was how they first were used, so
+we just had another model that evaluates how
+0:01:42.756 --> 0:01:49.180
+good a system is or how good a lot of languages.
+0:01:50.690 --> 0:02:04.468
+And next week we will go for a neuromachine
+translation where we replace the whole model
+0:02:04.468 --> 0:02:06.481
+by one huge.
+0:02:11.211 --> 0:02:18.079
+The main challenge in statistical language
+modelingSo just as a member from Tuesday we've
+0:02:18.079 --> 0:02:25.101
+seen, the main challenge in language modeling
+was that most of the anthrax we haven't seen.
+0:02:26.946 --> 0:02:34.167
+So this was therefore difficult to estimate
+any probability because we've seen that yet
+0:02:34.167 --> 0:02:39.501
+normally if you've seen had not seen the N
+gram you will assign.
+0:02:39.980 --> 0:02:53.385
+However, this is not really very good because
+we don't want to give zero probabilities to
+0:02:53.385 --> 0:02:55.023
+sentences.
+0:02:55.415 --> 0:03:10.397
+And then we learned a lot of techniques and
+that is the main challenge in statistical language.
+0:03:10.241 --> 0:03:15.396
+How we can give somehow a good.
+0:03:15.435 --> 0:03:23.835
+And they developed very specific, very good
+techniques to deal with that.
+0:03:23.721 --> 0:03:26.904
+However, this is the best.
+0:03:28.568 --> 0:03:33.907
+And therefore we can do things different.
+0:03:33.780 --> 0:03:44.332
+If we have not seen an N gram before in statistical
+models, we have to have seen.
+0:03:45.225 --> 0:03:51.361
+Before, and we can only get information from
+exactly the same word.
+0:03:51.411 --> 0:03:57.567
+We don't have an approximate matching like
+that.
+0:03:57.441 --> 0:04:10.256
+Maybe it stood together in some way or similar,
+and in a sentence we might generalize the knowledge.
+0:04:11.191 --> 0:04:21.227
+Would like to have more something like that
+where engrams are represented more in a general
+0:04:21.227 --> 0:04:21.990
+space.
+0:04:22.262 --> 0:04:29.877
+So if you learn something about eyewalk then
+maybe we can use this knowledge and also.
+0:04:30.290 --> 0:04:43.034
+And thereby no longer treat all or at least
+a lot of the ingrams as we've done before.
+0:04:42.887 --> 0:04:45.242
+We can really.
+0:04:47.047 --> 0:04:56.157
+And we maybe want to even do that in a more
+hierarchical approach, but we know okay some
+0:04:56.157 --> 0:05:05.268
+words are similar like go and walk is somehow
+similar and and therefore like maybe if we
+0:05:05.268 --> 0:05:07.009
+then merge them.
+0:05:07.387 --> 0:05:16.104
+If we learn something about work, then it
+should tell us also something about Hugo or
+0:05:16.104 --> 0:05:17.118
+he walks.
+0:05:17.197 --> 0:05:18.970
+We see already.
+0:05:18.859 --> 0:05:22.207
+It's, of course, not so easy.
+0:05:22.095 --> 0:05:31.774
+We see that there is some relations which
+we need to integrate, for example, for you.
+0:05:31.661 --> 0:05:35.491
+We need to add the S, but maybe.
+0:05:37.137 --> 0:05:42.984
+And luckily there is one really yeah, convincing
+methods in doing that.
+0:05:42.963 --> 0:05:47.239
+And that is by using an evil neck or.
+0:05:47.387 --> 0:05:57.618
+That's what we will introduce today so we
+can use this type of neural networks to try
+0:05:57.618 --> 0:06:04.042
+to learn this similarity and to learn how some
+words.
+0:06:04.324 --> 0:06:13.711
+And that is one of the main advantages that
+we have by switching from the standard statistical
+0:06:13.711 --> 0:06:15.193
+models to the.
+0:06:15.115 --> 0:06:22.840
+To learn similarities between words and generalized
+and learn what we call hidden representations.
+0:06:22.762 --> 0:06:29.708
+So somehow representations of words where
+we can measure similarity in some dimensions.
+0:06:30.290 --> 0:06:42.275
+So in representations where as a tubically
+continuous vector or a vector of a fixed size.
+0:06:42.822 --> 0:06:52.002
+We had it before and we've seen that the only
+thing we did is we don't want to do.
+0:06:52.192 --> 0:06:59.648
+But these indices don't have any meaning,
+so it wasn't that word five is more similar
+0:06:59.648 --> 0:07:02.248
+to words twenty than to word.
+0:07:02.582 --> 0:07:09.059
+So we couldn't learn anything about words
+in the statistical model.
+0:07:08.964 --> 0:07:12.110
+That's a big challenge because.
+0:07:12.192 --> 0:07:24.232
+If you think about words even in morphology,
+so go and go is more similar because the person.
+0:07:24.264 --> 0:07:36.265
+While the basic models we have up to now,
+they have no idea about that and goes as similar
+0:07:36.265 --> 0:07:37.188
+to go.
+0:07:39.919 --> 0:07:49.062
+A short introduction to network language modelsSo
+what we want to do today, in order to go to
+0:07:49.062 --> 0:07:53.050
+this, we will have a short introduction.
+0:07:53.954 --> 0:08:06.667
+It very short just to see how we use them
+here, but that's the good thing that are important
+0:08:06.667 --> 0:08:08.445
+for dealing.
+0:08:08.928 --> 0:08:14.083
+And then we'll first look into feet forward,
+new network language models.
+0:08:14.454 --> 0:08:21.221
+And there we will still have this approximation
+we had before, then we are looking only at
+0:08:21.221 --> 0:08:22.336
+fixed windows.
+0:08:22.262 --> 0:08:28.773
+So if you remember we have this classroom
+of language models, and to determine what is
+0:08:28.773 --> 0:08:33.788
+the probability of a word, we only look at
+the past and minus one.
+0:08:34.154 --> 0:08:36.878
+This is the theory of the case.
+0:08:36.793 --> 0:08:43.349
+However, we have the ability and that's why
+they're really better in order.
+0:08:44.024 --> 0:08:51.953
+And then at the end we'll look at current
+network language models where we then have
+0:08:51.953 --> 0:08:53.166
+a different.
+0:08:53.093 --> 0:09:01.922
+And thereby it is no longer the case that
+we need to have a fixed history, but in theory
+0:09:01.922 --> 0:09:04.303
+we can model arbitrary.
+0:09:04.304 --> 0:09:06.854
+And we can log this phenomenon.
+0:09:06.774 --> 0:09:12.673
+We talked about a Tuesday where it's not clear
+what type of information.
+0:09:16.396 --> 0:09:24.982
+So yeah, generally new networks are normally
+learned to improve and perform some tasks.
+0:09:25.325 --> 0:09:38.934
+We have this structure and we are learning
+them from samples so that is similar to what
+0:09:38.934 --> 0:09:42.336
+we had before so now.
+0:09:42.642 --> 0:09:49.361
+And is somehow originally motivated by the
+human brain.
+0:09:49.241 --> 0:10:00.641
+However, when you now need to know artificial
+neural networks, it's hard to get a similarity.
+0:10:00.540 --> 0:10:02.884
+There seems to be not that important.
+0:10:03.123 --> 0:10:11.013
+So what they are mainly doing is doing summoning
+multiplication and then one linear activation.
+0:10:12.692 --> 0:10:16.078
+So so the basic units are these type of.
+0:10:17.937 --> 0:10:29.837
+Perceptron is a basic block which we have
+and this does exactly the processing.
+0:10:29.688 --> 0:10:36.088
+We have a fixed number of input features.
+0:10:36.096 --> 0:10:39.668
+So we have here numbers six zero to x and
+as input.
+0:10:40.060 --> 0:10:48.096
+And this makes language processing difficult
+because we know that it's not the case.
+0:10:48.002 --> 0:10:53.109
+If we're dealing with language, it doesn't
+have any.
+0:10:54.114 --> 0:10:57.609
+So we have to model this somehow and understand
+how we model this.
+0:10:58.198 --> 0:11:03.681
+Then we have the weights, which are the parameters
+and the number of weights exactly the same.
+0:11:04.164 --> 0:11:15.069
+Of input features sometimes you have the spires
+in there that always and then it's not really.
+0:11:15.195 --> 0:11:19.656
+And what you then do is very simple.
+0:11:19.535 --> 0:11:26.168
+It's just like the weight it sounds, so you
+multiply.
+0:11:26.606 --> 0:11:38.405
+What is then additionally important is we
+have an activation function and it's important
+0:11:38.405 --> 0:11:42.514
+that this activation function.
+0:11:43.243 --> 0:11:54.088
+And later it will be important that this is
+differentiable because otherwise all the training.
+0:11:54.714 --> 0:12:01.471
+This model by itself is not very powerful.
+0:12:01.314 --> 0:12:10.430
+We have the X Or problem and with this simple
+you can't.
+0:12:10.710 --> 0:12:15.489
+However, there is a very easy and nice extension.
+0:12:15.393 --> 0:12:20.938
+The multi layer perception and things get
+very powerful.
+0:12:21.081 --> 0:12:32.953
+The thing is you just connect a lot of these
+in these layers of structures where we have
+0:12:32.953 --> 0:12:35.088
+the inputs and.
+0:12:35.395 --> 0:12:47.936
+And then we can combine them, or to do them:
+The input layer is of course given by your
+0:12:47.936 --> 0:12:51.926
+problem with the dimension.
+0:12:51.784 --> 0:13:00.065
+The output layer is also given by your dimension.
+0:13:01.621 --> 0:13:08.802
+So let's start with the first question, now
+more language related, and that is how we represent.
+0:13:09.149 --> 0:13:19.282
+So we have seen here input to x, but the question
+is now okay.
+0:13:19.121 --> 0:13:23.470
+How can we put into this?
+0:13:26.866 --> 0:13:34.123
+The first thing that we're able to do is we're
+going to set it in the inspector.
+0:13:34.314 --> 0:13:45.651
+Yeah, and that is not that easy because the
+continuous vector will come to that.
+0:13:45.511 --> 0:13:46.953
+We can't.
+0:13:46.809 --> 0:13:50.420
+We don't want to do it.
+0:13:50.630 --> 0:13:57.237
+But if we need to input the word into the
+needle network, it has to be something easily
+0:13:57.237 --> 0:13:57.912
+defined.
+0:13:59.079 --> 0:14:11.511
+One is the typical thing, the one-hour encoded
+vector, so we have a vector where the dimension
+0:14:11.511 --> 0:14:15.306
+is the vocabulary, and then.
+0:14:16.316 --> 0:14:25.938
+So the first thing you are ready to see that
+means we are always dealing with fixed.
+0:14:26.246 --> 0:14:34.961
+So you cannot easily extend your vocabulary,
+but if you mean your vocabulary would increase
+0:14:34.961 --> 0:14:37.992
+the size of this input vector,.
+0:14:39.980 --> 0:14:42.423
+That's maybe also motivating.
+0:14:42.341 --> 0:14:45.324
+We'll talk about bike parade going.
+0:14:45.241 --> 0:14:47.233
+That's the nice thing.
+0:14:48.048 --> 0:15:01.803
+The big advantage of this one putt encoding
+is that we don't implement similarity between
+0:15:01.803 --> 0:15:06.999
+words, but we're really learning.
+0:15:07.227 --> 0:15:11.219
+So you need like to represent any words.
+0:15:11.121 --> 0:15:15.895
+You need a dimension of and dimensional vector.
+0:15:16.236 --> 0:15:26.480
+Imagine you could eat no binary encoding,
+so you could represent words as binary vectors.
+0:15:26.806 --> 0:15:32.348
+So you will be significantly more efficient.
+0:15:32.225 --> 0:15:39.124
+However, you have some more digits than other
+numbers.
+0:15:39.559 --> 0:15:46.482
+Would somehow be bad because you would force
+the one to do this and it's by hand not clear
+0:15:46.482 --> 0:15:47.623
+how to define.
+0:15:48.108 --> 0:15:55.135
+So therefore currently this is the most successful
+approach to just do this one patch.
+0:15:55.095 --> 0:15:59.344
+We take a fixed vocabulary.
+0:15:59.192 --> 0:16:10.238
+We map each word to the initial and then we
+represent a word like this.
+0:16:10.084 --> 0:16:13.314
+The representation.
+0:16:14.514 --> 0:16:27.019
+But this dimension here is a secondary size,
+and if you think ten thousand that's quite
+0:16:27.019 --> 0:16:33.555
+high, so we're always trying to be efficient.
+0:16:33.853 --> 0:16:42.515
+And we are doing the same type of efficiency
+because then we are having a very small one
+0:16:42.515 --> 0:16:43.781
+compared to.
+0:16:44.104 --> 0:16:53.332
+It can be still a maybe or neurons, but this
+is significantly smaller, of course, as before.
+0:16:53.713 --> 0:17:04.751
+So you are learning there this word as you
+said, but you can learn it directly, and there
+0:17:04.751 --> 0:17:07.449
+we have similarities.
+0:17:07.807 --> 0:17:14.772
+But the nice thing is that this is then learned,
+and we do not need to like hand define.
+0:17:17.117 --> 0:17:32.377
+So yes, so that is how we're typically adding
+at least a single word into the language world.
+0:17:32.215 --> 0:17:42.390
+Then we can see: So we're seeing that you
+have the one hard representation always of
+0:17:42.390 --> 0:17:44.904
+the same similarity.
+0:17:45.105 --> 0:18:00.803
+Then we're having this continuous vector which
+is a lot smaller dimension and that's.
+0:18:01.121 --> 0:18:06.984
+What we are doing then is learning these representations
+so that they are best for language modeling.
+0:18:07.487 --> 0:18:19.107
+So the representations are implicitly because
+we're training on the language.
+0:18:19.479 --> 0:18:30.115
+And the nice thing was found out later is
+these representations are really, really good
+0:18:30.115 --> 0:18:32.533
+for a lot of other.
+0:18:33.153 --> 0:18:39.729
+And that is why they are now called word embedded
+space themselves, and used for other tasks.
+0:18:40.360 --> 0:18:49.827
+And they are somehow describing different
+things so they can describe and semantic similarities.
+0:18:49.789 --> 0:18:58.281
+We are looking at the very example of today
+that you can do in this vector space by adding
+0:18:58.281 --> 0:19:00.613
+some interesting things.
+0:19:00.940 --> 0:19:11.174
+And so they got really was a first big improvement
+when switching to neural staff.
+0:19:11.491 --> 0:19:20.736
+They are like part of the model still with
+more complex representation alert, but they
+0:19:20.736 --> 0:19:21.267
+are.
+0:19:23.683 --> 0:19:34.975
+Then we are having the output layer, and in
+the output layer we also have output structure
+0:19:34.975 --> 0:19:36.960
+and activation.
+0:19:36.997 --> 0:19:44.784
+That is the language we want to predict, which
+word should be the next.
+0:19:44.675 --> 0:19:46.521
+We always have.
+0:19:47.247 --> 0:19:56.454
+And that can be done very well with the softball
+softbacked layer, where again the dimension.
+0:19:56.376 --> 0:20:03.971
+Is the vocabulary, so this is a vocabulary
+size, and again the case neuro represents the
+0:20:03.971 --> 0:20:09.775
+case class, so in our case we have again a
+one-hour representation.
+0:20:10.090 --> 0:20:18.929
+Ours is a probability distribution and the
+end is a probability distribution of all works.
+0:20:18.832 --> 0:20:27.112
+The case entry tells us: So we need to have
+some of our probability distribution at our
+0:20:27.112 --> 0:20:36.144
+output, and in order to achieve that this activation
+function goes, it needs to be that all the
+0:20:36.144 --> 0:20:36.990
+outputs.
+0:20:37.197 --> 0:20:47.993
+And we can achieve that with a softmax activation
+we take each of the value and then.
+0:20:48.288 --> 0:20:58.020
+So by having this type of activation function
+we are really getting that at the end we always.
+0:20:59.019 --> 0:21:12.340
+The beginning was very challenging because
+again we have this inefficient representation
+0:21:12.340 --> 0:21:15.184
+of our vocabulary.
+0:21:15.235 --> 0:21:27.500
+And then you can imagine escalating over to
+something over a thousand is maybe a bit inefficient
+0:21:27.500 --> 0:21:29.776
+with cheap users.
+0:21:36.316 --> 0:21:43.664
+And then yeah, for training the models, that
+is how we refine, so we have this architecture
+0:21:43.664 --> 0:21:44.063
+now.
+0:21:44.264 --> 0:21:52.496
+We need to minimize the arrow by taking the
+output.
+0:21:52.338 --> 0:21:58.200
+We are comparing it to our targets.
+0:21:58.298 --> 0:22:07.670
+So one important thing is, of course, how
+can we measure the error?
+0:22:07.532 --> 0:22:12.774
+So what if we're training the ideas?
+0:22:13.033 --> 0:22:19.770
+And how well when measuring it is in natural
+language processing, typically the cross entropy.
+0:22:19.960 --> 0:22:32.847
+That means we are comparing the target with
+the output, so we're taking the value multiplying
+0:22:32.847 --> 0:22:35.452
+with the horizons.
+0:22:35.335 --> 0:22:43.454
+Which gets optimized and you're seeing that
+this, of course, makes it again very nice and
+0:22:43.454 --> 0:22:49.859
+easy because our target, we said, is again
+a one-hound representation.
+0:22:50.110 --> 0:23:00.111
+So except for one, all of these are always
+zero, and what we are doing is taking the one.
+0:23:00.100 --> 0:23:05.970
+And we only need to multiply the one with
+the logarism here, and that is all the feedback.
+0:23:06.946 --> 0:23:14.194
+Of course, this is not always influenced by
+all the others.
+0:23:14.073 --> 0:23:17.942
+Why is this influenced by all?
+0:23:24.304 --> 0:23:33.554
+Think Mac the activation function, which is
+the current activation divided by some of the
+0:23:33.554 --> 0:23:34.377
+others.
+0:23:34.354 --> 0:23:44.027
+Because otherwise it could of course easily
+just increase this value and ignore the others,
+0:23:44.027 --> 0:23:49.074
+but if you increase one value or the other,
+so.
+0:23:51.351 --> 0:24:04.433
+And then we can do with neon networks one
+very nice and easy type of training that is
+0:24:04.433 --> 0:24:07.779
+done in all the neon.
+0:24:07.707 --> 0:24:12.664
+So in which direction does the arrow show?
+0:24:12.548 --> 0:24:23.120
+And then if we want to go to a smaller like
+smaller arrow, that's what we want to achieve.
+0:24:23.004 --> 0:24:27.306
+We're trying to minimize our arrow.
+0:24:27.287 --> 0:24:32.875
+And we have to do that, of course, for all
+the weights, and to calculate the error of
+0:24:32.875 --> 0:24:36.709
+all the weights we want in the back of the
+baggation here.
+0:24:36.644 --> 0:24:41.289
+But what you can do is you can propagate the
+arrow which you measured.
+0:24:41.223 --> 0:24:43.741
+At the end you can propagate it back.
+0:24:43.675 --> 0:24:46.394
+That's basic mass and basic derivation.
+0:24:46.706 --> 0:24:59.557
+Then you can do each weight in your model
+and measure how much it contributes to this
+0:24:59.557 --> 0:25:01.350
+individual.
+0:25:04.524 --> 0:25:17.712
+To summarize what your machine translation
+should be, to understand all this problem is
+0:25:17.712 --> 0:25:20.710
+that this is how a.
+0:25:20.580 --> 0:25:23.056
+The notes are perfect thrones.
+0:25:22.976 --> 0:25:28.169
+They are fully connected between two layers
+and no connections.
+0:25:28.108 --> 0:25:29.759
+Across layers.
+0:25:29.829 --> 0:25:35.152
+And what they're doing is always just to wait
+for some here and then an activation function.
+0:25:35.415 --> 0:25:38.794
+And in order to train you have this sword
+in backwards past.
+0:25:39.039 --> 0:25:41.384
+So we put in here.
+0:25:41.281 --> 0:25:46.540
+Our inputs have some random values at the
+beginning.
+0:25:46.441 --> 0:25:49.140
+They calculate the output.
+0:25:49.040 --> 0:25:58.631
+We are measuring how big our error is, propagating
+the arrow back, and then changing our model
+0:25:58.631 --> 0:25:59.640
+in a way.
+0:26:01.962 --> 0:26:10.408
+How can we use neural networks for language
+modeling?So before we're coming into the neural
+0:26:10.408 --> 0:26:17.569
+networks, how can we use this type of neural
+network to do language modeling?
+0:26:23.103 --> 0:26:25.520
+So the question is now okay.
+0:26:25.437 --> 0:26:32.988
+How can we use them in natural language processing
+and especially in machine translation?
+0:26:32.904 --> 0:26:38.443
+The first idea of using them was to estimate
+the language model.
+0:26:38.999 --> 0:26:42.599
+So we have seen that the output can be monitored
+here as well.
+0:26:43.603 --> 0:26:49.308
+Has a probability distribution, and if we
+have a full vocabulary, we could mainly hear
+0:26:49.308 --> 0:26:55.209
+estimate how probable each next word is, and
+then use that in our language model fashion,
+0:26:55.209 --> 0:27:02.225
+as we've done it last time, we've got the probability
+of a full sentence as a product of all probabilities
+0:27:02.225 --> 0:27:03.208
+of individual.
+0:27:04.544 --> 0:27:06.695
+And UM.
+0:27:06.446 --> 0:27:09.776
+That was done and in ninety seven years.
+0:27:09.695 --> 0:27:17.370
+It's very easy to integrate it into this Locklear
+model, so we have said that this is how the
+0:27:17.370 --> 0:27:24.636
+Locklear model looks like, so we're searching
+the best translation, which minimizes each
+0:27:24.636 --> 0:27:25.126
+wage.
+0:27:25.125 --> 0:27:26.371
+The feature value.
+0:27:26.646 --> 0:27:31.642
+We have that with the minimum error training,
+if you can remember when we search for the
+0:27:31.642 --> 0:27:32.148
+optimal.
+0:27:32.512 --> 0:27:40.927
+We have the phrasetable probabilities, the
+language model, and we can just add here and
+0:27:40.927 --> 0:27:41.597
+there.
+0:27:41.861 --> 0:27:46.077
+So that is quite easy as said.
+0:27:45.941 --> 0:27:54.065
+That was how statistical machine translation
+was improved.
+0:27:53.927 --> 0:27:57.101
+Add one more feature.
+0:27:58.798 --> 0:28:11.220
+So how can we model the language mark for
+Belty with your network?
+0:28:11.035 --> 0:28:22.438
+So what we have to do is: And the problem
+in generally in the head is that most we haven't
+0:28:22.438 --> 0:28:25.070
+seen long sequences.
+0:28:25.085 --> 0:28:36.956
+Mostly we have to beg off to very short sequences
+and we are working on this discrete space where.
+0:28:37.337 --> 0:28:48.199
+So the idea is if we have a meal network we
+can map words into continuous representation
+0:28:48.199 --> 0:28:50.152
+and that helps.
+0:28:51.091 --> 0:28:59.598
+And the structure then looks like this, so
+this is the basic still feed forward neural
+0:28:59.598 --> 0:29:00.478
+network.
+0:29:01.361 --> 0:29:10.744
+We are doing this at Proximation again, so
+we are not putting in all previous words, but
+0:29:10.744 --> 0:29:11.376
+it's.
+0:29:11.691 --> 0:29:21.525
+And this is done because in your network we
+can have only a fixed type of input, so we
+0:29:21.525 --> 0:29:31.359
+can: Can only do a fixed set, and they are
+going to be doing exactly the same in minus
+0:29:31.359 --> 0:29:31.924
+one.
+0:29:33.593 --> 0:29:44.134
+And then we have, for example, three words
+and three different words, which are in these
+0:29:44.134 --> 0:29:54.911
+positions: And then we're having the first
+layer of the neural network, which learns words
+0:29:54.911 --> 0:29:56.214
+and words.
+0:29:57.437 --> 0:30:04.976
+There is one thing which is maybe special
+compared to the standard neural memory.
+0:30:05.345 --> 0:30:13.163
+So the representation of this word we want
+to learn first of all position independence,
+0:30:13.163 --> 0:30:19.027
+so we just want to learn what is the general
+meaning of the word.
+0:30:19.299 --> 0:30:26.244
+Therefore, the representation you get here
+should be the same as if you put it in there.
+0:30:27.247 --> 0:30:35.069
+The nice thing is you can achieve that in
+networks the same way you achieve it.
+0:30:34.972 --> 0:30:41.720
+This way you're reusing ears so we are forcing
+them to always stay.
+0:30:42.322 --> 0:30:49.689
+And that's why you then learn your word embedding,
+which is contextual and independent, so.
+0:30:49.909 --> 0:31:05.561
+So the idea is you have the diagram go home
+and you don't want to use the context.
+0:31:05.373 --> 0:31:07.654
+First you.
+0:31:08.348 --> 0:31:14.155
+That of course it might have a different meaning
+depending on where it stands, but learn that.
+0:31:14.514 --> 0:31:19.623
+First, we're learning key representation of
+the words, which is just the representation
+0:31:19.623 --> 0:31:20.378
+of the word.
+0:31:20.760 --> 0:31:37.428
+So it's also not like normally all input neurons
+are connected to all neurons.
+0:31:37.857 --> 0:31:47.209
+This is the first layer of representation,
+and then we have a lot denser representation,
+0:31:47.209 --> 0:31:56.666
+that is, our three word embeddings here, and
+now we are learning this interaction between
+0:31:56.666 --> 0:31:57.402
+words.
+0:31:57.677 --> 0:32:08.265
+So now we have at least one connected, fully
+connected layer here, which takes the three
+0:32:08.265 --> 0:32:14.213
+imbedded input and then learns the new embedding.
+0:32:15.535 --> 0:32:27.871
+And then if you had one of several layers
+of lining which is your output layer, then.
+0:32:28.168 --> 0:32:46.222
+So here the size is a vocabulary size, and
+then you put as target what is the probability
+0:32:46.222 --> 0:32:48.228
+for each.
+0:32:48.688 --> 0:32:56.778
+The nice thing is that you learn everything
+together, so you're not learning what is a
+0:32:56.778 --> 0:32:58.731
+good representation.
+0:32:59.079 --> 0:33:12.019
+When you are training the whole network together,
+it learns what representation for a word you
+0:33:12.019 --> 0:33:13.109
+get in.
+0:33:15.956 --> 0:33:19.176
+It's Yeah That Is the Main Idea.
+0:33:20.660 --> 0:33:32.695
+Nowadays often referred to as one way of self-supervised
+learning, why self-supervisory learning?
+0:33:33.053 --> 0:33:37.120
+The output is the next word and the input
+is the previous word.
+0:33:37.377 --> 0:33:46.778
+But somehow it's self-supervised because it's
+not really that we created labels, but we artificially.
+0:33:46.806 --> 0:34:01.003
+We just have pure text, and then we created
+the task.
+0:34:05.905 --> 0:34:12.413
+Say we have two sentences like go home again.
+0:34:12.272 --> 0:34:18.783
+Second one is go to creative again, so both.
+0:34:18.858 --> 0:34:30.737
+The starboard bygo and then we have to predict
+the next four years and my question is: Be
+0:34:30.737 --> 0:34:40.769
+modeled this ability as one vector with like
+probability or possible works.
+0:34:40.637 --> 0:34:42.746
+We have musical.
+0:34:44.044 --> 0:34:56.438
+You have multiple examples, so you would twice
+train, once you predict, once you predict,
+0:34:56.438 --> 0:35:02.359
+and then, of course, the best performance.
+0:35:04.564 --> 0:35:11.772
+A very good point, so you're not aggregating
+examples beforehand, but you're taking each
+0:35:11.772 --> 0:35:13.554
+example individually.
+0:35:19.259 --> 0:35:33.406
+So what you do is you simultaneously learn
+the projection layer which represents this
+0:35:33.406 --> 0:35:39.163
+word and the N gram probabilities.
+0:35:39.499 --> 0:35:48.390
+And what people then later analyzed is that
+these representations are very powerful.
+0:35:48.286 --> 0:35:56.342
+The task is just a very important task to
+model like what is the next word.
+0:35:56.816 --> 0:36:09.429
+It's a bit motivated by people saying in order
+to get the meaning of the word you have to
+0:36:09.429 --> 0:36:10.690
+look at.
+0:36:10.790 --> 0:36:18.467
+If you read the text in there, which you have
+never seen, you can still estimate the meaning
+0:36:18.467 --> 0:36:22.264
+of this word because you know how it is used.
+0:36:22.602 --> 0:36:26.667
+Just imagine you read this text about some
+city.
+0:36:26.584 --> 0:36:32.476
+Even if you've never seen the city before
+heard, you often know from.
+0:36:34.094 --> 0:36:44.809
+So what is now the big advantage of using
+neural networks?
+0:36:44.628 --> 0:36:56.941
+Just imagine we have to estimate this: So
+you have to monitor the probability of ad hip
+0:36:56.941 --> 0:37:00.300
+and now imagine iPhone.
+0:37:00.600 --> 0:37:06.837
+So all the techniques we have at the last
+time.
+0:37:06.707 --> 0:37:14.246
+At the end, if you haven't seen iPhone, you
+will always.
+0:37:15.055 --> 0:37:19.502
+Because you haven't seen the previous words,
+so you have no idea how to do that.
+0:37:19.447 --> 0:37:24.366
+You won't have seen the diagram, the trigram
+and all the others, so the probability here
+0:37:24.366 --> 0:37:27.682
+will just be based on the probability of ad,
+so it uses no.
+0:37:28.588 --> 0:37:38.328
+If you're having this type of model, what
+does it do so?
+0:37:38.157 --> 0:37:43.460
+This is the last three words.
+0:37:43.483 --> 0:37:49.837
+Maybe this representation is messed up because
+it's mainly on a particular word or source
+0:37:49.837 --> 0:37:50.260
+that.
+0:37:50.730 --> 0:38:00.426
+Now anyway you have these two information
+that were two words before was first and therefore:
+0:38:00.426 --> 0:38:07.234
+So you have a lot of information here to estimate
+how good it is.
+0:38:07.131 --> 0:38:13.293
+Of course, there could be more information.
+0:38:13.593 --> 0:38:25.958
+So all this type of modeling we can do and
+that we couldn't do beforehand because we always.
+0:38:27.027 --> 0:38:31.905
+Don't guess how we do it now.
+0:38:31.742 --> 0:38:41.826
+Typically you would have one talking for awkward
+vocabulary.
+0:38:42.602 --> 0:38:45.855
+All you're doing by carrying coding when it
+has a fixed dancing.
+0:38:46.226 --> 0:38:49.439
+Yeah, you have to do something like that that
+the opposite way.
+0:38:50.050 --> 0:38:55.413
+So yeah, all the vocabulary are by thankcoding
+where you don't have have all the vocabulary.
+0:38:55.735 --> 0:39:07.665
+But then, of course, the back pairing coating
+is better with arbitrary context because a
+0:39:07.665 --> 0:39:11.285
+problem with back pairing.
+0:39:17.357 --> 0:39:20.052
+Anymore questions to the basic same little
+things.
+0:39:23.783 --> 0:39:36.162
+This model we then want to continue is to
+look into how complex that is or can make things
+0:39:36.162 --> 0:39:39.155
+maybe more efficient.
+0:39:40.580 --> 0:39:47.404
+At the beginning there was definitely a major
+challenge.
+0:39:47.284 --> 0:39:50.431
+It's still not that easy.
+0:39:50.310 --> 0:39:58.301
+All guess follow the talk about their environmental
+fingerprint.
+0:39:58.478 --> 0:40:05.686
+So this calculation is normally heavy, and
+if you build systems yourself, you have to
+0:40:05.686 --> 0:40:06.189
+wait.
+0:40:06.466 --> 0:40:15.412
+So it's good to know a bit about how complex
+things are in order to do a good or efficient.
+0:40:15.915 --> 0:40:24.706
+So one thing where most of the calculation
+really happens is if you're.
+0:40:25.185 --> 0:40:34.649
+So in generally all these layers, of course,
+we're talking about networks and the zones
+0:40:34.649 --> 0:40:35.402
+fancy.
+0:40:35.835 --> 0:40:48.305
+So what you have to do in order to calculate
+here these activations, you have this weight.
+0:40:48.488 --> 0:41:05.021
+So to make it simple, let's see we have three
+outputs, and then you just do a metric identification
+0:41:05.021 --> 0:41:08.493
+between your weight.
+0:41:08.969 --> 0:41:19.641
+That is why the use is so powerful for neural
+networks because they are very good in doing
+0:41:19.641 --> 0:41:22.339
+metric multiplication.
+0:41:22.782 --> 0:41:28.017
+However, for some type of embedding layer
+this is really very inefficient.
+0:41:28.208 --> 0:41:37.547
+So in this input we are doing this calculation.
+0:41:37.352 --> 0:41:47.085
+What we are mainly doing is selecting one
+color.
+0:41:47.387 --> 0:42:03.570
+So therefore you can do at least the forward
+pass a lot more efficient if you don't really
+0:42:03.570 --> 0:42:07.304
+do this calculation.
+0:42:08.348 --> 0:42:20.032
+So the weight metrics of the first embedding
+layer is just that in each color you have.
+0:42:20.580 --> 0:42:30.990
+So this is how your initial weights look like
+and how you can interpret or understand.
+0:42:32.692 --> 0:42:42.042
+And this is already relatively important because
+remember this is a huge dimensional thing,
+0:42:42.042 --> 0:42:51.392
+so typically here we have the number of words
+ten thousand, so this is the word embeddings.
+0:42:51.451 --> 0:43:00.400
+Because it's the largest one there, we have
+entries, while for the others we maybe have.
+0:43:00.660 --> 0:43:03.402
+So they are a little bit efficient and are
+important to make this in.
+0:43:06.206 --> 0:43:10.529
+And then you can look at where else the calculations
+are very difficult.
+0:43:10.830 --> 0:43:20.294
+So here we have our individual network, so
+here are the word embeddings.
+0:43:20.164 --> 0:43:29.500
+Then we have one hidden layer, and then you
+can look at how difficult.
+0:43:30.270 --> 0:43:42.863
+We could save a lot of calculations by calculating
+that by just doing like do the selection because:
+0:43:42.863 --> 0:43:51.716
+And then the number of calculations you have
+to do here is the length.
+0:43:52.993 --> 0:44:06.206
+Then we have here the hint size that is the
+hint size, so the first step of calculation
+0:44:06.206 --> 0:44:10.260
+for this metric is an age.
+0:44:10.730 --> 0:44:20.639
+Then you have to do some activation function
+which is this: This is the hidden size hymn
+0:44:20.639 --> 0:44:29.100
+because we need the vocabulary socks to calculate
+the probability for each.
+0:44:29.889 --> 0:44:40.474
+And if you look at this number, so if you
+have a projection sign of one hundred and a
+0:44:40.474 --> 0:44:45.027
+vocabulary sign of one hundred, you.
+0:44:45.425 --> 0:44:53.958
+And that's why there has been especially at
+the beginning some ideas on how we can reduce
+0:44:53.958 --> 0:44:55.570
+the calculation.
+0:44:55.956 --> 0:45:02.352
+And if we really need to calculate all our
+capabilities, or if we can calculate only some.
+0:45:02.582 --> 0:45:13.061
+And there again one important thing to think
+about is for what you will use my language.
+0:45:12.943 --> 0:45:21.885
+One can use it for generations and that's
+where we will see the next week.
+0:45:21.766 --> 0:45:22.511
+And.
+0:45:23.123 --> 0:45:32.164
+Initially, if it's just used as a feature,
+we do not want to use it for generation, but
+0:45:32.164 --> 0:45:32.575
+we.
+0:45:32.953 --> 0:45:41.913
+And there we might not be interested in all
+the probabilities, but we already know all
+0:45:41.913 --> 0:45:49.432
+the probability of this one word, and then
+it might be very inefficient.
+0:45:51.231 --> 0:45:53.638
+And how can you do that so initially?
+0:45:53.575 --> 0:45:56.301
+For example, people look into shortlists.
+0:45:56.756 --> 0:46:03.321
+So the idea was this calculation at the end
+is really very expensive.
+0:46:03.227 --> 0:46:05.763
+So can we make that more.
+0:46:05.945 --> 0:46:17.135
+And the idea was okay, and most birds occur
+very rarely, and some beef birds occur very,
+0:46:17.135 --> 0:46:18.644
+very often.
+0:46:19.019 --> 0:46:37.644
+And so they use the smaller imagery, which
+is maybe very small, and then you merge a new.
+0:46:37.937 --> 0:46:45.174
+So you're taking if the word is in the shortness,
+so in the most frequent words.
+0:46:45.825 --> 0:46:58.287
+You're taking the probability of this short
+word by some normalization here, and otherwise
+0:46:58.287 --> 0:46:59.656
+you take.
+0:47:00.020 --> 0:47:00.836
+Course.
+0:47:00.734 --> 0:47:09.773
+It will not be as good, but then we don't
+have to calculate all the capabilities at the
+0:47:09.773 --> 0:47:16.038
+end, but we only have to calculate it for the
+most frequent.
+0:47:19.599 --> 0:47:39.477
+Machines about that, but of course we don't
+model the probability of the infrequent words.
+0:47:39.299 --> 0:47:46.658
+And one idea is to do what is reported as
+soles for the structure of the layer.
+0:47:46.606 --> 0:47:53.169
+You see how some years ago people were very
+creative in giving names to newer models.
+0:47:53.813 --> 0:48:00.338
+And there the idea is that we model the out
+group vocabulary as a clustered strip.
+0:48:00.680 --> 0:48:08.498
+So you don't need to mold all of your bodies
+directly, but you are putting words into.
+0:48:08.969 --> 0:48:20.623
+A very intricate word is first in and then
+in and then in and that is in sub-sub-clusters
+0:48:20.623 --> 0:48:21.270
+and.
+0:48:21.541 --> 0:48:29.936
+And this is what was mentioned in the past
+of the work, so these are the subclasses that
+0:48:29.936 --> 0:48:30.973
+always go.
+0:48:30.879 --> 0:48:40.756
+So if it's in cluster one at the first position
+then you only look at all the words which are:
+0:48:40.756 --> 0:48:50.217
+And then you can calculate the probability
+of a word again just by the product over these,
+0:48:50.217 --> 0:48:55.519
+so the probability of the word is the first
+class.
+0:48:57.617 --> 0:49:12.331
+It's maybe more clear where you have the sole
+architecture, so what you will do is first
+0:49:12.331 --> 0:49:13.818
+predict.
+0:49:14.154 --> 0:49:26.435
+Then you go to the appropriate sub-class,
+then you calculate the probability of the sub-class.
+0:49:27.687 --> 0:49:34.932
+Anybody have an idea why this is more, more
+efficient, or if people do it first, it looks
+0:49:34.932 --> 0:49:35.415
+more.
+0:49:42.242 --> 0:49:56.913
+Yes, so you have to do less calculations,
+or maybe here you have to calculate the element
+0:49:56.913 --> 0:49:59.522
+there, but you.
+0:49:59.980 --> 0:50:06.116
+The capabilities in the set classes that you're
+going through and not for all of them.
+0:50:06.386 --> 0:50:16.688
+Therefore, it's only more efficient if you
+don't need all awkward preferences because
+0:50:16.688 --> 0:50:21.240
+you have to even calculate the class.
+0:50:21.501 --> 0:50:30.040
+So it's only more efficient in scenarios where
+you really need to use a language to evaluate.
+0:50:35.275 --> 0:50:50.164
+How this works is that on the output layer
+you only have a vocabulary of: But on the input
+0:50:50.164 --> 0:51:04.563
+layer you have always your full vocabulary
+because at the input we saw that this is not
+0:51:04.563 --> 0:51:06.690
+complicated.
+0:51:06.906 --> 0:51:19.778
+And then you can cluster down all your words,
+embedding series of classes, and use that as
+0:51:19.778 --> 0:51:23.031
+your classes for that.
+0:51:22.890 --> 0:51:26.573
+So yeah, you have words.
+0:51:29.249 --> 0:51:32.593
+Is one idea of doing it.
+0:51:32.459 --> 0:51:44.899
+There is also a second idea of doing it again,
+the idea that we don't need the probability.
+0:51:45.025 --> 0:51:53.401
+So sometimes it doesn't really need to be
+a probability to evaluate.
+0:51:53.280 --> 0:52:05.637
+It's only important that: And: Here is called
+self-normalization.
+0:52:05.450 --> 0:52:19.350
+What people have done so is in the softmax
+is always to the input divided by normalization.
+0:52:19.759 --> 0:52:25.194
+So this is how we calculate the soft mix.
+0:52:25.825 --> 0:52:42.224
+And in self-normalization now, the idea is
+that we don't need to calculate the logarithm.
+0:52:42.102 --> 0:52:54.284
+That would be zero, and then you don't even
+have to calculate the normalization.
+0:52:54.514 --> 0:53:01.016
+So how can we achieve that?
+0:53:00.784 --> 0:53:08.687
+And then there's the nice thing.
+0:53:09.009 --> 0:53:14.743
+And our novel Lots and more to maximize probability.
+0:53:14.635 --> 0:53:23.833
+We have this cross entry lot that probability
+is higher, and now we're just adding.
+0:53:24.084 --> 0:53:31.617
+And the second loss just tells us you're pleased
+training the way the lock set is zero.
+0:53:32.352 --> 0:53:38.625
+So then if it's nearly zero at the end you
+don't need to calculate this and it's also
+0:53:38.625 --> 0:53:39.792
+very efficient.
+0:53:40.540 --> 0:53:57.335
+One important thing is this is only an inference,
+so during tests we don't need to calculate.
+0:54:00.480 --> 0:54:15.006
+You can do a bit of a hyperparameter here
+where you do the waiting and how much effort
+0:54:15.006 --> 0:54:16.843
+should be.
+0:54:18.318 --> 0:54:35.037
+The only disadvantage is that it's no speed
+up during training and there are other ways
+0:54:35.037 --> 0:54:37.887
+of doing that.
+0:54:41.801 --> 0:54:43.900
+I'm with you all.
+0:54:44.344 --> 0:54:48.540
+Then we are coming very, very briefly like
+this one here.
+0:54:48.828 --> 0:54:53.692
+There are more things on different types of
+languages.
+0:54:53.604 --> 0:54:58.028
+We are having a very short view of a restricted.
+0:54:58.298 --> 0:55:09.737
+And then we'll talk about recurrent neural
+networks for our language minds because they
+0:55:09.737 --> 0:55:17.407
+have the advantage now that we can't even further
+improve.
+0:55:18.238 --> 0:55:24.343
+Different types of neural networksThere's
+also different types of neural networks.
+0:55:24.269 --> 0:55:30.178
+These ballroom machines are not having input.
+0:55:30.330 --> 0:55:39.180
+They have these binary units: And they define
+an energy function on the network, which can
+0:55:39.180 --> 0:55:46.864
+be in respect of bottom machines efficiently
+calculated, and restricted needs.
+0:55:46.767 --> 0:55:53.149
+You only have connections between the input
+and the hidden layer.
+0:55:53.393 --> 0:56:00.190
+So you see here you don't have input and output,
+you just have an input and you calculate what.
+0:56:00.460 --> 0:56:16.429
+Which of course nicely fits with the idea
+we're having, so you can use this for N gram
+0:56:16.429 --> 0:56:19.182
+language ones.
+0:56:19.259 --> 0:56:25.187
+Decaying this credibility of the input by
+this type of neural networks.
+0:56:26.406 --> 0:56:30.582
+And the advantage of this type of model of
+board that is.
+0:56:30.550 --> 0:56:38.629
+Very fast to integrate it, so that one was
+the first one which was used during decoding.
+0:56:38.938 --> 0:56:50.103
+The problem of it is that the Enron language
+models were very good at performing the calculation.
+0:56:50.230 --> 0:57:00.114
+So what people typically did is we talked
+about a best list, so they generated a most
+0:57:00.114 --> 0:57:05.860
+probable output, and then they scored each
+entry.
+0:57:06.146 --> 0:57:10.884
+A language model, and then only like change
+the order against that based on that which.
+0:57:11.231 --> 0:57:20.731
+The knifing is maybe only hundred entries,
+while during decoding you will look at several
+0:57:20.731 --> 0:57:21.787
+thousand.
+0:57:26.186 --> 0:57:40.437
+This but let's look at the context, so we
+have now seen your language models.
+0:57:40.254 --> 0:57:43.737
+There is the big.
+0:57:44.084 --> 0:57:57.552
+Remember ingram language is not always words
+because sometimes you have to back off or interpolation
+0:57:57.552 --> 0:57:59.953
+to lower ingrams.
+0:58:00.760 --> 0:58:05.504
+However, in neural models we always have all
+of these inputs and some of these.
+0:58:07.147 --> 0:58:21.262
+The disadvantage is that you are still limited
+in your context, and if you remember the sentence
+0:58:21.262 --> 0:58:23.008
+from last,.
+0:58:22.882 --> 0:58:28.445
+Sometimes you need more context and there's
+unlimited contexts that you might need and
+0:58:28.445 --> 0:58:34.838
+you can always create sentences where you need
+this file context in order to put a good estimation.
+0:58:35.315 --> 0:58:44.955
+Can we also do it different in order to better
+understand that it makes sense to view?
+0:58:45.445 --> 0:58:56.160
+Sequence labeling tasksSo sequence labeling
+tasks are a very common type of towns in natural
+0:58:56.160 --> 0:59:03.418
+language processing where you have an input
+sequence and then.
+0:59:03.323 --> 0:59:08.663
+I've token so you have one output for each
+input so machine translation is not a secret
+0:59:08.663 --> 0:59:14.063
+labeling cast because the number of inputs
+and the number of outputs is different so you
+0:59:14.063 --> 0:59:19.099
+put in a string German which has five words
+and the output can be six or seven or.
+0:59:19.619 --> 0:59:20.155
+Secrets.
+0:59:20.095 --> 0:59:24.084
+Lately you always have the same number of
+and the same number of.
+0:59:24.944 --> 0:59:40.940
+And you can model language modeling as that,
+and you just say a label for each word is always
+0:59:40.940 --> 0:59:43.153
+a next word.
+0:59:45.705 --> 0:59:54.823
+This is the more general you can think of
+it, for example how to speech taking entity
+0:59:54.823 --> 0:59:56.202
+recognition.
+0:59:58.938 --> 1:00:08.081
+And if you look at now fruit cut token in
+generally sequence, they can depend on import
+1:00:08.081 --> 1:00:08.912
+tokens.
+1:00:09.869 --> 1:00:11.260
+Nice thing.
+1:00:11.144 --> 1:00:21.872
+In our case, the output tokens are the same
+so we can easily model it that they only depend
+1:00:21.872 --> 1:00:24.787
+on all the input tokens.
+1:00:24.670 --> 1:00:28.988
+So we have this whether it's or so.
+1:00:31.011 --> 1:00:42.945
+But we can always do a look at what specific
+type of sequence labeling, unidirectional sequence
+1:00:42.945 --> 1:00:44.188
+labeling.
+1:00:44.584 --> 1:00:58.215
+And that's exactly how we want the language
+of the next word only depends on all the previous
+1:00:58.215 --> 1:01:00.825
+words that we're.
+1:01:01.321 --> 1:01:12.899
+Mean, of course, that's not completely true
+in a language that the bad context might also
+1:01:12.899 --> 1:01:14.442
+be helpful.
+1:01:14.654 --> 1:01:22.468
+We will model always the probability of a
+word given on its history, and therefore we
+1:01:22.468 --> 1:01:23.013
+need.
+1:01:23.623 --> 1:01:29.896
+And currently we did there this approximation
+in sequence labeling that we have this windowing
+1:01:29.896 --> 1:01:30.556
+approach.
+1:01:30.951 --> 1:01:43.975
+So in order to predict this type of word we
+always look at the previous three words and
+1:01:43.975 --> 1:01:48.416
+then to do this one we again.
+1:01:49.389 --> 1:01:55.137
+If you are into neural networks you recognize
+this type of structure.
+1:01:55.055 --> 1:01:57.522
+Also are the typical neural.
+1:01:58.938 --> 1:02:09.688
+Yes, so this is like Engram, Louis Couperus,
+and at least in some way compared to the original,
+1:02:09.688 --> 1:02:12.264
+you're always looking.
+1:02:14.334 --> 1:02:30.781
+However, there are also other types of neural
+network structures which we can use for sequence.
+1:02:32.812 --> 1:02:34.678
+That we can do so.
+1:02:34.580 --> 1:02:39.646
+The idea is in recurrent neural network structure.
+1:02:39.547 --> 1:02:43.225
+We are saving the complete history.
+1:02:43.623 --> 1:02:55.118
+So again we have to do like this fix size
+representation because neural networks always
+1:02:55.118 --> 1:02:56.947
+need to have.
+1:02:57.157 --> 1:03:05.258
+And then we start with an initial value for
+our storage.
+1:03:05.116 --> 1:03:15.919
+We are giving our first input and then calculating
+the new representation.
+1:03:16.196 --> 1:03:26.328
+If you look at this, it's just again your
+network was two types of inputs: in your work,
+1:03:26.328 --> 1:03:29.743
+in your initial hidden state.
+1:03:30.210 --> 1:03:46.468
+Then you can apply it to the next type of
+input and you're again having.
+1:03:47.367 --> 1:03:53.306
+Nice thing is now that you can do now step
+by step by step, so all the way over.
+1:03:55.495 --> 1:04:05.245
+The nice thing that we are having here now
+is that we are having context information from
+1:04:05.245 --> 1:04:07.195
+all the previous.
+1:04:07.607 --> 1:04:13.582
+So if you're looking like based on which words
+do you use here, calculate your ability of
+1:04:13.582 --> 1:04:14.180
+varying.
+1:04:14.554 --> 1:04:20.128
+It depends on is based on this path.
+1:04:19.977 --> 1:04:33.085
+It depends on and this hidden state was influenced
+by this one and this hidden state.
+1:04:33.473 --> 1:04:37.741
+A new way to model probabilitySo now we're
+having something new.
+1:04:37.675 --> 1:04:46.451
+We can really model the word probability not
+only on a fixed context.
+1:04:46.906 --> 1:04:53.570
+Because the in-states we're having here in
+our area are influenced by all the trivia.
+1:04:56.296 --> 1:05:00.909
+So how is that to mean?
+1:05:00.717 --> 1:05:16.290
+If you're not thinking about the history of
+clustering, we said the clustering.
+1:05:16.736 --> 1:05:24.261
+So do not need to do any clustering here,
+and we also see how things are put together
+1:05:24.261 --> 1:05:26.273
+in order to really do.
+1:05:29.489 --> 1:05:43.433
+In the green box this way since we are starting
+from the left point to the right.
+1:05:44.524 --> 1:05:48.398
+And that's right, so they're clustered in
+some parts.
+1:05:48.326 --> 1:05:58.827
+Here is some type of clustering happening:
+It's continuous representations, but a smaller
+1:05:58.827 --> 1:06:02.677
+difference doesn't matter again.
+1:06:02.560 --> 1:06:10.846
+So if you have a lot of different histories,
+the similarity.
+1:06:11.071 --> 1:06:15.791
+Because in order to do the final restriction
+you only do it based on the green box.
+1:06:16.156 --> 1:06:24.284
+So you are now again still learning some type
+of clasp.
+1:06:24.139 --> 1:06:30.238
+You don't have to do this hard decision.
+1:06:30.570 --> 1:06:39.013
+The only restriction you are giving is you
+have to install everything that is important.
+1:06:39.359 --> 1:06:54.961
+So it's a different type of limitation, so
+you calculate the probability based on the
+1:06:54.961 --> 1:06:57.138
+last words.
+1:06:57.437 --> 1:07:09.645
+That is how you still need some cluster things
+in order to do it efficiently.
+1:07:09.970 --> 1:07:25.311
+But this is where things get merged together
+in this type of hidden representation, which
+1:07:25.311 --> 1:07:28.038
+is then merged.
+1:07:28.288 --> 1:07:33.104
+On the previous words, but they are some other
+bottleneck in order to make a good estimation.
+1:07:34.474 --> 1:07:41.242
+So the idea is that we can store all our history
+into one lecture.
+1:07:41.581 --> 1:07:47.351
+Which is very good and makes it more strong.
+1:07:47.223 --> 1:07:51.636
+Next we come to problems of that.
+1:07:51.507 --> 1:07:57.870
+Of course, at some point it might be difficult.
+1:07:58.398 --> 1:08:02.230
+Then maybe things get all overwritten, or
+you cannot store everything in there.
+1:08:02.662 --> 1:08:04.514
+So,.
+1:08:04.184 --> 1:08:10.252
+Therefore, yet for short things like signal
+sentences that works well, but especially if
+1:08:10.252 --> 1:08:16.184
+you think of other tasks like harmonisation
+where a document based on T where you need
+1:08:16.184 --> 1:08:22.457
+to consider a full document, these things got
+a bit more complicated and we learned another
+1:08:22.457 --> 1:08:23.071
+type of.
+1:08:24.464 --> 1:08:30.455
+For the further in order to understand these
+networks, it's good to have both views always.
+1:08:30.710 --> 1:08:39.426
+So this is the unroll view, so you have this
+type of network.
+1:08:39.285 --> 1:08:47.769
+Therefore, it can be shown as: We have here
+the output and here's your network which is
+1:08:47.769 --> 1:08:52.108
+connected by itself and that is a recurrent.
+1:08:56.176 --> 1:09:11.033
+There is one challenge in these networks and
+that is the training so the nice thing is train
+1:09:11.033 --> 1:09:11.991
+them.
+1:09:12.272 --> 1:09:20.147
+So the idea is we don't really know how to
+train them, but if you unroll them like this,.
+1:09:20.540 --> 1:09:38.054
+It's exactly the same so you can measure your
+arrows and then you propagate your arrows.
+1:09:38.378 --> 1:09:45.647
+Now the nice thing is if you unroll something,
+it's a feet forward and you can train it.
+1:09:46.106 --> 1:09:56.493
+The only important thing is, of course, for
+different inputs you have to take that into
+1:09:56.493 --> 1:09:57.555
+account.
+1:09:57.837 --> 1:10:07.621
+But since parameters are shared, it's somehow
+similar and you can train that the training
+1:10:07.621 --> 1:10:08.817
+algorithm.
+1:10:10.310 --> 1:10:16.113
+One thing which makes things difficult is
+what is referred to as the vanishing gradient.
+1:10:16.048 --> 1:10:21.683
+So we are saying there is a big advantage
+of these models and that's why we are using
+1:10:21.683 --> 1:10:22.076
+that.
+1:10:22.010 --> 1:10:27.960
+The output here does not only depend on the
+current input of a last three but on anything
+1:10:27.960 --> 1:10:29.415
+that was said before.
+1:10:29.809 --> 1:10:32.803
+That's a very strong thing is the motivation
+of using art.
+1:10:33.593 --> 1:10:44.599
+However, if you're using standard, the influence
+here gets smaller and smaller, and the models.
+1:10:44.804 --> 1:10:55.945
+Because the gradients get smaller and smaller,
+and so the arrow here propagated to this one,
+1:10:55.945 --> 1:10:59.659
+this contributes to the arrow.
+1:11:00.020 --> 1:11:06.710
+And yeah, that's why standard R&amp;S are
+difficult or have to become boosters.
+1:11:07.247 --> 1:11:11.439
+How to make neural networks more complexSo
+if we are talking about our ends nowadays,.
+1:11:11.791 --> 1:11:19.532
+What we are typically meaning are long short
+memories.
+1:11:19.391 --> 1:11:30.933
+You see there by now quite old already, but
+they have special gating mechanisms.
+1:11:31.171 --> 1:11:41.911
+So in the language model tasks, for example
+in some other story information, all this sentence
+1:11:41.911 --> 1:11:44.737
+started with a question.
+1:11:44.684 --> 1:11:51.886
+Because if you only look at the five last
+five words, it's often no longer clear as a
+1:11:51.886 --> 1:11:52.556
+normal.
+1:11:53.013 --> 1:12:06.287
+So there you have these mechanisms with the
+right gate in order to store things for a longer
+1:12:06.287 --> 1:12:08.571
+time into your.
+1:12:10.730 --> 1:12:20.147
+Here they are used in, in, in, in selling
+quite a lot of works.
+1:12:21.541 --> 1:12:30.487
+For especially text machine translation now,
+the standard is to do transformer base models.
+1:12:30.690 --> 1:12:42.857
+But for example, this type of in architecture
+we have later one lecture about efficiency.
+1:12:42.882 --> 1:12:53.044
+And there in the decoder and partial networks
+they are still using our edges because then.
+1:12:53.473 --> 1:12:57.542
+So it's not that our ends are of no importance.
+1:12:59.239 --> 1:13:09.178
+In order to make them strong, there are some
+more things which are helpful and should be:
+1:13:09.178 --> 1:13:19.669
+So one thing is it's a very easy and nice trick
+to make this neon network stronger and better.
+1:13:19.739 --> 1:13:21.619
+So, of course, it doesn't work always.
+1:13:21.571 --> 1:13:23.452
+They have to have enough training to.
+1:13:23.763 --> 1:13:29.583
+But in general that is the easiest way of
+making your mouth bigger and stronger is to
+1:13:29.583 --> 1:13:30.598
+increase your.
+1:13:30.630 --> 1:13:43.244
+And you've seen that with a large size model
+they are always braggling about.
+1:13:43.903 --> 1:13:53.657
+This is one way so the question is how do
+you get more parameters?
+1:13:53.511 --> 1:14:04.947
+There's two ways you can make your representations:
+And the other thing is its octave deep learning,
+1:14:04.947 --> 1:14:10.043
+so the other thing is to make your networks.
+1:14:11.471 --> 1:14:13.831
+And then you can also get more work off.
+1:14:14.614 --> 1:14:19.931
+There's one problem with this and with more
+deeper networks.
+1:14:19.844 --> 1:14:23.332
+It's very similar to what we saw with.
+1:14:23.603 --> 1:14:34.755
+With the we have this problem of radiant flow
+that if it flows so fast like the radiant gets
+1:14:34.755 --> 1:14:35.475
+very.
+1:14:35.795 --> 1:14:41.114
+Exactly the same thing happens in deep.
+1:14:40.981 --> 1:14:52.286
+If you take the gradient and tell it's the
+right or wrong, then you're propagating.
+1:14:52.612 --> 1:14:53.228
+Three layers.
+1:14:53.184 --> 1:14:56.440
+It's no problem, but if you're going to ten,
+twenty or a hundred layers.
+1:14:57.797 --> 1:14:59.690
+That is getting typically a problem.
+1:15:00.060 --> 1:15:10.659
+People are doing and they are using what is
+called visual connections.
+1:15:10.510 --> 1:15:15.889
+That's a very helpful idea, which.
+1:15:15.956 --> 1:15:20.309
+And so the idea is that these networks.
+1:15:20.320 --> 1:15:30.694
+In between should calculate really what is
+a new representation, but they are calculating
+1:15:30.694 --> 1:15:31.386
+what.
+1:15:31.731 --> 1:15:37.585
+And therefore in the end you'll always the
+output of a layer is added with the input.
+1:15:38.318 --> 1:15:48.824
+The nice thing is that later, if you are doing
+back propagation with this very fast back,.
+1:15:49.209 --> 1:16:01.896
+So that is what you're seeing nowadays in
+very deep architectures, not only as others,
+1:16:01.896 --> 1:16:04.229
+but you always.
+1:16:04.704 --> 1:16:07.388
+Has two advantages.
+1:16:07.253 --> 1:16:15.264
+On the one hand, it's more easy to learn a
+representation.
+1:16:15.128 --> 1:16:18.799
+On the other hand, these.
+1:16:22.082 --> 1:16:24.114
+Goods.
+1:16:23.843 --> 1:16:31.763
+That much for the new record before, so the
+last thing now means this.
+1:16:31.671 --> 1:16:36.400
+Language was used in the molds itself.
+1:16:36.279 --> 1:16:46.709
+Now we're seeing them again, but one thing
+that at the beginning was very essential.
+1:16:46.967 --> 1:16:57.655
+So people really train part in the language
+models only to get this type of embeddings
+1:16:57.655 --> 1:17:04.166
+and therefore we want to look a bit more into
+these.
+1:17:09.229 --> 1:17:13.456
+Some laugh words to the word embeddings.
+1:17:13.353 --> 1:17:22.080
+The interesting thing is that word embeddings
+can be used for very different tasks.
+1:17:21.976 --> 1:17:27.173
+The advantage is we can train the word embedded.
+1:17:27.347 --> 1:17:31.334
+The knife is you can train that on just large
+amounts of data.
+1:17:31.931 --> 1:17:40.937
+And then if you have these wooden beddings
+you don't have a layer of ten thousand any
+1:17:40.937 --> 1:17:41.566
+more.
+1:17:41.982 --> 1:17:52.231
+So then you can train a small market to do
+any other tasks and therefore you're more.
+1:17:52.532 --> 1:17:58.761
+Initial word embeddings really depend only
+on the word itself.
+1:17:58.662 --> 1:18:07.350
+If you look at the two meanings of can, the
+can of beans, or can they do that, some of
+1:18:07.350 --> 1:18:08.748
+the embedded.
+1:18:09.189 --> 1:18:12.395
+That cannot be resolved.
+1:18:12.267 --> 1:18:23.907
+Therefore, you need to know the context, and
+if you look at the higher levels that people
+1:18:23.907 --> 1:18:27.917
+are doing in the context, but.
+1:18:29.489 --> 1:18:33.757
+However, even this one has quite very interesting.
+1:18:34.034 --> 1:18:44.644
+So people like to visualize that they're always
+a bit difficult because if you look at this
+1:18:44.644 --> 1:18:47.182
+word, vector or word.
+1:18:47.767 --> 1:18:52.879
+And drawing your five hundred dimensional
+vector is still a bit challenging.
+1:18:53.113 --> 1:19:12.464
+So you cannot directly do that, so what people
+have to do is learn some type of dimension.
+1:19:13.073 --> 1:19:17.216
+And of course then yes some information gets
+lost but you can try it.
+1:19:18.238 --> 1:19:28.122
+And you see, for example, this is the most
+famous and common example, so what you can
+1:19:28.122 --> 1:19:37.892
+look is you can look at the difference between
+the male and the female word English.
+1:19:38.058 --> 1:19:40.389
+And you can do that for a very different work.
+1:19:40.780 --> 1:19:45.403
+And that is where, where the masks come into
+that, what people then look into.
+1:19:45.725 --> 1:19:50.995
+So what you can now, for example, do is you
+can calculate the difference between man and
+1:19:50.995 --> 1:19:51.410
+woman.
+1:19:52.232 --> 1:19:56.356
+And what you can do then you can take the
+embedding of peeing.
+1:19:56.290 --> 1:20:02.341
+You can add on it the difference between men
+and women and where people get really excited.
+1:20:02.275 --> 1:20:05.524
+Then you can look at what are the similar
+words.
+1:20:05.457 --> 1:20:09.220
+So you won't, of course, directly hit the
+correct word.
+1:20:09.153 --> 1:20:10.501
+It's a continuous.
+1:20:10.790 --> 1:20:24.062
+But you can look at what are the nearest neighbors
+to the same, and often these words are near.
+1:20:24.224 --> 1:20:33.911
+So it's somehow weird that the difference
+between these works is always the same.
+1:20:34.374 --> 1:20:37.308
+Can do different things.
+1:20:37.191 --> 1:20:47.506
+You can also imagine that the work tends to
+be assuming and swim, and with walking and
+1:20:47.506 --> 1:20:49.047
+walking you.
+1:20:49.469 --> 1:20:53.040
+So you can try to use him.
+1:20:52.907 --> 1:20:56.254
+It's no longer like say.
+1:20:56.120 --> 1:21:04.020
+The interesting thing is nobody taught him
+the principle.
+1:21:04.284 --> 1:21:09.910
+So it's purely trained on the task of doing
+the next work prediction.
+1:21:10.230 --> 1:21:23.669
+And even for some information like the capital,
+this is the difference between the capital.
+1:21:23.823 --> 1:21:33.760
+Is another visualization here where you have
+done the same things on the difference between.
+1:21:33.853 --> 1:21:41.342
+And you see it's not perfect, but it's building
+in my directory, so you can even use that for
+1:21:41.342 --> 1:21:42.936
+pressure answering.
+1:21:42.856 --> 1:21:50.322
+If you have no three countries, the capital,
+you can do what is the difference between them.
+1:21:50.242 --> 1:21:53.375
+You apply that to a new country, and.
+1:21:54.834 --> 1:22:02.280
+So these models are able to really learn a
+lot of information and collapse this information
+1:22:02.280 --> 1:22:04.385
+into this representation.
+1:22:05.325 --> 1:22:07.679
+And just to do the next two are predictions.
+1:22:07.707 --> 1:22:22.358
+And that also explains a bit maybe or explains
+strongly, but motivates what is the main advantage
+1:22:22.358 --> 1:22:26.095
+of this type of neurons.
+1:22:28.568 --> 1:22:41.599
+So to summarize what we did today, so what
+you should hopefully have with you is: Then
+1:22:41.599 --> 1:22:49.238
+how we can do language modeling with new networks.
+1:22:49.449 --> 1:22:55.849
+We looked at three different architectures:
+We looked into the feet forward language one,
+1:22:55.849 --> 1:22:59.050
+the R&amp;N, and the one based the balsamic.
+1:22:59.039 --> 1:23:04.559
+And finally, there are different architectures
+to do in neural networks.
+1:23:04.483 --> 1:23:10.961
+We have seen feet for neural networks and
+base neural networks, and we'll see in the
+1:23:10.961 --> 1:23:14.390
+next lectures the last type of architecture.
+1:23:15.915 --> 1:23:17.438
+Any questions.
+1:23:20.680 --> 1:23:27.360
+Then thanks a lot, and next I'm just there,
+we'll be again on order to.

demo_data/lectures/Lecture-07-16.05.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee1fc2af8bf4d95a18dacaa3d5d9aad8c6c207e0f5f63090a9adefcfcf29f418
+size 150440033

demo_data/lectures/Lecture-09-25.05.2023/English.vtt ADDED Viewed

	@@ -0,0 +1,3039 @@

+WEBVTT
+0:00:01.721 --> 0:00:05.046
+IntroductionHey, and then welcome to today's
+lecture.
+0:00:06.126 --> 0:00:13.861
+What we want to do today is we will finish
+with what we have done last time, so we started
+0:00:13.861 --> 0:00:22.192
+looking at the new machine translation system,
+but we have had all the components of the sequence
+0:00:22.192 --> 0:00:22.787
+model.
+0:00:22.722 --> 0:00:29.361
+We're still missing is the transformer based
+architecture so that maybe the self attention.
+0:00:29.849 --> 0:00:31.958
+Then we want to look at the beginning today.
+0:00:32.572 --> 0:00:39.315
+And then the main part of the day's lecture
+will be decoding.
+0:00:39.207 --> 0:00:43.995
+That means we know how to train the model.
+0:00:44.624 --> 0:00:47.507
+So decoding sewage all they can be.
+0:00:47.667 --> 0:00:53.359
+Be useful that and the idea is how we find
+that and what challenges are there.
+0:00:53.287 --> 0:00:59.051
+Since it's unregressive, we will see that
+it's not as easy as for other tasks.
+0:00:59.359 --> 0:01:08.206
+While generating the translation step by step,
+we might make additional arrows that lead.
+0:01:09.069 --> 0:01:15.547
+Self-AttentionBut let's start with a self
+attention, so what we looked at into was an
+0:01:15.547 --> 0:01:16.451
+base model.
+0:01:16.816 --> 0:01:27.931
+And then in our based models you always take
+the last new state, you take your input, you
+0:01:27.931 --> 0:01:31.513
+generate a new hidden state.
+0:01:31.390 --> 0:01:35.222
+This is more like a standard.
+0:01:35.675 --> 0:01:41.088
+And one challenge in this is that we always
+store all our history in one signal hidden
+0:01:41.088 --> 0:01:41.523
+stick.
+0:01:41.781 --> 0:01:50.235
+We saw that this is a problem when going from
+encoder to decoder, and that is why we then
+0:01:50.235 --> 0:01:58.031
+introduced the attention mechanism so that
+we can look back and see all the parts.
+0:01:59.579 --> 0:02:06.059
+However, in the decoder we still have this
+issue so we are still storing all information
+0:02:06.059 --> 0:02:12.394
+in one hidden state and we might do things
+like here that we start to overwrite things
+0:02:12.394 --> 0:02:13.486
+and we forgot.
+0:02:14.254 --> 0:02:23.575
+So the idea is, can we do something similar
+which we do between encoder and decoder within
+0:02:23.575 --> 0:02:24.907
+the decoder?
+0:02:26.526 --> 0:02:33.732
+And the idea is each time we're generating
+here in New York State, it will not only depend
+0:02:33.732 --> 0:02:40.780
+on the previous one, but we will focus on the
+whole sequence and look at different parts
+0:02:40.780 --> 0:02:46.165
+as we did in attention in order to generate
+our new representation.
+0:02:46.206 --> 0:02:53.903
+So each time we generate a new representation
+we will look into what is important now to
+0:02:53.903 --> 0:02:54.941
+understand.
+0:02:55.135 --> 0:03:00.558
+You may want to understand what much is important.
+0:03:00.452 --> 0:03:08.536
+You might want to look to vary and to like
+so that it's much about liking.
+0:03:08.808 --> 0:03:24.076
+So the idea is that we are not staring everything
+in each time we are looking at the full sequence.
+0:03:25.125 --> 0:03:35.160
+And that is achieved by no longer going really
+secret, and the hidden states here aren't dependent
+0:03:35.160 --> 0:03:37.086
+on the same layer.
+0:03:36.984 --> 0:03:42.865
+But instead we are always looking at the previous
+layer.
+0:03:42.942 --> 0:03:45.510
+We will always have more information that
+we are coming.
+0:03:47.147 --> 0:03:51.572
+So how does this censor work in detail?
+0:03:51.461 --> 0:03:56.076
+So we started with our initial mistakes.
+0:03:55.964 --> 0:04:08.148
+So, for example: Now where we had the three
+terms already, the query, the key and the value,
+0:04:08.148 --> 0:04:12.603
+it was motivated by our database.
+0:04:12.772 --> 0:04:20.746
+We are comparing it to the keys to all the
+other values, and then we are merging the values.
+0:04:21.321 --> 0:04:35.735
+There was a difference between the decoder
+and the encoder.
+0:04:35.775 --> 0:04:41.981
+You can assume all the same because we are
+curving ourselves.
+0:04:41.881 --> 0:04:49.490
+However, we can make them different but just
+learning a linear projection.
+0:04:49.529 --> 0:05:01.836
+So you learn here some projection based on
+what need to do in order to ask which question.
+0:05:02.062 --> 0:05:11.800
+That is, the query and the key is to what
+do want to compare and provide others, and
+0:05:11.800 --> 0:05:13.748
+which values do.
+0:05:14.014 --> 0:05:23.017
+This is not like hand defined, but learn,
+so it's like three linear projections that
+0:05:23.017 --> 0:05:26.618
+you apply on all of these hidden.
+0:05:26.512 --> 0:05:32.340
+That is the first thing based on your initial
+hidden.
+0:05:32.612 --> 0:05:37.249
+And now you can do exactly as before, you
+can do the attention.
+0:05:37.637 --> 0:05:40.023
+How did the attention work?
+0:05:39.937 --> 0:05:45.391
+The first thing is we are comparing our query
+to all the keys.
+0:05:45.445 --> 0:05:52.713
+And that is now the difference before the
+quarry was from the decoder, the keys were
+0:05:52.713 --> 0:05:54.253
+from the encoder.
+0:05:54.167 --> 0:06:02.548
+Now it's like all from the same, so we started
+the first in state to the keys of all the others.
+0:06:02.582 --> 0:06:06.217
+We're learning some value here.
+0:06:06.104 --> 0:06:12.808
+How important are these information to better
+understand?
+0:06:13.974 --> 0:06:19.103
+And these are just like floating point numbers.
+0:06:18.996 --> 0:06:21.673
+They are normalized so.
+0:06:22.762 --> 0:06:30.160
+And that is the first step, so let's go first
+for the first curve.
+0:06:30.470 --> 0:06:41.937
+What we can then do is multiply each value
+as we have done before with the importance
+0:06:41.937 --> 0:06:43.937
+of each state.
+0:06:45.145 --> 0:06:47.686
+And then we have in here the new hit step.
+0:06:48.308 --> 0:06:57.862
+See now this new hidden status is depending
+on all the hidden state of all the sequences
+0:06:57.862 --> 0:06:59.686
+of the previous.
+0:06:59.879 --> 0:07:01.739
+One important thing.
+0:07:01.651 --> 0:07:08.738
+This one doesn't really depend, so the hidden
+states here don't depend on the.
+0:07:09.029 --> 0:07:15.000
+So it only depends on the hidden state of
+the previous layer, but it depends on all the
+0:07:15.000 --> 0:07:18.664
+hidden states, and that is of course a big
+advantage.
+0:07:18.596 --> 0:07:25.095
+So on the one hand information can directly
+flow from each hidden state before the information
+0:07:25.095 --> 0:07:27.215
+flow was always a bit limited.
+0:07:28.828 --> 0:07:35.100
+And the independence is important so we can
+calculate all these in the states in parallel.
+0:07:35.031 --> 0:07:41.339
+That's another big advantage of self attention
+that we can calculate all the hidden states
+0:07:41.339 --> 0:07:46.816
+in one layer in parallel and therefore it's
+the ad designed for GPUs and fast.
+0:07:47.587 --> 0:07:50.235
+Then we can do the same thing for the second
+in the state.
+0:07:50.530 --> 0:08:06.866
+And the only difference here is how we calculate
+what is occurring.
+0:08:07.227 --> 0:08:15.733
+Getting these values is different because
+we use the different query and then getting
+0:08:15.733 --> 0:08:17.316
+our new hidden.
+0:08:18.258 --> 0:08:26.036
+Yes, this is the word of words that underneath
+this case might, but this is simple.
+0:08:25.943 --> 0:08:26.522
+Not.
+0:08:27.127 --> 0:08:33.359
+That's a very good question that is like on
+the initial thing.
+0:08:33.260 --> 0:08:38.452
+That is exactly not one of you in the architecture.
+0:08:38.352 --> 0:08:44.045
+Maybe first you would think of a very big
+disadvantage.
+0:08:44.384 --> 0:08:49.804
+So this hidden state would be the same if
+the movie would be different.
+0:08:50.650 --> 0:08:59.983
+And of course this estate is a site someone
+should like, so if the estate would be here
+0:08:59.983 --> 0:09:06.452
+except for this correspondence the word order
+is completely.
+0:09:06.706 --> 0:09:17.133
+Therefore, just doing self attention wouldn't
+work at all because we know word order is important
+0:09:17.133 --> 0:09:21.707
+and there is a complete different meaning.
+0:09:22.262 --> 0:09:26.277
+We introduce the word position again.
+0:09:26.171 --> 0:09:33.040
+The main idea is if the position is already
+in your embeddings.
+0:09:33.533 --> 0:09:39.296
+Then of course the position is there and you
+don't lose it anymore.
+0:09:39.211 --> 0:09:46.908
+So mainly if your life representation here
+encodes at the second position and your output
+0:09:46.908 --> 0:09:48.533
+will be different.
+0:09:49.049 --> 0:09:54.585
+And that's how you encode it, but that's essential
+in order to get this work.
+0:09:57.137 --> 0:10:06.015
+Multi-head AttentionBut before we are coming
+to the next slide, one other thing that is
+0:10:06.015 --> 0:10:10.050
+typically done is multi-head attention.
+0:10:10.430 --> 0:10:15.662
+And it might be that in order to understand
+much, it might be good that in some way we
+0:10:15.662 --> 0:10:19.872
+focus on life, and in some way we can focus
+on vary, but not equally.
+0:10:19.812 --> 0:10:25.346
+But maybe it's like to understand again on
+different dimensions we should look into these.
+0:10:25.905 --> 0:10:31.393
+And therefore what we're doing is we're just
+doing the self attention at once, but we're
+0:10:31.393 --> 0:10:35.031
+doing it end times or based on your multi head
+attentions.
+0:10:34.970 --> 0:10:43.517
+So in typical examples, the number of heads
+people are talking about is like: So you're
+0:10:43.517 --> 0:10:50.607
+doing this process and have different queries
+and keys so you can focus.
+0:10:50.790 --> 0:10:52.887
+How can you generate eight different?
+0:10:53.593 --> 0:11:07.595
+Things it's quite easy here, so instead of
+having one linear projection you can have age
+0:11:07.595 --> 0:11:09.326
+different.
+0:11:09.569 --> 0:11:13.844
+And it might be that sometimes you're looking
+more into one thing, and sometimes you're Looking
+0:11:13.844 --> 0:11:14.779
+more into the other.
+0:11:15.055 --> 0:11:24.751
+So that's of course nice with this type of
+learned approach because we can automatically
+0:11:24.751 --> 0:11:25.514
+learn.
+0:11:29.529 --> 0:11:36.629
+And what you correctly said is its positional
+independence, so it doesn't really matter the
+0:11:36.629 --> 0:11:39.176
+order which should be important.
+0:11:39.379 --> 0:11:47.686
+So how can we do that and the idea is we are
+just encoding it directly into the embedding
+0:11:47.686 --> 0:11:52.024
+so into the starting so that a representation.
+0:11:52.512 --> 0:11:55.873
+How do we get that so we started with our
+embeddings?
+0:11:55.810 --> 0:11:58.302
+Just imagine this is embedding of eye.
+0:11:59.259 --> 0:12:06.169
+And then we are having additionally this positional
+encoding.
+0:12:06.057 --> 0:12:10.184
+In this position, encoding is just.
+0:12:10.670 --> 0:12:19.564
+With different wavelength, so with different
+lengths of your signal as you see here.
+0:12:20.160 --> 0:12:37.531
+And the number of functions you have is exactly
+the number of dimensions you have in your embedded.
+0:12:38.118 --> 0:12:51.091
+And what will then do is take the first one,
+and based on your position you multiply your
+0:12:51.091 --> 0:12:51.955
+word.
+0:12:52.212 --> 0:13:02.518
+And you see now if you put it in this position,
+of course it will get a different value.
+0:13:03.003 --> 0:13:12.347
+And thereby in each position a different function
+is multiplied.
+0:13:12.203 --> 0:13:19.826
+This is a representation for at the first
+position.
+0:13:20.020 --> 0:13:34.922
+If you have it in the input already encoded
+then of course the model is able to keep the
+0:13:34.922 --> 0:13:38.605
+position information.
+0:13:38.758 --> 0:13:48.045
+But your embeddings can also learn your embeddings
+in a way that they are optimal collaborating
+0:13:48.045 --> 0:13:49.786
+with these types.
+0:13:51.451 --> 0:13:59.351
+Is that somehow clear where he is there?
+0:14:06.006 --> 0:14:13.630
+Am the first position and second position?
+0:14:16.576 --> 0:14:17.697
+Have a long wait period.
+0:14:17.652 --> 0:14:19.625
+I'm not going to tell you how to turn the.
+0:14:21.441 --> 0:14:26.927
+Be completely issued because if you have a
+very short wavelength there might be quite
+0:14:26.927 --> 0:14:28.011
+big differences.
+0:14:28.308 --> 0:14:33.577
+And it might also be that then it depends,
+of course, like what type of world embedding
+0:14:33.577 --> 0:14:34.834
+you've learned like.
+0:14:34.774 --> 0:14:37.541
+Is the dimension where you have long changes?
+0:14:37.481 --> 0:14:43.048
+Is the report for your embedding or not so
+that's what I mean so that the model can somehow
+0:14:43.048 --> 0:14:47.708
+learn that by putting more information into
+one of the embedding dimensions?
+0:14:48.128 --> 0:14:54.560
+So incorporated and would assume it's learning
+it a bit haven't seen.
+0:14:54.468 --> 0:14:57.412
+Details studied how different.
+0:14:58.078 --> 0:15:07.863
+It's also a bit difficult because really measuring
+how similar or different a world isn't that
+0:15:07.863 --> 0:15:08.480
+easy.
+0:15:08.377 --> 0:15:13.118
+You can do, of course, the average distance.
+0:15:14.114 --> 0:15:21.393
+Them, so are the weight tags not at model
+two, or is there fixed weight tags that the
+0:15:21.393 --> 0:15:21.986
+model.
+0:15:24.164 --> 0:15:30.165
+To believe they are fixed and the mono learns
+there's a different way of doing it.
+0:15:30.093 --> 0:15:32.987
+The other thing you can do is you can.
+0:15:33.213 --> 0:15:36.945
+So you can learn the second embedding which
+says this is position one.
+0:15:36.893 --> 0:15:38.581
+This is position two and so on.
+0:15:38.529 --> 0:15:42.502
+Like for words you could learn fixed embeddings
+and then add them upwards.
+0:15:42.449 --> 0:15:45.008
+So then it would have the same thing it's
+done.
+0:15:44.955 --> 0:15:46.836
+There is one disadvantage of this.
+0:15:46.782 --> 0:15:51.405
+There is anybody an idea what could be the
+disadvantage of a more learned embedding.
+0:15:54.955 --> 0:16:00.000
+Here maybe extra play this finger and ethnic
+stuff that will be an art.
+0:15:59.929 --> 0:16:01.754
+This will be an art for.
+0:16:02.502 --> 0:16:08.323
+You would only be good at positions you have
+seen often and especially for long sequences.
+0:16:08.259 --> 0:16:13.990
+You might have seen the positions very rarely
+and then normally not performing that well
+0:16:13.990 --> 0:16:17.982
+while here it can better learn a more general
+representation.
+0:16:18.298 --> 0:16:22.522
+So that is another thing which we won't discuss
+here.
+0:16:22.444 --> 0:16:25.965
+Guess is what is called relative attention.
+0:16:25.945 --> 0:16:32.570
+And in this case you don't learn absolute
+positions, but in your calculation of the similarity
+0:16:32.570 --> 0:16:39.194
+you take again the relative distance into account
+and have a different similarity depending on
+0:16:39.194 --> 0:16:40.449
+how far they are.
+0:16:40.660 --> 0:16:45.898
+And then you don't need to encode it beforehand,
+but you would more happen within your comparison.
+0:16:46.186 --> 0:16:53.471
+So when you compare how similar things you
+print, of course also take the relative position.
+0:16:55.715 --> 0:17:03.187
+Because there are multiple ways to use the
+one, to multiply all the embedding, or to use
+0:17:03.187 --> 0:17:03.607
+all.
+0:17:17.557 --> 0:17:21.931
+The encoder can be bidirectional.
+0:17:21.802 --> 0:17:30.681
+We have everything from the beginning so we
+can have a model where.
+0:17:31.111 --> 0:17:36.455
+Decoder training of course has also everything
+available but during inference you always have
+0:17:36.455 --> 0:17:41.628
+only the past available so you can only look
+into the previous one and not into the future
+0:17:41.628 --> 0:17:46.062
+because if you generate word by word you don't
+know what it will be there in.
+0:17:46.866 --> 0:17:53.180
+And so we also have to consider this somehow
+in the attention, and until now we look more
+0:17:53.180 --> 0:17:54.653
+at the ecoder style.
+0:17:54.583 --> 0:17:58.613
+So if you look at this type of model, it's
+by direction.
+0:17:58.542 --> 0:18:03.775
+So for this hill state we are looking into
+the past and into the future.
+0:18:04.404 --> 0:18:14.436
+So the question is, can we have to do this
+like unidirectional so that you only look into
+0:18:14.436 --> 0:18:15.551
+the past?
+0:18:15.439 --> 0:18:22.575
+And the nice thing is, this is even easier
+than for our hands.
+0:18:23.123 --> 0:18:29.738
+So we would have different types of parameters
+and models because you have a forward direction.
+0:18:31.211 --> 0:18:35.679
+For attention, that is very simple.
+0:18:35.555 --> 0:18:39.326
+We are doing what is masking.
+0:18:39.200 --> 0:18:45.613
+If you want to have a backward model, these
+ones.
+0:18:45.845 --> 0:18:54.355
+So on the first hit stage it's been over,
+so it's maybe only looking at its health.
+0:18:54.894 --> 0:19:05.310
+By the second it looks on the second and the
+third, so you're always selling all values
+0:19:05.310 --> 0:19:07.085
+in the future.
+0:19:07.507 --> 0:19:13.318
+And thereby you can have with the same parameters
+the same model.
+0:19:13.230 --> 0:19:15.786
+You can have then a unique.
+0:19:16.156 --> 0:19:29.895
+In the decoder you do the masked self attention
+where you only look into the past and you don't
+0:19:29.895 --> 0:19:30.753
+look.
+0:19:32.212 --> 0:19:36.400
+Then we only have, of course, looked onto
+itself.
+0:19:36.616 --> 0:19:50.903
+So the question: How can we combine forward
+and decoder and then we can do a decoder and
+0:19:50.903 --> 0:19:54.114
+just have a second?
+0:19:54.374 --> 0:20:00.286
+And then we're doing the cross attention which
+attacks from the decoder to the anchoder.
+0:20:00.540 --> 0:20:11.264
+So in this time it's again that the queries
+is a current state of decoder, while the keys
+0:20:11.264 --> 0:20:22.821
+are: You can do both onto yourself to get the
+meaning on the target side and to get the meaning.
+0:20:23.423 --> 0:20:25.928
+So see then the full picture.
+0:20:25.844 --> 0:20:32.997
+This is now the typical picture of the transformer
+and where you use self attention.
+0:20:32.913 --> 0:20:36.702
+So what you have is have your power hidden.
+0:20:37.217 --> 0:20:43.408
+What you then apply is here the position they're
+coding: We have then doing the self attention
+0:20:43.408 --> 0:20:46.731
+to all the others, and this can be bi-directional.
+0:20:47.707 --> 0:20:54.918
+You normally do another feed forward layer
+just like to make things to learn additional
+0:20:54.918 --> 0:20:55.574
+things.
+0:20:55.492 --> 0:21:02.755
+You're just having also a feed forward layer
+which takes your heel stable and generates
+0:21:02.755 --> 0:21:07.129
+your heel state because we are making things
+deeper.
+0:21:07.747 --> 0:21:15.648
+Then this blue part you can stack over several
+times so you can have layers so that.
+0:21:16.336 --> 0:21:30.256
+In addition to these blue arrows, so we talked
+about this in R&amp;S that if you are now back
+0:21:30.256 --> 0:21:35.883
+propagating your arrow from the top,.
+0:21:36.436 --> 0:21:48.578
+In order to prevent that we are not really
+learning how to transform that, but instead
+0:21:48.578 --> 0:21:51.230
+we have to change.
+0:21:51.671 --> 0:22:00.597
+You're calculating what should be changed
+with this one.
+0:22:00.440 --> 0:22:09.368
+The backwards clip each layer and the learning
+is just.
+0:22:10.750 --> 0:22:21.632
+The encoder before we go to the decoder.
+0:22:21.366 --> 0:22:30.663
+We have any additional questions.
+0:22:31.471 --> 0:22:33.220
+That's a Very Good Point.
+0:22:33.553 --> 0:22:38.709
+Yeah, you normally take always that at least
+the default architecture to only look at the
+0:22:38.709 --> 0:22:38.996
+top.
+0:22:40.000 --> 0:22:40.388
+Coder.
+0:22:40.332 --> 0:22:42.340
+Of course, you can do other things.
+0:22:42.285 --> 0:22:45.040
+We investigated, for example, the lowest layout.
+0:22:44.983 --> 0:22:49.426
+The decoder is looking at the lowest level
+of the incoder and not of the top.
+0:22:49.749 --> 0:23:05.342
+You can average or you can even learn theoretically
+that what you can also do is attending to all.
+0:23:05.785 --> 0:23:11.180
+Can attend to all possible layers and states.
+0:23:11.063 --> 0:23:18.337
+But what the default thing is is that you
+only have the top.
+0:23:20.580 --> 0:23:31.999
+The decoder when we're doing is firstly doing
+the same position and coding, then we're doing
+0:23:31.999 --> 0:23:36.419
+self attention in the decoder side.
+0:23:37.837 --> 0:23:43.396
+Of course here it's not important we're doing
+the mask self attention so that we're only
+0:23:43.396 --> 0:23:45.708
+attending to the past and we're not.
+0:23:47.287 --> 0:24:02.698
+Here you see the difference, so in this case
+the keys and values are from the encoder and
+0:24:02.698 --> 0:24:03.554
+the.
+0:24:03.843 --> 0:24:12.103
+You're comparing it to all the counter hidden
+states calculating the similarity and then
+0:24:12.103 --> 0:24:13.866
+you do the weight.
+0:24:14.294 --> 0:24:17.236
+And that is an edit to what is here.
+0:24:18.418 --> 0:24:29.778
+Then you have a linen layer and again this
+green one is sticked several times and then.
+0:24:32.232 --> 0:24:36.987
+Question, so each code is off.
+0:24:36.834 --> 0:24:46.041
+Every one of those has the last layer of thing,
+so in the.
+0:24:46.246 --> 0:24:51.007
+All with and only to the last or the top layer
+of the anchor.
+0:24:57.197 --> 0:25:00.053
+Designing a translation systemGood So That
+Would Be.
+0:25:01.501 --> 0:25:12.513
+To sequence models we have looked at attention
+and before we are decoding do you have any
+0:25:12.513 --> 0:25:18.020
+more questions to this type of architecture.
+0:25:20.480 --> 0:25:30.049
+Transformer was first used in machine translation,
+but now it's a standard thing for doing nearly
+0:25:30.049 --> 0:25:32.490
+any tie sequence models.
+0:25:33.013 --> 0:25:35.984
+Even large language models.
+0:25:35.878 --> 0:25:38.455
+They are a bit similar.
+0:25:38.347 --> 0:25:45.114
+They are just throwing away the anchor and
+cross the tension.
+0:25:45.505 --> 0:25:59.329
+And that is maybe interesting that it's important
+to have this attention because you cannot store
+0:25:59.329 --> 0:26:01.021
+everything.
+0:26:01.361 --> 0:26:05.357
+The interesting thing with the attention is
+now we can attend to everything.
+0:26:05.745 --> 0:26:13.403
+So you can again go back to your initial model
+and have just a simple sequence model and then
+0:26:13.403 --> 0:26:14.055
+target.
+0:26:14.694 --> 0:26:24.277
+There would be a more language model style
+or people call it Decoder Only model where
+0:26:24.277 --> 0:26:26.617
+you throw this away.
+0:26:27.247 --> 0:26:30.327
+The nice thing is because of your self attention.
+0:26:30.265 --> 0:26:34.163
+You have the original problem why you introduce
+the attention.
+0:26:34.101 --> 0:26:39.639
+You don't have that anymore because it's not
+everything is summarized, but each time you
+0:26:39.639 --> 0:26:44.866
+generate, you're looking back at all the previous
+words, the source and the target.
+0:26:45.805 --> 0:26:51.734
+And there is a lot of work on is a really
+important to have encoded a decoded model or
+0:26:51.734 --> 0:26:54.800
+is a decoded only model as good if you have.
+0:26:54.732 --> 0:27:00.049
+But the comparison is not that easy because
+how many parameters do you have?
+0:27:00.360 --> 0:27:08.832
+So think the general idea at the moment is,
+at least for machine translation, it's normally
+0:27:08.832 --> 0:27:17.765
+a bit better to have an encoded decoder model
+and not a decoder model where you just concatenate
+0:27:17.765 --> 0:27:20.252
+the source and the target.
+0:27:21.581 --> 0:27:24.073
+But there is not really a big difference anymore.
+0:27:24.244 --> 0:27:29.891
+Because this big issue, which we had initially
+with it that everything is stored in the working
+0:27:29.891 --> 0:27:31.009
+state, is nothing.
+0:27:31.211 --> 0:27:45.046
+Of course, the advantage maybe here is that
+you give it a bias at your same language information.
+0:27:45.285 --> 0:27:53.702
+While in an encoder only model this all is
+merged into one thing and sometimes it is good
+0:27:53.702 --> 0:28:02.120
+to give models a bit of bias okay you should
+maybe treat things separately and you should
+0:28:02.120 --> 0:28:03.617
+look different.
+0:28:04.144 --> 0:28:11.612
+And of course one other difference, one other
+disadvantage, maybe of an encoder owning one.
+0:28:16.396 --> 0:28:19.634
+You think about the suicide sentence and how
+it's treated.
+0:28:21.061 --> 0:28:33.787
+Architecture: Anchorer can both be in the
+sentence for every state and cause a little
+0:28:33.787 --> 0:28:35.563
+difference.
+0:28:35.475 --> 0:28:43.178
+If you only have a decoder that has to be
+unidirectional because for the decoder side
+0:28:43.178 --> 0:28:51.239
+for the generation you need it and so your
+input is read state by state so you don't have
+0:28:51.239 --> 0:28:54.463
+positional bidirection information.
+0:28:56.596 --> 0:29:05.551
+Again, it receives a sequence of embeddings
+with position encoding.
+0:29:05.419 --> 0:29:11.085
+The piece is like long vector has output.
+0:29:11.031 --> 0:29:17.148
+Don't understand how you can set footworks
+to this part of each other through inputs.
+0:29:17.097 --> 0:29:20.060
+Other than cola is the same as the food consume.
+0:29:21.681 --> 0:29:27.438
+Okay, it's very good bye, so this one hand
+coding is only done on the top layer.
+0:29:27.727 --> 0:29:32.012
+So this green one is only repeated.
+0:29:31.893 --> 0:29:38.511
+You have the word embedding or the position
+embedding.
+0:29:38.390 --> 0:29:42.966
+You have one layer of decoder which.
+0:29:43.283 --> 0:29:48.245
+Then you stick in the second one, the third
+one, the fourth one, and then on the top.
+0:29:48.208 --> 0:29:55.188
+Layer: You put this projection layer which
+takes a one thousand dimensional backtalk and
+0:29:55.188 --> 0:30:02.089
+generates based on your vocabulary maybe in
+ten thousand soft max layer which gives you
+0:30:02.089 --> 0:30:04.442
+the probability of all words.
+0:30:06.066 --> 0:30:22.369
+It's a very good part part of the mass tape
+ladies, but it wouldn't be for the X-rays.
+0:30:22.262 --> 0:30:27.015
+Aquarium filters to be like monsoon roding
+as they get by the river.
+0:30:27.647 --> 0:30:33.140
+Yes, there is work on that think we will discuss
+that in the pre-trained models.
+0:30:33.493 --> 0:30:39.756
+It's called where you exactly do that.
+0:30:39.595 --> 0:30:48.591
+If you have more metric side, it's like diagonal
+here.
+0:30:48.708 --> 0:30:53.018
+And it's a full metric, so here everybody's
+attending to each position.
+0:30:52.958 --> 0:30:54.696
+Here you're only attending.
+0:30:54.975 --> 0:31:05.744
+Then you can do the previous one where this
+one is decoded, not everything but everything.
+0:31:06.166 --> 0:31:13.961
+So you have a bit more that is possible, and
+we'll have that in the lecture on pre-train
+0:31:13.961 --> 0:31:14.662
+models.
+0:31:18.478 --> 0:31:27.440
+So we now know how to build a translation
+system, but of course we don't want to have
+0:31:27.440 --> 0:31:30.774
+a translation system by itself.
+0:31:31.251 --> 0:31:40.037
+Now given this model an input sentence, how
+can we generate an output mind?
+0:31:39.921 --> 0:31:49.455
+The general idea is still: So what we really
+want to do is we start with the model.
+0:31:49.342 --> 0:31:53.894
+We generate different possible translations.
+0:31:54.014 --> 0:31:59.754
+We score them the lock probability that we're
+getting, so for each input and output pair
+0:31:59.754 --> 0:32:05.430
+we can calculate the lock probability, which
+is a product of all probabilities for each
+0:32:05.430 --> 0:32:09.493
+word in there, and then we can find what is
+the most probable.
+0:32:09.949 --> 0:32:15.410
+However, that's a bit complicated we will
+see because we can't look at all possible translations.
+0:32:15.795 --> 0:32:28.842
+So there is infinite or a number of possible
+translations, so we have to do it somehow in
+0:32:28.842 --> 0:32:31.596
+more intelligence.
+0:32:32.872 --> 0:32:37.821
+So what we want to do today in the rest of
+the lecture?
+0:32:37.732 --> 0:32:40.238
+What is the search problem?
+0:32:40.149 --> 0:32:44.716
+Then we will look at different search algorithms.
+0:32:45.825 --> 0:32:56.636
+Will compare model and search errors, so there
+can be errors on the model where the model
+0:32:56.636 --> 0:33:03.483
+is not giving the highest score to the best
+translation.
+0:33:03.903 --> 0:33:21.069
+This is always like searching the best translation
+out of one model, which is often also interesting.
+0:33:24.004 --> 0:33:29.570
+And how do we do the search?
+0:33:29.378 --> 0:33:41.856
+We want to find the translation where the
+reference is minimal.
+0:33:42.042 --> 0:33:44.041
+So the nice thing is SMT.
+0:33:43.964 --> 0:33:51.310
+It wasn't the case, but in neuromachine translation
+we can't find any possible translation, so
+0:33:51.310 --> 0:33:53.785
+at least within our vocabulary.
+0:33:53.707 --> 0:33:58.116
+But if we have BPE we can really generate
+any possible.
+0:33:58.078 --> 0:34:04.604
+Translation and cereal: We could always minimize
+that, but yeah, we can't do it that easy because
+0:34:04.604 --> 0:34:07.734
+of course we don't have the reference at hand.
+0:34:07.747 --> 0:34:10.384
+If it has a reference, it's not a problem.
+0:34:10.322 --> 0:34:13.696
+We know what we are searching for, but we
+don't know.
+0:34:14.054 --> 0:34:23.886
+So how can we then model this by just finding
+the translation with the highest probability?
+0:34:23.779 --> 0:34:29.018
+Looking at it, we want to find the translation.
+0:34:29.169 --> 0:34:32.525
+Idea is our model is a good approximation.
+0:34:32.447 --> 0:34:34.333
+That's how we train it.
+0:34:34.254 --> 0:34:36.471
+What is a good translation?
+0:34:36.391 --> 0:34:43.665
+And if we find translation with the highest
+probability, this should also give us the best
+0:34:43.665 --> 0:34:44.704
+translation.
+0:34:45.265 --> 0:34:56.965
+And that is then, of course, the difference
+between the search error is that the model
+0:34:56.965 --> 0:35:02.076
+doesn't predict the best translation.
+0:35:02.622 --> 0:35:08.777
+How can we do the basic search first of all
+in basic search that seems to be very easy
+0:35:08.777 --> 0:35:15.003
+so what we can do is we can do the forward
+pass for the whole encoder and that's how it
+0:35:15.003 --> 0:35:21.724
+starts the input sentences known you can put
+the input sentence and calculate all your estates
+0:35:21.724 --> 0:35:22.573
+and hidden?
+0:35:23.083 --> 0:35:35.508
+Then you can put in your sentence start and
+you can generate.
+0:35:35.308 --> 0:35:41.728
+Here you have the probability.
+0:35:41.801 --> 0:35:52.624
+A good idea we would see later that as a typical
+algorithm is guess what you all would do, you
+0:35:52.624 --> 0:35:54.788
+would then select.
+0:35:55.235 --> 0:36:06.265
+So if you generate here a probability distribution
+over all the words in your vocabulary then
+0:36:06.265 --> 0:36:08.025
+you can solve.
+0:36:08.688 --> 0:36:13.147
+Yeah, this is how our auto condition is done
+in our system.
+0:36:14.794 --> 0:36:19.463
+Yeah, this is also why there you have to have
+a model of possible extending.
+0:36:19.403 --> 0:36:24.274
+It's more of a language model, but then this
+is one algorithm to do the search.
+0:36:24.213 --> 0:36:26.726
+They maybe have also more advanced ones.
+0:36:26.665 --> 0:36:32.044
+We will see that so this search and other
+completion should be exactly the same as the
+0:36:32.044 --> 0:36:33.775
+search machine translation.
+0:36:34.914 --> 0:36:40.480
+So we'll see that this is not optimal, so
+hopefully it's not that this way, but for this
+0:36:40.480 --> 0:36:41.043
+problem.
+0:36:41.941 --> 0:36:47.437
+And what you can do then you can select this
+word.
+0:36:47.329 --> 0:36:50.781
+This was the best translation.
+0:36:51.111 --> 0:36:57.675
+Because the decoder, of course, in the next
+step needs not to know what is the best word
+0:36:57.675 --> 0:37:02.396
+here, it inputs it and generates that flexibility
+distribution.
+0:37:03.423 --> 0:37:14.608
+And then your new distribution, and you can
+do the same thing, there's the best word there,
+0:37:14.608 --> 0:37:15.216
+and.
+0:37:15.435 --> 0:37:22.647
+So you can continue doing that and always
+get the hopefully the best translation in.
+0:37:23.483 --> 0:37:30.839
+The first question is, of course, how long
+are you doing it?
+0:37:30.718 --> 0:37:33.859
+Now we could go forever.
+0:37:36.476 --> 0:37:52.596
+We had this token at the input and we put
+the stop token at the output.
+0:37:53.974 --> 0:38:07.217
+And this is important because if we wouldn't
+do that then we wouldn't have a good idea.
+0:38:10.930 --> 0:38:16.193
+So that seems to be a good idea, but is it
+really?
+0:38:16.090 --> 0:38:21.046
+Do we find the most probable sentence in this?
+0:38:23.763 --> 0:38:25.154
+Or my dear healed proverb,.
+0:38:27.547 --> 0:38:41.823
+We are always selecting the highest probability
+one, so it seems to be that this is a very
+0:38:41.823 --> 0:38:45.902
+good solution to anybody.
+0:38:46.406 --> 0:38:49.909
+Yes, that is actually the problem.
+0:38:49.809 --> 0:38:56.417
+You might do early decisions and you don't
+have the global view.
+0:38:56.796 --> 0:39:02.813
+And this problem happens because it is an
+outer regressive model.
+0:39:03.223 --> 0:39:13.275
+So it happens because yeah, the output we
+generate is the input in the next step.
+0:39:13.793 --> 0:39:19.493
+And this, of course, is leading to problems.
+0:39:19.367 --> 0:39:27.476
+If we always take the best solution, it doesn't
+mean you have.
+0:39:27.727 --> 0:39:33.941
+It would be different if you have a problem
+where the output is not influencing your input.
+0:39:34.294 --> 0:39:44.079
+Then this solution will give you the best
+model, but since the output is influencing
+0:39:44.079 --> 0:39:47.762
+your next input and the model,.
+0:39:48.268 --> 0:39:51.599
+Because one question might not be why do we
+have this type of model?
+0:39:51.771 --> 0:39:58.946
+So why do we really need to put here in the
+last source word?
+0:39:58.831 --> 0:40:05.351
+You can also put in: And then always predict
+the word and the nice thing is then you wouldn't
+0:40:05.351 --> 0:40:11.733
+need to do beams or a difficult search because
+then the output here wouldn't influence what
+0:40:11.733 --> 0:40:12.982
+is inputted here.
+0:40:15.435 --> 0:40:20.219
+Idea whether that might not be the best idea.
+0:40:20.115 --> 0:40:24.590
+You'll just be translating each word and.
+0:40:26.626 --> 0:40:37.815
+The second one is right, yes, you're not generating
+a Korean sentence.
+0:40:38.058 --> 0:40:48.197
+We'll also see that later it's called non
+auto-progressive translation, so there is work
+0:40:48.197 --> 0:40:49.223
+on that.
+0:40:49.529 --> 0:41:02.142
+So you might know it roughly because you know
+it's based on this hidden state, but it can
+0:41:02.142 --> 0:41:08.588
+be that in the end you have your probability.
+0:41:09.189 --> 0:41:14.633
+And then you're not modeling the dependencies
+within a work within the target sentence.
+0:41:14.571 --> 0:41:27.579
+For example: You can express things in German,
+then you don't know which one you really select.
+0:41:27.443 --> 0:41:32.159
+That influences what you later.
+0:41:33.393 --> 0:41:46.411
+Then you try to find a better way not only
+based on the English sentence and the words
+0:41:46.411 --> 0:41:48.057
+that come.
+0:41:49.709 --> 0:42:00.954
+Yes, that is more like a two-step decoding,
+but that is, of course, a lot more like computational.
+0:42:01.181 --> 0:42:15.978
+The first thing you can do, which is typically
+done, is doing not really search.
+0:42:16.176 --> 0:42:32.968
+So first look at what the problem of research
+is to make it a bit more clear.
+0:42:34.254 --> 0:42:53.163
+And now you can extend them and you can extend
+these and the joint probabilities.
+0:42:54.334 --> 0:42:59.063
+The other thing is the second word.
+0:42:58.931 --> 0:43:03.336
+You can do the second word dusk.
+0:43:03.202 --> 0:43:07.345
+Now you see the problem here.
+0:43:07.707 --> 0:43:17.507
+It is true that these have the highest probability,
+but for these you have an extension.
+0:43:18.078 --> 0:43:31.585
+So the problem is just because in one position
+one hypothesis, so you can always call this
+0:43:31.585 --> 0:43:34.702
+partial translation.
+0:43:34.874 --> 0:43:41.269
+The blue one begin is higher, but the green
+one can be better extended and it will overtake.
+0:43:45.525 --> 0:43:54.672
+So the problem is if we are doing this greedy
+search is that we might not end up in really
+0:43:54.672 --> 0:43:55.275
+good.
+0:43:55.956 --> 0:44:00.916
+So the first thing we could not do is like
+yeah, we can just try.
+0:44:00.880 --> 0:44:06.049
+All combinations that are there, so there
+is the other direction.
+0:44:05.971 --> 0:44:12.988
+So if the solution to to check the first one
+is to just try all and it doesn't give us a
+0:44:12.988 --> 0:44:17.876
+good result, maybe what we have to do is just
+try everything.
+0:44:18.318 --> 0:44:23.120
+The nice thing is if we try everything, we'll
+definitely find the best translation.
+0:44:23.463 --> 0:44:26.094
+So we won't have a search error.
+0:44:26.014 --> 0:44:28.113
+We'll come to that later.
+0:44:28.032 --> 0:44:32.474
+The interesting thing is our translation performance.
+0:44:33.353 --> 0:44:37.039
+But we will definitely find the most probable
+translation.
+0:44:38.598 --> 0:44:44.552
+However, it's not really possible because
+the number of combinations is just too high.
+0:44:44.764 --> 0:44:57.127
+So the number of congregations is your vocabulary
+science times the lengths of your sentences.
+0:44:57.157 --> 0:45:03.665
+Ten thousand or so you can imagine that very
+soon you will have so many possibilities here
+0:45:03.665 --> 0:45:05.597
+that you cannot check all.
+0:45:06.226 --> 0:45:13.460
+So this is not really an implication or an
+algorithm that you can use for applying machine
+0:45:13.460 --> 0:45:14.493
+translation.
+0:45:15.135 --> 0:45:24.657
+So maybe we have to do something in between
+and yeah, not look at all but only look at
+0:45:24.657 --> 0:45:25.314
+some.
+0:45:26.826 --> 0:45:29.342
+And the easiest thing for that is okay.
+0:45:29.279 --> 0:45:34.840
+Just do sampling, so if we don't know what
+to look at, maybe it's good to randomly pick
+0:45:34.840 --> 0:45:35.219
+some.
+0:45:35.156 --> 0:45:40.572
+That's not only a very good algorithm, so
+the basic idea will always randomly select
+0:45:40.572 --> 0:45:42.866
+the word, of course, based on bits.
+0:45:43.223 --> 0:45:52.434
+We are doing that or times, and then we are
+looking which one at the end has the highest.
+0:45:52.672 --> 0:45:59.060
+So we are not doing anymore really searching
+for the best one, but we are more randomly
+0:45:59.060 --> 0:46:05.158
+doing selections with the idea that we always
+select the best one at the beginning.
+0:46:05.085 --> 0:46:11.758
+So maybe it's better to do random, but of
+course one important thing is how do we randomly
+0:46:11.758 --> 0:46:12.345
+select?
+0:46:12.452 --> 0:46:15.756
+If we just do uniform distribution, it would
+be very bad.
+0:46:15.699 --> 0:46:18.036
+You'll only have very bad translations.
+0:46:18.398 --> 0:46:23.261
+Because in each position if you think about
+it you have ten thousand possibilities.
+0:46:23.903 --> 0:46:28.729
+Most of them are really bad decisions and
+you shouldn't do that.
+0:46:28.655 --> 0:46:35.190
+There is always only a very small number,
+at least compared to the 10 000 translation.
+0:46:35.395 --> 0:46:43.826
+So if you have the sentence here, this is
+an English sentence.
+0:46:43.692 --> 0:46:47.846
+You can start with these and.
+0:46:48.408 --> 0:46:58.345
+You're thinking about setting legal documents
+in a legal document.
+0:46:58.197 --> 0:47:02.356
+You should not change the.
+0:47:03.603 --> 0:47:11.032
+The problem is we have a neural network, we
+have a black box, so it's anyway a bit random.
+0:47:12.092 --> 0:47:24.341
+It is considered, but you will see that if
+you make it intelligent for clear sentences,
+0:47:24.341 --> 0:47:26.986
+there is not that.
+0:47:27.787 --> 0:47:35.600
+Is an issue we should consider that this one
+might lead to more randomness, but it might
+0:47:35.600 --> 0:47:39.286
+also be positive for machine translation.
+0:47:40.080 --> 0:47:46.395
+Least can't directly think of a good implication
+where it's positive, but if you most think
+0:47:46.395 --> 0:47:52.778
+about dialogue systems, for example, whereas
+the similar architecture is nowadays also used,
+0:47:52.778 --> 0:47:55.524
+you predict what the system should say.
+0:47:55.695 --> 0:48:00.885
+Then you want to have randomness because it's
+not always saying the same thing.
+0:48:01.341 --> 0:48:08.370
+Machine translation is typically not you want
+to have consistency, so if you have the same
+0:48:08.370 --> 0:48:09.606
+input normally.
+0:48:09.889 --> 0:48:14.528
+Therefore, sampling is not a mathieu.
+0:48:14.406 --> 0:48:22.565
+PreprocessingThere are some things you will
+later see as a preprocessing step.
+0:48:23.003 --> 0:48:27.832
+But of course it's important how you can make
+this process not too random.
+0:48:29.269 --> 0:48:41.619
+Therefore, the first thing is don't take a
+uniform distribution, but we have a very nice
+0:48:41.619 --> 0:48:43.562
+distribution.
+0:48:43.843 --> 0:48:46.621
+So I'm like randomly taking a word.
+0:48:46.544 --> 0:48:51.329
+We are looking at output distribution and
+now taking a word.
+0:48:51.731 --> 0:49:03.901
+So that means we are taking the word these,
+we are taking the word does, and all these.
+0:49:04.444 --> 0:49:06.095
+How can you do that?
+0:49:06.016 --> 0:49:09.950
+You randomly draw a number between zero and
+one.
+0:49:10.390 --> 0:49:23.686
+And then you have ordered your words in some
+way, and then you take the words before the
+0:49:23.686 --> 0:49:26.375
+sum of the words.
+0:49:26.806 --> 0:49:34.981
+So the easiest thing is you have zero point
+five, zero point two five, and zero point two
+0:49:34.981 --> 0:49:35.526
+five.
+0:49:35.435 --> 0:49:43.411
+If you have a number smaller than you take
+the first word, it takes a second word, and
+0:49:43.411 --> 0:49:45.336
+if it's higher than.
+0:49:45.845 --> 0:49:57.707
+Therefore, you can very easily get a distribution
+distributed according to this probability mass
+0:49:57.707 --> 0:49:59.541
+and no longer.
+0:49:59.799 --> 0:50:12.479
+You can't even do that a bit more and more
+focus on the important part if we are not randomly
+0:50:12.479 --> 0:50:19.494
+drawing from all words, but we are looking
+only at.
+0:50:21.361 --> 0:50:24.278
+You have an idea why this is an important
+stamp.
+0:50:24.219 --> 0:50:29.427
+Although we say I'm only throwing away the
+words which have a very low probability, so
+0:50:29.427 --> 0:50:32.541
+anyway the probability of taking them is quite
+low.
+0:50:32.481 --> 0:50:35.236
+So normally that shouldn't matter that much.
+0:50:36.256 --> 0:50:38.830
+There's ten thousand words.
+0:50:40.300 --> 0:50:42.074
+Of course, they admire thousand nine hundred.
+0:50:42.035 --> 0:50:44.003
+They're going to build a good people steal
+it up.
+0:50:45.085 --> 0:50:47.425
+Hi, I'm Sarah Hauer and I'm Sig Hauer and
+We're Professional.
+0:50:47.867 --> 0:50:55.299
+Yes, that's exactly why you do this most sampling
+or so that you don't take the lowest.
+0:50:55.415 --> 0:50:59.694
+Probability words, but you only look at the
+most probable ones and then like.
+0:50:59.639 --> 0:51:04.594
+Of course you have to rescale your probability
+mass then so that it's still a probability
+0:51:04.594 --> 0:51:08.393
+because now it's a probability distribution
+over ten thousand words.
+0:51:08.338 --> 0:51:13.332
+If you only take ten of them or so it's no
+longer a probability distribution, you rescale
+0:51:13.332 --> 0:51:15.330
+them and you can still do that and.
+0:51:16.756 --> 0:51:20.095
+That is what is done assembling.
+0:51:19.994 --> 0:51:26.269
+It's not the most common thing, but it's done
+several times.
+0:51:28.088 --> 0:51:40.625
+Then the search, which is somehow a standard,
+and if you're doing some type of machine translation.
+0:51:41.181 --> 0:51:50.162
+And the basic idea is that in research we
+select for the most probable and only continue
+0:51:50.162 --> 0:51:51.171
+with the.
+0:51:51.691 --> 0:51:53.970
+You can easily generalize this.
+0:51:53.899 --> 0:52:00.452
+We are not only continuing the most probable
+one, but we are continuing the most probable.
+0:52:00.880 --> 0:52:21.376
+The.
+0:52:17.697 --> 0:52:26.920
+You should say we are sampling how many examples
+it makes sense to take the one with the highest.
+0:52:27.127 --> 0:52:33.947
+But that is important that once you do a mistake
+you might want to not influence that much.
+0:52:39.899 --> 0:52:45.815
+So the idea is if we're keeping the end best
+hypotheses and not only the first fact.
+0:52:46.586 --> 0:52:51.558
+And the nice thing is in statistical machine
+translation.
+0:52:51.473 --> 0:52:54.408
+We have exactly the same problem.
+0:52:54.322 --> 0:52:57.635
+You would do the same thing, however.
+0:52:57.548 --> 0:53:03.391
+Since the model wasn't that strong you needed
+a quite large beam.
+0:53:03.984 --> 0:53:18.944
+Machine translation models are really strong
+and you get already a very good performance.
+0:53:19.899 --> 0:53:22.835
+So how does it work?
+0:53:22.695 --> 0:53:35.136
+We can't relate to our capabilities, but now
+we are not storing the most probable ones.
+0:53:36.156 --> 0:53:45.163
+Done that we extend all these hypothesis and
+of course there is now a bit difficult because
+0:53:45.163 --> 0:53:54.073
+now we always have to switch what is the input
+so the search gets more complicated and the
+0:53:54.073 --> 0:53:55.933
+first one is easy.
+0:53:56.276 --> 0:54:09.816
+In this case we have to once put in here these
+and then somehow delete this one and instead
+0:54:09.816 --> 0:54:12.759
+put that into that.
+0:54:13.093 --> 0:54:24.318
+Otherwise you could only store your current
+network states here and just continue by going
+0:54:24.318 --> 0:54:25.428
+forward.
+0:54:26.766 --> 0:54:34.357
+So now you have done the first two, and then
+you have known the best.
+0:54:34.249 --> 0:54:37.289
+Can you now just continue?
+0:54:39.239 --> 0:54:53.511
+Yes, that's very important, otherwise all
+your beam search doesn't really help because
+0:54:53.511 --> 0:54:57.120
+you would still have.
+0:54:57.317 --> 0:55:06.472
+So now you have to do one important step and
+then reduce again to end.
+0:55:06.343 --> 0:55:13.824
+So in our case to make things easier we have
+the inputs.
+0:55:14.014 --> 0:55:19.072
+Otherwise you will have two to the power of
+length possibilities, so it is still exponential.
+0:55:19.559 --> 0:55:26.637
+But by always throwing them away you keep
+your beans fixed.
+0:55:26.519 --> 0:55:31.712
+The items now differ in the last position.
+0:55:32.492 --> 0:55:42.078
+They are completely different, but you are
+always searching what is the best one.
+0:55:44.564 --> 0:55:50.791
+So another way of hearing it is like this,
+so just imagine you start with the empty sentence.
+0:55:50.725 --> 0:55:55.266
+Then you have three possible extensions: A,
+B, and end of sentence.
+0:55:55.199 --> 0:55:59.207
+It's throwing away the worst one, continuing
+with the two.
+0:55:59.699 --> 0:56:13.136
+Then you want to stay too, so in this state
+it's either or and then you continue.
+0:56:13.293 --> 0:56:24.924
+So you always have this exponential growing
+tree by destroying most of them away and only
+0:56:24.924 --> 0:56:26.475
+continuing.
+0:56:26.806 --> 0:56:42.455
+And thereby you can hopefully do less errors
+because in these examples you always see this
+0:56:42.455 --> 0:56:43.315
+one.
+0:56:43.503 --> 0:56:47.406
+So you're preventing some errors, but of course
+it's not perfect.
+0:56:47.447 --> 0:56:56.829
+You can still do errors because it could be
+not the second one but the fourth one.
+0:56:57.017 --> 0:57:03.272
+Now just the idea is that you make yeah less
+errors and prevent that.
+0:57:07.667 --> 0:57:11.191
+Then the question is how much does it help?
+0:57:11.111 --> 0:57:14.012
+And here is some examples for that.
+0:57:13.932 --> 0:57:16.614
+So for S & T it was really like.
+0:57:16.533 --> 0:57:23.525
+Typically the larger beam you have a larger
+third space and you have a better score.
+0:57:23.763 --> 0:57:27.370
+So the larger you get, the bigger your emails,
+the better you will.
+0:57:27.317 --> 0:57:30.024
+Typically maybe use something like three hundred.
+0:57:30.250 --> 0:57:38.777
+And it's mainly a trade-off between quality
+and speed because the larger your beams, the
+0:57:38.777 --> 0:57:43.184
+more time it takes and you want to finish it.
+0:57:43.088 --> 0:57:49.126
+So your quality improvements are getting smaller
+and smaller.
+0:57:49.349 --> 0:57:57.164
+So the difference between a beam of one and
+ten is bigger than the difference between a.
+0:57:58.098 --> 0:58:14.203
+And the interesting thing is we're seeing
+a bit of a different view, and we're seeing
+0:58:14.203 --> 0:58:16.263
+typically.
+0:58:16.776 --> 0:58:24.376
+And then especially if you look at the green
+ones, this is unnormalized.
+0:58:24.272 --> 0:58:26.775
+You're seeing a sharp.
+0:58:27.207 --> 0:58:32.284
+So your translation quality here measured
+in blue will go down again.
+0:58:33.373 --> 0:58:35.663
+That is now a question.
+0:58:35.568 --> 0:58:37.692
+Why is that the case?
+0:58:37.596 --> 0:58:43.681
+Why should we are seeing more and more possible
+translations?
+0:58:46.226 --> 0:58:48.743
+If we have a bigger stretch and we are going.
+0:58:52.612 --> 0:58:56.312
+I'm going to be using my examples before we
+also look at the bar.
+0:58:56.656 --> 0:58:59.194
+A good idea.
+0:59:00.000 --> 0:59:18.521
+But it's not everything because we in the
+end always in this list we're selecting.
+0:59:18.538 --> 0:59:19.382
+So this is here.
+0:59:19.333 --> 0:59:21.172
+We don't do any regions to do that.
+0:59:21.601 --> 0:59:29.287
+So the probabilities at the end we always
+give out the hypothesis with the highest probabilities.
+0:59:30.250 --> 0:59:33.623
+That is always the case.
+0:59:33.488 --> 0:59:43.340
+If you have a beam of this should be a subset
+of the items you look at.
+0:59:44.224 --> 0:59:52.571
+So if you increase your biomeat you're just
+looking at more and you're always taking the
+0:59:52.571 --> 0:59:54.728
+wine with the highest.
+0:59:57.737 --> 1:00:07.014
+Maybe they are all the probability that they
+will be comparable to don't really have.
+1:00:08.388 --> 1:00:14.010
+But the probabilities are the same, not that
+easy.
+1:00:13.900 --> 1:00:23.910
+One morning maybe you will have more examples
+where we look at some stuff that's not seen
+1:00:23.910 --> 1:00:26.357
+in the trading space.
+1:00:28.428 --> 1:00:36.478
+That's mainly the answer why we give a hyperability
+math we will see, but that is first of all
+1:00:36.478 --> 1:00:43.087
+the biggest issues, so here is a blue score,
+so that is somewhat translation.
+1:00:43.883 --> 1:00:48.673
+This will go down by the probability of the
+highest one that only goes out where stays
+1:00:48.673 --> 1:00:49.224
+at least.
+1:00:49.609 --> 1:00:57.971
+The problem is if we are searching more, we
+are finding high processes which have a high
+1:00:57.971 --> 1:00:59.193
+translation.
+1:00:59.579 --> 1:01:10.375
+So we are finding these things which we wouldn't
+find and we'll see why this is happening.
+1:01:10.256 --> 1:01:15.716
+So somehow we are reducing our search error.
+1:01:16.336 --> 1:01:25.300
+However, we also have a model error and we
+don't assign the highest probability to translation
+1:01:25.300 --> 1:01:27.942
+quality to the really best.
+1:01:28.548 --> 1:01:31.460
+They don't always add up.
+1:01:31.348 --> 1:01:34.859
+Of course somehow they add up.
+1:01:34.746 --> 1:01:41.656
+If your bottle is worse then your performance
+will even go.
+1:01:42.202 --> 1:01:49.718
+But sometimes it's happening that by increasing
+search errors we are missing out the really
+1:01:49.718 --> 1:01:57.969
+bad translations which have a high probability
+and we are only finding the decently good probability
+1:01:57.969 --> 1:01:58.460
+mass.
+1:01:59.159 --> 1:02:03.859
+So they are a bit independent of each other
+and you can make those types of arrows.
+1:02:04.224 --> 1:02:09.858
+That's why, for example, doing exact search
+will give you the translation with the highest
+1:02:09.858 --> 1:02:15.245
+probability, but there has been work on it
+that you then even have a lower translation
+1:02:15.245 --> 1:02:21.436
+quality because then you find some random translation
+which has a very high translation probability
+1:02:21.436 --> 1:02:22.984
+by which I'm really bad.
+1:02:23.063 --> 1:02:29.036
+Because our model is not perfect and giving
+a perfect translation probability over air,.
+1:02:31.431 --> 1:02:34.537
+So why is this happening?
+1:02:34.417 --> 1:02:42.303
+And one issue with this is the so called label
+or length spiral.
+1:02:42.782 --> 1:02:47.115
+And we are in each step of decoding.
+1:02:46.998 --> 1:02:55.313
+We are modeling the probability of the next
+word given the input and.
+1:02:55.895 --> 1:03:06.037
+So if you have this picture, so you always
+hear you have the probability of the next word.
+1:03:06.446 --> 1:03:16.147
+That's that's what your modeling, and of course
+the model is not perfect.
+1:03:16.576 --> 1:03:22.765
+So it can be that if we at one time do a bitter
+wrong prediction not for the first one but
+1:03:22.765 --> 1:03:28.749
+maybe for the 5th or 6th thing, then we're
+giving it an exceptional high probability we
+1:03:28.749 --> 1:03:30.178
+cannot recover from.
+1:03:30.230 --> 1:03:34.891
+Because this high probability will stay there
+forever and we just multiply other things to
+1:03:34.891 --> 1:03:39.910
+it, but we cannot like later say all this probability
+was a bit too high, we shouldn't have done.
+1:03:41.541 --> 1:03:48.984
+And this leads to that the more the longer
+your translation is, the more often you use
+1:03:48.984 --> 1:03:51.637
+this probability distribution.
+1:03:52.112 --> 1:04:03.321
+The typical example is this one, so you have
+the probability of the translation.
+1:04:04.104 --> 1:04:12.608
+And this probability is quite low as you see,
+and maybe there are a lot of other things.
+1:04:13.053 --> 1:04:25.658
+However, it might still be overestimated that
+it's still a bit too high.
+1:04:26.066 --> 1:04:33.042
+The problem is if you know the project translation
+is a very long one, but probability mask gets
+1:04:33.042 --> 1:04:33.545
+lower.
+1:04:34.314 --> 1:04:45.399
+Because each time you multiply your probability
+to it, so your sequence probability gets lower
+1:04:45.399 --> 1:04:46.683
+and lower.
+1:04:48.588 --> 1:04:59.776
+And this means that at some point you might
+get over this, and it might be a lower probability.
+1:05:00.180 --> 1:05:09.651
+And if you then have this probability at the
+beginning away, but it wasn't your beam, then
+1:05:09.651 --> 1:05:14.958
+at this point you would select the empty sentence.
+1:05:15.535 --> 1:05:25.379
+So this has happened because this short translation
+is seen and it's not thrown away.
+1:05:28.268 --> 1:05:31.121
+So,.
+1:05:31.151 --> 1:05:41.256
+If you have a very sore beam that can be prevented,
+but if you have a large beam, this one is in
+1:05:41.256 --> 1:05:41.986
+there.
+1:05:42.302 --> 1:05:52.029
+This in general seems reasonable that shorter
+pronunciations instead of longer sentences
+1:05:52.029 --> 1:05:54.543
+because non-religious.
+1:05:56.376 --> 1:06:01.561
+It's a bit depending on whether the translation
+should be a bit related to your input.
+1:06:02.402 --> 1:06:18.053
+And since we are always multiplying things,
+the longer the sequences we are getting smaller,
+1:06:18.053 --> 1:06:18.726
+it.
+1:06:19.359 --> 1:06:29.340
+It's somewhat right for human main too, but
+the models tend to overestimate because of
+1:06:29.340 --> 1:06:34.388
+this short translation of long translation.
+1:06:35.375 --> 1:06:46.474
+Then, of course, that means that it's not
+easy to stay on a computer because eventually
+1:06:46.474 --> 1:06:48.114
+it suggests.
+1:06:51.571 --> 1:06:59.247
+First of all there is another way and that's
+typically used but you don't have to do really
+1:06:59.247 --> 1:07:07.089
+because this is normally not a second position
+and if it's like on the 20th position you only
+1:07:07.089 --> 1:07:09.592
+have to have some bean lower.
+1:07:10.030 --> 1:07:17.729
+But you are right because these issues get
+larger, the larger your input is, and then
+1:07:17.729 --> 1:07:20.235
+you might make more errors.
+1:07:20.146 --> 1:07:27.578
+So therefore this is true, but it's not as
+simple that this one is always in the.
+1:07:28.408 --> 1:07:45.430
+That the translation for it goes down with
+higher insert sizes has there been more control.
+1:07:47.507 --> 1:07:51.435
+In this work you see a dozen knocks.
+1:07:51.329 --> 1:07:52.940
+Knots go down.
+1:07:52.833 --> 1:08:00.249
+That's light green here, but at least you
+don't see the sharp rock.
+1:08:00.820 --> 1:08:07.897
+So if you do some type of normalization, at
+least you can assess this probability and limit
+1:08:07.897 --> 1:08:08.204
+it.
+1:08:15.675 --> 1:08:24.828
+There is other reasons why, like initial,
+it's not only the length, but there can be
+1:08:24.828 --> 1:08:26.874
+other reasons why.
+1:08:27.067 --> 1:08:37.316
+And if you just take it too large, you're
+looking too often at ways in between, but it's
+1:08:37.316 --> 1:08:40.195
+better to ignore things.
+1:08:41.101 --> 1:08:44.487
+But that's more a hand gravy argument.
+1:08:44.401 --> 1:08:47.876
+Agree so don't know if the exact word.
+1:08:48.648 --> 1:08:53.223
+You need to do the normalization and there
+are different ways of doing it.
+1:08:53.162 --> 1:08:54.142
+It's mainly OK.
+1:08:54.142 --> 1:08:59.410
+We're just now not taking the translation
+with the highest probability, but we during
+1:08:59.410 --> 1:09:04.922
+the coding have another feature saying not
+only take the one with the highest probability
+1:09:04.922 --> 1:09:08.169
+but also prefer translations which are a bit
+longer.
+1:09:08.488 --> 1:09:16.933
+You can do that different in a way to divide
+by the center length.
+1:09:16.807 --> 1:09:23.111
+We take not the highest but the highest average.
+1:09:23.563 --> 1:09:28.841
+Of course, if both are the same lengths, it
+doesn't matter if M is the same lengths in
+1:09:28.841 --> 1:09:34.483
+all cases, but if you compare a translation
+with seven or eight words, there is a difference
+1:09:34.483 --> 1:09:39.700
+if you want to have the one with the highest
+probability or with the highest average.
+1:09:41.021 --> 1:09:50.993
+So that is the first one can have some reward
+model for each word, add a bit of the score,
+1:09:50.993 --> 1:09:51.540
+and.
+1:09:51.711 --> 1:10:03.258
+And then, of course, you have to find you
+that there is also more complex ones here.
+1:10:03.903 --> 1:10:08.226
+So there is different ways of doing that,
+and of course that's important.
+1:10:08.428 --> 1:10:11.493
+But in all of that, the main idea is OK.
+1:10:11.493 --> 1:10:18.520
+We are like knowing of the arrow that the
+model seems to prevent or prefer short translation.
+1:10:18.445 --> 1:10:24.800
+We circumvent that by OK we are adding we
+are no longer searching for the best one.
+1:10:24.764 --> 1:10:30.071
+But we're searching for the one best one and
+some additional constraints, so mainly you
+1:10:30.071 --> 1:10:32.122
+are doing here during the coding.
+1:10:32.061 --> 1:10:37.411
+You're not completely trusting your model,
+but you're adding some buyers or constraints
+1:10:37.411 --> 1:10:39.600
+into what should also be fulfilled.
+1:10:40.000 --> 1:10:42.543
+That can be, for example, that the length
+should be recently.
+1:10:49.369 --> 1:10:51.071
+Any More Questions to That.
+1:10:56.736 --> 1:11:04.001
+Last idea which gets recently quite a bit
+more interest also is what is called minimum
+1:11:04.001 --> 1:11:11.682
+base risk decoding and there is maybe not the
+one correct translation but there are several
+1:11:11.682 --> 1:11:13.937
+good correct translations.
+1:11:14.294 --> 1:11:21.731
+And the idea is now we don't want to find
+the one translation, which is maybe the highest
+1:11:21.731 --> 1:11:22.805
+probability.
+1:11:23.203 --> 1:11:31.707
+Instead we are looking at all the high translation,
+all translation with high probability and then
+1:11:31.707 --> 1:11:39.524
+we want to take one representative out of this
+so we're just most similar to all the other
+1:11:39.524 --> 1:11:42.187
+hydrobility translation again.
+1:11:43.643 --> 1:11:46.642
+So how does it work?
+1:11:46.499 --> 1:11:55.640
+First you could have imagined you have reference
+translations.
+1:11:55.996 --> 1:12:13.017
+You have a set of reference translations and
+then what you want to get is you want to have.
+1:12:13.073 --> 1:12:28.641
+As a probability distribution you measure
+the similarity of reference and the hypothesis.
+1:12:28.748 --> 1:12:31.408
+So you have two sets of translation.
+1:12:31.336 --> 1:12:34.788
+You have the human translations of a sentence.
+1:12:35.675 --> 1:12:39.251
+That's of course not realistic, but first
+from the idea.
+1:12:39.188 --> 1:12:42.326
+Then you have your set of possible translations.
+1:12:42.622 --> 1:12:52.994
+And now you're not saying okay, we have only
+one human, but we have several humans with
+1:12:52.994 --> 1:12:56.294
+different types of quality.
+1:12:56.796 --> 1:13:07.798
+You have to have two metrics here, the similarity
+between the automatic translation and the quality
+1:13:07.798 --> 1:13:09.339
+of the human.
+1:13:10.951 --> 1:13:17.451
+Of course, we have the same problem that we
+don't have the human reference, so we have.
+1:13:18.058 --> 1:13:29.751
+So when we are doing it, instead of estimating
+the quality based on the human, we use our
+1:13:29.751 --> 1:13:30.660
+model.
+1:13:31.271 --> 1:13:37.612
+So we can't be like humans, so we take the
+model probability.
+1:13:37.510 --> 1:13:40.786
+We take the set here first of.
+1:13:41.681 --> 1:13:48.755
+Then we are comparing each hypothesis to this
+one, so you have two sets.
+1:13:48.658 --> 1:13:53.942
+Just imagine here you take all possible translations.
+1:13:53.844 --> 1:13:58.738
+Here you take your hypothesis in comparing
+them.
+1:13:58.678 --> 1:14:03.798
+And then you're taking estimating the quality
+based on the outcome.
+1:14:04.304 --> 1:14:06.874
+So the overall idea is okay.
+1:14:06.785 --> 1:14:14.652
+We are not finding the best hypothesis but
+finding the hypothesis which is most similar
+1:14:14.652 --> 1:14:17.066
+to many good translations.
+1:14:19.599 --> 1:14:21.826
+Why would you do that?
+1:14:21.730 --> 1:14:25.070
+It's a bit like a smoothing idea.
+1:14:24.971 --> 1:14:28.609
+Imagine this is the probability of.
+1:14:29.529 --> 1:14:36.634
+So if you would do beam search or mini search
+or anything, if you just take the highest probability
+1:14:36.634 --> 1:14:39.049
+one, you would take this red one.
+1:14:39.799 --> 1:14:45.686
+Has this type of probability distribution.
+1:14:45.549 --> 1:14:58.556
+Then it might be better to take some of these
+models because it's a bit lower in probability.
+1:14:58.618 --> 1:15:12.501
+So what you're mainly doing is you're doing
+some smoothing of your probability distribution.
+1:15:15.935 --> 1:15:17.010
+How can you do that?
+1:15:16.959 --> 1:15:20.132
+Of course, we cannot do this again compared
+to all the hype.
+1:15:21.141 --> 1:15:29.472
+But what we can do is we have just two sets
+and we're just taking them the same.
+1:15:29.369 --> 1:15:38.422
+So we're having our penny data of the hypothesis
+and the sum of the soider references.
+1:15:39.179 --> 1:15:55.707
+And we can just take the same clue so we can
+just compare the utility of the.
+1:15:56.656 --> 1:16:16.182
+And then, of course, the question is how do
+we measure the quality of the hypothesis?
+1:16:16.396 --> 1:16:28.148
+Course: You could also take here the probability
+of this pee of given, but you can also say
+1:16:28.148 --> 1:16:30.958
+we only take the top.
+1:16:31.211 --> 1:16:39.665
+And where we don't want to really rely on
+how good they are, we filtered out all the
+1:16:39.665 --> 1:16:40.659
+bad ones.
+1:16:40.940 --> 1:16:50.109
+Q&A: How do you set the quality of the pseudo-referencesSo
+that is the first question for the minimum
+1:16:50.109 --> 1:16:54.604
+base rhythm, and what are your pseudo references?
+1:16:55.255 --> 1:17:06.968
+So how do you set the quality of all these
+references here in the independent sampling?
+1:17:06.835 --> 1:17:10.168
+They all have the same.
+1:17:10.750 --> 1:17:12.308
+There's Also Work Where You Can Take That.
+1:17:13.453 --> 1:17:17.952
+And then the second question you have to do
+is, of course,.
+1:17:17.917 --> 1:17:26.190
+How do you prepare now two hypothesisms so
+you have now Y and H which are post generated
+1:17:26.190 --> 1:17:34.927
+by the system and you want to find the H which
+is most similar to all the other translations.
+1:17:35.335 --> 1:17:41.812
+So it's mainly like this model here, which
+says how similar is age to all the other whites.
+1:17:42.942 --> 1:17:50.127
+So you have to again use some type of similarity
+metric, which says how similar to possible.
+1:17:52.172 --> 1:17:53.775
+How can you do that?
+1:17:53.699 --> 1:17:58.315
+We luckily knew how to compare a reference
+to a hypothesis.
+1:17:58.238 --> 1:18:00.423
+We have evaluation metrics.
+1:18:00.345 --> 1:18:03.703
+You can do something like sentence level.
+1:18:04.044 --> 1:18:13.501
+But especially if you're looking into neuromodels
+you should have a stromometric so you can use
+1:18:13.501 --> 1:18:17.836
+a neural metric which directly compares to.
+1:18:22.842 --> 1:18:29.292
+Yes, so that is, is the main idea of minimum
+base risk to, so the important idea you should
+1:18:29.292 --> 1:18:35.743
+keep in mind is that it's doing somehow the
+smoothing by not taking the highest probability
+1:18:35.743 --> 1:18:40.510
+one, but by comparing like by taking a set
+of high probability one.
+1:18:40.640 --> 1:18:45.042
+And then looking for the translation, which
+is most similar to all of that.
+1:18:45.445 --> 1:18:49.888
+And thereby doing a bit more smoothing because
+you look at this one.
+1:18:49.824 --> 1:18:55.135
+If you have this one, for example, it would
+be more similar to all of these ones.
+1:18:55.071 --> 1:19:00.966
+But if you take this one, it's higher probability,
+but it's very dissimilar to all these.
+1:19:05.445 --> 1:19:17.609
+Hey, that is all for decoding before we finish
+with your combination of models.
+1:19:18.678 --> 1:19:20.877
+Sort of set of pseudo-reperences.
+1:19:20.812 --> 1:19:24.370
+Thomas Brown writes a little bit of type research
+or.
+1:19:24.944 --> 1:19:27.087
+For example, you can do beam search.
+1:19:27.029 --> 1:19:28.774
+You can do sampling for that.
+1:19:28.716 --> 1:19:31.172
+Oh yeah, we had mentioned sampling there.
+1:19:31.113 --> 1:19:34.409
+I don't know somebody asking for what sampling
+is good.
+1:19:34.351 --> 1:19:37.205
+So there's, of course, another important issue.
+1:19:37.146 --> 1:19:40.120
+How do you get a good representative set of
+age?
+1:19:40.620 --> 1:19:47.147
+If you do beam search, it might be that you
+end up with two similar ones, and maybe it's
+1:19:47.147 --> 1:19:49.274
+prevented by doing sampling.
+1:19:49.201 --> 1:19:55.289
+But maybe in sampling you find worse ones,
+but yet some type of model is helpful.
+1:19:56.416 --> 1:20:04.863
+Search method use more transformed based translation
+points.
+1:20:04.724 --> 1:20:09.852
+Nowadays beam search is definitely.
+1:20:10.130 --> 1:20:13.749
+There is work on this.
+1:20:13.592 --> 1:20:27.262
+The problem is that the MBR is often a lot
+more like heavy because you have to sample
+1:20:27.262 --> 1:20:29.488
+translations.
+1:20:31.871 --> 1:20:40.946
+If you are bustling then we take a pen or
+a pen for the most possible one.
+1:20:40.825 --> 1:20:43.011
+Now we put them.
+1:20:43.623 --> 1:20:46.262
+Bit and then we say okay, you don't have to
+be fine.
+1:20:46.213 --> 1:20:47.659
+I'm going to put it to you.
+1:20:48.428 --> 1:20:52.690
+Yes, so that is what you can also do.
+1:20:52.577 --> 1:21:00.093
+Instead of taking uniform per ability, you
+could take the modest.
+1:21:01.041 --> 1:21:14.303
+The uniform is a bit more robust because if
+you had this one it might be that there is
+1:21:14.303 --> 1:21:17.810
+some crazy exceptions.
+1:21:17.897 --> 1:21:21.088
+And then it would still relax.
+1:21:20.986 --> 1:21:28.261
+So if you look at this picture, the probability
+here would be higher.
+1:21:28.157 --> 1:21:31.798
+But yeah, that's a bit of tuning.
+1:21:33.073 --> 1:21:42.980
+In this case, and yes, it is like modeling
+also the ants that.
+1:21:49.169 --> 1:21:56.265
+The last thing is now we always have considered
+one model.
+1:21:56.145 --> 1:22:04.086
+It's also some prints helpful to not only
+look at one model but.
+1:22:04.384 --> 1:22:10.453
+So in general there's many ways of how you
+can make several models and with it's even
+1:22:10.453 --> 1:22:17.370
+easier you can just start three different random
+municipalizations you get three different models
+1:22:17.370 --> 1:22:18.428
+and typically.
+1:22:19.019 --> 1:22:27.299
+And then the question is, can we combine their
+strength into one model and use that then?
+1:22:29.669 --> 1:22:39.281
+And that can be done and it can be either
+online or ensemble, and the more offline thing
+1:22:39.281 --> 1:22:41.549
+is called reranking.
+1:22:42.462 --> 1:22:52.800
+So the idea is, for example, an ensemble that
+you combine different initializations.
+1:22:52.678 --> 1:23:02.045
+Of course, you can also do other things like
+having different architecture.
+1:23:02.222 --> 1:23:08.922
+But the easiest thing you can change always
+in generating two motors is to have different.
+1:23:09.209 --> 1:23:24.054
+And then the question is how can you combine
+that?
+1:23:26.006 --> 1:23:34.245
+And the easiest thing, as said, is the bottle
+of soda.
+1:23:34.095 --> 1:23:39.422
+What you mainly do is in parallel.
+1:23:39.270 --> 1:23:43.841
+You decode all of the money.
+1:23:44.444 --> 1:23:59.084
+So the probability of the output and you can
+join this one to a joint one by just summing
+1:23:59.084 --> 1:24:04.126
+up over your key models again.
+1:24:04.084 --> 1:24:10.374
+So you still have a pro bonding distribution,
+but you are not taking only one output here,
+1:24:10.374 --> 1:24:10.719
+but.
+1:24:11.491 --> 1:24:20.049
+So that's one you can easily combine different
+models, and the nice thing is it typically
+1:24:20.049 --> 1:24:20.715
+works.
+1:24:21.141 --> 1:24:27.487
+You additional improvement with only more
+calculation but not more human work.
+1:24:27.407 --> 1:24:33.754
+You just do the same thing for times and you're
+getting a better performance.
+1:24:33.793 --> 1:24:41.623
+Like having more layers and so on, the advantage
+of bigger models is of course you have to have
+1:24:41.623 --> 1:24:46.272
+the big models only joint and decoding during
+inference.
+1:24:46.190 --> 1:24:52.635
+There you have to load models in parallel
+because you have to do your search.
+1:24:52.672 --> 1:24:57.557
+Normally there is more memory resources for
+training than you need for insurance.
+1:25:00.000 --> 1:25:12.637
+You have to train four models and the decoding
+speed is also slower because you need to decode
+1:25:12.637 --> 1:25:14.367
+four models.
+1:25:14.874 --> 1:25:25.670
+There is one other very important thing and
+the models have to be very similar, at least
+1:25:25.670 --> 1:25:27.368
+in some ways.
+1:25:27.887 --> 1:25:28.506
+Course.
+1:25:28.428 --> 1:25:34.612
+You can only combine this one if you have
+the same words because you are just.
+1:25:34.874 --> 1:25:43.110
+So just imagine you have two different sizes
+because you want to compare them or a director
+1:25:43.110 --> 1:25:44.273
+based model.
+1:25:44.724 --> 1:25:53.327
+That's at least not easily possible here because
+once your output would be here a word and the
+1:25:53.327 --> 1:25:56.406
+other one would have to sum over.
+1:25:56.636 --> 1:26:07.324
+So this ensemble typically only works if you
+have the same output vocabulary.
+1:26:07.707 --> 1:26:16.636
+Your input can be different because that is
+only done once and then.
+1:26:16.506 --> 1:26:23.755
+Your hardware vocabulary has to be the same
+otherwise.
+1:26:27.507 --> 1:26:41.522
+There's even a surprising effect of improving
+your performance and it's again some kind of
+1:26:41.522 --> 1:26:43.217
+smoothing.
+1:26:43.483 --> 1:26:52.122
+So normally during training what we are doing
+is we can save the checkpoints after each epoch.
+1:26:52.412 --> 1:27:01.774
+And you have this type of curve where your
+Arab performance normally should go down, and
+1:27:01.774 --> 1:27:09.874
+if you do early stopping it means that at the
+end you select not the lowest.
+1:27:11.571 --> 1:27:21.467
+However, some type of smoothing is there again.
+1:27:21.261 --> 1:27:31.161
+Sometimes what you can do is take an ensemble.
+1:27:31.491 --> 1:27:38.798
+That is not as good, but you still have four
+different bottles, and they give you a little.
+1:27:39.259 --> 1:27:42.212
+So,.
+1:27:43.723 --> 1:27:48.340
+It's some are helping you, so now they're
+supposed to be something different, you know.
+1:27:49.489 --> 1:27:53.812
+Oh didn't do that, so that is a checkpoint.
+1:27:53.713 --> 1:27:59.119
+There is one thing interesting, which is even
+faster.
+1:27:59.419 --> 1:28:12.255
+Normally let's give you better performance
+because this one might be again like a smooth
+1:28:12.255 --> 1:28:13.697
+ensemble.
+1:28:16.736 --> 1:28:22.364
+Of course, there is also some problems with
+this, so I said.
+1:28:22.272 --> 1:28:30.023
+For example, maybe you want to do different
+web representations with Cherokee and.
+1:28:30.590 --> 1:28:37.189
+You want to do right to left decoding so you
+normally do like I go home but then your translation
+1:28:37.189 --> 1:28:39.613
+depends only on the previous words.
+1:28:39.545 --> 1:28:45.926
+If you want to model on the future you could
+do the inverse direction and generate the target
+1:28:45.926 --> 1:28:47.895
+sentence from right to left.
+1:28:48.728 --> 1:28:50.839
+But it's not easy to combine these things.
+1:28:51.571 --> 1:28:56.976
+In order to do this, or what is also sometimes
+interesting is doing in verse translation.
+1:28:57.637 --> 1:29:07.841
+You can combine these types of models in the
+next election.
+1:29:07.671 --> 1:29:13.968
+That is only a bit which we can do.
+1:29:14.494 --> 1:29:29.593
+Next time what you should remember is how
+search works and do you have any final questions.
+1:29:33.773 --> 1:29:43.393
+Then I wish you a happy holiday for next week
+and then Monday there is another practical
+1:29:43.393 --> 1:29:50.958
+and then Thursday in two weeks so we'll have
+the next lecture Monday.

demo_data/lectures/Lecture-09-25.05.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb17280ddd03304eacdea7239b8a65b617c0c5bc9a4ab92e07100370c09187af
+size 119262060

demo_data/lectures/Lecture-10-13.06.2023/English.vtt ADDED Viewed

	@@ -0,0 +1,2458 @@

+WEBVTT
+0:00:00.860 --> 0:00:04.146
+IntroductionOkay Again Welcome.
+0:00:04.524 --> 0:00:09.256
+So today I'll be doing the lecture.
+0:00:09.124 --> 0:00:12.201
+My name is Danny Liro.
+0:00:12.067 --> 0:00:16.754
+I'm one of the PhD students with.
+0:00:17.137 --> 0:00:25.942
+And specifically how to learn representations
+that are common across languages and use that
+0:00:25.942 --> 0:00:29.004
+to help low resource languages.
+0:00:29.689 --> 0:00:39.445
+So hope today we can explore a little bit
+about motoring machine translation and hopefully.
+0:00:40.100 --> 0:00:50.940
+So today what we are going to do first we
+are going to look at.
+0:00:52.152 --> 0:01:02.491
+Second, we will be looking into more details
+as in how we achieve modeling or machine translation
+0:01:02.491 --> 0:01:06.183
+and what are the techniques there.
+0:01:06.078 --> 0:01:12.199
+At last, we are going to look at the current
+challenges.
+0:01:13.573 --> 0:01:15.976
+Alright, so some definitions.
+0:01:15.895 --> 0:01:19.821
+First, what is modeling or machine translation?
+0:01:21.201 --> 0:01:28.637
+So for a multilingual machine translation
+system, it's basically a system that is able
+0:01:28.637 --> 0:01:34.279
+to handle multiple source languages or multiple
+target languages.
+0:01:34.254 --> 0:01:44.798
+You see here you've got source on the source
+side, some German Chinese, Spanish and English.
+0:01:45.485 --> 0:01:50.615
+Physically, it's also a quite interesting
+machine learning challenge actually.
+0:01:51.031 --> 0:02:05.528
+So if you consider each translation pair as
+a different task in machine learning, then
+0:02:05.528 --> 0:02:08.194
+a multilingual.
+0:02:08.628 --> 0:02:17.290
+Where it has to specialize in all these different
+translation directions and try to be good.
+0:02:17.917 --> 0:02:26.890
+So this is basically about multi-task learning,
+and here when translation direction being one
+0:02:26.890 --> 0:02:27.462
+task.
+0:02:28.428 --> 0:02:35.096
+Interesting question to ask here is like do
+we get synergy like different tasks helping
+0:02:35.096 --> 0:02:39.415
+each other, the knowledge of one task helping
+the other?
+0:02:39.539 --> 0:02:48.156
+Or do we get more interference in English
+to German, and now I get worse at English to
+0:02:48.156 --> 0:02:49.047
+Chinese.
+0:02:49.629 --> 0:02:55.070
+So this is also a very interesting question
+that we'll look into later.
+0:02:56.096 --> 0:02:58.605
+Now a little bit of context.
+0:02:59.519 --> 0:03:04.733
+We care about multilingual machine translation.
+0:03:04.624 --> 0:03:10.601
+Part of the thing is that machine translation
+models.
+0:03:11.291 --> 0:03:22.659
+If you consider all the languages in the world,
+there are a read it here roughly seven thousand
+0:03:22.659 --> 0:03:23.962
+languages.
+0:03:24.684 --> 0:03:37.764
+So consider this number, and if you think
+about this many languages out there, how many
+0:03:37.764 --> 0:03:39.548
+directions.
+0:03:40.220 --> 0:03:46.897
+So this means to cover end languages.
+0:03:46.722 --> 0:03:59.377
+We're going to end up with a prodretic in
+square number of directions.
+0:03:59.779 --> 0:04:02.290
+This Is Very Bad, Padre Is Very Bad.
+0:04:03.203 --> 0:04:14.078
+The prosthetic situation going on means that
+for a lot of translation directions, if you
+0:04:14.078 --> 0:04:16.278
+consider all the.
+0:04:17.177 --> 0:04:34.950
+For many of them we aren't going to have any
+parallel data as in existing translated data.
+0:04:35.675 --> 0:04:40.001
+So this is a very data scarce situation.
+0:04:39.896 --> 0:04:49.685
+We're not going to get parallel data in blue
+wear, especially likely when you have a system
+0:04:49.685 --> 0:04:52.558
+that covers tan languages.
+0:04:52.912 --> 0:05:04.437
+If this access actually goes towards thousands
+that are realistic, we are going to end up
+0:05:04.437 --> 0:05:06.614
+with some holes.
+0:05:07.667 --> 0:05:15.400
+So now we are going to ask: Can we use motel
+inquality to help this kind of glow resource?
+0:05:15.875 --> 0:05:22.858
+So when useful concept there is mutual intelligibility,
+don't know if you've heard of this.
+0:05:23.203 --> 0:05:30.264
+Basically isn't linguistic when you say somebody
+who's speaking one language can directly without
+0:05:30.264 --> 0:05:33.218
+learning understands the other language.
+0:05:33.146 --> 0:05:39.340
+So if you're a German speaker maybe Dutch
+or Danish and all that kind of stuff would
+0:05:39.340 --> 0:05:39.632
+be.
+0:05:40.000 --> 0:05:45.990
+Useful or like directly understandable partially
+to you.
+0:05:46.586 --> 0:05:52.082
+That is, thanks to this kind of mutual enthology
+ability that is basically based on language
+0:05:52.082 --> 0:05:52.791
+similarity.
+0:05:53.893 --> 0:05:57.105
+And then there's knowledge sharing this concept.
+0:05:57.039 --> 0:06:01.188
+I mean, it's quite intuitive, basically a
+very German speaker.
+0:06:01.122 --> 0:06:06.756
+If you start learning Dutch or Danish and
+all these Mordic languages, I think you're
+0:06:06.756 --> 0:06:11.197
+going to be faster than just a native English
+speaker or anything.
+0:06:11.952 --> 0:06:18.751
+So hopefully our model is also able to do
+this, but we'll see later what the real situation.
+0:06:19.799 --> 0:06:27.221
+So we said multilingual is good multilingual
+transmission, it's nice and there's a lot of
+0:06:27.221 --> 0:06:28.210
+potentials.
+0:06:28.969 --> 0:06:32.205
+So it's a long path towards there.
+0:06:32.113 --> 0:06:37.571
+Think all the efforts started in so quite
+some years ago.
+0:06:37.958 --> 0:06:54.639
+At first people started with models with language
+specific modules.
+0:06:54.454 --> 0:06:58.747
+So we talked about the input of the decoder
+architecture in the previous lecturer area.
+0:07:00.100 --> 0:07:06.749
+And with this separation of the inputter and
+the decoder, it gives it a natural way to split
+0:07:06.749 --> 0:07:07.679
+the modules.
+0:07:09.069 --> 0:07:20.805
+So basically what's happening going on here
+is dedicated to each toes language and dedicated.
+0:07:21.281 --> 0:07:34.252
+Now given parallel data of body good data
+English German data we just activate this German
+0:07:34.252 --> 0:07:39.241
+inputter and activate this and an.
+0:07:40.680 --> 0:07:48.236
+So now we are training basically like corresponding
+parts of the encoder decoders.
+0:07:48.145 --> 0:07:55.369
+It has some advantages: First, we have a multilingual
+system.
+0:07:55.252 --> 0:08:03.887
+Of course, second modularity is also an advantage
+in software engineering.
+0:08:03.772 --> 0:08:10.567
+We want to decouple things if the German input
+is broken.
+0:08:11.011 --> 0:08:19.313
+So modularity is advantage in this case, but
+again if we think about scalability, if we
+0:08:19.313 --> 0:08:27.521
+think about languages out there that we talked
+about, scalability isn't a great thing.
+0:08:27.947 --> 0:08:37.016
+We also talked about sharing knowledge or
+sharing representations for different languages.
+0:08:37.317 --> 0:08:41.968
+We have a separate thing for each language.
+0:08:41.862 --> 0:08:46.453
+How likely is it that we are sharing much?
+0:08:46.346 --> 0:08:52.541
+So these are potential disadvantages with
+this approach.
+0:08:53.073 --> 0:09:01.181
+So yeah we talked about, we want to have knowledge
+transfer, we want to have similar languages
+0:09:01.181 --> 0:09:02.888
+helping each other.
+0:09:02.822 --> 0:09:06.095
+This is somehow a more reachable goal.
+0:09:06.011 --> 0:09:13.521
+If you have a shared in corner and a shared
+in physically, a full perimeter shared model
+0:09:13.521 --> 0:09:21.284
+for all the translation pairs out there, and
+there's also another game, so if you just have
+0:09:21.284 --> 0:09:21.705
+one.
+0:09:22.582 --> 0:09:26.084
+Lock of model for all the translation directions
+out there.
+0:09:26.606 --> 0:09:38.966
+It's easier to deploy in the sense that if
+you are serving a model you don't have a thousand
+0:09:38.966 --> 0:09:42.555
+small modules to maintain.
+0:09:42.762 --> 0:09:52.262
+So in terms of engineering somehow these kind
+of fully primitive shared models have: So this
+0:09:52.262 --> 0:09:59.821
+is also where the parent research has been
+going towards in recent years.
+0:10:00.460 --> 0:10:16.614
+So the rest of the electro are also going
+to focus on this kind of model.
+0:10:17.037 --> 0:10:30.901
+So the first type of multilinguali is this
+kind of many to one abbreviated kind of situation.
+0:10:30.754 --> 0:10:34.448
+Basically what's going.
+0:10:35.355 --> 0:10:49.804
+So one news case that you can think of here
+is if you're subtitled for international movies
+0:10:49.804 --> 0:10:51.688
+in Germany.
+0:10:53.073 --> 0:11:02.863
+Then flipping the situation there is also
+many configurations where we only have when
+0:11:02.863 --> 0:11:04.798
+source language.
+0:11:06.046 --> 0:11:13.716
+There's also many use cases like if you think
+about the lecture translator here you've seen.
+0:11:14.914 --> 0:11:21.842
+So here most of the lecturers are in German
+and now we want to translate it into.
+0:11:21.758 --> 0:11:28.433
+I think on the user end we only support English
+but they're also supportable.
+0:11:28.608 --> 0:11:38.988
+So in this kind of used case, if you have
+one speaker and you want to serve or expand
+0:11:38.988 --> 0:11:41.281
+to many audience,.
+0:11:42.802 --> 0:11:50.542
+But of course, combining everything, there's
+the many to many situation here.
+0:11:50.443 --> 0:11:53.958
+You can think of Google Translate.
+0:11:53.857 --> 0:11:58.781
+They are doing basically any selected language.
+0:11:59.159 --> 0:12:03.760
+And this is also more difficult.
+0:12:03.620 --> 0:12:14.775
+If you consider the data you need to get and
+concerns, we'll cover this later.
+0:12:15.135 --> 0:12:21.008
+Many to One TranslationsBut first we are going
+to start with many to one translations.
+0:12:21.741 --> 0:12:30.436
+Say this is the most similar to the bilingual
+translation situation you saw earlier, but
+0:12:30.436 --> 0:12:39.423
+now one difference is we need a vocabulary
+or tokens that can represent all these different
+0:12:39.423 --> 0:12:40.498
+languages.
+0:12:41.301 --> 0:12:44.200
+So we need a joint more telecom global vocabulary.
+0:12:44.924 --> 0:12:48.794
+So let's just quickly recall what word embedding
+is to do.
+0:12:49.189 --> 0:12:54.561
+Basically we need to represent it.
+0:12:54.407 --> 0:13:04.079
+We have to get some vector representation
+for discrete words.
+0:13:04.784 --> 0:13:16.911
+And when we embed a token, we are retrieving
+the corresponding vector out of this little.
+0:13:17.697 --> 0:13:19.625
+And then we put it.
+0:13:19.528 --> 0:13:26.083
+We feed a sequence of vectors into the inputter
+as the next steps.
+0:13:26.987 --> 0:13:34.973
+Now if it's motelingual you can imagine that
+vocabulary suddenly gets very, very big because
+0:13:34.973 --> 0:13:36.262
+the languages.
+0:13:37.877 --> 0:13:46.141
+So what is quite useful here is the by pair
+like subwords you talked about by pairing.
+0:13:46.406 --> 0:13:55.992
+So in this case we are still limiting ourselves
+to a finite number of vocabularies that we
+0:13:55.992 --> 0:13:59.785
+are exploding the vocabulary table.
+0:14:01.181 --> 0:14:11.631
+So when we learn these kinds of subwords,
+what happens basically?
+0:14:11.473 --> 0:14:17.020
+We look at all the training data.
+0:14:18.558 --> 0:14:20.856
+So think about this.
+0:14:20.746 --> 0:14:28.079
+If we do this now on a bunch of Mozilla data,
+are there concerns?
+0:14:30.050 --> 0:14:36.811
+Maybe we have an underground status head,
+so we get over English mergers and nocularities.
+0:14:37.337 --> 0:14:39.271
+Yeah Exactly Thanks.
+0:14:39.539 --> 0:14:46.602
+So what we have to pay attention to here is
+learn this motilingual vocabulary.
+0:14:46.513 --> 0:14:52.550
+We should pay attention: All the languages
+are more or less balanced, not that you only
+0:14:52.550 --> 0:14:58.862
+learning words is for for English or some bigger
+languages, and then neglecting other other
+0:14:58.862 --> 0:15:00.028
+languages, yeah.
+0:15:01.021 --> 0:15:04.068
+Of course, this is not going to solve everything.
+0:15:04.007 --> 0:15:09.589
+Even if we get a perfectly uniform distribution
+out of all the languages out, there is not
+0:15:09.589 --> 0:15:13.454
+going to mean that we are ending up with a
+perfect vocabulary.
+0:15:14.154 --> 0:15:20.068
+There are also language differences read,
+so if you consider more European languages.
+0:15:20.180 --> 0:15:27.081
+There will be many shared subcomponents like
+how you write a certain word, somewhat similar.
+0:15:27.267 --> 0:15:34.556
+But then there are other languages with completely
+different scripts like Arabic, Cyrillic scripts
+0:15:34.556 --> 0:15:40.594
+or Eastern Asian scripts where you get a vocabulary
+like the characters set with.
+0:15:40.940 --> 0:15:43.531
+Tens of thousands of characters.
+0:15:43.453 --> 0:15:50.356
+So these are also individual concerns that
+one has to think about my building specific
+0:15:50.356 --> 0:15:51.070
+systems.
+0:15:51.591 --> 0:16:02.660
+But overall, the rule of thumb is that when
+you do a mottling tokenizer vocabulary, there's
+0:16:02.660 --> 0:16:04.344
+more or less.
+0:16:05.385 --> 0:16:17.566
+And there's actually some paper showing that
+the performance of the final system is going
+0:16:17.566 --> 0:16:25.280
+to start to degrade if you have a disproportionate
+data.
+0:16:27.207 --> 0:16:33.186
+Of course there is currently the trend of
+using pre-train models.
+0:16:33.095 --> 0:16:39.891
+If you take a pre-train model somewhere then
+you don't have this concern.
+0:16:40.580 --> 0:16:47.810
+Making sure that you use the same organizers
+that they used so that there is no train test
+0:16:47.810 --> 0:16:48.287
+time.
+0:16:48.888 --> 0:16:53.634
+Yeah for a pre-trainer, we're going to talk
+about a little bit later as well.
+0:16:54.734 --> 0:16:59.928
+Martin Luther VocabularyAlright: So now where's
+a Martin Luther vocabulary?
+0:17:00.920 --> 0:17:04.187
+There are several good things, obviously.
+0:17:04.109 --> 0:17:10.909
+So one thing is that if we have words that
+are in the textful form like we said, there
+0:17:10.909 --> 0:17:16.224
+are European languages that share some vocabulary,
+then it's great.
+0:17:16.146 --> 0:17:19.899
+Then we have the first step towards knowledge.
+0:17:20.000 --> 0:17:30.464
+For example, the word pineapple for some reason
+is also in Eastern European languages.
+0:17:30.344 --> 0:17:34.918
+In Cyrillic scripts that's also the.
+0:17:36.116 --> 0:17:42.054
+But however, there is also ambiguity if you've
+embracing together or dye.
+0:17:41.973 --> 0:17:46.067
+Of course, they mean different things for
+German.
+0:17:46.246 --> 0:17:53.276
+Then, of course, that's possible to rely on
+further context.
+0:17:53.161 --> 0:17:59.156
+It's not a problem, it's something to think
+about.
+0:18:00.200 --> 0:18:11.061
+And when we go higher to cover more vocabulary
+entries, we might need to go bigger in the
+0:18:11.061 --> 0:18:13.233
+vocabulary count.
+0:18:13.653 --> 0:18:28.561
+So there is always sort of a bottleneck as
+the number of languages increase.
+0:18:30.110 --> 0:18:32.836
+Right, so what is the result?
+0:18:32.745 --> 0:18:38.290
+What are these crustling over inventings actually
+learning?
+0:18:40.160 --> 0:18:44.658
+So normally to inspect them it's quite hard.
+0:18:44.558 --> 0:18:53.854
+It's like high dimensional vectors with dimensions,
+but researchers also try to project it.
+0:18:54.454 --> 0:19:05.074
+So in this case it is a little bit small,
+but in this case for English and French there
+0:19:05.074 --> 0:19:07.367
+are many injuries.
+0:19:07.467 --> 0:19:20.014
+My example is like different words with the
+same word in morphological forms.
+0:19:19.853 --> 0:19:26.131
+Basically, it's like a morphological.
+0:19:26.546 --> 0:19:32.727
+There are also words in different languages
+like think there is research for English and
+0:19:32.727 --> 0:19:33.282
+French.
+0:19:33.954 --> 0:19:41.508
+So the take away from this plot is that somehow
+we learn a bit of semantic meanings beyond
+0:19:41.508 --> 0:19:43.086
+the textual forms.
+0:19:45.905 --> 0:19:50.851
+But then this looks good and this gives us
+hope.
+0:19:52.252 --> 0:20:05.240
+That if we consider what is the baseline here,
+the baseline we compare to is a bilingual system
+0:20:05.240 --> 0:20:09.164
+without any multilinguality.
+0:20:10.290 --> 0:20:18.214
+This looks good because if we compare for
+many Central European languages, Eastern and
+0:20:18.214 --> 0:20:27.413
+Central European languages to English, we compare:
+And we see that the Mini Two English has actually
+0:20:27.413 --> 0:20:30.601
+always gained quite a bit over it.
+0:20:31.751 --> 0:20:38.876
+But there is also later investigation on whether
+it is actually out of mountain linguality or
+0:20:38.876 --> 0:20:39.254
+not.
+0:20:39.639 --> 0:20:46.692
+So this is a spoiler won't tell much about
+it until the second half, but just remember
+0:20:46.692 --> 0:20:47.908
+there is this.
+0:20:49.449 --> 0:20:53.548
+Many to Many TranslationsNow move on to many
+translations.
+0:20:53.479 --> 0:21:01.785
+Let's recall in a normal transformer or any
+encoder decoder setup.
+0:21:02.242 --> 0:21:08.839
+We have an inkluder that creates sort of contextual
+representation for the sort of sentence.
+0:21:09.949 --> 0:21:17.787
+Is more or less the context for generating
+the target sentence red.
+0:21:17.672 --> 0:21:28.381
+Now on the target side we get the first open,
+then we feed it again and then get the second
+0:21:28.381 --> 0:21:29.545
+decoding.
+0:21:31.651 --> 0:21:35.039
+And now we have multiple target languages.
+0:21:34.960 --> 0:21:39.059
+Does anybody see a problem with this architecture?
+0:21:48.268 --> 0:21:57.791
+Specifically, it's in the decoder, so now
+have a German sentiments encoded.
+0:21:57.666 --> 0:22:01.930
+It now want to generate Spanish.
+0:22:07.367 --> 0:22:11.551
+So the problem is how does the model know
+which language to generate?
+0:22:12.112 --> 0:22:24.053
+If you just give it a generic start token,
+there is nowhere where we are telling the model.
+0:22:24.944 --> 0:22:30.277
+So that this can only be a guess, and this
+model will definitely not run well.
+0:22:32.492 --> 0:22:40.021
+So this comes to the question: How do we indicate
+the one's intended language to the model?
+0:22:41.441 --> 0:22:52.602
+One first idea is what people tried is basically
+now in a source where not only including the
+0:22:52.602 --> 0:22:53.552
+source.
+0:22:53.933 --> 0:23:01.172
+To Spanish things like this, so basically
+the source is already informed.
+0:23:01.074 --> 0:23:11.818
+The source sentence is already supplemented
+with: Now this is also called a target forcing
+0:23:11.818 --> 0:23:19.257
+in the sense that we try to force it to give
+the right target.
+0:23:20.080 --> 0:23:24.622
+This is one approach.
+0:23:24.416 --> 0:23:38.047
+Another approach is basically based on the
+idea that if we have.
+0:23:38.438 --> 0:23:52.177
+So if we create a context of our world, the
+incode output shouldn't really differ.
+0:23:52.472 --> 0:24:02.397
+So out of this motivation people have moved
+this signaling mechanism.
+0:24:02.255 --> 0:24:09.914
+They basically replaced the traditional start
+token.
+0:24:10.330 --> 0:24:17.493
+So here we are not kids starting into the
+generic start talking anymore instead language
+0:24:17.493 --> 0:24:18.298
+specific.
+0:24:18.938 --> 0:24:21.805
+So this is also another way to achieve this.
+0:24:23.283 --> 0:24:27.714
+But there are still more challenging cases.
+0:24:27.614 --> 0:24:35.536
+Sometimes here it can be called as General
+English or German when it's there.
+0:24:35.435 --> 0:24:39.703
+Later on it goes further and further on.
+0:24:40.320 --> 0:24:46.752
+Basically this information is not strong enough
+to always enforce the target language, especially
+0:24:46.752 --> 0:24:48.392
+in zero shot conditions.
+0:24:48.327 --> 0:24:54.142
+We'll look into this later so we'll get this
+kind of target translation into generating
+0:24:54.142 --> 0:24:57.843
+and generating and then going into some wrong
+language.
+0:24:59.219 --> 0:25:12.542
+So another technique actually developed here
+some years ago was to inject this language.
+0:25:12.872 --> 0:25:19.834
+So when we are feeding doing the auto-aggressive
+decoding normally, we only feed the upherb.
+0:25:20.000 --> 0:25:22.327
+Into the depoter.
+0:25:22.197 --> 0:25:33.676
+But if we also add a language embedding for
+the target language, on top of that we have
+0:25:33.676 --> 0:25:37.067
+the language information.
+0:25:37.397 --> 0:25:44.335
+And this has shown to perform quite a bit
+better, especially in conditions where the
+0:25:44.335 --> 0:25:44.906
+model.
+0:25:46.126 --> 0:25:55.015
+So yeah, we introduced three ways to enforce
+the Tardid language: And now with this we're
+0:25:55.015 --> 0:26:02.621
+going to move on to the more interesting case
+of many too many translations.
+0:26:03.503 --> 0:26:13.183
+Zero-Shot TranslationAm so here we just consider
+a system that translates two directions: English
+0:26:13.183 --> 0:26:15.554
+to English and English.
+0:26:16.676 --> 0:26:21.416
+Now we have target languages read.
+0:26:21.280 --> 0:26:29.498
+Can you see where we're enforcing the target
+language here?
+0:26:29.361 --> 0:26:33.475
+In this case what technique?
+0:26:34.934 --> 0:26:45.338
+So here we are enforcing the characteristic
+language with the yelling we train this system.
+0:26:46.526 --> 0:26:59.567
+And at the inference time we are able to generate
+English to French, but in addition to this
+0:26:59.567 --> 0:27:12.048
+we are also able to: We will be able to do
+zero shot inference that basically translates
+0:27:12.048 --> 0:27:17.937
+a direction that is not seen in training.
+0:27:19.319 --> 0:27:25.489
+So this is so called zero shot translation
+using a modeling wall system.
+0:27:26.606 --> 0:27:34.644
+Of course, we have to reach several things
+before we are able to control the language,
+0:27:34.644 --> 0:27:36.769
+otherwise it's no use.
+0:27:37.317 --> 0:27:51.087
+Second, we should also have some kind of language
+independent representation.
+0:27:51.731 --> 0:27:53.196
+Why is this?
+0:27:53.083 --> 0:27:55.028
+Why is this big?
+0:27:54.914 --> 0:28:00.637
+Because if women drink generally French up
+here?
+0:28:00.940 --> 0:28:05.870
+It was trained to translate from some English.
+0:28:07.187 --> 0:28:15.246
+But now we use Anchored Germans in the French,
+so intuitively we need these representations
+0:28:15.246 --> 0:28:22.429
+to be similar enough, not that they are so
+far attracted that we cannot use this.
+0:28:25.085 --> 0:28:32.059
+So there are several works out there showing
+that if you do a standard transformer architecture
+0:28:32.059 --> 0:28:39.107
+this language independent property is not really
+there and you need to add additional approaches
+0:28:39.107 --> 0:28:40.633
+in order to enforce.
+0:28:41.201 --> 0:28:50.863
+So you can, for example, add an additional
+training objective: That says, we invoked SARSN,
+0:28:50.863 --> 0:29:00.211
+be invoked by German, and the invoked English
+have to be the same or be as close to each
+0:29:00.211 --> 0:29:02.207
+other as possible.
+0:29:02.882 --> 0:29:17.576
+So if we take the output and the output for
+another language, how can we formulate this
+0:29:17.576 --> 0:29:18.745
+as an.
+0:29:20.981 --> 0:29:27.027
+We can take the translation to the encoder
+and whatever you translate.
+0:29:26.942 --> 0:29:32.819
+The embeddings also must be similar and that's
+the great direction.
+0:29:33.253 --> 0:29:42.877
+So one thing to take care of here is the length
+for the same sentence in German and English
+0:29:42.877 --> 0:29:44.969
+is not necessarily.
+0:29:45.305 --> 0:30:00.858
+So if we just do a word to word matching,
+we can always do pulling to a fixed length
+0:30:00.858 --> 0:30:03.786
+representation.
+0:30:04.004 --> 0:30:08.392
+Or there are more advanced techniques that
+involve some alignments.
+0:30:08.848 --> 0:30:23.456
+So this is useful in the sense that in this
+part in experiments we have shown it improves
+0:30:23.456 --> 0:30:27.189
+zero shot translation.
+0:30:27.447 --> 0:30:36.628
+This is on the data condition of English to
+Malay, Java and Filipino, so kind of made to
+0:30:36.628 --> 0:30:39.722
+low resource language family.
+0:30:40.100 --> 0:30:50.876
+And there we assume that we get parallel English
+to all of them, but among all these.
+0:30:51.451 --> 0:31:03.592
+So the blue bar is a Vanilla Transformer model,
+and the purple bar is when we add a language.
+0:31:04.544 --> 0:31:12.547
+You see that in supervised conditions it's
+not changing much, but in zero shots there's
+0:31:12.547 --> 0:31:13.183
+quite.
+0:31:15.215 --> 0:31:22.649
+Yeah, so far we said zero shots is doable
+and it's even more achievable if we enforce
+0:31:22.649 --> 0:31:26.366
+some language independent representations.
+0:31:26.279 --> 0:31:29.778
+However, there's one practical concern.
+0:31:29.690 --> 0:31:33.803
+Don't know if you also had the same question.
+0:31:34.514 --> 0:31:39.835
+If you have two languages, you don't have
+direct parallel.
+0:31:39.745 --> 0:31:43.895
+One's into English and one's out of English.
+0:31:45.685 --> 0:31:52.845
+It's actually this kind of approach is called
+pivoting as in pivoting over an intermediate
+0:31:52.845 --> 0:31:53.632
+language.
+0:31:55.935 --> 0:32:00.058
+Yeah, that it definitely has advantages in
+the sense that we're going.
+0:32:00.440 --> 0:32:11.507
+Now if we go over these two steps every direction
+was trained with supervised data so you could
+0:32:11.507 --> 0:32:18.193
+always assume that when we are working with
+a supervised.
+0:32:18.718 --> 0:32:26.868
+So in this case we can expect more robust
+inference time behavior.
+0:32:26.747 --> 0:32:31.616
+However, there are also disadvantages.
+0:32:31.531 --> 0:32:38.860
+An inference where passing through the model
+ties so that's doubling the inference time
+0:32:38.860 --> 0:32:39.943
+computation.
+0:32:40.500 --> 0:32:47.878
+You might think okay doubling then what, but
+if you consider if your company like Google,
+0:32:47.878 --> 0:32:54.929
+Google Translate and all your life traffic
+suddenly becomes twice as big, this is not
+0:32:54.929 --> 0:33:00.422
+something scalable that you want to see, especially
+in production.
+0:33:01.641 --> 0:33:11.577
+A problem with this is making information
+loss because if we go over these games when
+0:33:11.577 --> 0:33:20.936
+a chain of kids pass the word to each other,
+in the end it's losing information.
+0:33:22.082 --> 0:33:24.595
+Can give it an example here.
+0:33:24.509 --> 0:33:27.765
+It's also from a master thesis here.
+0:33:27.677 --> 0:33:30.321
+It's on gender preservation.
+0:33:30.770 --> 0:33:39.863
+Basically, some languages like Italian and
+French have different word forms based on the
+0:33:39.863 --> 0:33:40.782
+speaker.
+0:33:41.001 --> 0:33:55.987
+So if a male person says feel alienated, this
+word for alienated would be exclusive and a
+0:33:55.987 --> 0:33:58.484
+female person.
+0:34:00.620 --> 0:34:05.730
+Now imagine that we pivot through anguish.
+0:34:05.611 --> 0:34:08.641
+The information is lost.
+0:34:08.520 --> 0:34:11.917
+We don't know what gender.
+0:34:12.492 --> 0:34:19.626
+When we go out into branch again, there are
+different forms.
+0:34:19.509 --> 0:34:29.177
+Depending on the speaker gender, we can: So
+this is one problem.
+0:34:31.871 --> 0:34:44.122
+This is especially the case because English
+compared to many other languages is relatively
+0:34:44.122 --> 0:34:45.199
+simple.
+0:34:45.205 --> 0:34:53.373
+Gendered where it forms like this, it also
+doesn't have many cases, so going through English
+0:34:53.373 --> 0:34:56.183
+many information would be lost.
+0:34:57.877 --> 0:35:12.796
+And another thing is if you have similar languages
+that you are translating out of my systems
+0:35:12.796 --> 0:35:15.494
+that translates.
+0:35:16.496 --> 0:35:24.426
+This is the output of going from Dutch to
+German again.
+0:35:24.284 --> 0:35:30.235
+If you read the German, how many of you?
+0:35:32.552 --> 0:35:51.679
+Good and the problem here is that we are going
+over English and then the English to German.
+0:35:51.831 --> 0:36:06.332
+However, if we go direct in this case zero
+shot translation you see that word forgive.
+0:36:06.546 --> 0:36:09.836
+In this case, the outward translation is better.
+0:36:10.150 --> 0:36:20.335
+And we believe this has to do with using the
+language similarity between the two languages.
+0:36:20.225 --> 0:36:26.759
+There is also quantitative results we found
+when born in.
+0:36:27.988 --> 0:36:33.780
+The models are always doing better when translating
+similar languages compared to the.
+0:36:35.535 --> 0:36:42.130
+SummaryYeah, so in this first half what we
+talked about basically first, we started with
+0:36:42.130 --> 0:36:49.838
+how motilinguality or motilingual machine translation
+could enable knowledge transfer between languages
+0:36:49.838 --> 0:36:53.987
+and help with conditions where we don't have
+much data.
+0:36:55.235 --> 0:37:02.826
+Now it looks at three types of multilingual
+translation, so one is many to one, one to
+0:37:02.826 --> 0:37:03.350
+many.
+0:37:05.285 --> 0:37:13.397
+We got there first about a shared vocabulary
+based on different languages and how these
+0:37:13.397 --> 0:37:22.154
+cross lingual word embeddings capture semantic
+meanings rather than just on a text proof form.
+0:37:25.505 --> 0:37:37.637
+Then we looked at how to signal the target
+language, how to ask for the model to generate,
+0:37:37.637 --> 0:37:43.636
+and then we looked at zero shot translation.
+0:37:45.325 --> 0:37:57.395
+MotilingualityYou now before go into the second
+half are there questions about the first okay
+0:37:57.395 --> 0:37:58.166
+good.
+0:38:00.140 --> 0:38:10.932
+In the second half of this lecture we'll be
+looking into challenges like what is still
+0:38:10.932 --> 0:38:12.916
+unsolved about.
+0:38:13.113 --> 0:38:18.620
+There are some aspects to look at it.
+0:38:18.475 --> 0:38:26.593
+The first is modeling, the second is more
+engineering.
+0:38:28.248 --> 0:38:33.002
+Okay, so we talked about this question several
+times.
+0:38:32.914 --> 0:38:35.610
+How does motilinguality help?
+0:38:35.520 --> 0:38:37.411
+Where does it help?
+0:38:38.298 --> 0:38:45.416
+Here want to show results of an experiment
+based on over a hundred languages.
+0:38:46.266 --> 0:38:58.603
+Here you can see the data amount so they use
+parallel data to English and it's very.
+0:38:58.999 --> 0:39:00.514
+This is already lock scale.
+0:39:00.961 --> 0:39:12.982
+So for higher resource languages like English
+to French, German to Spanish you get over billion
+0:39:12.982 --> 0:39:14.359
+sentences.
+0:39:14.254 --> 0:39:21.003
+In parallel, and when we go more to the right
+to the more low resource spectrum on the other
+0:39:21.003 --> 0:39:26.519
+hand, there are languages that maybe many of
+us have new and heard of like.
+0:39:26.466 --> 0:39:29.589
+Do You Want to Move Back?
+0:39:30.570 --> 0:39:33.270
+Hawaiian Indians have heard of it.
+0:39:34.414 --> 0:39:39.497
+So on that spectrum we only have like thirty
+thousand sentences.
+0:39:40.400 --> 0:39:48.389
+So what this means is when we train, we have
+to up sample these guys.
+0:39:48.275 --> 0:39:51.589
+The model didn't even know.
+0:39:52.732 --> 0:40:05.777
+Yeah, so on this graph on how we read it is
+this horizontal line and zero is basically
+0:40:05.777 --> 0:40:07.577
+indicating.
+0:40:07.747 --> 0:40:14.761
+Because we want to see where mottling quality
+helps only compare to what happens when there
+0:40:14.761 --> 0:40:15.371
+is not.
+0:40:16.356 --> 0:40:29.108
+So upper like higher than the zero line it
+means we're gaining.
+0:40:29.309 --> 0:40:34.154
+The same like for these languages.
+0:40:34.015 --> 0:40:40.802
+This side means we are a high resource for
+the.
+0:40:40.981 --> 0:40:46.675
+Yeah sorry, think I've somehow removed the
+the ex-O as he does.
+0:40:48.008 --> 0:40:58.502
+Yeah alright, what happens now if we look
+at many into English?
+0:40:58.698 --> 0:41:08.741
+On the low resource spectrum by going multilingua
+we gain a lot over the Palumbo system.
+0:41:10.010 --> 0:41:16.658
+Overall, if you consider the average for all
+of the languages, it's still again.
+0:41:17.817 --> 0:41:27.301
+Now we're looking at the green line so you
+can ignore the blue line.
+0:41:27.164 --> 0:41:32.253
+Basically we have to do our sample.
+0:41:33.753 --> 0:41:41.188
+Yeah, so if you just even consider the average,
+it's still a game form over by link.
+0:41:42.983 --> 0:41:57.821
+However, if we go to the English to many systems
+looking at the gains, we only get minor improvements.
+0:41:59.039 --> 0:42:12.160
+So why is it the case that Going Mott Lingu
+isn't really helping universally?
+0:42:16.016 --> 0:42:18.546
+Do you have some intuitions on yeah?
+0:42:18.698 --> 0:42:38.257
+It's easier to understand something that generates
+if we consider what the model has to generate.
+0:42:38.718 --> 0:42:40.091
+I See It Like.
+0:42:40.460 --> 0:42:49.769
+Generating is a bit like writing or speaking,
+while inputing on the source side is more like
+0:42:49.769 --> 0:42:50.670
+reading.
+0:42:50.650 --> 0:42:57.971
+So one is more passive and the other is more
+active and don't know if you have similar experience.
+0:42:57.897 --> 0:43:05.116
+I think speaking and writing is always a little
+bit more difficult than just passively listening
+0:43:05.116 --> 0:43:06.009
+or reading.
+0:43:05.934 --> 0:43:09.805
+But this is a very pendwavy kind of understanding.
+0:43:10.390 --> 0:43:11.854
+And fed.
+0:43:12.032 --> 0:43:20.718
+In terms of the model, if we consider what
+is the difference for the target side for many
+0:43:20.718 --> 0:43:26.703
+to English: One difference is that there's
+a data difference.
+0:43:27.167 --> 0:43:33.438
+So if you just consider a modern English system
+with German to English and Spanish to English,.
+0:43:34.975 --> 0:43:44.321
+One thing we have to keep in mind is that
+the parallel data is not all the same, so on
+0:43:44.321 --> 0:43:49.156
+the target side there are different English.
+0:43:49.769 --> 0:43:54.481
+So the situation rather looks like this.
+0:43:54.366 --> 0:43:59.196
+What this means is that we are going to.
+0:44:00.820 --> 0:44:04.635
+We also add more data on the target side for
+English.
+0:44:06.967 --> 0:44:18.581
+Now since the target side data is not identical,
+how do we do a controlled experiment to remove
+0:44:18.581 --> 0:44:21.121
+the multilinguality?
+0:44:24.644 --> 0:44:42.794
+So what people tried as a control experiment
+is to keep all the English same as the above
+0:44:42.794 --> 0:44:44.205
+setup.
+0:44:44.684 --> 0:44:49.700
+So they take the English on English data of
+the same branch to German.
+0:44:50.090 --> 0:44:55.533
+And then the general synthetic data for Germans.
+0:44:55.422 --> 0:45:05.843
+So now we have a bilingual system again, but
+on the target side we still have the previously
+0:45:05.843 --> 0:45:08.420
+enriched English data.
+0:45:10.290 --> 0:45:25.092
+Now back to this picture that we've seen before,
+this mysterious orange line here is basically
+0:45:25.092 --> 0:45:26.962
+the result.
+0:45:27.907 --> 0:45:36.594
+And somewhat struckly and perhaps sadly for
+believers of multilinguality.
+0:45:36.476 --> 0:45:39.182
+This is also gaining.
+0:45:41.001 --> 0:45:52.775
+So what this means is for the many English
+is gaining not really because of multilinguality
+0:45:52.775 --> 0:45:55.463
+but just because of.
+0:45:55.976 --> 0:46:10.650
+And this means that there is still quite a
+lot to do if we really want to gain from just
+0:46:10.650 --> 0:46:13.618
+shared knowledge.
+0:46:14.514 --> 0:46:27.599
+But this also gives hope because there are
+still many things to research in this area
+0:46:27.599 --> 0:46:28.360
+now.
+0:46:28.708 --> 0:46:40.984
+So we've seen adding more languages helps
+with somewhat data side effect and can it hurt.
+0:46:40.848 --> 0:46:45.626
+So if we just add more languages.
+0:46:47.007 --> 0:46:48.408
+We've seen this.
+0:46:48.325 --> 0:46:52.696
+This is the picture for the Manitou English
+system.
+0:46:53.793 --> 0:47:09.328
+Comparing to this valuable face line, we see
+that for these high resource languages we are
+0:47:09.328 --> 0:47:12.743
+not doing as great.
+0:47:15.956 --> 0:47:18.664
+So why are we losing here?
+0:47:18.564 --> 0:47:25.287
+It's been showing that this performance last
+is somewhat related.
+0:47:26.026 --> 0:47:37.373
+In the sense that the motto has to learn so
+much that at some point it has to sacrifice
+0:47:37.373 --> 0:47:39.308
+capacity from.
+0:47:41.001 --> 0:47:57.081
+So what to do to basically grow a bigger brain
+to tackle this is to add some dedicated capacity
+0:47:57.081 --> 0:47:59.426
+per language.
+0:48:00.100 --> 0:48:15.600
+Here it's like a simplified graph of a transformer
+architecture, so this is the encoder within
+0:48:15.600 --> 0:48:16.579
+time.
+0:48:17.357 --> 0:48:27.108
+But additionally here these little colorable
+blouse are now the language-specific capable
+0:48:27.108 --> 0:48:28.516
+of capacity.
+0:48:29.169 --> 0:48:42.504
+There are language specific in the sense that
+if you get the Chinese to English, the pattern.
+0:48:43.103 --> 0:48:54.900
+We are also going to language specific parts
+that in this case consists of a down projection.
+0:48:56.416 --> 0:49:07.177
+So this is also called adaptors, something
+that is plugged into an existing model and
+0:49:07.177 --> 0:49:11.556
+it adapts towards a specific task.
+0:49:12.232 --> 0:49:22.593
+And this is conditionally activated in the
+sense that if you get a different input sentence.
+0:49:27.307 --> 0:49:34.173
+So this was first proposed in by some folks
+selling Google.
+0:49:34.058 --> 0:49:36.696
+Does this scale well?
+0:49:39.619 --> 0:49:56.621
+Yes exactly, so this is a translation periscusive
+cannon adapter, and this is not going to scale
+0:49:56.621 --> 0:49:57.672
+well.
+0:49:58.959 --> 0:50:13.676
+So this also brought people to try some more
+simple architecture.
+0:50:16.196 --> 0:50:22.788
+Yeah, this is also an alternative, in this
+case called monolingual adapters.
+0:50:24.184 --> 0:50:32.097
+Any of these adapters so again have this low
+resource.
+0:50:31.953 --> 0:50:42.027
+The zero line is bilingual baseline, but the
+lines are interpolated.
+0:50:43.783 --> 0:50:48.767
+The red one is the mottling word original
+mottling word model.
+0:50:49.929 --> 0:50:57.582
+And if we put the adapters in like a basic
+virginal adapter that goes to the blue liner,.
+0:50:58.078 --> 0:51:08.582
+You see the lids gaining performance for the
+high resource languages.
+0:51:08.432 --> 0:51:16.089
+If they even scale a lot, this further increases.
+0:51:16.556 --> 0:51:22.770
+So this is also a side kind of this.
+0:51:23.103 --> 0:51:27.807
+From the side shows that it's really a capacity
+bottom up.
+0:51:28.488 --> 0:51:30.590
+Like If You Eleanor.
+0:51:31.151 --> 0:51:34.313
+Resource they regain their performance.
+0:51:38.959 --> 0:51:50.514
+For smaller languages, but it's just.
+0:51:50.770 --> 0:52:03.258
+Think in the original modeling, the smaller
+languages they weren't constrained by capacity.
+0:52:05.445 --> 0:52:13.412
+So guess for the smaller languages, the difficulty
+is more the data rather than the model capacity.
+0:52:13.573 --> 0:52:26.597
+So in general you always want to have more
+or less data matching your model capacity.
+0:52:27.647 --> 0:52:33.255
+Yeah, here think the bigger challenge for
+lower roots was the data.
+0:52:34.874 --> 0:52:39.397
+You also mention it a little bit.
+0:52:39.264 --> 0:52:46.982
+Are these adapters per language or how many
+adapters do?
+0:52:47.267 --> 0:52:55.378
+And do we have to design them differently
+so that we learn to share more like a language
+0:52:55.378 --> 0:52:56.107
+family?
+0:52:56.576 --> 0:53:15.680
+So one downside of the adaptor we talked about
+is that basically there is no way to go over.
+0:53:16.516 --> 0:53:29.862
+Routing or LearningSo then a recent kind of
+additional approach for these language specific
+0:53:29.862 --> 0:53:36.100
+capacity is so called routing or learning.
+0:53:36.256 --> 0:53:42.438
+Basically, we have these language specific
+components.
+0:53:42.326 --> 0:53:45.875
+We also have a shared adapter.
+0:53:45.760 --> 0:53:52.148
+The model should learn: So in this case maybe
+we could imagine for the lower resource case
+0:53:52.148 --> 0:53:54.044
+that we just talked about.
+0:53:54.094 --> 0:54:04.838
+Sense to go there because there's not much
+to do with language specific anyway than it's
+0:54:04.838 --> 0:54:10.270
+better to make use of similarity with other.
+0:54:11.111 --> 0:54:30.493
+So this architecture is more data driven instead
+of what we specify prior to training.
+0:54:31.871 --> 0:54:33.998
+So how do we learn this?
+0:54:35.095 --> 0:54:49.286
+Basically, in terms of the mask, we want to
+basically have a binary rule that goes either
+0:54:49.286 --> 0:54:50.548
+to the.
+0:54:51.311 --> 0:54:56.501
+But how do we get a valued zero or one mean
+we can?
+0:54:56.402 --> 0:54:58.503
+We can do a signal.
+0:54:58.999 --> 0:55:13.376
+However, one thing is we don't want to get
+stuck in the middle, so we don't want black.
+0:55:14.434 --> 0:55:28.830
+It is also bad because it is not going to
+be the same training and test time by the way.
+0:55:31.151 --> 0:55:50.483
+So here the question is how do we force basically
+the model to always go there prior to activation?
+0:55:54.894 --> 0:56:02.463
+Found it interesting because it sounds like
+a trick for me.
+0:56:02.337 --> 0:56:05.497
+This approach has been.
+0:56:06.026 --> 0:56:15.844
+So what they do is prior to going through
+this activation, and they add some bosom noise.
+0:56:17.257 --> 0:56:31.610
+If there is always noise prior to activation
+then the model will be encouraged to preserve
+0:56:31.610 --> 0:56:34.291
+the information.
+0:56:36.356 --> 0:56:44.067
+Was a very interesting thing that found out
+while preparing this, so wanted to share this
+0:56:44.067 --> 0:56:44.410
+as.
+0:56:44.544 --> 0:56:48.937
+So basically you can create a battery gate
+with this technique.
+0:56:50.390 --> 0:57:01.747
+And if you add these language specific routing:
+Here they also have some that can control how
+0:57:01.747 --> 0:57:07.788
+much is shared and how much is language specific.
+0:57:07.727 --> 0:57:16.374
+Here the seals are the is the routing with
+the red and orange lines, so.
+0:57:16.576 --> 0:57:22.752
+So you can see that poor for many and many
+to one there in both cases quite some games.
+0:57:23.063 --> 0:57:30.717
+So that is the overall picture and just find
+the idea of the routing quite interesting.
+0:57:30.991 --> 0:57:32.363
+And UM.
+0:57:32.212 --> 0:57:38.348
+It's also getting a bit more increasingly
+used as there are the so called mixture of
+0:57:38.348 --> 0:57:39.431
+expert models.
+0:57:39.499 --> 0:57:51.801
+The model learns where to route the input
+so they are all conditionally activated when
+0:57:51.801 --> 0:57:53.074
+you are.
+0:57:53.213 --> 0:57:59.089
+But this is not really something specific
+to mortal inquality, so won't talk too much
+0:57:59.089 --> 0:57:59.567
+about.
+0:58:00.620 --> 0:58:02.115
+No.
+0:58:01.761 --> 0:58:09.640
+From this parrot is first that we talked about
+the listing of the capacity bottleneck.
+0:58:10.570 --> 0:58:19.808
+Where we can partly compensate by adapters
+or adding language specific capacity, there's
+0:58:19.808 --> 0:58:23.026
+the idea of negative transfer.
+0:58:24.844 --> 0:58:35.915
+When we add any additional capacity, how can
+we improve the knowledge sharing?
+0:58:38.318 --> 0:58:46.662
+Also, for this one too many directions that
+seem to be hopeless for multilinguality, can
+0:58:46.662 --> 0:58:47.881
+we actually?
+0:58:49.129 --> 0:58:52.171
+Yeah, these are all open things still in the
+area.
+0:58:53.673 --> 0:59:04.010
+Data ScarcityNow next part, I'm going to talk
+about some data challenges for Model Ewell.
+0:59:03.895 --> 0:59:07.667
+We talk about Model Ewell.
+0:59:08.488 --> 0:59:14.967
+But there are these lower resource languages
+that don't have well curated parallel data.
+0:59:16.216 --> 0:59:27.539
+When alternative people resort to Pro Data
+from the Internet, there's a lot of noise.
+0:59:27.927 --> 0:59:36.244
+And in this paper last year they did some
+manual analyses of several popular cross data
+0:59:36.244 --> 0:59:36.811
+sets.
+0:59:37.437 --> 0:59:55.262
+And you'll see that there are a lot of wrong
+translations, non-linguistic contents, pornographic
+0:59:55.262 --> 0:59:57.100
+contents.
+0:59:57.777 --> 1:00:04.661
+So as you can imagine, they say what you eat.
+1:00:04.512 --> 1:00:20.028
+If you use this kind of data to train a model,
+you can: So there are also many techniques
+1:00:20.028 --> 1:00:28.820
+for filtering and filtering these noisy data
+sets.
+1:00:29.809 --> 1:00:36.982
+So to filter these out we can use an additional
+classifier that basically are trained to classify
+1:00:36.982 --> 1:00:43.496
+which language to sentences and then kick out
+all the sentences with the wrong language.
+1:00:45.105 --> 1:00:49.331
+Another thing is the length ratio.
+1:00:49.211 --> 1:01:00.202
+Basically, the assumption there is that if
+two sentences are translations of each other,.
+1:01:01.901 --> 1:01:08.718
+So often people use maybe a ratio of three
+and then it eliminates the rest.
+1:01:09.909 --> 1:01:20.187
+Also, the other idea maybe similar to the
+language classifier is basically to heaven
+1:01:20.187 --> 1:01:24.540
+allowed character set per language.
+1:01:24.419 --> 1:01:28.293
+So if you're trying to filter.
+1:01:28.568 --> 1:01:34.622
+Don't know Cyrillic spribs or Arabic spribs,
+then it's maybe a good idea to remove them.
+1:01:35.775 --> 1:01:43.123
+This is not all there are many other ideas
+using some pre-trained neural networks to compare
+1:01:43.123 --> 1:01:50.629
+the representations, but just to give you an
+idea of what our basic techniques were filtering.
+1:01:50.991 --> 1:01:53.458
+Is quite important.
+1:01:53.335 --> 1:02:02.467
+We have seen in our experience that if you
+do these thoroughly there is.
+1:02:03.883 --> 1:02:17.814
+So after all, even if we do web crawling,
+there is still a bit of data scarcity problem.
+1:02:18.118 --> 1:02:30.760
+So there are many bad things that can happen
+when there's too little training data.
+1:02:30.609 --> 1:02:35.430
+The first is low performances.
+1:02:35.735 --> 1:02:49.859
+So they did it on many English system index
+languages, all together with here means: So
+1:02:49.859 --> 1:03:04.144
+we really need to get that area of a lot of
+data in order to get that ideal performance.
+1:03:04.884 --> 1:03:20.639
+There are also many horrible things that can
+happen in general when you train a model across
+1:03:20.639 --> 1:03:24.874
+different training runs.
+1:03:26.946 --> 1:03:36.733
+So one solution to tackle this problem, the
+data scarcity problem, is by fine tuning some
+1:03:36.733 --> 1:03:38.146
+pre-trained.
+1:03:38.979 --> 1:03:46.245
+And basically the idea is you've got the pre-trained
+model that can already do translation.
+1:03:46.846 --> 1:03:54.214
+Then you find units on your own training data
+and you end up with a more specialized model.
+1:03:55.155 --> 1:03:59.369
+So why does pretraining help?
+1:03:59.228 --> 1:04:11.436
+One argument is that if you do pretraining
+then the motto has seen over more data and
+1:04:11.436 --> 1:04:12.714
+learned.
+1:04:13.313 --> 1:04:19.135
+Say more generalizable representations that
+can help more downstream tasks.
+1:04:19.719 --> 1:04:28.063
+So in this case we are basically trying to
+make use of the more meaningful and generalizable
+1:04:28.063 --> 1:04:29.499
+representation.
+1:04:30.490 --> 1:04:45.103
+So for machine translation there are several
+open source models out there that can handle
+1:04:45.103 --> 1:04:46.889
+languages.
+1:04:48.188 --> 1:04:49.912
+Two hundred model.
+1:04:49.822 --> 1:04:53.404
+They also cover two hundred languages.
+1:04:53.312 --> 1:04:57.631
+That means that's quite a lot of translation.
+1:04:57.978 --> 1:05:06.218
+However, one thing to remember is that these
+lados are more like a how do you call them.
+1:05:06.146 --> 1:05:12.812
+Jackson Waltry is a master of none in the
+sense that they are very good as coverage,
+1:05:12.812 --> 1:05:20.498
+but if you look at specific translation directions
+they might be not as good as dedicated models.
+1:05:21.521 --> 1:05:34.170
+So here I'm going to have some results by
+comparing random initialization versus the
+1:05:34.170 --> 1:05:36.104
+first thing.
+1:05:36.396 --> 1:05:46.420
+The third line is the result of basically
+finding a pre-train model that is one of the
+1:05:46.420 --> 1:05:47.342
+family.
+1:05:47.947 --> 1:05:51.822
+So in this case you could see the.
+1:05:51.831 --> 1:05:58.374
+If we just look at the second line, that is
+the pre trade model out of the box, you see
+1:05:58.374 --> 1:06:04.842
+that if we just use it out of the box, the
+performance everywhere isn't super great as
+1:06:04.842 --> 1:06:06.180
+dedicated models.
+1:06:07.867 --> 1:06:22.305
+But then here that ex-here means English:
+So the first takeaway here is that if we do
+1:06:22.305 --> 1:06:31.539
+pre-train financing again when we do it into
+English,.
+1:06:33.433 --> 1:06:40.438
+Here is that we are forgetting.
+1:06:40.219 --> 1:06:50.514
+When we do further training there is no data.
+1:06:50.770 --> 1:07:04.865
+So even if we initialize the pre-trained bottle
+and continue training, if we don't see translation.
+1:07:05.345 --> 1:07:13.826
+So this is bad machine learning people termed
+it as perfect forgetting in the sense that
+1:07:13.826 --> 1:07:20.115
+if you have a model that is trained to do some
+task and then you.
+1:07:20.860 --> 1:07:22.487
+This Is Also Pretty Bad.
+1:07:24.244 --> 1:07:32.341
+Is especially bad if you consider training
+data actually grows over time.
+1:07:32.231 --> 1:07:35.408
+It's not like you have one.
+1:07:36.336 --> 1:07:46.756
+So in practice we do not always train systems
+from stretch so it's more like you have an
+1:07:46.756 --> 1:07:54.951
+existing system and later we want to expand
+the translation coverage.
+1:07:57.277 --> 1:08:08.932
+Here and the key question is how do we continue
+training from an existing system in doing so?
+1:08:09.909 --> 1:08:12.288
+Approaches.
+1:08:12.090 --> 1:08:27.948
+One very simple one is to include a portion
+of your previous training so that.
+1:08:28.148 --> 1:08:34.333
+So if you consider you have an English German
+system and now you want to explain it to English
+1:08:34.333 --> 1:08:34.919
+French,.
+1:08:36.036 --> 1:08:42.308
+Like so nice going English, French and English
+German, so when you train it you still include
+1:08:42.308 --> 1:08:45.578
+a small proportion of your previous German
+data.
+1:08:45.512 --> 1:08:51.118
+Hopefully your model is not forgetting that
+much about the previously lent German.
+1:08:53.073 --> 1:08:58.876
+Idea here is what we saw earlier.
+1:08:58.705 --> 1:09:09.803
+We can also add adaptors and only train them
+while keeping the.
+1:09:10.170 --> 1:09:26.860
+So this means we're going to end up with a
+generic model that was not anyhow changed.
+1:09:27.447 --> 1:09:37.972
+So in this way it's also more module and more
+suitable to the incremental learning kind of.
+1:09:38.758 --> 1:09:49.666
+Right in this part, the takeaways guess are
+first data filtering.
+1:09:49.501 --> 1:09:55.125
+His Internet data is very noisy.
+1:09:56.496 --> 1:10:05.061
+Second, it's about paint tuning pre-fine models
+and how we can or cannot avoid catastrophic
+1:10:05.061 --> 1:10:06.179
+forgetting.
+1:10:07.247 --> 1:10:15.866
+And of course open questions would include
+how can we do incremental learning with these
+1:10:15.866 --> 1:10:19.836
+multilingual machine translation models?
+1:10:20.860 --> 1:10:30.247
+Engineering ChallengesSo with this in mind
+would like to briefly cover several engineering
+1:10:30.247 --> 1:10:39.531
+challenges when we talk about: Yeah, earlier
+we also briefly talked about the motelingual
+1:10:39.531 --> 1:10:49.021
+means sometimes you have to scale up, you have
+to make your models bigger just to have that
+1:10:49.021 --> 1:10:51.394
+capacity to deal with.
+1:10:52.472 --> 1:10:59.262
+This means the model sizes are getting bigger
+and sometimes having one single is not enough
+1:10:59.262 --> 1:11:00.073
+to handle.
+1:11:00.400 --> 1:11:08.914
+Here wanted to introduce ideas of going parallel
+and scaling up.
+1:11:08.783 --> 1:11:12.848
+The first is so called model.
+1:11:14.434 --> 1:11:18.859
+Don't know if you also had this in other like
+maury cue related courses.
+1:11:20.220 --> 1:11:30.639
+Okay, so the idea of data parallel is basically
+we train in parallel.
+1:11:30.790 --> 1:11:35.852
+We put our model onto several GPS.
+1:11:35.707 --> 1:11:47.133
+We send the same model there and then when
+we get the training data we split.
+1:11:48.108 --> 1:11:54.594
+So each on each of these we are doing the
+forward and backward pass in parallel.
+1:11:55.355 --> 1:12:07.779
+Then after we get his gradient all these reviews
+will be synchronized and the gradients will
+1:12:07.779 --> 1:12:09.783
+be aggregated.
+1:12:11.691 --> 1:12:27.127
+We are having a bigger batch size in effect,
+so this would be much faster than, for example,
+1:12:27.127 --> 1:12:31.277
+doing all these smaller.
+1:12:32.772 --> 1:12:45.252
+That is, if your model itself is too big to
+fit onto an energy group, so you cannot split
+1:12:45.252 --> 1:12:46.084
+this.
+1:12:46.486 --> 1:12:51.958
+And honestly, the model itself, unless you're
+going for those.
+1:12:51.891 --> 1:12:55.500
+Huge models the industry made these days.
+1:12:55.414 --> 1:13:03.198
+I've never run into a situation where the
+single model itself does not fit into one shape
+1:13:03.198 --> 1:13:03.717
+here.
+1:13:03.631 --> 1:13:08.476
+Realistically, it's more the what is memory
+consuming.
+1:13:08.528 --> 1:13:14.871
+It is more of the backward cast and the Optimizer
+states that led me to be stored.
+1:13:15.555 --> 1:13:22.193
+So but still there are people training gigantic
+models where they have to go model parallel.
+1:13:22.602 --> 1:13:35.955
+This means you have a model consisting of
+all those orange pets, but it doesn't fit to
+1:13:35.955 --> 1:13:40.714
+split the next several layers.
+1:13:41.581 --> 1:13:51.787
+So this means when you do the forward pass
+you have to wait and to finish before doing.
+1:13:52.532 --> 1:14:11.193
+And this kind of implementation is sometimes
+a bit architecture or specific.
+1:14:12.172 --> 1:14:17.177
+Right, so there's one more thing when scaling
+up.
+1:14:17.077 --> 1:14:19.184
+Want it to mention.
+1:14:20.080 --> 1:14:25.687
+We also talked about it briefly earlier.
+1:14:25.550 --> 1:14:34.032
+We said that when we go to Linguo we need
+a vocabulary that.
+1:14:34.614 --> 1:14:40.867
+And can give you some numbers.
+1:14:40.665 --> 1:14:53.578
+Most of the pre-trained modeling models here
+use a vocabulary.
+1:14:53.933 --> 1:14:58.454
+Normally each vector is.
+1:14:58.273 --> 1:15:10.754
+This means just the word embedding table alone
+is times parameters.
+1:15:11.011 --> 1:15:18.620
+This means just for the embedding table alone
+it's already taking million parameters of the.
+1:15:19.859 --> 1:15:28.187
+And this is often one of the largest parts
+of the machine.
+1:15:28.046 --> 1:15:31.299
+This also comes with.
+1:15:31.651 --> 1:15:43.891
+So one question is how can we efficiently
+represent a multilingual vocabulary?
+1:15:43.736 --> 1:15:49.008
+Are there better ways than just?
+1:15:50.750 --> 1:16:00.526
+There are many out there people tread, maybe
+not all targeted for mottling wool, but think.
+1:16:00.840 --> 1:16:03.635
+So when is bites level representation?
+1:16:03.743 --> 1:16:11.973
+So the idea there is if we train with data
+they're all stored on computers, so all their
+1:16:11.973 --> 1:16:15.579
+characters must be reused in by bites.
+1:16:15.486 --> 1:16:23.717
+So they want to then not using subwords, not
+using characters, but using bites instead.
+1:16:25.905 --> 1:16:27.693
+Do You See Some Downsides?
+1:16:31.791 --> 1:16:38.245
+There are some languages that are easier to
+represent than others.
+1:16:38.148 --> 1:16:40.561
+That's definitely true.
+1:16:41.081 --> 1:16:44.981
+So if you have a sentence normally of five
+words,.
+1:16:46.246 --> 1:16:59.899
+You think about if we split it into characters,
+how many characters we have, and each character
+1:16:59.899 --> 1:17:04.166
+that would be how many bites.
+1:17:04.424 --> 1:17:15.749
+And then it's more to model, it's more for
+the model to learn, and it's also a bigger
+1:17:15.749 --> 1:17:19.831
+sequence to give to the model.
+1:17:20.260 --> 1:17:22.038
+Yeah.
+1:17:21.941 --> 1:17:31.232
+Visual representation is also quite interesting,
+so some people argued that we don't want to
+1:17:31.232 --> 1:17:35.428
+have a fixed discrete vocabulary anymore.
+1:17:35.328 --> 1:17:41.923
+Instead, we want to do it like OCR, like reading
+them as images.
+1:17:42.942 --> 1:17:55.403
+We'll look at one example for this next: Then
+another idea is how if you can distill the
+1:17:55.403 --> 1:18:03.943
+vocabulary as in learning some more compact
+representation,.
+1:18:04.284 --> 1:18:12.554
+But next wanted to show you an example of
+pixel inputs for modeling war machine.
+1:18:12.852 --> 1:18:29.757
+If you look at the picture, all the characters
+that are marked with red are actually not.
+1:18:32.772 --> 1:18:48.876
+They are actually from a different script
+for the model and let it do the subword tokenization.
+1:18:52.852 --> 1:19:04.373
+You would get maybe mostly characters out
+of it because I guess in the pre existing vocabulary
+1:19:04.373 --> 1:19:07.768
+there won't be Latin H and.
+1:19:07.707 --> 1:19:16.737
+So you'll get characters out of it, which
+means it's probably going to be more difficult
+1:19:16.737 --> 1:19:18.259
+for the model.
+1:19:20.140 --> 1:19:28.502
+Yeah, so the motivation for pixel inputs is
+that there is more sharing across languages.
+1:19:30.010 --> 1:19:37.773
+Here basically illustrates an embedding table
+for subwords and saying if you have sentences
+1:19:37.773 --> 1:19:45.705
+in the letter scripts like French and the English
+then it's going to take certain proportions
+1:19:45.705 --> 1:19:48.152
+of this big embetting table.
+1:19:48.328 --> 1:19:56.854
+While for Arabic and Chinese it's yet again
+another,.
+1:19:56.796 --> 1:20:09.037
+That is not joined with the previous one if
+we want to have shared representations for
+1:20:09.037 --> 1:20:11.992
+different languages.
+1:20:12.692 --> 1:20:18.531
+On the other hand, if we're going with pixels,
+there's definitely more sharing.
+1:20:22.362 --> 1:20:30.911
+There's a difference though to a standard
+kind of norm machine translation typeline.
+1:20:32.252 --> 1:20:47.581
+If you have this brace then how do we go with
+images into a translation model?
+1:20:50.690 --> 1:20:58.684
+We still have to tokenize it somehow, so in
+this case they do an overlapping sliding window.
+1:20:59.259 --> 1:21:13.636
+Since it's more visual, we're using some kind
+of convolution blocks before going into these
+1:21:13.636 --> 1:21:14.730
+black.
+1:21:15.035 --> 1:21:25.514
+So here wanted to show that if you go with
+these more specialist architectures we get
+1:21:25.514 --> 1:21:27.829
+pixels and that's.
+1:21:30.050 --> 1:21:31.310
+There's Also One Down the Side.
+1:21:31.431 --> 1:21:51.380
+If we go with pixels and present teachings,
+what are our challenges?
+1:21:52.993 --> 1:22:00.001
+Exactly so as they beat us others here, also
+pointing out here for their experiments.
+1:22:01.061 --> 1:22:08.596
+They only consider a one target language,
+and this is also on their target site.
+1:22:08.503 --> 1:22:10.648
+It's not pixel based.
+1:22:11.131 --> 1:22:31.033
+So this is definitely, in my opinion, very
+interesting steps towards more shared representations.
+1:22:31.831 --> 1:22:40.574
+Yeah, so with this kind of out of the box
+approach just wanted to summarize today's lecture.
+1:22:41.962 --> 1:22:53.158
+First think we saw why motelingue is cool,
+why there are several open challenges out there
+1:22:53.158 --> 1:22:53.896
+that.
+1:22:55.355 --> 1:23:03.601
+We also saw, like several approaches, how
+to realize implement a modern molecular translation
+1:23:03.601 --> 1:23:11.058
+system, and yeah, lastly, we've seen quite
+some over challenges on what is unsolved.
+1:23:11.691 --> 1:23:22.403
+Yeah, so with this want to thank you for being
+here today and I'm up there if you want.
+1:23:26.106 --> 1:23:29.727
+If you have questions, how will we also share
+with the moment?

demo_data/lectures/Lecture-10-13.06.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8dc282db3512e8731326f1898c8dd757c40f33bd1468ffae249a9374f76fe28
+size 122197601

demo_data/lectures/Lecture-11-15.06.2023/English.vtt ADDED Viewed

The diff for this file is too large to render. See raw diff

demo_data/lectures/Lecture-11-15.06.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:018f7b42f2225e9ea6d68c39e22111b3d3e172c045fde57e3dfd6b2ca3df4198
+size 123175586

demo_data/lectures/Lecture-12-20.06.2023/English.vtt ADDED Viewed

The diff for this file is too large to render. See raw diff

demo_data/lectures/Lecture-12-20.06.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e86b4df900483ac17cf6e78c131d83ab5f7df2a0790c7ae034502bdce61554f3
+size 158173841

demo_data/lectures/Lecture-13-04.07.2023/English.vtt ADDED Viewed

	@@ -0,0 +1,2699 @@

+WEBVTT
+0:00:01.641 --> 0:00:06.289
+IntroductionHey so what again to today's lecture
+on machine translation.
+0:00:07.968 --> 0:00:15.152
+This week we'll have a bit of different focus,
+so last two weeks or so we have looking into.
+0:00:15.655 --> 0:00:28.073
+How we can improve our system by having more
+data, other data sources, or using them to
+0:00:28.073 --> 0:00:30.331
+more efficient.
+0:00:30.590 --> 0:00:38.046
+And we'll have a bit more of that next week
+with the anti-travised and the context.
+0:00:38.338 --> 0:00:47.415
+So that we are shifting from this idea of
+we treat each sentence independently, but treat
+0:00:47.415 --> 0:00:49.129
+the translation.
+0:00:49.129 --> 0:00:58.788
+Because maybe you can remember from the beginning,
+there are phenomenon in machine translation
+0:00:58.788 --> 0:01:02.143
+that you cannot correctly check.
+0:01:03.443 --> 0:01:14.616
+However, today we want to more look into what
+challenges arise, specifically when we're practically
+0:01:14.616 --> 0:01:16.628
+applying machine.
+0:01:17.017 --> 0:01:23.674
+And this block will be a total of four different
+lectures.
+0:01:23.561 --> 0:01:29.495
+Types of Biases in Machine TranslationWhat
+type of biases are in machine translation can.
+0:01:29.729 --> 0:01:37.646
+Just then can we try to improve this, but
+of course the first focus can be at least the.
+0:01:37.717 --> 0:01:41.375
+And this, of course, gets more and more important.
+0:01:41.304 --> 0:01:48.302
+The more often you apply this type of technology,
+when it was mainly a basic research tool which
+0:01:48.302 --> 0:01:53.785
+you were using in a research environment, it's
+not directly that important.
+0:01:54.054 --> 0:02:00.370
+But once you apply it to the question, is
+it performed the same for everybody or is it
+0:02:00.370 --> 0:02:04.436
+performance of some people less good than other
+people?
+0:02:04.364 --> 0:02:10.463
+Does it have specific challenges and we are
+seeing that especially in translation?
+0:02:10.710 --> 0:02:13.420
+We have the major challenge.
+0:02:13.326 --> 0:02:20.334
+We have the grammatical gender and this is
+not the same in all languages.
+0:02:20.520 --> 0:02:35.431
+In English, it's not clear if you talk about
+some person, if it's male or female, and so
+0:02:35.431 --> 0:02:39.787
+hopefully you've learned.
+0:02:41.301 --> 0:02:49.419
+Just as a brief view, so based on this one
+aspect of application will then have two other
+0:02:49.419 --> 0:02:57.807
+aspects: On Thursday we'll look into adaptation,
+so how can we adapt to specific situations?
+0:02:58.718 --> 0:03:09.127
+Because we have seen that your systems perform
+well when the test case is similar to the training
+0:03:09.127 --> 0:03:15.181
+case, it's always the case you should get training
+data.
+0:03:16.036 --> 0:03:27.577
+However, in practical applications, it's not
+always possible to collect really the best
+0:03:27.577 --> 0:03:31.642
+fitting data, so in that case.
+0:03:32.092 --> 0:03:39.269
+And then the third larger group of applications
+will then be speech translation.
+0:03:39.181 --> 0:03:42.993
+What do we have to change in our machine?
+0:03:43.323 --> 0:03:53.569
+If we are now not translating text, but if
+we want to translate speech, that will be more
+0:03:53.569 --> 0:03:54.708
+lectures.
+0:04:00.180 --> 0:04:12.173
+So what are we talking about when we are talking
+about bias from a definition point?
+0:04:12.092 --> 0:04:21.799
+Means we are introducing systematic errors
+when testing, and then we encourage the selection
+0:04:21.799 --> 0:04:24.408
+of the specific answers.
+0:04:24.804 --> 0:04:36.862
+The most prominent case, which is analyzed
+most in the research community, is a bias based
+0:04:36.862 --> 0:04:38.320
+on gender.
+0:04:38.187 --> 0:04:43.359
+One example: she works in a hospital.
+0:04:43.523 --> 0:04:50.787
+It is not directly able to assess whether
+this is now a point or a friend.
+0:04:51.251 --> 0:05:07.095
+And although in this one even there is, it's
+possible to ambiguate this based on the context.
+0:05:07.127 --> 0:05:14.391
+However, there is yeah, this relation to learn
+is of course not that easy.
+0:05:14.614 --> 0:05:27.249
+So the system might also learn more like shortcut
+connections, which might be that in your training
+0:05:27.249 --> 0:05:31.798
+data most of the doctors are males.
+0:05:32.232 --> 0:05:41.725
+That is like that was too bigly analyzed and
+biased, and we'll focus on that also in this.
+0:05:41.641 --> 0:05:47.664
+In this lecture, however, of course, the system
+might be a lot of other biases too, which have
+0:05:47.664 --> 0:05:50.326
+been partly investigated in other fields.
+0:05:50.263 --> 0:05:53.498
+But I think machine translation is not that
+much.
+0:05:53.813 --> 0:05:57.637
+For example, it can be based on your originals.
+0:05:57.737 --> 0:06:09.405
+So there is an example for a sentiment analysis
+that's a bit prominent.
+0:06:09.243 --> 0:06:15.081
+A sentiment analysis means you're.
+0:06:15.035 --> 0:06:16.788
+Like you're seeing it in reviews.
+0:06:17.077 --> 0:06:24.045
+And then you can show that with baseline models,
+if the name is Mohammed then the sentiment
+0:06:24.045 --> 0:06:30.786
+in a lot of systems will be more negative than
+if it's like a traditional European name.
+0:06:31.271 --> 0:06:33.924
+Are with foods that is simple.
+0:06:33.839 --> 0:06:36.453
+It's this type of restaurant.
+0:06:36.366 --> 0:06:38.809
+It's positive and another.
+0:06:39.319 --> 0:06:49.510
+You have other aspects, so we have seen this.
+0:06:49.289 --> 0:06:59.485
+We have done some experiments in Vietnamese.
+0:06:59.559 --> 0:07:11.040
+And then, for example, you can analyze that
+if it's like he's Germany will address it more
+0:07:11.040 --> 0:07:18.484
+formal, while if he is North Korean he'll use
+an informal.
+0:07:18.838 --> 0:07:24.923
+So these are also possible types of gender.
+0:07:24.785 --> 0:07:31.012
+However, this is difficult types of biases.
+0:07:31.251 --> 0:07:38.903
+However, especially in translation, the bias
+for gender is the most challenging because
+0:07:38.903 --> 0:07:42.989
+we are treating gender in different languages.
+0:07:45.405 --> 0:07:46.930
+Hi this is challenging.
+0:07:48.148 --> 0:07:54.616
+The reason for that is that there is a translation
+mismatch and we have, I mean, one reason for
+0:07:54.616 --> 0:08:00.140
+that is there's a translation mismatch and
+that's the most challenging situation.
+0:08:00.073 --> 0:08:05.733
+So there is there is different information
+in the Sears language or in the target.
+0:08:06.046 --> 0:08:08.832
+So if we have the English word dot player,.
+0:08:09.029 --> 0:08:12.911
+It's there is no information about the gender
+in there.
+0:08:12.842 --> 0:08:19.043
+However, if you want to translate in German,
+you cannot easily generate a word without a
+0:08:19.043 --> 0:08:20.437
+gender information.
+0:08:20.367 --> 0:08:27.057
+Or man, you can't do something like Shubila
+in, but that sounds a bit weird if you're talking.
+0:08:27.027 --> 0:08:29.006
+About a specific person.
+0:08:28.927 --> 0:08:32.333
+Then you should use the appropriate font.
+0:08:32.692 --> 0:08:44.128
+And so it's most challenging translation as
+always in this situation where you have less
+0:08:44.128 --> 0:08:50.939
+information on the source side but more information.
+0:08:51.911 --> 0:08:57.103
+Similar things like if you think about Japanese,
+for example where there's different formality
+0:08:57.103 --> 0:08:57.540
+levels.
+0:08:57.485 --> 0:09:02.291
+If in German there is no formality or like
+two only or in English there's no formality
+0:09:02.291 --> 0:09:02.677
+level.
+0:09:02.862 --> 0:09:08.139
+And now you have to estimate the formality
+level.
+0:09:08.034 --> 0:09:10.830
+Of course, it takes some.
+0:09:10.722 --> 0:09:13.845
+It's not directly possible.
+0:09:14.094 --> 0:09:20.475
+What nowadays systems are doing is at least
+assess.
+0:09:20.352 --> 0:09:27.472
+This is a situation where don't have enough
+information.
+0:09:27.567 --> 0:09:28.656
+Translation.
+0:09:28.572 --> 0:09:34.939
+So here you have that suggesting it can be
+doctor or doctorate in Spanish.
+0:09:35.115 --> 0:09:37.051
+So that is a possibility.
+0:09:36.977 --> 0:09:41.597
+However, it is of course very, very challenging
+to find out.
+0:09:42.062 --> 0:09:46.130
+Is there two really different meanings, or
+is it not the case?
+0:09:46.326 --> 0:09:47.933
+You can do the big rule base here.
+0:09:47.887 --> 0:09:49.496
+Maybe don't know how they did it.
+0:09:49.990 --> 0:09:57.469
+You can, of course, if you are focusing on
+gender, the source and the target is different,
+0:09:57.469 --> 0:09:57.879
+and.
+0:09:58.118 --> 0:10:05.799
+But if you want to do it more general, it's
+not that easy because there's always.
+0:10:06.166 --> 0:10:18.255
+But it's not clear if these are really different
+or if there's only slight differences.
+0:10:22.142 --> 0:10:36.451
+Between that another reason why there is a
+bias in there is typically the system tries
+0:10:36.451 --> 0:10:41.385
+to always do the most simple.
+0:10:42.262 --> 0:10:54.483
+And also in your training data there are unintended
+shortcuts or clues only in the training data
+0:10:54.483 --> 0:10:59.145
+because you sample them in some way.
+0:10:59.379 --> 0:11:06.257
+This example, if she works in a hospital and
+my friend is a nurse, then it might be that
+0:11:06.257 --> 0:11:07.184
+one friend.
+0:11:08.168 --> 0:11:18.979
+Male and female because it has learned that
+in your trained doctor is a male and a nurse
+0:11:18.979 --> 0:11:20.802
+is doing this.
+0:11:20.880 --> 0:11:29.587
+And of course, if we are doing maximum likelihood
+approximation as we are doing it in general,
+0:11:29.587 --> 0:11:30.962
+we are always.
+0:11:30.951 --> 0:11:43.562
+So that means if in your training data this
+correlation is maybe in the case then your
+0:11:43.562 --> 0:11:48.345
+predictions are always the same.
+0:11:48.200 --> 0:11:50.386
+It typically.
+0:11:55.035 --> 0:12:06.007
+What does it mean, of course, if we are having
+this type of fires and if we are applying?
+0:12:05.925 --> 0:12:14.821
+It might be that the benefit of machine translation
+rice so more and more people can benefit from
+0:12:14.821 --> 0:12:20.631
+the ability to talk to people in different
+languages and so on.
+0:12:20.780 --> 0:12:27.261
+But if you more often use it, problems of
+the system also get more and more important.
+0:12:27.727 --> 0:12:36.984
+And so if we are seeing that these problems
+and people nowadays only start to analyze these
+0:12:36.984 --> 0:12:46.341
+problems partly, also because if it hasn't
+been used, it's not that important if the quality
+0:12:46.341 --> 0:12:47.447
+is so bad.
+0:12:47.627 --> 0:12:51.907
+Version or is mixing it all the time like
+we have seen in old systems.
+0:12:51.847 --> 0:12:52.996
+Then, of course,.
+0:12:53.053 --> 0:12:57.303
+The issue is not that you have biased issues
+that you at first need to create a right view.
+0:12:57.637 --> 0:13:10.604
+So only with the wide application of the good
+quality this becomes important, and then of
+0:13:10.604 --> 0:13:15.359
+course you should look into how.
+0:13:15.355 --> 0:13:21.355
+Challenges in Machine TranslationIn order
+to first get aware of what are the challenges,
+0:13:21.355 --> 0:13:24.591
+and that is a general idea not only about bias.
+0:13:24.764 --> 0:13:31.868
+Of course, we have learned about blue scores,
+so how can you evaluate the over quality and
+0:13:31.868 --> 0:13:36.006
+they are very important, either blue or any
+of that.
+0:13:35.928 --> 0:13:40.379
+However, they are somehow giving us a general
+overview.
+0:13:40.560 --> 0:13:58.410
+And if we want to improve our systems, of
+course it's important that we also do more
+0:13:58.410 --> 0:14:00.510
+detailed.
+0:14:00.340 --> 0:14:05.828
+Test sets which are very challenging in order
+to attend to see how good these systems.
+0:14:06.446 --> 0:14:18.674
+Of course, one last reminder to that if you
+do a challenge that says it's typically good
+0:14:18.674 --> 0:14:24.581
+to keep track of your general performance.
+0:14:24.784 --> 0:14:28.648
+You don't want to improve normally then on
+the general quality.
+0:14:28.688 --> 0:14:41.555
+So if you build a system which will mitigate
+some biases then the aim is that if you evaluate
+0:14:41.555 --> 0:14:45.662
+it on the challenging biases.
+0:14:45.745 --> 0:14:53.646
+You don't need to get better because the aggregated
+versions don't really measure that aspect well,
+0:14:53.646 --> 0:14:57.676
+but if you significantly drop in performance
+then.
+0:15:00.000 --> 0:15:19.164
+What are, in generally calms, people report
+about that or why should you care about?
+0:15:19.259 --> 0:15:23.598
+And you're even then amplifying this type
+of stereotypes.
+0:15:23.883 --> 0:15:33.879
+And that is not what you want to achieve with
+using this technology.
+0:15:33.734 --> 0:15:39.388
+It's not working through some groups.
+0:15:39.819 --> 0:15:47.991
+And secondly what is referred to as allocational
+parts.
+0:15:47.845 --> 0:15:54.123
+The system might not perform as well for.
+0:15:54.314 --> 0:16:00.193
+So another example of which we would like
+to see is that sometimes the translation depends
+0:16:00.193 --> 0:16:01.485
+on who is speaking.
+0:16:01.601 --> 0:16:03.463
+So Here You Have It in French.
+0:16:03.723 --> 0:16:16.359
+Not say it, but the word happy or French has
+to be expressed differently, whether it's a
+0:16:16.359 --> 0:16:20.902
+male person or a female person.
+0:16:21.121 --> 0:16:28.917
+It's nearly impossible to guess that or it's
+impossible, so then you always select one.
+0:16:29.189 --> 0:16:37.109
+And of course, since we do greedy search,
+it will always generate the same, so you will
+0:16:37.109 --> 0:16:39.449
+have a worse performance.
+0:16:39.779 --> 0:16:46.826
+And of course not what we want to achieve
+in average.
+0:16:46.696 --> 0:16:54.006
+You might be then good, but you also have
+the ability.
+0:16:54.234 --> 0:17:08.749
+This is a biased problem or an interface problem
+because mean you can say well.
+0:17:09.069 --> 0:17:17.358
+And if you do it, we still have a system that
+generates unusable output.
+0:17:17.244 --> 0:17:24.059
+If you don't tell it what you want to do,
+so in this case.
+0:17:24.244 --> 0:17:27.173
+So in this case it's like if we don't have
+enough information.
+0:17:27.467 --> 0:17:34.629
+So you have to adapt your system in some way
+that can either access the information or output.
+0:17:34.894 --> 0:17:46.144
+But yeah, how you mean there's different ways
+of how to improve over that first thing is
+0:17:46.144 --> 0:17:47.914
+you find out.
+0:17:48.688 --> 0:17:53.826
+Then there is different ways of addressing
+them, and they of course differ.
+0:17:53.759 --> 0:17:57.546
+Isn't the situation where the information's
+available?
+0:17:58.038 --> 0:18:12.057
+That's the first case we have, or is it a
+situation where we don't have the information
+0:18:12.057 --> 0:18:13.332
+either?
+0:18:14.154 --> 0:18:28.787
+Or should give the system maybe the opportunity
+to output those or say don't know this is still
+0:18:28.787 --> 0:18:29.701
+open.
+0:18:29.769 --> 0:18:35.470
+And even if they have enough information,
+need this additional information, but they
+0:18:35.470 --> 0:18:36.543
+are just doing.
+0:18:36.776 --> 0:18:51.132
+Which is a bit based on how we find that there
+is research on that, but it's not that easy
+0:18:51.132 --> 0:18:52.710
+to solve.
+0:18:52.993 --> 0:19:05.291
+But in general, detecting do have enough information
+to do a good translation or are information
+0:19:05.291 --> 0:19:06.433
+missing?
+0:19:09.669 --> 0:19:18.951
+But before we come on how we will address
+it or try to change it, and before we look
+0:19:18.951 --> 0:19:22.992
+at how we can assess it, of course,.
+0:19:23.683 --> 0:19:42.820
+And therefore wanted to do a bit of a review
+on how gender is represented in languages.
+0:19:43.743 --> 0:19:48.920
+Course: You can have more fine grained.
+0:19:48.791 --> 0:20:00.571
+It's not that everything in the group is the
+same, but in general you have a large group.
+0:20:01.381 --> 0:20:08.347
+For example, you even don't say ishi or but
+it's just one word for it written.
+0:20:08.259 --> 0:20:16.101
+Oh, don't know how it's pronounced, so you
+cannot say from a sentence whether it's ishi
+0:20:16.101 --> 0:20:16.725
+or it.
+0:20:17.937 --> 0:20:29.615
+Of course, there are some exceptions for whether
+it's a difference between male and female.
+0:20:29.488 --> 0:20:35.965
+They have different names for brother and
+sister.
+0:20:36.036 --> 0:20:41.772
+So normally you cannot infer whether this
+is a male speaker or speaking about a male
+0:20:41.772 --> 0:20:42.649
+or a female.
+0:20:44.304 --> 0:20:50.153
+Examples for these languages are, for example,
+Finnish and Turkish.
+0:20:50.067 --> 0:21:00.205
+There are more languages, but these are: Then
+we have no nutritional gender languages where
+0:21:00.205 --> 0:21:05.935
+there's some gender information in there, but
+it's.
+0:21:05.905 --> 0:21:08.169
+And this is an example.
+0:21:08.075 --> 0:21:15.150
+This is English, which is in that way a nice
+example because most people.
+0:21:15.415 --> 0:21:20.164
+So you have there some lexicogender and phenomenal
+gender.
+0:21:20.083 --> 0:21:23.305
+I mean mamadeta there she-hee and him.
+0:21:23.643 --> 0:21:31.171
+And very few words are marked like actor and
+actress, but in general most words are not
+0:21:31.171 --> 0:21:39.468
+marked, so it's teacher and lecturer and friend,
+so in all these words the gender is not marked,
+0:21:39.468 --> 0:21:41.607
+and so you cannot infer.
+0:21:42.622 --> 0:21:48.216
+So the initial Turkish sentence here would
+be translated to either he is a good friend
+0:21:48.216 --> 0:21:49.373
+or she is a good.
+0:21:51.571 --> 0:22:05.222
+In this case you would have them gender information
+in there, but of course there's a good friend.
+0:22:07.667 --> 0:22:21.077
+And then finally there is the grammatical
+German languages where each noun has a gender.
+0:22:20.926 --> 0:22:25.301
+That's the case in Spanish.
+0:22:26.186 --> 0:22:34.025
+This is mostly formal, but at least if you're
+talking about a human that also agrees.
+0:22:34.214 --> 0:22:38.209
+Of course, it's like the sun.
+0:22:38.076 --> 0:22:50.464
+There is no clear thing why the sun should
+be female, and in other language it's different.
+0:22:50.390 --> 0:22:56.100
+The matching, and then you also have more
+agreements with this that makes things more
+0:22:56.100 --> 0:22:56.963
+complicated.
+0:22:57.958 --> 0:23:08.571
+Here he is a good friend and the good is also
+depending whether it's male or went up so it's
+0:23:08.571 --> 0:23:17.131
+changing also based on the gender so you have
+a lot of gender information.
+0:23:17.777 --> 0:23:21.364
+Get them, but do you always get them correctly?
+0:23:21.289 --> 0:23:25.101
+It might be that they're in English, for example.
+0:23:28.748 --> 0:23:36.154
+And since this is the case, and you need to
+like often express the gender even though you
+0:23:36.154 --> 0:23:37.059
+might not.
+0:23:37.377 --> 0:23:53.030
+Aware of it or it's not possible, there's
+some ways in German how to mark mutual forms.
+0:23:54.194 --> 0:24:03.025
+But then it's again from the machine learning
+side of view, of course quite challenging because
+0:24:03.025 --> 0:24:05.417
+you only want to use the.
+0:24:05.625 --> 0:24:11.108
+If it's known to the reader you want to use
+the correct, the not mutual form but either
+0:24:11.108 --> 0:24:12.354
+the male or female.
+0:24:13.013 --> 0:24:21.771
+So they are assessing what is known to the
+reader as a challenge which needs to in some
+0:24:21.771 --> 0:24:23.562
+way be addressed.
+0:24:26.506 --> 0:24:30.887
+Here why does that happen?
+0:24:30.725 --> 0:24:42.086
+Three reasons we have that in a bit so one
+is, of course, that your.
+0:24:42.162 --> 0:24:49.003
+Example: If you look at the Europe High Corpus,
+which is an important resource for doing machine
+0:24:49.003 --> 0:24:49.920
+translation.
+0:24:50.010 --> 0:24:59.208
+Then there's only thirty percent of the speakers
+are female, and so if you train a model on
+0:24:59.208 --> 0:25:06.606
+that data, if you're translating to French,
+there will be a male version.
+0:25:06.746 --> 0:25:10.762
+And so you'll just have a lot more like seventy
+percent of your mail for it.
+0:25:10.971 --> 0:25:18.748
+And that will be Yep will make the model therefore
+from this data sub.
+0:25:18.898 --> 0:25:25.882
+And of course this will be in the data for
+a very long time.
+0:25:25.768 --> 0:25:33.669
+So if there's more female speakers in the
+European Parliament, but.
+0:25:33.933 --> 0:25:42.338
+But we are training on historical data, so
+even if there is for a long time, it will not
+0:25:42.338 --> 0:25:43.377
+be in the.
+0:25:46.346 --> 0:25:57.457
+Then besides these preexisting data there
+is of course technical biases which will amplify
+0:25:57.457 --> 0:25:58.800
+this type.
+0:25:59.039 --> 0:26:04.027
+So one we already address, that's for example
+sampling or beam search.
+0:26:03.957 --> 0:26:06.418
+You get the most probable output.
+0:26:06.646 --> 0:26:16.306
+So if there's a bias in your model, it will
+amplify that not only in the case we had before,
+0:26:16.306 --> 0:26:19.423
+and produce the male version.
+0:26:20.040 --> 0:26:32.873
+So if you have the same source sentence like
+am happy and in your training data it will
+0:26:32.873 --> 0:26:38.123
+be male and female if you're doing.
+0:26:38.418 --> 0:26:44.510
+So in that way by doing this type of algorithmic
+design you will have.
+0:26:44.604 --> 0:26:59.970
+Another use case is if you think about a multilingual
+machine translation, for example if you are
+0:26:59.970 --> 0:27:04.360
+now doing a pivot language.
+0:27:04.524 --> 0:27:13.654
+But if you're first trying to English this
+information might get lost and then you translate
+0:27:13.654 --> 0:27:14.832
+to Spanish.
+0:27:15.075 --> 0:27:21.509
+So while in general in this class there is
+not this type of bias there,.
+0:27:22.922 --> 0:27:28.996
+You might introduce it because you might have
+good reasons for doing a modular system because
+0:27:28.996 --> 0:27:31.968
+you don't have enough training data or so on.
+0:27:31.903 --> 0:27:37.570
+It's performing better in average, but of
+course by doing this choice you'll introduce
+0:27:37.570 --> 0:27:40.045
+an additional type of bias into your.
+0:27:45.805 --> 0:27:52.212
+And then there is what people refer to as
+emergent bias, and that is, if you use a system
+0:27:52.212 --> 0:27:58.903
+for a different use case as we see in, generally
+it is the case that is performing worse, but
+0:27:58.903 --> 0:28:02.533
+then of course you can have even more challenging.
+0:28:02.942 --> 0:28:16.196
+So the extreme case would be if you train
+a system only on male speakers, then of course
+0:28:16.196 --> 0:28:22.451
+it will perform worse on female speakers.
+0:28:22.902 --> 0:28:36.287
+So, of course, if you're doing this type of
+problem, if you use a system for a different
+0:28:36.287 --> 0:28:42.152
+situation where it was original, then.
+0:28:44.004 --> 0:28:54.337
+And with this we would then go for type of
+evaluation, but before we are looking at how
+0:28:54.337 --> 0:28:56.333
+we can evaluate.
+0:29:00.740 --> 0:29:09.484
+Stereotypes in Machine TranslationBefore we
+want to look into how we can improve the system,
+0:29:09.484 --> 0:29:13.527
+think yeah, maybe at the moment most work.
+0:29:13.954 --> 0:29:21.659
+And the one thing is the system trying to
+look into stereotypes.
+0:29:21.541 --> 0:29:26.167
+So how does a system use stereotypes?
+0:29:26.466 --> 0:29:29.443
+So if you have the Hungarian sentence,.
+0:29:29.729 --> 0:29:33.805
+Which should be he is an engineer or she is
+an engineer.
+0:29:35.375 --> 0:29:43.173
+And you cannot guess that because we saw that
+he and she is not different in Hungary.
+0:29:43.423 --> 0:29:57.085
+Then you can have a test set where you have
+these type of ailanomal occupations.
+0:29:56.977 --> 0:30:03.862
+You have statistics from how is the distribution
+by gender so you can automatically generate
+0:30:03.862 --> 0:30:04.898
+the sentence.
+0:30:04.985 --> 0:30:21.333
+Then you could put in jobs which are mostly
+done by a man and then you can check how is
+0:30:21.333 --> 0:30:22.448
+your.
+0:30:22.542 --> 0:30:33.276
+That is one type of evaluating stereotypes
+that one of the most famous benchmarks called
+0:30:33.276 --> 0:30:42.322
+vino is exactly: The second type of evaluation
+is about gender preserving.
+0:30:42.342 --> 0:30:51.201
+So that is exactly what we have seen beforehand.
+0:30:51.020 --> 0:31:00.244
+If these information are not in the text itself,.
+0:31:00.320 --> 0:31:01.875
+Gender as a speaker.
+0:31:02.062 --> 0:31:04.450
+And how good does a system do that?
+0:31:04.784 --> 0:31:09.675
+And we'll see there's, for example, one benchmark
+on this.
+0:31:09.592 --> 0:31:15.762
+For example: For Arabic there is one benchmark
+on this foot: Audio because if you're now think
+0:31:15.762 --> 0:31:16.801
+already of the.
+0:31:17.157 --> 0:31:25.257
+From when we're talking about speech translation,
+it might be interesting because in the speech
+0:31:25.257 --> 0:31:32.176
+signal you should have a better guess on whether
+it's a male or a female speaker.
+0:31:32.432 --> 0:31:38.928
+So but mean current systems, mostly you can
+always add, and they will just first transcribe.
+0:31:42.562 --> 0:31:45.370
+Yes, so how do these benchmarks?
+0:31:45.305 --> 0:31:51.356
+Look like that, the first one is here.
+0:31:51.201 --> 0:32:02.839
+There's an occupation test where it looks
+like a simple test set because.
+0:32:03.023 --> 0:32:10.111
+So I've known either hurry him or pronounce
+the name for a long time.
+0:32:10.010 --> 0:32:13.557
+My friend works as an occupation.
+0:32:13.833 --> 0:32:16.771
+So that is like all sentences in that look
+like that.
+0:32:17.257 --> 0:32:28.576
+So in this case you haven't had the biggest
+work in here, which is friends.
+0:32:28.427 --> 0:32:33.346
+So your only checking later is.
+0:32:34.934 --> 0:32:46.981
+This can be inferred from whether it's her
+or her or her, or if it's a proper name, so
+0:32:46.981 --> 0:32:55.013
+can you infer it from the name, and then you
+can compare.
+0:32:55.115 --> 0:33:01.744
+So is this because the job description is
+nearer to friend.
+0:33:01.633 --> 0:33:06.939
+Does the system get disturbed by this type
+of.
+0:33:08.828 --> 0:33:14.753
+And there you can then automatically assess
+yeah this type.
+0:33:14.774 --> 0:33:18.242
+Of course, that's what said at the beginning.
+0:33:18.167 --> 0:33:24.837
+You shouldn't only rely on that because if
+you only rely on it you can easily trick the
+0:33:24.837 --> 0:33:25.444
+system.
+0:33:25.368 --> 0:33:31.888
+So one type of sentence is translated, but
+of course it can give you very important.
+0:33:33.813 --> 0:33:35.309
+Any questions yeah.
+0:33:36.736 --> 0:33:44.553
+Much like the evaluation of stereotype, we
+want the system to agree with stereotypes because
+0:33:44.553 --> 0:33:46.570
+it increases precision.
+0:33:46.786 --> 0:33:47.979
+No, no, no.
+0:33:47.880 --> 0:33:53.088
+In this case, if we say oh yeah, he is an
+engineer.
+0:33:52.988 --> 0:34:01.602
+From the example, it's probably the most likely
+translation, probably in more cases.
+0:34:02.702 --> 0:34:08.611
+Now there is two things, so yeah yeah, so
+there is two ways of evaluating.
+0:34:08.533 --> 0:34:15.594
+The one thing is in this case he's using that
+he's an engineer, but there is conflicting
+0:34:15.594 --> 0:34:19.879
+information that in this case the engineer
+is female.
+0:34:20.380 --> 0:34:21.890
+So anything was.
+0:34:22.342 --> 0:34:29.281
+Information yes, so that is the one in the
+other case.
+0:34:29.155 --> 0:34:38.746
+Typically it's not evaluated in that, but
+in that time you really want it.
+0:34:38.898 --> 0:34:52.732
+That's why most of those cases you have evaluated
+in scenarios where you have context information.
+0:34:53.453 --> 0:34:58.878
+How to deal with the other thing is even more
+challenging to one case where it is the case
+0:34:58.878 --> 0:35:04.243
+is what I said before is when it's about the
+speaker so that the speech translation test.
+0:35:04.584 --> 0:35:17.305
+And there they try to look in a way that can
+you use, so use the audio also as input.
+0:35:18.678 --> 0:35:20.432
+Yeah.
+0:35:20.640 --> 0:35:30.660
+So if we have a reference where she is an
+engineer okay, are there efforts to adjust
+0:35:30.660 --> 0:35:37.497
+the metric so that our transmissions go into
+the correct?
+0:35:37.379 --> 0:35:38.689
+We don't.
+0:35:38.618 --> 0:35:40.389
+Only done for mean this is evaluation.
+0:35:40.344 --> 0:35:42.388
+You are not pushing the model for anything.
+0:35:43.023 --> 0:35:53.458
+But if you want to do it in training, that
+you're not doing it this way.
+0:35:53.315 --> 0:35:58.465
+I'm not aware of any direct model.
+0:35:58.638 --> 0:36:04.146
+Because you have to find out, is it known
+in this scenario or not?
+0:36:05.725 --> 0:36:12.622
+So at least I'm not aware of there's like
+the directive doing training try to assess
+0:36:12.622 --> 0:36:13.514
+more than.
+0:36:13.813 --> 0:36:18.518
+Mean there is data augmentation in the way
+that is done.
+0:36:18.436 --> 0:36:23.967
+Think we'll have that later, so what you can
+do is generate more.
+0:36:24.144 --> 0:36:35.355
+You can do that automatically or there's ways
+of biasing so that you can try to make your
+0:36:35.355 --> 0:36:36.600
+training.
+0:36:36.957 --> 0:36:46.228
+That's typically not done with focusing on
+scenarios where you check before or do have
+0:36:46.228 --> 0:36:47.614
+information.
+0:36:49.990 --> 0:36:58.692
+Mean, but for everyone it's not clear and
+agree with you in this scenario, the normal
+0:36:58.692 --> 0:37:01.222
+evaluation system where.
+0:37:01.341 --> 0:37:07.006
+Maybe you could say it shouldn't do always
+the same but have a distribution like a training
+0:37:07.006 --> 0:37:12.733
+data or something like that because otherwise
+we're amplifying but that current system can't
+0:37:12.733 --> 0:37:15.135
+do current systems can't predict both.
+0:37:15.073 --> 0:37:17.377
+That's why we see all the beginning.
+0:37:17.314 --> 0:37:20.864
+They have this extra interface where they
+then propose.
+0:37:24.784 --> 0:37:33.896
+Another thing is the vino empty system and
+it started from a challenge set for co-reference
+0:37:33.896 --> 0:37:35.084
+resolution.
+0:37:34.985 --> 0:37:43.503
+Co-reference resolution means we have pear
+on him and we need to find out what it's.
+0:37:43.823 --> 0:37:53.620
+So you have the doctor off the nurse to help
+her in the procedure, and now her does not
+0:37:53.620 --> 0:37:55.847
+refer to the nurse.
+0:37:56.556 --> 0:38:10.689
+And there you of course have the same type
+of stewardesses and the same type of buyers
+0:38:10.689 --> 0:38:15.237
+as the machine translation.
+0:38:16.316 --> 0:38:25.165
+And no think that normally yeah mean maybe
+that's also biased.
+0:38:27.687 --> 0:38:37.514
+No, but if you ask somebody, I guess if you
+ask somebody, then I mean syntectically it's
+0:38:37.514 --> 0:38:38.728
+ambiguous.
+0:38:38.918 --> 0:38:50.248
+If you ask somebody to help, then the horror
+has to refer to that.
+0:38:50.079 --> 0:38:54.990
+So it should also help the.
+0:38:56.396 --> 0:38:57.469
+Of the time.
+0:38:57.386 --> 0:39:03.907
+The doctor is female and says please have
+me in the procedure, but the other.
+0:39:04.904 --> 0:39:09.789
+Oh, you mean that it's helping the third person.
+0:39:12.192 --> 0:39:16.140
+Yeah, agree that it could also be yes.
+0:39:16.039 --> 0:39:19.037
+Don't know how easy that is.
+0:39:18.933 --> 0:39:21.109
+Only know the test.
+0:39:21.321 --> 0:39:31.820
+Then guess yeah, then you need a situation
+context where you know the situation, the other
+0:39:31.820 --> 0:39:34.589
+person having problems.
+0:39:36.936 --> 0:39:42.251
+Yeah no yeah that is like here when there
+is additional ambiguity in there.
+0:39:45.465 --> 0:39:48.395
+See that pure text models is not always okay.
+0:39:48.331 --> 0:39:51.136
+How full mean there is a lot of work also.
+0:39:52.472 --> 0:40:00.119
+Will not cover that in the lecture, but there
+are things like multimodal machine translation
+0:40:00.119 --> 0:40:07.109
+where you try to add pictures or something
+like that to have more context, and then.
+0:40:10.370 --> 0:40:23.498
+Yeah, it starts with this, so in order to
+evaluate that what it does is that you translate
+0:40:23.498 --> 0:40:25.229
+the system.
+0:40:25.305 --> 0:40:32.310
+It's doing stereotyping so the doctor is male
+and the nurse is female.
+0:40:32.492 --> 0:40:42.362
+And then you're using word alignment, and
+then you check whether this gender maps with
+0:40:42.362 --> 0:40:52.345
+the annotated gender of there, and that is
+how you evaluate in this type of vino empty.
+0:40:52.832 --> 0:40:59.475
+Mean, as you see, you're only focusing on
+the situation where you can or where the gender
+0:40:59.475 --> 0:41:00.214
+is known.
+0:41:00.140 --> 0:41:06.915
+Why for this one you don't do any evaluation,
+but because nurses can in that case be those
+0:41:06.915 --> 0:41:08.703
+and you cannot, as has.
+0:41:08.728 --> 0:41:19.112
+The benchmarks are at the moment designed
+in a way that you only evaluate things that
+0:41:19.112 --> 0:41:20.440
+are known.
+0:41:23.243 --> 0:41:25.081
+Then yeah, you can have a look.
+0:41:25.024 --> 0:41:28.905
+For example, here what people are looking
+is you can do the first.
+0:41:28.847 --> 0:41:32.150
+Oh well, the currency, how often does it do
+it correct?
+0:41:32.552 --> 0:41:41.551
+And there you see these numbers are a bit
+older.
+0:41:41.367 --> 0:41:51.838
+There's more work on that, but this is the
+first color.
+0:41:51.731 --> 0:42:01.311
+Because they do it like in this test, they
+do it twice, one with him and one with her.
+0:42:01.201 --> 0:42:04.838
+So the chance is fifty percent.
+0:42:05.065 --> 0:42:12.097
+Except somehow here, the one system seems
+to be quite good there that everything.
+0:42:13.433 --> 0:42:30.863
+What you can also do is look at the difference,
+where you need to predict female and the difference.
+0:42:30.850 --> 0:42:40.338
+It's more often correct on the male forms
+than on the female forms, and you see that
+0:42:40.338 --> 0:42:43.575
+it's except for this system.
+0:42:43.603 --> 0:42:53.507
+So would assume that they maybe in this one
+language did some type of method in there.
+0:42:55.515 --> 0:42:57.586
+If you are more often mean there is like.
+0:42:58.178 --> 0:43:01.764
+It's not a lot lower, there's one.
+0:43:01.662 --> 0:43:08.893
+I don't know why, but if you're always to
+the same then it should be.
+0:43:08.789 --> 0:43:14.679
+You seem to be counter intuitive, so maybe
+it's better.
+0:43:15.175 --> 0:43:18.629
+Don't know exactly how yes, but it's, it's
+true.
+0:43:19.019 --> 0:43:20.849
+Mean, there's very few cases.
+0:43:20.788 --> 0:43:22.686
+I also don't know for Russian.
+0:43:22.624 --> 0:43:27.480
+I mean, there is, I think, mainly for Russian
+where you have very low numbers.
+0:43:27.418 --> 0:43:30.076
+I mean, I would say like forty five or so.
+0:43:30.014 --> 0:43:32.878
+There can be more about renting and sampling.
+0:43:32.816 --> 0:43:37.287
+I don't know if they have even more gender
+or if they have a new tool.
+0:43:37.224 --> 0:43:38.424
+I don't think so.
+0:43:40.040 --> 0:43:46.901
+Then you have typically even a stronger bias
+here where you not do the differentiation between
+0:43:46.901 --> 0:43:53.185
+how often is it correct for me and the female,
+but you are distinguishing between the.
+0:43:53.553 --> 0:44:00.503
+So you're here, for you can check for each
+occupation, which is the most important.
+0:44:00.440 --> 0:44:06.182
+A comment one based on statistics, and then
+you take that on the one side and the anti
+0:44:06.182 --> 0:44:12.188
+stereotypically on the other side, and you
+see that not in all cases but in a lot of cases
+0:44:12.188 --> 0:44:16.081
+that null probabilities are even higher than
+on the other.
+0:44:21.061 --> 0:44:24.595
+Ah, I'm telling you there's something.
+0:44:28.668 --> 0:44:32.850
+But it has to be for a doctor.
+0:44:32.715 --> 0:44:39.597
+For example, for a doctor there three don't
+know.
+0:44:40.780 --> 0:44:44.275
+Yeah, but guess here it's mainly imminent
+job description.
+0:44:44.215 --> 0:44:45.108
+So yeah, but.
+0:44:50.050 --> 0:45:01.145
+And then there is the Arabic capital gender
+corpus where it is about more assessing how
+0:45:01.145 --> 0:45:03.289
+strong a singer.
+0:45:03.483 --> 0:45:09.445
+How that is done is the open subtitles.
+0:45:09.296 --> 0:45:18.690
+Corpus is like a corpus of subtitles generated
+by volunteers.
+0:45:18.558 --> 0:45:23.426
+For the Words Like I Mean Myself.
+0:45:23.303 --> 0:45:30.670
+And mine, and then they annotated the Arabic
+sentences, whether here I refer to as a female
+0:45:30.670 --> 0:45:38.198
+and masculine, or whether it's ambiguous, and
+then from the male and female one they generate
+0:45:38.198 --> 0:45:40.040
+types of translations.
+0:45:43.703 --> 0:45:51.921
+And then a bit more different test sets as
+the last one that is referred to as the machine.
+0:45:52.172 --> 0:45:57.926
+Corpus, which is based on these lectures.
+0:45:57.789 --> 0:46:05.464
+In general, this lecture is very important
+because it.
+0:46:05.765 --> 0:46:22.293
+And here is also interesting because you also
+have the obvious signal and it's done in the
+0:46:22.293 --> 0:46:23.564
+worst.
+0:46:23.763 --> 0:46:27.740
+In the first case is where it can only be
+determined based on the speaker.
+0:46:27.968 --> 0:46:30.293
+So something like am a good speaker.
+0:46:30.430 --> 0:46:32.377
+You cannot do that correctly.
+0:46:32.652 --> 0:46:36.970
+However, if you would have the audio signal
+you should have a lot better guests.
+0:46:37.257 --> 0:46:47.812
+So it wasn't evaluated, especially machine
+translation and speech translation system,
+0:46:47.812 --> 0:46:53.335
+which take this into account or, of course,.
+0:46:57.697 --> 0:47:04.265
+The second thing is where you can do it based
+on the context.
+0:47:04.159 --> 0:47:08.717
+In this case we are not using artificial.
+0:47:11.011 --> 0:47:15.550
+Cope from the from the real data, so it's
+not like artificial creative data, but.
+0:47:15.815 --> 0:47:20.939
+Of course, in a lot more work you have to
+somehow find these in the corpus and use them
+0:47:20.939 --> 0:47:21.579
+as a test.
+0:47:21.601 --> 0:47:27.594
+Is something she got together with two of
+her dearest friends, this older woman, and
+0:47:27.594 --> 0:47:34.152
+then, of course, here friends can we get from
+the context, but it might be that some systems
+0:47:34.152 --> 0:47:36.126
+ignore that that should be.
+0:47:36.256 --> 0:47:43.434
+So you have two test sets in there, two types
+of benchmarks, and you want to determine which
+0:47:43.434 --> 0:47:43.820
+one.
+0:47:47.787 --> 0:47:54.443
+Modeling in Machine TranslationYes, this is
+how we can evaluate it, so the next question
+0:47:54.443 --> 0:48:01.397
+is how can we improve our systems because that's
+normally how we do evaluation and why we do
+0:48:01.397 --> 0:48:04.238
+evaluation so before we go into that?
+0:48:08.508 --> 0:48:22.685
+One idea is to do what is referred to as modeling,
+so the idea is somehow change the model in
+0:48:22.685 --> 0:48:24.495
+a way that.
+0:48:24.965 --> 0:48:38.271
+And yes, one idea is, of course, if we are
+giving him more information, the system doesn't
+0:48:38.271 --> 0:48:44.850
+need to do a guess without this information.
+0:48:44.724 --> 0:48:47.253
+In order to just ambiguate the bias,.
+0:48:47.707 --> 0:48:59.746
+The first thing is you can do that on the
+sentence level, for example, especially if
+0:48:59.746 --> 0:49:03.004
+you have the speakers.
+0:49:03.063 --> 0:49:14.585
+You can annotate the sentence with whether
+a speaker is made or a female, and then you
+0:49:14.585 --> 0:49:26.505
+can: Here we're seeing one thing which is very
+successful in neuromachine translation and
+0:49:26.505 --> 0:49:30.743
+other kinds of neural networks.
+0:49:31.711 --> 0:49:39.546
+However, in neuromachine translation, since
+we have no longer the strong correlation between
+0:49:39.546 --> 0:49:47.043
+input and output, the nice thing is you can
+normally put everything into your input, and
+0:49:47.043 --> 0:49:50.834
+if you have enough data, it's well balanced.
+0:49:51.151 --> 0:50:00.608
+So how you can do it here is you can add the
+token here saying female or male if the speaker
+0:50:00.608 --> 0:50:01.523
+is male.
+0:50:01.881 --> 0:50:07.195
+So, of course, this is no longer for human
+correct translation.
+0:50:07.112 --> 0:50:09.855
+It's like female Madam because.
+0:50:10.090 --> 0:50:22.951
+If you are doing the same thing then the translation
+would not be to translate female but can use
+0:50:22.951 --> 0:50:25.576
+it to disintegrate.
+0:50:25.865 --> 0:50:43.573
+And so this type of tagging is a very commonly
+used method in order to add more information.
+0:50:47.107 --> 0:50:54.047
+So this is first of all a very good thing,
+a very easy one.
+0:50:53.931 --> 0:50:57.637
+You don't have to change your.
+0:50:58.018 --> 0:51:04.581
+For example, has also been done if you think
+about formality in German.
+0:51:04.490 --> 0:51:11.479
+Whether you have to produce or, you can: We'll
+see it on Thursday.
+0:51:11.375 --> 0:51:19.621
+It's a very common approach for domains, so
+you put in the domain beforehand.
+0:51:19.515 --> 0:51:24.592
+This is from a Twitter or something like that.
+0:51:24.904 --> 0:51:36.239
+Of course, it only learns it if it has seen
+it and it dees them out, but in this case you
+0:51:36.239 --> 0:51:38.884
+don't need an equal.
+0:51:39.159 --> 0:51:42.593
+But however, it's still like challenging to
+get this availability.
+0:51:42.983 --> 0:51:55.300
+If you would do that on the first of all,
+of course, it only works if you really have
+0:51:55.300 --> 0:52:02.605
+data from speaking because otherwise it's unclear.
+0:52:02.642 --> 0:52:09.816
+You would only have the text and you would
+not easily see whether it is the mayor or the
+0:52:09.816 --> 0:52:14.895
+female speaker because this information has
+been removed from.
+0:52:16.456 --> 0:52:18.745
+Does anybody of you have an idea of how it
+fits?
+0:52:20.000 --> 0:52:25.480
+Manage that and still get the data of whether
+it's made or not speaking.
+0:52:32.152 --> 0:52:34.270
+Can do a small trick.
+0:52:34.174 --> 0:52:37.836
+We can just look on the target side.
+0:52:37.937 --> 0:52:43.573
+Mean this is, of course, only important if
+in the target side this is the case.
+0:52:44.004 --> 0:52:50.882
+So for your training data you can irritate
+it based on your target site in German you
+0:52:50.882 --> 0:52:51.362
+know.
+0:52:51.282 --> 0:52:58.383
+In German you don't know but in Spanish for
+example you know because different and then
+0:52:58.383 --> 0:53:00.400
+you can use grammatical.
+0:53:00.700 --> 0:53:10.964
+Of course, the test day would still need to
+do that more interface decision.
+0:53:13.954 --> 0:53:18.854
+And: You can, of course, do it even more advanced.
+0:53:18.898 --> 0:53:30.659
+You can even try to add these information
+to each word, so you're not doing it for the
+0:53:30.659 --> 0:53:32.687
+full sentence.
+0:53:32.572 --> 0:53:42.129
+If it's unknown, if it's female or if it's
+male, you know word alignment so you can't
+0:53:42.129 --> 0:53:42.573
+do.
+0:53:42.502 --> 0:53:55.919
+Here then you can do a word alignment, which
+is of course not always perfect, but roughly
+0:53:55.919 --> 0:53:59.348
+then you can annotate.
+0:54:01.401 --> 0:54:14.165
+Now you have these type of inputs where you
+have one information per word, but on the one
+0:54:14.165 --> 0:54:16.718
+end you have the.
+0:54:17.517 --> 0:54:26.019
+This has been used before in other scenarios,
+so you might not put in the gender, but in
+0:54:26.019 --> 0:54:29.745
+general this can be other information.
+0:54:30.090 --> 0:54:39.981
+And people refer to that or have used that
+as a factored translation model, so what you
+0:54:39.981 --> 0:54:42.454
+may do is you factor.
+0:54:42.742 --> 0:54:45.612
+You have the word itself.
+0:54:45.501 --> 0:54:48.513
+You might have the gender.
+0:54:48.401 --> 0:54:55.988
+You could have more information like don't
+know the paddle speech.
+0:54:56.316 --> 0:54:58.564
+And then you have an embedding for each of
+them.
+0:54:59.199 --> 0:55:03.599
+And you congratulate them, and then you have
+years of congratulated a bedding.
+0:55:03.563 --> 0:55:09.947
+Which says okay, this is a female plumber
+or a male plumber or so on.
+0:55:09.856 --> 0:55:18.032
+This has additional information and then you
+can train this factory model where you have
+0:55:18.032 --> 0:55:22.534
+the ability to give the model extra information.
+0:55:23.263 --> 0:55:35.702
+And of course now if you are training this
+way directly you always need to have this information.
+0:55:36.576 --> 0:55:45.396
+So that might not be the best way if you want
+to use a translation system and sometimes don't
+0:55:45.396 --> 0:55:45.959
+have.
+0:55:46.866 --> 0:55:57.987
+So any idea of how you can train it or what
+machine learning technique you can use to deal
+0:55:57.987 --> 0:55:58.720
+with.
+0:56:03.263 --> 0:56:07.475
+Mainly despite it already, many of your things.
+0:56:14.154 --> 0:56:21.521
+Drop out so you sometimes put information
+in there and then you can use dropouts to inputs.
+0:56:21.861 --> 0:56:27.599
+Is sometimes put in this information in there,
+sometimes not, and the system is then able
+0:56:27.599 --> 0:56:28.874
+to deal with those.
+0:56:28.811 --> 0:56:34.776
+If it doesn't have the information, it's doing
+some of the best it can do, but if it has the
+0:56:34.776 --> 0:56:39.203
+information, it can use the information and
+maybe do a more rounded.
+0:56:46.766 --> 0:56:52.148
+Context Based Machine TranslationSo then there
+is, of course, more ways to try to do a moderately
+0:56:52.148 --> 0:56:52.807
+biased one.
+0:56:52.993 --> 0:57:01.690
+We will only want to mention here because
+you'll have a full lecture on that next week
+0:57:01.690 --> 0:57:08.188
+and that is referred to where context based
+machine translation.
+0:57:08.728 --> 0:57:10.397
+Good, and in this other ones, but.
+0:57:10.750 --> 0:57:16.830
+If you translate several sentences well, of
+course, there are more situations where you
+0:57:16.830 --> 0:57:17.866
+can dissemble.
+0:57:18.118 --> 0:57:23.996
+Because it might be that the information is
+not in the current sentence, but it's in the
+0:57:23.996 --> 0:57:25.911
+previous sentence or before.
+0:57:26.967 --> 0:57:33.124
+If you have the mean with the speaker maybe
+not, but if it's referring to, you can core
+0:57:33.124 --> 0:57:33.963
+references.
+0:57:34.394 --> 0:57:40.611
+They are often referring to things in the
+previous sentence so you can use them in order
+0:57:40.611 --> 0:57:44.104
+to: And that can be done basically and very
+easy.
+0:57:44.034 --> 0:57:47.438
+You'll see more advanced options, but the
+main.
+0:57:48.108 --> 0:57:58.516
+Mean, no machine translation is a sequence
+to sequence model, which can use any input
+0:57:58.516 --> 0:58:02.993
+sequence to output sequence mapping.
+0:58:02.872 --> 0:58:04.337
+So now at.
+0:58:04.484 --> 0:58:11.281
+So then you can do, for example, five to five
+translations, or also five to one, or so there's.
+0:58:11.811 --> 0:58:19.211
+This is not a method like only dedicated to
+buying, of course, but the hope is.
+0:58:19.139 --> 0:58:25.534
+If you're using this because I mean bias often,
+we have seen that it rises in situations where
+0:58:25.534 --> 0:58:27.756
+we're not having enough context.
+0:58:27.688 --> 0:58:32.940
+So the idea is if we generally increase our
+context, it will also help this.
+0:58:32.932 --> 0:58:42.378
+Of course, it will help other situations where
+you need context to disintegrate.
+0:58:43.603 --> 0:58:45.768
+Get There If You're Saying I'm Going to the
+Bank.
+0:58:46.286 --> 0:58:54.761
+It's not directly from this sentence clear
+whether it's the finance institute or the bank
+0:58:54.761 --> 0:58:59.093
+for sitting, but maybe if you say afterward,.
+0:59:02.322 --> 0:59:11.258
+And then there is in generally a very large
+amount of work on debiasing the word embelling.
+0:59:11.161 --> 0:59:20.098
+So the one I hear like, I mean, I think that
+partly comes from the fact that like a first.
+0:59:21.041 --> 0:59:26.925
+Or that first research was done often on inspecting
+the word embeddings and seeing whether they
+0:59:26.925 --> 0:59:32.503
+are biased or not, and people found out how
+there is some bias in there, and then the idea
+0:59:32.503 --> 0:59:38.326
+is oh, if you remove them from the word embedded
+in already, then maybe your system later will
+0:59:38.326 --> 0:59:39.981
+not have that strong of a.
+0:59:40.520 --> 0:59:44.825
+So how can that work?
+0:59:44.629 --> 0:59:56.360
+Or like maybe first, how do words encounter
+bias in there?
+0:59:56.161 --> 0:59:57.221
+So.
+0:59:57.137 --> 1:00:06.152
+So you can look at the word embedding, and
+then you can compare the distance of the word
+1:00:06.152 --> 1:00:11.116
+compared: And there's like interesting findings.
+1:00:11.015 --> 1:00:18.285
+For example, you have the difference in occupation
+and how similar.
+1:00:18.678 --> 1:00:33.068
+And of course it's not a perfect correlation,
+but you see some type of correlation: jobs
+1:00:33.068 --> 1:00:37.919
+which have a high occupation.
+1:00:37.797 --> 1:00:41.387
+They also are more similar to the word what
+we're going to be talking about.
+1:00:43.023 --> 1:00:50.682
+Maybe a secretary is also a bit difficult,
+but because yeah maybe it's more often.
+1:00:50.610 --> 1:00:52.438
+Done in general by by women.
+1:00:52.375 --> 1:00:58.208
+However, there is a secretary like the Secretary
+of State or so, the German minister, which
+1:00:58.208 --> 1:01:03.406
+I of course know that many so in the statistics
+they are not counting that often.
+1:01:03.543 --> 1:01:11.576
+But in data they of course cook quite often,
+so there's different ways of different meanings.
+1:01:14.154 --> 1:01:23.307
+So how can you not try to remove this type
+of bias?
+1:01:23.131 --> 1:01:32.992
+One way is the idea of hearts, devices and
+embeddings.
+1:01:33.113 --> 1:01:39.354
+So if you remember on word embeddings think
+we have this image that you can do the difference
+1:01:39.354 --> 1:01:44.931
+between man and woman and add this difference
+to king and then look at your screen.
+1:01:45.865 --> 1:01:57.886
+So here's the idea we want to remove this
+gender information from some things which should
+1:01:57.886 --> 1:02:00.132
+not have gender.
+1:02:00.120 --> 1:02:01.386
+The word engineer.
+1:02:01.320 --> 1:02:06.854
+There is no information about the gender in
+that, so you should remove this type.
+1:02:07.347 --> 1:02:16.772
+Of course, you first need to find out where
+these inflammations are and you can.
+1:02:17.037 --> 1:02:23.603
+However, normally if you do the difference
+like the subspace by only one example, it's
+1:02:23.603 --> 1:02:24.659
+not the best.
+1:02:24.924 --> 1:02:31.446
+So you can do the same thing for things like
+brother and sister, man and dad, and then you
+1:02:31.446 --> 1:02:38.398
+can somehow take the average of these differences
+saying this is a vector which maps a male from
+1:02:38.398 --> 1:02:39.831
+to the female form.
+1:02:40.660 --> 1:02:50.455
+And then you can try to neutralize this gender
+information on this dimension.
+1:02:50.490 --> 1:02:57.951
+You can find it's subspace or dimensional.
+1:02:57.777 --> 1:03:08.884
+It would be a line, but now this is dimensional,
+and then you.
+1:03:08.728 --> 1:03:13.104
+Representation: Where you remove this type
+of embellishment.
+1:03:15.595 --> 1:03:18.178
+This is, of course, quite strong of the questions.
+1:03:18.128 --> 1:03:19.058
+How good does it?
+1:03:19.006 --> 1:03:20.714
+Thanks tell them for one other.
+1:03:20.880 --> 1:03:28.256
+But it's an idea we are trying to after learning
+before we are using the Word and Banks for
+1:03:28.256 --> 1:03:29.940
+machine translation.
+1:03:29.859 --> 1:03:37.303
+We are trying to remove the gender information
+from the jobs and then have a representation
+1:03:37.303 --> 1:03:38.679
+which hopefully.
+1:03:40.240 --> 1:03:45.047
+Similar idea is the one of agenda neutral
+glove.
+1:03:44.949 --> 1:03:50.250
+Glove is another technique to learn word embeddings.
+1:03:50.750 --> 1:03:52.870
+Think we discussed one shortly.
+1:03:52.804 --> 1:03:56.183
+It was too back, which was some of the first
+one.
+1:03:56.456 --> 1:04:04.383
+But there are other of course methods how
+you can train word embeddings and glove as
+1:04:04.383 --> 1:04:04.849
+one.
+1:04:04.756 --> 1:04:07.464
+The idea is we're training.
+1:04:07.747 --> 1:04:19.007
+At least this is somehow a bit separated,
+so where you have part of the vector is gender
+1:04:19.007 --> 1:04:20.146
+neutral.
+1:04:20.300 --> 1:04:29.247
+What you need therefore is three sets of words,
+so you have male words and you have words.
+1:04:29.769 --> 1:04:39.071
+And then you're trying to learn some type
+of vector where some dimensions are not.
+1:04:39.179 --> 1:04:51.997
+So the idea is can learn a representation
+where at least know that this part is gender
+1:04:51.997 --> 1:04:56.123
+neutral and the other part.
+1:05:00.760 --> 1:05:03.793
+How can we do that?
+1:05:03.641 --> 1:05:12.363
+How can we change the system to learn anything
+specific?
+1:05:12.210 --> 1:05:20.476
+Nearly in all cases this works by the loss
+function.
+1:05:20.520 --> 1:05:26.206
+And that is more a general approach in machine
+translation.
+1:05:26.111 --> 1:05:30.567
+The general loss function is we are learning.
+1:05:31.111 --> 1:05:33.842
+Here is the same idea.
+1:05:33.723 --> 1:05:44.378
+You have the general loss function in order
+to learn good embeddings and then you try to
+1:05:44.378 --> 1:05:48.688
+introduce additional loss function.
+1:05:48.969 --> 1:05:58.213
+Yes, I think yes, yes, that's the solution,
+and how you make sure that if I have training
+1:05:58.213 --> 1:06:07.149
+for all nurses of email, how do you make sure
+that the algorithm puts it into neutral?
+1:06:07.747 --> 1:06:12.448
+And you need, so this is like for only the
+first learning of word embeddings.
+1:06:12.388 --> 1:06:18.019
+Then the idea is if you have word embeddings
+where the gender is separate and then you train
+1:06:18.019 --> 1:06:23.711
+on top of that machine translation where you
+don't change the embeddings, it should hopefully
+1:06:23.711 --> 1:06:25.225
+be less and less biased.
+1:06:25.865 --> 1:06:33.465
+And in order to train that yes you need additional
+information so these information need to be
+1:06:33.465 --> 1:06:40.904
+hence defined and they can't be general so
+you need to have a list of these are male persons
+1:06:40.904 --> 1:06:44.744
+or males these are nouns for females and these.
+1:06:49.429 --> 1:06:52.575
+So the first step, of course, we still want
+to have good word inventings.
+1:06:54.314 --> 1:07:04.100
+So you have the normal objective function
+of the word embedding.
+1:07:03.949 --> 1:07:09.524
+It's something like the similarity.
+1:07:09.849 --> 1:07:19.751
+How it's exactly derived is not that important
+because we're not interested in love itself,
+1:07:19.751 --> 1:07:23.195
+but you have any loss function.
+1:07:23.087 --> 1:07:26.857
+Of course, you have to keep that.
+1:07:27.167 --> 1:07:38.977
+And then there's three more lost functions
+that you can add: So the one is you take the
+1:07:38.977 --> 1:07:51.325
+average value of all the male words and the
+average word embedding of all the female words.
+1:07:51.731 --> 1:08:00.066
+So the good thing about this is we don't always
+need to have for one word the male and the
+1:08:00.066 --> 1:08:05.837
+female worship, so it's only like we have a
+set of male words.
+1:08:06.946 --> 1:08:21.719
+So this is just saying yeah, we want these
+two should be somehow similar to each other.
+1:08:21.551 --> 1:08:25.421
+It shouldn't be that.
+1:08:30.330 --> 1:08:40.081
+Should be the other one, or think this should
+be it.
+1:08:39.897 --> 1:08:45.975
+This is agenda, the average of.
+1:08:45.945 --> 1:09:01.206
+The average should be the same, but if you're
+looking at the female should be at the other.
+1:09:01.681 --> 1:09:06.959
+This is like on these dimensions, the male
+should be on the one and the female on the
+1:09:06.959 --> 1:09:07.388
+other.
+1:09:07.627 --> 1:09:16.123
+The same yeah, this gender information should
+be there, so you're pushing all the males to
+1:09:16.123 --> 1:09:17.150
+the other.
+1:09:21.541 --> 1:09:23.680
+Then their words should be.
+1:09:23.604 --> 1:09:30.389
+If you have that you see the neutral words,
+they should be in the middle of between the
+1:09:30.389 --> 1:09:32.008
+male and the female.
+1:09:32.012 --> 1:09:48.261
+So you say is the middle point between all
+male and female words and just somehow putting
+1:09:48.261 --> 1:09:51.691
+the neutral words.
+1:09:52.912 --> 1:09:56.563
+And then you're learning them, and then you
+can apply them in different ways.
+1:09:57.057 --> 1:10:03.458
+So you have this a bit in the pre-training
+thing.
+1:10:03.330 --> 1:10:10.337
+You can use the pre-trained inbeddings on
+the output.
+1:10:10.208 --> 1:10:23.179
+All you can use are: And then you can analyze
+what happens instead of training them directly.
+1:10:23.041 --> 1:10:30.506
+If have this additional loss, which tries
+to optimize.
+1:10:32.432 --> 1:10:42.453
+And then it was evaluated exactly on the sentences
+we had at the beginning where it is about know
+1:10:42.453 --> 1:10:44.600
+her for a long time.
+1:10:44.498 --> 1:10:48.693
+My friend works as an accounting cling.
+1:10:48.788 --> 1:10:58.049
+So all these examples are not very difficult
+to translation, but the question is how often
+1:10:58.049 --> 1:10:58.660
+does?
+1:11:01.621 --> 1:11:06.028
+That it's not that complicated as you see
+here, so even the baseline.
+1:11:06.366 --> 1:11:10.772
+If you're doing nothing is working quite well,
+it's most challenging.
+1:11:10.709 --> 1:11:16.401
+It seems overall in the situation where it's
+a name, so for he and him he has learned the
+1:11:16.401 --> 1:11:22.282
+correlation because that's maybe not surprisingly
+because this correlation occurs more often
+1:11:22.282 --> 1:11:23.927
+than with any name there.
+1:11:24.044 --> 1:11:31.749
+If you have a name that you can extract, that
+is talking about Mary, that's female is a lot
+1:11:31.749 --> 1:11:34.177
+harder to extract than this.
+1:11:34.594 --> 1:11:40.495
+So you'll see already in the bass line this
+is yeah, not working, not working.
+1:11:43.403 --> 1:11:47.159
+And for all the other cases it's working very
+well.
+1:11:47.787 --> 1:11:53.921
+Where all the best one is achieved here with
+an arc debiasing both on the encoder, on the.
+1:11:57.077 --> 1:12:09.044
+It makes sense that a hard debasing on the
+decoder doesn't really work because there you
+1:12:09.044 --> 1:12:12.406
+have gender information.
+1:12:14.034 --> 1:12:17.406
+For glove it seems to already work here.
+1:12:17.323 --> 1:12:20.204
+That's maybe surprising and yeah.
+1:12:20.260 --> 1:12:28.263
+So there is no clear else we don't have numbers
+for that doesn't really work well on the other.
+1:12:28.179 --> 1:12:30.517
+So how much do I use then?
+1:12:33.693 --> 1:12:44.720
+Then as a last way of improving that is a
+bit what we had mentioned before.
+1:12:44.575 --> 1:12:48.499
+That is what is referred.
+1:12:48.488 --> 1:12:59.133
+One problem is the bias in the data so you
+can adapt your data so you can just try to
+1:12:59.133 --> 1:13:01.485
+find equal amount.
+1:13:01.561 --> 1:13:11.368
+In your data like you adapt your data and
+then you find your data on the smaller but
+1:13:11.368 --> 1:13:12.868
+you can try.
+1:13:18.298 --> 1:13:19.345
+This is line okay.
+1:13:19.290 --> 1:13:21.584
+We have access to the data to the model.
+1:13:21.528 --> 1:13:23.041
+We can improve our model.
+1:13:24.564 --> 1:13:31.328
+One situation we haven't talked a lot about
+but another situation might also be and that's
+1:13:31.328 --> 1:13:37.942
+even getting more important is oh you want
+to work with a model which you don't have but
+1:13:37.942 --> 1:13:42.476
+you want to improve the model without having
+access so when.
+1:13:42.862 --> 1:13:49.232
+Nowadays there are a lot of companies who
+are not developing their own system but they're
+1:13:49.232 --> 1:13:52.983
+using or something like that or machine translation.
+1:13:53.313 --> 1:13:59.853
+So there is interest that you might not be
+able to find children with models completely.
+1:14:00.080 --> 1:14:10.068
+So the question is, can you do some type of
+black box adaptation of a system that takes
+1:14:10.068 --> 1:14:20.055
+the black box system but tries to improve it
+in some ways through: There's some ways of
+1:14:20.055 --> 1:14:21.417
+doing that.
+1:14:21.304 --> 1:14:30.328
+One is called black box injection and that's
+what is referred to as prompt.
+1:14:30.730 --> 1:14:39.793
+So the problem is if you have sentences you
+don't have information about the speakers.
+1:14:39.689 --> 1:14:43.130
+So how can you put information?
+1:14:43.984 --> 1:14:53.299
+And what we know from a large language model,
+we just prompt them, and you can do that.
+1:14:53.233 --> 1:14:59.545
+Translating directly, I love you, you said
+she said to him, I love you, and then of course
+1:14:59.545 --> 1:15:01.210
+you have to strip away.
+1:15:01.181 --> 1:15:06.629
+I mean, you cannot prevent the model from
+translating that, but you should be able to
+1:15:06.629 --> 1:15:08.974
+see what is the translation of this.
+1:15:08.910 --> 1:15:14.849
+One can strip that away, and now the system
+had hopefully the information that it's somebody
+1:15:14.849 --> 1:15:15.552
+like that.
+1:15:15.488 --> 1:15:17.023
+The speaker is female.
+1:15:18.198 --> 1:15:23.222
+Because you're no longer translating love
+you, but you're translating the sentence she
+1:15:23.222 --> 1:15:24.261
+said to him love.
+1:15:24.744 --> 1:15:37.146
+And so you insert this information as contextual
+information around it and don't have to change
+1:15:37.146 --> 1:15:38.567
+the model.
+1:15:41.861 --> 1:15:54.518
+Researches in Machine TranslationLast idea
+is to do what is referred to as letters rescoring,
+1:15:54.518 --> 1:16:01.115
+so the idea there is you generate a translation.
+1:16:01.481 --> 1:16:18.547
+And now you have an additional component which
+tries to add possibilities where gender information
+1:16:18.547 --> 1:16:21.133
+might be lost.
+1:16:21.261 --> 1:16:29.687
+It's just a graph in this way, a simplified
+graph where there's always one word between
+1:16:29.687 --> 1:16:31.507
+two notes and you.
+1:16:31.851 --> 1:16:35.212
+So you have something like Zi is an ads or
+a Zi is an ads.
+1:16:35.535 --> 1:16:41.847
+And then you can generate all possible variants.
+1:16:41.718 --> 1:16:49.320
+Then, of course, we're not done because the
+final output.
+1:16:50.530 --> 1:16:56.999
+Then you can re-score the system by a gender
+de-biased model.
+1:16:56.895 --> 1:17:03.414
+So the nice thing is why why don't we directly
+use our model?
+1:17:03.309 --> 1:17:10.356
+The idea is our model, which is only focusing
+on gender devising.
+1:17:10.530 --> 1:17:16.470
+It can be, for example, if it's just trained
+on some synthetical data, it will not be that
+1:17:16.470 --> 1:17:16.862
+well.
+1:17:16.957 --> 1:17:21.456
+But what we can do then is now you can rescore
+the possible translations in here.
+1:17:21.721 --> 1:17:31.090
+And here the cases of course in general structure
+is already done how to translate the words.
+1:17:31.051 --> 1:17:42.226
+Then you're only using the second component
+in order to react for some variants and then
+1:17:42.226 --> 1:17:45.490
+get the best translation.
+1:17:45.925 --> 1:17:58.553
+And: As the last one there is the post processing
+so you can't have it.
+1:17:58.538 --> 1:18:02.830
+Mean this was one way of post-processing was
+to generate the lattice and retranslate it.
+1:18:03.123 --> 1:18:08.407
+But you can also have a processing, for example
+only on the target side where you have additional
+1:18:08.407 --> 1:18:12.236
+components with checks about the gender which
+maybe only knows gender.
+1:18:12.182 --> 1:18:17.073
+So it's not a machine translation component
+but more like a grammatical checker which can
+1:18:17.073 --> 1:18:19.193
+be used as most processing to do that.
+1:18:19.579 --> 1:18:22.926
+Think about it a bit like when you use PPT.
+1:18:22.850 --> 1:18:25.833
+There's also a lot of post processing.
+1:18:25.757 --> 1:18:32.618
+If you use a directive, it would tell you
+how to build a bond, but they have some checks
+1:18:32.618 --> 1:18:35.932
+either before and after to prevent things.
+1:18:36.356 --> 1:18:40.580
+So often there might be an application system.
+1:18:40.490 --> 1:18:44.716
+There might be extra pre and post processing.
+1:18:48.608 --> 1:18:52.589
+And yeah, with this we're at the end of.
+1:18:52.512 --> 1:19:09.359
+To this lecture where we focused on the bias,
+but think a lot of these techniques we have
+1:19:09.359 --> 1:19:11.418
+seen here.
+1:19:11.331 --> 1:19:17.664
+So we saw, on the one hand, we saw that evaluating
+just pure blues first might not always be.
+1:19:17.677 --> 1:19:18.947
+Mean it's very important.
+1:19:20.000 --> 1:19:30.866
+Always do that, but if you want to check and
+some specific things are important, then you
+1:19:30.866 --> 1:19:35.696
+might have to do dedicated evaluations.
+1:19:36.036 --> 1:19:44.296
+It is now translating for the President and
+it is like in German that guess it is not very
+1:19:44.296 --> 1:19:45.476
+appropriate.
+1:19:45.785 --> 1:19:53.591
+So it might be important if characteristics
+of your system are essential to have dedicated
+1:19:53.591 --> 1:19:54.620
+evaluation.
+1:19:55.135 --> 1:20:02.478
+And then if you have that, of course, it might
+be also important to develop delicate techniques.
+1:20:02.862 --> 1:20:10.988
+We have seen today some how to mitigate biases,
+but I hope you see that a lot of these techniques
+1:20:10.988 --> 1:20:13.476
+you can also use to mitigate.
+1:20:13.573 --> 1:20:31.702
+At least related things you can adjust the
+training data you can do for other things.
+1:20:33.253 --> 1:20:36.022
+Before we have been finishing, we have any
+more questions.
+1:20:41.761 --> 1:20:47.218
+Then thanks a lot, and then we will see each
+other again on the first step.

demo_data/lectures/Lecture-13-04.07.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42f89fc932d5818061ea4e7490a1ea9a58c6b937b7696d69d117fca50623f0a2
+size 108699463

demo_data/lectures/Lecture-14-27.06.2023/English.vtt ADDED Viewed

	@@ -0,0 +1,2753 @@

+WEBVTT
+0:00:01.921 --> 0:00:14.926
+IntroductionHey welcome to today's lecture,
+what we today want to look at is how we can
+0:00:14.926 --> 0:00:16.403
+make new.
+0:00:16.796 --> 0:00:26.458
+So until now we have this global system, the
+encoder and the decoder mostly, and we haven't
+0:00:26.458 --> 0:00:29.714
+really thought about how long.
+0:00:30.170 --> 0:00:42.684
+And what we, for example, know is yeah, you
+can make the systems bigger in different ways.
+0:00:42.546 --> 0:00:47.088
+We can make them deeper so the.
+0:00:47.407 --> 0:00:56.331
+And if we have at least enough data that typically
+helps you make things performance better,.
+0:00:56.576 --> 0:01:00.620
+But of course leads to problems that we need
+more resources.
+0:01:00.554 --> 0:01:06.556
+That is a problem at universities where we
+have typically limited computation capacities.
+0:01:06.489 --> 0:01:11.759
+So at some point you have such big models
+that you cannot train them anymore.
+0:01:13.033 --> 0:01:23.792
+And also for companies is of course important
+if it costs you like to generate translation
+0:01:23.792 --> 0:01:26.984
+just by power consumption.
+0:01:27.667 --> 0:01:35.386
+So yeah, there's different reasons why you
+want to do efficient machine translation.
+0:01:36.436 --> 0:01:48.338
+One reason is there are different ways of
+how you can improve your machine translation
+0:01:48.338 --> 0:01:50.527
+system once we.
+0:01:50.670 --> 0:01:55.694
+There can be different types of data we looked
+into data crawling, monolingual data.
+0:01:55.875 --> 0:01:59.024
+All this data and the aim is always.
+0:01:59.099 --> 0:02:05.735
+Of course, we are not just purely interested
+in having more data, but the idea why we want
+0:02:05.735 --> 0:02:12.299
+to have more data is that more data also means
+that we have better quality because mostly
+0:02:12.299 --> 0:02:17.550
+we are interested in increasing the quality
+of the machine translation.
+0:02:18.838 --> 0:02:24.892
+But there's also other ways of how you can
+improve the quality of a machine translation.
+0:02:25.325 --> 0:02:36.450
+And what is, of course, that is where most
+research is focusing on.
+0:02:36.287 --> 0:02:44.471
+It means all we want to build better algorithms.
+0:02:44.684 --> 0:02:48.199
+Course: The other things are normally as good.
+0:02:48.124 --> 0:02:54.596
+Sometimes it's easier to improve, so often
+it's easier to just collect more data than
+0:02:54.596 --> 0:02:57.455
+to invent some great view algorithms.
+0:02:57.380 --> 0:03:00.317
+But yeah, both of them are important.
+0:03:00.920 --> 0:03:09.812
+But there is this third thing, especially
+with neural machine translation, and that means
+0:03:09.812 --> 0:03:11.590
+we make a bigger.
+0:03:11.751 --> 0:03:16.510
+Can be, as said, that we have more layers,
+that we have wider layers.
+0:03:16.442 --> 0:03:19.928
+EnsemblesThe other thing we talked a bit about
+is ensemble.
+0:03:19.870 --> 0:03:24.534
+That means we are not building one new machine
+translation system.
+0:03:24.965 --> 0:03:27.505
+And we can easily build four.
+0:03:27.420 --> 0:03:32.319
+What is the typical strategy to build different
+systems?
+0:03:32.233 --> 0:03:33.188
+Remember.
+0:03:35.795 --> 0:03:40.119
+It should be of course a bit different if
+you have the same.
+0:03:40.048 --> 0:03:44.550
+If they all predict the same then combining
+them doesn't help.
+0:03:44.478 --> 0:03:48.981
+So what is the easiest way if you have to
+build four systems?
+0:03:51.711 --> 0:04:01.747
+And the Charleston's will take, but this is
+the best output of a single system.
+0:04:02.362 --> 0:04:10.165
+Mean now, it's really three different systems
+so that you later can combine them and maybe
+0:04:10.165 --> 0:04:11.280
+the average.
+0:04:11.194 --> 0:04:16.683
+Ensembles are typically that the average is
+all probabilities.
+0:04:19.439 --> 0:04:24.227
+The idea is to think about neural networks.
+0:04:24.118 --> 0:04:29.279
+There's one parameter which can easily adjust.
+0:04:29.169 --> 0:04:36.527
+That's exactly the easiest way to randomize
+with three different.
+0:04:37.017 --> 0:04:43.119
+They have the same architecture, so all the
+hydroparameters are the same, but they are
+0:04:43.119 --> 0:04:43.891
+different.
+0:04:43.821 --> 0:04:46.558
+They will have different predictions.
+0:04:48.228 --> 0:04:52.572
+So, of course, bigger amounts.
+0:04:52.432 --> 0:05:05.300
+Some of these are a bit the easiest way of
+improving your quality because you don't really
+0:05:05.300 --> 0:05:08.269
+have to do anything.
+0:05:08.588 --> 0:05:12.588
+There is limits on that bigger models only
+get better.
+0:05:12.515 --> 0:05:19.098
+If you have enough training data you can't
+do like a handheld layer and you will not work
+0:05:19.098 --> 0:05:24.877
+on very small data but with a recent amount
+of data that is the easiest thing.
+0:05:25.305 --> 0:05:33.726
+However, they are challenging with making
+better models, bigger motors, and that is the
+0:05:33.726 --> 0:05:34.970
+computation.
+0:05:35.175 --> 0:05:44.482
+So, of course, if you have a bigger model
+that can mean that you have longer running
+0:05:44.482 --> 0:05:49.518
+times, if you have models, you have to times.
+0:05:51.171 --> 0:05:56.685
+Normally you cannot paralyze the different
+layers because the input to one layer is always
+0:05:56.685 --> 0:06:02.442
+the output of the previous layer, so you propagate
+that so it will also increase your runtime.
+0:06:02.822 --> 0:06:10.720
+Then you have to store all your models in
+memory.
+0:06:10.562 --> 0:06:21.027
+If you have double weights you will have:
+Is more difficult to then do back propagation.
+0:06:20.909 --> 0:06:27.674
+You have to store in between the activations,
+so there's not only do you increase the model
+0:06:27.674 --> 0:06:31.865
+in your memory, but also all these other variables
+that.
+0:06:34.414 --> 0:06:36.734
+And so in general it is more expensive.
+0:06:37.137 --> 0:06:54.208
+And therefore there's good reasons in looking
+into can we make these models sound more efficient.
+0:06:54.134 --> 0:07:00.982
+So it's been through the viewer, you can have
+it okay, have one and one day of training time,
+0:07:00.982 --> 0:07:01.274
+or.
+0:07:01.221 --> 0:07:07.535
+Forty thousand euros and then what is the
+best machine translation system I can get within
+0:07:07.535 --> 0:07:08.437
+this budget.
+0:07:08.969 --> 0:07:19.085
+And then, of course, you can make the models
+bigger, but then you have to train them shorter,
+0:07:19.085 --> 0:07:24.251
+and then we can make more efficient algorithms.
+0:07:25.925 --> 0:07:31.687
+EfficiencyIf you think about efficiency, there's
+a bit different scenarios.
+0:07:32.312 --> 0:07:43.635
+So if you're more of coming from the research
+community, what you'll be doing is building
+0:07:43.635 --> 0:07:47.913
+a lot of models in your research.
+0:07:48.088 --> 0:07:58.645
+So you're having your test set of maybe sentences,
+calculating the blue score, then another model.
+0:07:58.818 --> 0:08:08.911
+So what that means is typically you're training
+on millions of cents, so your training time
+0:08:08.911 --> 0:08:14.944
+is long, maybe a day, but maybe in other cases
+a week.
+0:08:15.135 --> 0:08:22.860
+The testing is not really the cost efficient,
+but the training is very costly.
+0:08:23.443 --> 0:08:37.830
+If you are more thinking of building models
+for application, the scenario is quite different.
+0:08:38.038 --> 0:08:46.603
+And then you keep it running, and maybe thousands
+of customers are using it in translating.
+0:08:46.510 --> 0:08:47.729
+So in that.
+0:08:48.168 --> 0:08:59.577
+And we will see that it is not always the
+same type of challenges you can paralyze some
+0:08:59.577 --> 0:09:07.096
+things in training, which you cannot paralyze
+in testing.
+0:09:07.347 --> 0:09:14.124
+For example, in training you have to do back
+propagation, so you have to store the activations.
+0:09:14.394 --> 0:09:23.901
+Therefore, in testing we briefly discussed
+that we would do it in more detail today in
+0:09:23.901 --> 0:09:24.994
+training.
+0:09:25.265 --> 0:09:36.100
+You know they're a target and you can process
+everything in parallel while in testing.
+0:09:36.356 --> 0:09:46.741
+So you can only do one word at a time, and
+so you can less paralyze this.
+0:09:46.601 --> 0:09:50.536
+Therefore, it's important.
+0:09:52.712 --> 0:09:55.347
+Is a specific task on this.
+0:09:55.253 --> 0:10:03.158
+For example, it's the efficiency task where
+it's about making things as efficient.
+0:10:03.123 --> 0:10:09.230
+Is possible and they can look at different
+resources.
+0:10:09.117 --> 0:10:14.209
+So how much deep fuel run time do you need?
+0:10:14.454 --> 0:10:19.366
+See how much memory you need or you can have
+a fixed memory budget and then have to build
+0:10:19.366 --> 0:10:20.294
+the best system.
+0:10:20.500 --> 0:10:29.010
+And here is a bit like an example of that,
+so there's three teams from Edinburgh from
+0:10:29.010 --> 0:10:30.989
+and they submitted.
+0:10:31.131 --> 0:10:36.278
+So then, of course, if you want to know the
+most efficient system you have to do a bit
+0:10:36.278 --> 0:10:36.515
+of.
+0:10:36.776 --> 0:10:44.656
+You want to have a better quality or more
+runtime and there's not the one solution.
+0:10:44.562 --> 0:10:46.724
+You can improve your.
+0:10:46.946 --> 0:10:49.662
+And that you see that there are different
+systems.
+0:10:49.909 --> 0:11:06.051
+Here is how many words you can do for a second
+on the clock, and you want to be as talk as
+0:11:06.051 --> 0:11:07.824
+possible.
+0:11:08.068 --> 0:11:08.889
+And you see here a bit.
+0:11:08.855 --> 0:11:09.985
+This is a little bit different.
+0:11:11.051 --> 0:11:27.717
+You want to be there on the top right corner
+and you can get a score of something between
+0:11:27.717 --> 0:11:29.014
+words.
+0:11:30.250 --> 0:11:34.161
+Two hundred and fifty thousand, then you'll
+ever come and score zero point three.
+0:11:34.834 --> 0:11:41.243
+There is, of course, any bit of a decision,
+but the question is, like how far can you again?
+0:11:41.174 --> 0:11:47.756
+Some of all these points on this line would
+be winners because they are somehow most efficient
+0:11:47.756 --> 0:11:53.923
+in a way that there's no system which achieves
+the same quality with less computational.
+0:11:57.657 --> 0:12:04.119
+ResourcesSo there's the one question of which
+resources are you interested.
+0:12:04.034 --> 0:12:07.362
+Are you running it on CPU or GPU?
+0:12:07.264 --> 0:12:11.671
+There's different ways of paralyzing stuff.
+0:12:14.654 --> 0:12:20.777
+Another dimension is how you process your
+data.
+0:12:20.649 --> 0:12:27.157
+There's really the best processing and streaming.
+0:12:27.647 --> 0:12:34.672
+So in batch processing you have the whole
+document available so you can translate all
+0:12:34.672 --> 0:12:39.981
+sentences in perimeter and then you're interested
+in throughput.
+0:12:40.000 --> 0:12:43.844
+But you can then process, for example, especially
+in GPS.
+0:12:43.778 --> 0:12:49.772
+That's interesting, you're not translating
+one sentence at a time, but you're translating
+0:12:49.772 --> 0:12:56.099
+one hundred sentences or so in parallel, so
+you have one more dimension where you can paralyze
+0:12:56.099 --> 0:12:57.964
+and then be more efficient.
+0:12:58.558 --> 0:13:14.863
+On the other hand, for example sorts of documents,
+so we learned that if you do badge processing
+0:13:14.863 --> 0:13:16.544
+you have.
+0:13:16.636 --> 0:13:24.636
+Then, of course, it makes sense to sort the
+sentences in order to have the minimum thing
+0:13:24.636 --> 0:13:25.535
+attached.
+0:13:27.427 --> 0:13:32.150
+The other scenario is more the streaming scenario
+where you do life translation.
+0:13:32.512 --> 0:13:40.212
+So in that case you can't wait for the whole
+document to pass, but you have to do.
+0:13:40.520 --> 0:13:49.529
+And then, for example, that's especially in
+situations like speech translation, and then
+0:13:49.529 --> 0:13:53.781
+you're interested in things like latency.
+0:13:53.680 --> 0:14:00.362
+So how much do you have to wait to get the
+output of a sentence?
+0:14:06.566 --> 0:14:15.703
+Finally, there is the thing about the implementation:
+Today we're mainly looking at different algorithms,
+0:14:15.703 --> 0:14:23.115
+different models of how you can model them
+in your machine translation system, but of
+0:14:23.115 --> 0:14:29.235
+course for the same algorithms there's also
+different implementations.
+0:14:29.489 --> 0:14:38.643
+So, for example, for a machine translation
+this tool could be very fast.
+0:14:38.638 --> 0:14:46.615
+So they have like coded a lot of the operations
+very low resource, not low resource, low level
+0:14:46.615 --> 0:14:49.973
+on the directly on the QDAC kernels in.
+0:14:50.110 --> 0:15:00.948
+So the same attention network is typically
+more efficient in that type of algorithm.
+0:15:00.880 --> 0:15:02.474
+Than in in any other.
+0:15:03.323 --> 0:15:13.105
+Of course, it might be other disadvantages,
+so if you're a little worker or have worked
+0:15:13.105 --> 0:15:15.106
+in the practical.
+0:15:15.255 --> 0:15:22.604
+Because it's normally easier to understand,
+easier to change, and so on, but there is again
+0:15:22.604 --> 0:15:23.323
+a train.
+0:15:23.483 --> 0:15:29.440
+You have to think about, do you want to include
+this into my study or comparison or not?
+0:15:29.373 --> 0:15:36.450
+Should it be like I compare different implementations
+and I also find the most efficient implementation?
+0:15:36.383 --> 0:15:39.148
+Or is it only about the pure algorithm?
+0:15:42.742 --> 0:15:50.355
+Yeah, when building these systems there is
+a different trade-off to do.
+0:15:50.850 --> 0:15:56.555
+So there's one of the traders between memory
+and throughput, so how many words can generate
+0:15:56.555 --> 0:15:57.299
+per second.
+0:15:57.557 --> 0:16:03.351
+So typically you can easily like increase
+your scruple by increasing the batch size.
+0:16:03.643 --> 0:16:06.899
+So that means you are translating more sentences
+in parallel.
+0:16:07.107 --> 0:16:09.241
+And gypsies are very good at that stuff.
+0:16:09.349 --> 0:16:15.161
+It should translate one sentence or one hundred
+sentences, not the same time, but its.
+0:16:15.115 --> 0:16:20.784
+Rough are very similar because they are at
+this efficient metrics multiplication so that
+0:16:20.784 --> 0:16:24.415
+you can do the same operation on all sentences
+parallel.
+0:16:24.351 --> 0:16:30.133
+So typically that means if you increase your
+benchmark you can do more things in parallel
+0:16:30.133 --> 0:16:31.996
+and you will translate more.
+0:16:31.952 --> 0:16:33.370
+Second.
+0:16:33.653 --> 0:16:43.312
+On the other hand, with this advantage, of
+course you will need higher badge sizes and
+0:16:43.312 --> 0:16:44.755
+more memory.
+0:16:44.965 --> 0:16:56.452
+To begin with, the other problem is that you
+have such big models that you can only translate
+0:16:56.452 --> 0:16:59.141
+with lower bed sizes.
+0:16:59.119 --> 0:17:08.466
+If you are running out of memory with translating,
+one idea to go on that is to decrease your.
+0:17:13.453 --> 0:17:24.456
+Then there is the thing about quality in Screwport,
+of course, and before it's like larger models,
+0:17:24.456 --> 0:17:28.124
+but in generally higher quality.
+0:17:28.012 --> 0:17:31.906
+The first one is always this way.
+0:17:32.092 --> 0:17:38.709
+Course: Not always larger model helps you
+have over fitting at some point, but in generally.
+0:17:43.883 --> 0:17:52.901
+And with this a bit on this training and testing
+thing we had before.
+0:17:53.113 --> 0:17:58.455
+So it wears all the difference between training
+and testing, and for the encoder and decoder.
+0:17:58.798 --> 0:18:09.553
+So if we are looking at what mentioned before
+at training time, we have a source sentence
+0:18:09.553 --> 0:18:17.201
+here: And how this is processed on a is not
+the attention here.
+0:18:17.081 --> 0:18:21.840
+That's a tubical transformer.
+0:18:22.162 --> 0:18:31.626
+And how we can do that on a is that we can
+paralyze the ear ever since.
+0:18:31.494 --> 0:18:40.512
+The first thing to know is: So that is, of
+course, not in all cases.
+0:18:40.382 --> 0:18:49.184
+We'll later talk about speech translation
+where we might want to translate.
+0:18:49.389 --> 0:18:56.172
+Without the general case in, it's like you
+have the full sentence you want to translate.
+0:18:56.416 --> 0:19:02.053
+So the important thing is we are here everything
+available on the source side.
+0:19:03.323 --> 0:19:13.524
+And then this was one of the big advantages
+that you can remember back of transformer.
+0:19:13.407 --> 0:19:15.759
+There are several.
+0:19:16.156 --> 0:19:25.229
+But the other one is now that we can calculate
+the full layer.
+0:19:25.645 --> 0:19:29.318
+There is no dependency between this and this
+state or this and this state.
+0:19:29.749 --> 0:19:36.662
+So we always did like here to calculate the
+key value and query, and based on that you
+0:19:36.662 --> 0:19:37.536
+calculate.
+0:19:37.937 --> 0:19:46.616
+Which means we can do all these calculations
+here in parallel and in parallel.
+0:19:48.028 --> 0:19:55.967
+And there, of course, is this very efficiency
+because again for GPS it's too bigly possible
+0:19:55.967 --> 0:20:00.887
+to do these things in parallel and one after
+each other.
+0:20:01.421 --> 0:20:10.311
+And then we can also for each layer one by
+one, and then we calculate here the encoder.
+0:20:10.790 --> 0:20:21.921
+In training now an important thing is that
+for the decoder we have the full sentence available
+0:20:21.921 --> 0:20:28.365
+because we know this is the target we should
+generate.
+0:20:29.649 --> 0:20:33.526
+We have models now in a different way.
+0:20:33.426 --> 0:20:38.299
+This hidden state is only on the previous
+ones.
+0:20:38.598 --> 0:20:51.887
+And the first thing here depends only on this
+information, so you see if you remember we
+0:20:51.887 --> 0:20:56.665
+had this masked self-attention.
+0:20:56.896 --> 0:21:04.117
+So that means, of course, we can only calculate
+the decoder once the encoder is done, but that's.
+0:21:04.444 --> 0:21:06.656
+Percent can calculate the end quarter.
+0:21:06.599 --> 0:21:08.926
+Then we can calculate here the decoder.
+0:21:09.569 --> 0:21:25.566
+But again in training we have x, y and that
+is available so we can calculate everything
+0:21:25.566 --> 0:21:27.929
+in parallel.
+0:21:28.368 --> 0:21:40.941
+So the interesting thing or advantage of transformer
+is in training.
+0:21:40.759 --> 0:21:46.414
+We can do it for the decoder.
+0:21:46.866 --> 0:21:54.457
+That means you will have more calculations
+because you can only calculate one layer at
+0:21:54.457 --> 0:22:02.310
+a time, but for example the length which is
+too bigly quite long or doesn't really matter
+0:22:02.310 --> 0:22:03.270
+that much.
+0:22:05.665 --> 0:22:10.704
+However, in testing this situation is different.
+0:22:10.602 --> 0:22:13.280
+In testing we only have.
+0:22:13.713 --> 0:22:21.427
+So this means we start with a sense: We don't
+know the full sentence yet because we ought
+0:22:21.427 --> 0:22:29.054
+to regularly generate that so for the encoder
+we have the same here but for the decoder.
+0:22:29.409 --> 0:22:39.598
+In this case we only have the first and the
+second instinct, but only for all states in
+0:22:39.598 --> 0:22:40.756
+parallel.
+0:22:41.101 --> 0:22:51.752
+And then we can do the next step for y because
+we are putting our most probable one.
+0:22:51.626 --> 0:22:58.646
+We do greedy search or beam search, but you
+cannot do.
+0:23:03.663 --> 0:23:16.838
+Yes, so if we are interesting in making things
+more efficient for testing, which we see, for
+0:23:16.838 --> 0:23:22.363
+example in the scenario of really our.
+0:23:22.642 --> 0:23:34.286
+It makes sense that we think about our architecture
+and that we are currently working on attention
+0:23:34.286 --> 0:23:35.933
+based models.
+0:23:36.096 --> 0:23:44.150
+The decoder there is some of the most time
+spent testing and testing.
+0:23:44.035 --> 0:23:47.146
+It's similar, but during.
+0:23:47.167 --> 0:23:50.248
+Nothing about beam search.
+0:23:50.134 --> 0:23:59.835
+It might be even more complicated because
+in beam search you have to try different.
+0:24:02.762 --> 0:24:15.140
+So the question is what can you now do in
+order to make your model more efficient and
+0:24:15.140 --> 0:24:21.905
+better in translation in these types of cases?
+0:24:24.604 --> 0:24:30.178
+And the one thing is to look into the encoded
+decoder trailer.
+0:24:30.690 --> 0:24:43.898
+And then until now we typically assume that
+the depth of the encoder and the depth of the
+0:24:43.898 --> 0:24:48.154
+decoder is roughly the same.
+0:24:48.268 --> 0:24:55.553
+So if you haven't thought about it, you just
+take what is running well.
+0:24:55.452 --> 0:24:57.683
+You would try to do.
+0:24:58.018 --> 0:25:04.148
+However, we saw now that there is a quite
+big challenge and the runtime is a lot longer
+0:25:04.148 --> 0:25:04.914
+than here.
+0:25:05.425 --> 0:25:14.018
+The question is also the case for the calculations,
+or do we have there the same issue that we
+0:25:14.018 --> 0:25:21.887
+only get the good quality if we are having
+high and high, so we know that making these
+0:25:21.887 --> 0:25:25.415
+more depths is increasing our quality.
+0:25:25.425 --> 0:25:31.920
+But what we haven't talked about is really
+important that we increase the depth the same
+0:25:31.920 --> 0:25:32.285
+way.
+0:25:32.552 --> 0:25:41.815
+So what we can put instead also do is something
+like this where you have a deep encoder and
+0:25:41.815 --> 0:25:42.923
+a shallow.
+0:25:43.163 --> 0:25:57.386
+So that would be that you, for example, have
+instead of having layers on the encoder, and
+0:25:57.386 --> 0:25:59.757
+layers on the.
+0:26:00.080 --> 0:26:10.469
+So in this case the overall depth from start
+to end would be similar and so hopefully.
+0:26:11.471 --> 0:26:21.662
+But we could a lot more things hear parallelized,
+and hear what is costly at the end during decoding
+0:26:21.662 --> 0:26:22.973
+the decoder.
+0:26:22.872 --> 0:26:29.331
+Because that does change in an outer regressive
+way, there we.
+0:26:31.411 --> 0:26:33.727
+And that that can be analyzed.
+0:26:33.652 --> 0:26:38.744
+So here is some examples: Where people have
+done all this.
+0:26:39.019 --> 0:26:55.710
+So here it's mainly interested on the orange
+things, which is auto-regressive about the
+0:26:55.710 --> 0:26:57.607
+speed up.
+0:26:57.717 --> 0:27:15.031
+You have the system, so agree is not exactly
+the same, but it's similar.
+0:27:15.055 --> 0:27:23.004
+It's always the case if you look at speed
+up.
+0:27:22.831 --> 0:27:31.647
+Think they put a speed of so that's the baseline.
+0:27:31.771 --> 0:27:35.348
+So between and times as fast.
+0:27:35.229 --> 0:27:42.623
+If you switch from a system to where you have
+layers in the.
+0:27:42.782 --> 0:27:52.309
+You see that although you have slightly more
+parameters, more calculations are also roughly
+0:27:52.309 --> 0:28:00.283
+the same, but you can speed out because now
+during testing you can paralyze.
+0:28:02.182 --> 0:28:09.754
+The other thing is that you're speeding up,
+but if you look at the performance it's similar,
+0:28:09.754 --> 0:28:13.500
+so sometimes you improve, sometimes you lose.
+0:28:13.419 --> 0:28:20.422
+There's a bit of losing English to Romania,
+but in general the quality is very slow.
+0:28:20.680 --> 0:28:30.343
+So you see that you can keep a similar performance
+while improving your speed by just having different.
+0:28:30.470 --> 0:28:34.903
+And you also see the encoder layers from speed.
+0:28:34.811 --> 0:28:38.125
+They don't really metal that much.
+0:28:38.030 --> 0:28:38.712
+Most.
+0:28:38.979 --> 0:28:50.319
+Because if you compare the 12th system to
+the 6th system you have a lower performance
+0:28:50.319 --> 0:28:57.309
+with 6th and colder layers but the speed is
+similar.
+0:28:57.897 --> 0:29:02.233
+And see the huge decrease is it maybe due
+to a lack of data.
+0:29:03.743 --> 0:29:11.899
+Good idea would say it's not the case.
+0:29:11.690 --> 0:29:23.195
+Romanian English should have the same number
+of data.
+0:29:24.224 --> 0:29:31.184
+Maybe it's just that something in that language.
+0:29:31.042 --> 0:29:40.704
+If you generate Romanian maybe they need more
+target dependencies.
+0:29:42.882 --> 0:29:46.263
+The Wine's the Eye Also Don't Know Any Sex
+People Want To.
+0:29:47.887 --> 0:29:49.034
+There could be yeah the.
+0:29:49.889 --> 0:30:02.316
+As the maybe if you go from like a movie sphere
+to a hybrid sphere, you can: It's very much
+0:30:02.316 --> 0:30:12.447
+easier to expand the vocabulary to English,
+but it must be the vocabulary.
+0:30:13.333 --> 0:30:21.147
+Have to check, but would assume that in this
+case the system is not retrained, but it's
+0:30:21.147 --> 0:30:22.391
+trained with.
+0:30:22.902 --> 0:30:30.213
+And that's why I was assuming that they have
+the same, but maybe you'll write that in this
+0:30:30.213 --> 0:30:35.595
+piece, for example, if they were pre-trained,
+the decoder English.
+0:30:36.096 --> 0:30:43.733
+But don't remember exactly if they do something
+like that, but that could be a good.
+0:30:45.325 --> 0:30:52.457
+So this is some of the most easy way to speed
+up.
+0:30:52.314 --> 0:31:01.446
+You just switch to hyperparameters, not to
+implement anything.
+0:31:02.722 --> 0:31:08.340
+ArchitectureOf course, there's other ways
+of doing that.
+0:31:08.242 --> 0:31:11.809
+We'll look into two things.
+0:31:11.682 --> 0:31:16.527
+The other thing is the architecture.
+0:31:16.796 --> 0:31:28.154
+We are now at some of the baselines that we
+are doing.
+0:31:28.488 --> 0:31:39.978
+However, in translation in the decoder side,
+it might not be the best solution.
+0:31:39.834 --> 0:31:41.857
+There is no.
+0:31:42.222 --> 0:31:47.130
+So we can use different types of architectures,
+also in the encoder and the.
+0:31:47.747 --> 0:31:52.475
+And there's two ways of what you could do
+different, or there's more ways.
+0:31:52.912 --> 0:31:54.825
+We will look into two todays.
+0:31:54.761 --> 0:31:58.843
+The one is average attention, which is a very
+simple solution.
+0:31:59.419 --> 0:32:01.464
+You can do as it says.
+0:32:01.375 --> 0:32:04.527
+It's not really attending anymore.
+0:32:04.437 --> 0:32:08.760
+It's just like equal attendance to everything.
+0:32:09.249 --> 0:32:23.422
+And the other idea, which is currently done
+in most systems which are optimized to efficiency,
+0:32:23.422 --> 0:32:24.913
+is we're.
+0:32:25.065 --> 0:32:32.623
+But on the decoder side we are then not using
+transformer or self attention, but we are using
+0:32:32.623 --> 0:32:39.700
+recurrent neural network because they are the
+disadvantage of recurrent neural network.
+0:32:39.799 --> 0:32:48.353
+And then the recurrent is normally easier
+to calculate because it only depends on inputs,
+0:32:48.353 --> 0:32:49.684
+the input on.
+0:32:51.931 --> 0:33:02.190
+So what is the difference between decoding
+and why is the tension maybe not sufficient
+0:33:02.190 --> 0:33:03.841
+for decoding?
+0:33:04.204 --> 0:33:14.390
+If we want to populate the new state, we only
+have to look at the input and the previous
+0:33:14.390 --> 0:33:15.649
+state, so.
+0:33:16.136 --> 0:33:19.029
+We are more conditional here networks.
+0:33:18.955 --> 0:33:20.000
+We have the.
+0:33:19.980 --> 0:33:31.291
+Dependency to a fixed number of previous ones,
+but that's rarely used for decoding.
+0:33:31.156 --> 0:33:39.776
+In contrast, in transformer we have this large
+dependency, so.
+0:33:40.000 --> 0:33:52.760
+So from t minus one to y t so that is somehow
+and mainly not very efficient in this way mean
+0:33:52.760 --> 0:33:56.053
+it's very good because.
+0:33:56.276 --> 0:34:03.543
+However, the disadvantage is that we also
+have to do all these calculations, so if we
+0:34:03.543 --> 0:34:10.895
+more view from the point of view of efficient
+calculation, this might not be the best.
+0:34:11.471 --> 0:34:20.517
+So the question is, can we change our architecture
+to keep some of the advantages but make things
+0:34:20.517 --> 0:34:21.994
+more efficient?
+0:34:24.284 --> 0:34:31.131
+The one idea is what is called the average
+attention, and the interesting thing is this
+0:34:31.131 --> 0:34:32.610
+work surprisingly.
+0:34:33.013 --> 0:34:38.917
+So the only idea what you're doing is doing
+the decoder.
+0:34:38.813 --> 0:34:42.592
+You're not doing attention anymore.
+0:34:42.487 --> 0:34:46.794
+The attention weights are all the same.
+0:34:47.027 --> 0:35:00.723
+So you don't calculate with query and key
+the different weights, and then you just take
+0:35:00.723 --> 0:35:03.058
+equal weights.
+0:35:03.283 --> 0:35:07.585
+So here would be one third from this, one
+third from this, and one third.
+0:35:09.009 --> 0:35:14.719
+And while it is sufficient you can now do
+precalculation and things get more efficient.
+0:35:15.195 --> 0:35:18.803
+So first go the formula that's maybe not directed
+here.
+0:35:18.979 --> 0:35:38.712
+So the difference here is that your new hint
+stage is the sum of all the hint states, then.
+0:35:38.678 --> 0:35:40.844
+So here would be with this.
+0:35:40.767 --> 0:35:45.023
+It would be one third of this plus one third
+of this.
+0:35:46.566 --> 0:35:57.162
+But if you calculate it this way, it's not
+yet being more efficient because you still
+0:35:57.162 --> 0:36:01.844
+have to sum over here all the hidden.
+0:36:04.524 --> 0:36:22.932
+But you can not easily speed up these things
+by having an in between value, which is just
+0:36:22.932 --> 0:36:24.568
+always.
+0:36:25.585 --> 0:36:30.057
+If you take this as ten to one, you take this
+one class this one.
+0:36:30.350 --> 0:36:36.739
+Because this one then was before this, and
+this one was this, so in the end.
+0:36:37.377 --> 0:36:49.545
+So now this one is not the final one in order
+to get the final one to do the average.
+0:36:49.404 --> 0:36:50.158
+So.
+0:36:50.430 --> 0:37:00.264
+But then if you do this calculation with speed
+up you can do it with a fixed number of steps.
+0:37:00.180 --> 0:37:11.300
+Instead of the sun which depends on age, so
+you only have to do calculations to calculate
+0:37:11.300 --> 0:37:12.535
+this one.
+0:37:12.732 --> 0:37:21.718
+Can you do the lakes and the lakes?
+0:37:21.469 --> 0:37:32.707
+For example, light bulb here now takes and.
+0:37:32.993 --> 0:37:38.762
+That's a very good point and that's why this
+is now in the image.
+0:37:38.675 --> 0:37:44.533
+It's not very good so this is the one with
+tilder and the tilder.
+0:37:44.884 --> 0:37:57.895
+So this one is just the sum of these two,
+because this is just this one.
+0:37:58.238 --> 0:38:08.956
+So the sum of this is exactly as the sum of
+these, and the sum of these is the sum of here.
+0:38:08.840 --> 0:38:15.133
+So you only do the sum in here, and the multiplying.
+0:38:15.255 --> 0:38:22.145
+So what you can mainly do here is you can
+do it more mathematically.
+0:38:22.045 --> 0:38:31.532
+You can know this by tea taking out of the
+sum, and then you can calculate the sum different.
+0:38:36.256 --> 0:38:42.443
+That maybe looks a bit weird and simple, so
+we were all talking about this great attention
+0:38:42.443 --> 0:38:47.882
+that we can focus on different parts, and a
+bit surprising on this work is now.
+0:38:47.814 --> 0:38:53.322
+In the end it might also work well without
+really putting and just doing equal.
+0:38:53.954 --> 0:38:56.164
+Mean it's not that easy.
+0:38:56.376 --> 0:38:58.261
+It's like sometimes this is working.
+0:38:58.210 --> 0:39:00.452
+There's also report weight work that well.
+0:39:01.481 --> 0:39:05.848
+But I think it's an interesting way and it
+maybe shows that a lot of.
+0:39:05.805 --> 0:39:10.624
+Things in the self or in the transformer paper
+which are more put as like yet.
+0:39:10.563 --> 0:39:15.890
+These are some hyperpermetheuss around it,
+like that you do the layer norm in between,
+0:39:15.890 --> 0:39:21.769
+and that you do a feat forward before, and
+things like that, that these are also all important,
+0:39:21.769 --> 0:39:25.566
+and that the right set up around that is also
+very important.
+0:39:28.969 --> 0:39:38.598
+The other thing you can do in the end is not
+completely different from this one.
+0:39:38.479 --> 0:39:42.524
+It's just like a very different.
+0:39:42.942 --> 0:39:54.338
+And that is a recurrent network which also
+has this type of highway connection that can
+0:39:54.338 --> 0:40:01.330
+ignore the recurrent unit and directly put
+the input.
+0:40:01.561 --> 0:40:10.770
+It's not really adding out, but if you see
+the hitting step is your input, but what you
+0:40:10.770 --> 0:40:15.480
+can do is somehow directly go to the output.
+0:40:17.077 --> 0:40:28.390
+These are the four components of the simple
+return unit, and the unit is motivated by GIS
+0:40:28.390 --> 0:40:33.418
+and by LCMs, which we have seen before.
+0:40:33.513 --> 0:40:43.633
+And that has proven to be very good for iron
+ends, which allows you to have a gate on your.
+0:40:44.164 --> 0:40:48.186
+In this thing we have two gates, the reset
+gate and the forget gate.
+0:40:48.768 --> 0:40:57.334
+So first we have the general structure which
+has a cell state.
+0:40:57.198 --> 0:41:01.282
+Here we have the cell state.
+0:41:01.361 --> 0:41:09.661
+And then this goes next, and we always get
+the different cell states over the times that.
+0:41:10.030 --> 0:41:11.448
+This Is the South Stand.
+0:41:11.771 --> 0:41:16.518
+How do we now calculate that just assume we
+have an initial cell safe here?
+0:41:17.017 --> 0:41:19.670
+But the first thing is we're doing the forget
+game.
+0:41:20.060 --> 0:41:34.774
+The forgetting models should the new cell
+state mainly depend on the previous cell state
+0:41:34.774 --> 0:41:40.065
+or should it depend on our age.
+0:41:40.000 --> 0:41:41.356
+Like Add to Them.
+0:41:41.621 --> 0:41:42.877
+How can we model that?
+0:41:44.024 --> 0:41:45.599
+First we were at a cocktail.
+0:41:45.945 --> 0:41:52.151
+The forget gait is depending on minus one.
+0:41:52.006 --> 0:41:56.485
+You also see here the former.
+0:41:57.057 --> 0:42:01.963
+So we are multiplying both the cell state
+and our input.
+0:42:01.877 --> 0:42:04.893
+With some weights we are getting.
+0:42:05.105 --> 0:42:08.472
+We are putting some Bay Inspector and then
+we are doing Sigma Weed on that.
+0:42:08.868 --> 0:42:13.452
+So in the end we have numbers between zero
+and one saying for each dimension.
+0:42:13.853 --> 0:42:22.041
+Like how much if it's near to zero we will
+mainly use the new input.
+0:42:21.922 --> 0:42:31.891
+If it's near to one we will keep the input
+and ignore the input at this dimension.
+0:42:33.313 --> 0:42:40.173
+And by this motivation we can then create
+here the new sound state, and here you see
+0:42:40.173 --> 0:42:41.141
+the formal.
+0:42:41.601 --> 0:42:55.048
+So you take your foot back gate and multiply
+it with your class.
+0:42:54.841 --> 0:43:00.435
+So if my was around then.
+0:43:00.800 --> 0:43:07.405
+In the other case, when the value was others,
+that's what you added.
+0:43:07.309 --> 0:43:10.949
+Then you're adding a transformation.
+0:43:11.351 --> 0:43:24.284
+So if this value was maybe zero then you're
+putting most of the information from inputting.
+0:43:25.065 --> 0:43:26.947
+Is already your element?
+0:43:26.872 --> 0:43:30.540
+The only question is now based on your element.
+0:43:30.463 --> 0:43:32.072
+What is the output?
+0:43:33.253 --> 0:43:47.951
+And there you have another opportunity so
+you can either take the output or instead you
+0:43:47.951 --> 0:43:50.957
+prefer the input.
+0:43:52.612 --> 0:43:58.166
+So is the value also the same for the recept
+game and the forget game.
+0:43:58.087 --> 0:43:59.422
+Yes, the movie.
+0:44:00.900 --> 0:44:10.004
+Yes exactly so the matrices are different
+and therefore it can be and that should be
+0:44:10.004 --> 0:44:16.323
+and maybe there is sometimes you want to have
+information.
+0:44:16.636 --> 0:44:23.843
+So here again we have this vector with values
+between zero and which says controlling how
+0:44:23.843 --> 0:44:25.205
+the information.
+0:44:25.505 --> 0:44:36.459
+And then the output is calculated here similar
+to a cell stage, but again input is from.
+0:44:36.536 --> 0:44:45.714
+So either the reset gate decides should give
+what is currently stored in there, or.
+0:44:46.346 --> 0:44:58.647
+So it's not exactly as the thing we had before,
+with the residual connections where we added
+0:44:58.647 --> 0:45:01.293
+up, but here we do.
+0:45:04.224 --> 0:45:08.472
+This is the general idea of a simple recurrent
+neural network.
+0:45:08.405 --> 0:45:13.094
+Then we will now look at how we can make things
+even more efficient.
+0:45:13.026 --> 0:45:17.106
+But first do you have more questions on how
+it is working?
+0:45:23.063 --> 0:45:38.799
+Now these calculations are a bit where things
+get more efficient because this somehow.
+0:45:38.718 --> 0:45:43.177
+It depends on all the other damage for the
+second one also.
+0:45:43.423 --> 0:45:48.904
+Because if you do a matrix multiplication
+with a vector like for the output vector, each
+0:45:48.904 --> 0:45:52.353
+diameter of the output vector depends on all
+the other.
+0:45:52.973 --> 0:46:06.561
+The cell state here depends because this one
+is used here, and somehow the first dimension
+0:46:06.561 --> 0:46:11.340
+of the cell state only depends.
+0:46:11.931 --> 0:46:17.973
+In order to make that, of course, is sometimes
+again making things less paralyzeable if things
+0:46:17.973 --> 0:46:18.481
+depend.
+0:46:19.359 --> 0:46:35.122
+Can easily make that different by changing
+from the metric product to not a vector.
+0:46:35.295 --> 0:46:51.459
+So you do first, just like inside here, you
+take like the first dimension, my second dimension.
+0:46:52.032 --> 0:46:53.772
+Is, of course, narrow.
+0:46:53.696 --> 0:46:59.295
+This should be reset or this should be because
+it should be a different.
+0:46:59.899 --> 0:47:12.053
+Now the first dimension only depends on the
+first dimension, so you don't have dependencies
+0:47:12.053 --> 0:47:16.148
+any longer between dimensions.
+0:47:18.078 --> 0:47:25.692
+Maybe it gets a bit clearer if you see about
+it in this way, so what we have to do now.
+0:47:25.966 --> 0:47:31.911
+First, we have to do a metrics multiplication
+on to gather and to get the.
+0:47:32.292 --> 0:47:38.041
+And then we only have the element wise operations
+where we take this output.
+0:47:37.966 --> 0:47:38.722
+We take.
+0:47:39.179 --> 0:47:42.978
+Minus one and our original.
+0:47:42.842 --> 0:47:52.750
+Here we only have elemental abrasions which
+can be optimally paralyzed.
+0:47:53.273 --> 0:48:07.603
+So here we have additional paralyzed things
+across the dimension and don't have to do that.
+0:48:09.929 --> 0:48:24.255
+Yeah, but this you can do like in parallel
+again for all xts.
+0:48:24.544 --> 0:48:33.014
+Here you can't do it in parallel, but you
+only have to do it on each seat, and then you
+0:48:33.014 --> 0:48:34.650
+can parallelize.
+0:48:35.495 --> 0:48:39.190
+But this maybe for the dimension.
+0:48:39.081 --> 0:48:42.036
+Maybe it's also important.
+0:48:41.926 --> 0:48:45.898
+I don't know if they have tried it.
+0:48:45.787 --> 0:48:55.386
+I assume it's not only for dimension reduction,
+but it's hard because you can easily.
+0:49:01.001 --> 0:49:08.164
+People have even like made the second thing
+even more easy.
+0:49:08.044 --> 0:49:10.214
+So there is this.
+0:49:10.093 --> 0:49:17.897
+This is how we have the highway connections
+in the transformer.
+0:49:17.776 --> 0:49:20.708
+Then it's like you do.
+0:49:20.780 --> 0:49:24.789
+So that is like how things are put together
+as a transformer.
+0:49:25.125 --> 0:49:39.960
+And that is a similar and simple recurring
+neural network where you do exactly the same
+0:49:39.960 --> 0:49:44.512
+for the so you don't have.
+0:49:46.326 --> 0:49:47.503
+This type of things.
+0:49:49.149 --> 0:50:01.196
+And with this we are at the end of how to
+make efficient architectures before we go to
+0:50:01.196 --> 0:50:02.580
+the next.
+0:50:13.013 --> 0:50:23.004
+Teacher ModelsBetween the ink or the trader
+and the architectures there is a next technique
+0:50:23.004 --> 0:50:28.977
+which is used in nearly all deburning very
+successful.
+0:50:29.449 --> 0:50:43.463
+So the idea is can we extract the knowledge
+from a large network into a smaller one, but
+0:50:43.463 --> 0:50:45.983
+it's similarly.
+0:50:47.907 --> 0:50:53.217
+And the nice thing is that this really works,
+and it may be very, very surprising.
+0:50:53.673 --> 0:51:03.000
+So the idea is that we have a large straw
+model which we train for long, and the question
+0:51:03.000 --> 0:51:07.871
+is: Can that help us to train a smaller model?
+0:51:08.148 --> 0:51:16.296
+So can what we refer to as teacher model tell
+us better to build a small student model than
+0:51:16.296 --> 0:51:17.005
+before.
+0:51:17.257 --> 0:51:27.371
+So what we're before in it as a student model,
+we learn from the data and that is how we train
+0:51:27.371 --> 0:51:28.755
+our systems.
+0:51:29.249 --> 0:51:37.949
+The question is: Can we train this small model
+better if we are not only learning from the
+0:51:37.949 --> 0:51:46.649
+data, but we are also learning from a large
+model which has been trained maybe in the same
+0:51:46.649 --> 0:51:47.222
+data?
+0:51:47.667 --> 0:51:55.564
+So that you have then in the end a smaller
+model that is somehow better performing than.
+0:51:55.895 --> 0:51:59.828
+And maybe that's on the first view.
+0:51:59.739 --> 0:52:05.396
+Very very surprising because it has seen the
+same data so it should have learned the same
+0:52:05.396 --> 0:52:11.053
+so the baseline model trained only on the data
+and the student teacher knowledge to still
+0:52:11.053 --> 0:52:11.682
+model it.
+0:52:11.619 --> 0:52:17.387
+They all have seen only this data because
+your teacher modeling was also trained typically
+0:52:17.387 --> 0:52:19.162
+only on this model however.
+0:52:20.580 --> 0:52:30.071
+It has by now shown that by many ways the
+model trained in the teacher and analysis framework
+0:52:30.071 --> 0:52:32.293
+is performing better.
+0:52:33.473 --> 0:52:40.971
+A bit of an explanation when we see how that
+works.
+0:52:40.827 --> 0:52:46.141
+There's different ways of doing it.
+0:52:45.993 --> 0:52:47.199
+Maybe.
+0:52:47.567 --> 0:52:51.501
+So how does it work?
+0:52:51.314 --> 0:53:04.787
+This is our student network, the normal one,
+some type of new network.
+0:53:04.597 --> 0:53:06.147
+We're.
+0:53:06.586 --> 0:53:17.050
+So we are training the model to predict the
+same thing as we are doing that by calculating.
+0:53:17.437 --> 0:53:23.173
+The cross angry loss was defined in a way
+where saying all the probabilities for the
+0:53:23.173 --> 0:53:25.332
+correct word should be as high.
+0:53:25.745 --> 0:53:32.207
+So you are calculating your alphabet probabilities
+always, and each time step you have an alphabet
+0:53:32.207 --> 0:53:33.055
+probability.
+0:53:32.990 --> 0:53:38.639
+What is the most probable in the next word
+and your training signal is put as much of
+0:53:38.639 --> 0:53:43.368
+your probability mass to the correct word to
+the word that is there in.
+0:53:43.903 --> 0:53:51.367
+And this is the chief by this cross entry
+loss, which says with some of the all training
+0:53:51.367 --> 0:53:58.664
+examples of all positions, with some of the
+full vocabulary, and then this one is this
+0:53:58.664 --> 0:54:03.947
+one that this current word is the case word
+in the vocabulary.
+0:54:04.204 --> 0:54:18.001
+And then we take here the lock for the ability
+of that, so what we made me do is: We have
+0:54:18.001 --> 0:54:27.200
+this metric here, so each position of your
+vocabulary size.
+0:54:27.507 --> 0:54:38.656
+In the end what you just do is some of these
+three lock probabilities, and then you want
+0:54:38.656 --> 0:54:40.785
+to have as much.
+0:54:41.041 --> 0:54:54.614
+So although this is a thumb over this metric
+here, in the end of each dimension you.
+0:54:54.794 --> 0:55:06.366
+So that is a normal cross end to be lost that
+we have discussed at the very beginning of
+0:55:06.366 --> 0:55:07.016
+how.
+0:55:08.068 --> 0:55:15.132
+So what can we do differently in the teacher
+network?
+0:55:15.001 --> 0:55:23.376
+We also have a teacher network which is trained
+on large data.
+0:55:24.224 --> 0:55:35.957
+And of course this distribution might be better
+than the one from the small model because it's.
+0:55:36.456 --> 0:55:40.941
+So in this case we have now the training signal
+from the teacher network.
+0:55:41.441 --> 0:55:46.262
+And it's the same way as we had before.
+0:55:46.142 --> 0:55:56.483
+The only difference is we're training not
+the ground truths per ability distribution
+0:55:56.483 --> 0:55:59.160
+year, which is sharp.
+0:55:59.299 --> 0:56:11.303
+That's also a probability, so this word has
+a high probability, but have some probability.
+0:56:12.612 --> 0:56:19.577
+And that is the main difference.
+0:56:19.366 --> 0:56:30.345
+Typically you do like the interpretation of
+these.
+0:56:33.213 --> 0:56:38.669
+Because there's more information contained
+in the distribution than in the front booth,
+0:56:38.669 --> 0:56:44.187
+because it encodes more information about the
+language, because language always has more
+0:56:44.187 --> 0:56:47.907
+options to put alone, that's the same sentence
+yes exactly.
+0:56:47.845 --> 0:56:53.115
+So there's ambiguity in there that is encoded
+hopefully very well in the complaint.
+0:56:53.513 --> 0:56:57.257
+Trade you two networks so better than a student
+network you have in there from your learner.
+0:56:57.537 --> 0:57:05.961
+So maybe often there's only one correct word,
+but it might be two or three, and then all
+0:57:05.961 --> 0:57:10.505
+of these three have a probability distribution.
+0:57:10.590 --> 0:57:21.242
+And then is the main advantage or one explanation
+of why it's better to train from the.
+0:57:21.361 --> 0:57:32.652
+Of course, it's good to also keep the signal
+in there because then you can prevent it because
+0:57:32.652 --> 0:57:33.493
+crazy.
+0:57:37.017 --> 0:57:49.466
+Any more questions on the first type of knowledge
+distillation, also distribution changes.
+0:57:50.550 --> 0:58:02.202
+Coming around again, this would put it a bit
+different, so this is not a solution to maintenance
+0:58:02.202 --> 0:58:04.244
+or distribution.
+0:58:04.744 --> 0:58:12.680
+But don't think it's performing worse than
+only doing the ground tours because they also.
+0:58:13.113 --> 0:58:21.254
+So it's more like it's not improving you would
+assume it's similarly helping you, but.
+0:58:21.481 --> 0:58:28.145
+Of course, if you now have a teacher, maybe
+you have no danger on your target to Maine,
+0:58:28.145 --> 0:58:28.524
+but.
+0:58:28.888 --> 0:58:39.895
+Then you can use this one which is not the
+ground truth but helpful to learn better for
+0:58:39.895 --> 0:58:42.147
+the distribution.
+0:58:46.326 --> 0:58:57.012
+The second idea is to do sequence level knowledge
+distillation, so what we have in this case
+0:58:57.012 --> 0:59:02.757
+is we have looked at each position independently.
+0:59:03.423 --> 0:59:05.436
+Mean, we do that often.
+0:59:05.352 --> 0:59:10.930
+We are not generating a lot of sequences,
+but that has a problem.
+0:59:10.845 --> 0:59:13.932
+We have this propagation of errors.
+0:59:13.846 --> 0:59:16.765
+We start with one area and then.
+0:59:17.237 --> 0:59:27.419
+So if we are doing word-level knowledge dissolution,
+we are treating each word in the sentence independently.
+0:59:28.008 --> 0:59:32.091
+So we are not trying to like somewhat model
+the dependency between.
+0:59:32.932 --> 0:59:47.480
+We can try to do that by sequence level knowledge
+dissolution, but the problem is, of course,.
+0:59:47.847 --> 0:59:53.478
+So we can that for each position we can get
+a distribution over all the words at this.
+0:59:53.793 --> 1:00:05.305
+But if we want to have a distribution of all
+possible target sentences, that's not possible
+1:00:05.305 --> 1:00:06.431
+because.
+1:00:08.508 --> 1:00:15.940
+Area, so we can then again do a bit of a heck
+on that.
+1:00:15.805 --> 1:00:23.240
+If we can't have a distribution of all sentences,
+it.
+1:00:23.843 --> 1:00:30.764
+So what we can't do is you can not use the
+teacher network and sample different translations.
+1:00:31.931 --> 1:00:39.327
+And now we can do different ways to train
+them.
+1:00:39.173 --> 1:00:49.345
+We can use them as their probability, the
+easiest one to assume.
+1:00:50.050 --> 1:00:56.373
+So what that ends to is that we're taking
+our teacher network, we're generating some
+1:00:56.373 --> 1:01:01.135
+translations, and these ones we're using as
+additional trading.
+1:01:01.781 --> 1:01:11.382
+Then we have mainly done this sequence level
+because the teacher network takes us.
+1:01:11.266 --> 1:01:17.515
+These are all probable translations of the
+sentence.
+1:01:26.286 --> 1:01:34.673
+And then you can do a bit of a yeah, and you
+can try to better make a bit of an interpolated
+1:01:34.673 --> 1:01:36.206
+version of that.
+1:01:36.716 --> 1:01:42.802
+So what people have also done is like subsequent
+level interpolations.
+1:01:42.717 --> 1:01:52.873
+You generate here several translations: But
+then you don't use all of them.
+1:01:52.739 --> 1:02:00.660
+You do some metrics on which of these ones.
+1:02:01.021 --> 1:02:12.056
+So it's a bit more training on this brown
+chose which might be improbable or unreachable
+1:02:12.056 --> 1:02:16.520
+because we can generate everything.
+1:02:16.676 --> 1:02:23.378
+And we are giving it an easier solution which
+is also good quality and training of that.
+1:02:23.703 --> 1:02:32.602
+So you're not training it on a very difficult
+solution, but you're training it on an easier
+1:02:32.602 --> 1:02:33.570
+solution.
+1:02:36.356 --> 1:02:38.494
+Any More Questions to This.
+1:02:40.260 --> 1:02:41.557
+Yeah.
+1:02:41.461 --> 1:02:44.296
+Good.
+1:02:43.843 --> 1:03:01.642
+Is to look at the vocabulary, so the problem
+is we have seen that vocabulary calculations
+1:03:01.642 --> 1:03:06.784
+are often very presuming.
+1:03:09.789 --> 1:03:19.805
+The thing is that most of the vocabulary is
+not needed for each sentence, so in each sentence.
+1:03:20.280 --> 1:03:28.219
+The question is: Can we somehow easily precalculate,
+which words are probable to occur in the sentence,
+1:03:28.219 --> 1:03:30.967
+and then only calculate these ones?
+1:03:31.691 --> 1:03:34.912
+And this can be done so.
+1:03:34.784 --> 1:03:43.934
+For example, if you have sentenced card, it's
+probably not happening.
+1:03:44.164 --> 1:03:48.701
+So what you can try to do is to limit your
+vocabulary.
+1:03:48.618 --> 1:03:51.096
+You're considering for each.
+1:03:51.151 --> 1:04:04.693
+So you're no longer taking the full vocabulary
+as possible output, but you're restricting.
+1:04:06.426 --> 1:04:18.275
+That typically works is that we limit it by
+the most frequent words we always take because
+1:04:18.275 --> 1:04:23.613
+these are not so easy to align to words.
+1:04:23.964 --> 1:04:32.241
+To take the most treatment taggin' words and
+then work that often aligns with one of the
+1:04:32.241 --> 1:04:32.985
+source.
+1:04:33.473 --> 1:04:46.770
+So for each source word you calculate the
+word alignment on your training data, and then
+1:04:46.770 --> 1:04:51.700
+you calculate which words occur.
+1:04:52.352 --> 1:04:57.680
+And then for decoding you build this union
+of maybe the source word list that other.
+1:04:59.960 --> 1:05:02.145
+Are like for each source work.
+1:05:02.075 --> 1:05:08.745
+One of the most frequent translations of these
+source words, for example for each source work
+1:05:08.745 --> 1:05:13.003
+like in the most frequent ones, and then the
+most frequent.
+1:05:13.193 --> 1:05:24.333
+In total, if you have short sentences, you
+have a lot less words, so in most cases it's
+1:05:24.333 --> 1:05:26.232
+not more than.
+1:05:26.546 --> 1:05:33.957
+And so you have dramatically reduced your
+vocabulary, and thereby can also fax a depot.
+1:05:35.495 --> 1:05:43.757
+That easy does anybody see what is challenging
+here and why that might not always need.
+1:05:47.687 --> 1:05:54.448
+The performance is not why this might not.
+1:05:54.291 --> 1:06:01.842
+If you implement it, it might not be a strong.
+1:06:01.941 --> 1:06:06.053
+You have to store this list.
+1:06:05.911 --> 1:06:14.138
+You have to burn the union and of course your
+safe time.
+1:06:14.554 --> 1:06:21.920
+The second thing the vocabulary is used in
+our last step, so we have the hidden state,
+1:06:21.920 --> 1:06:23.868
+and then we calculate.
+1:06:24.284 --> 1:06:29.610
+Now we are not longer calculating them for
+all output words, but for a subset of them.
+1:06:30.430 --> 1:06:35.613
+However, this metric multiplication is typically
+parallelized with the perfect but good.
+1:06:35.956 --> 1:06:46.937
+But if you not only calculate some of them,
+if you're not modeling it right, it will take
+1:06:46.937 --> 1:06:52.794
+as long as before because of the nature of
+the.
+1:06:56.776 --> 1:07:07.997
+Here for beam search there's some ideas of
+course you can go back to greedy search because
+1:07:07.997 --> 1:07:10.833
+that's more efficient.
+1:07:11.651 --> 1:07:18.347
+And better quality, and you can buffer some
+states in between, so how much buffering it's
+1:07:18.347 --> 1:07:22.216
+again this tradeoff between calculation and
+memory.
+1:07:25.125 --> 1:07:37.723
+Outer Regressive ModelThen at the end of today
+what we want to look into is one last type
+1:07:37.723 --> 1:07:42.902
+of new machine translation approach.
+1:07:43.403 --> 1:07:53.621
+And the idea is what we've already seen in
+our first two steps is that this ultra aggressive
+1:07:53.621 --> 1:07:57.246
+park is taking community coding.
+1:07:57.557 --> 1:08:04.461
+Can process everything in parallel, but we
+are always taking the most probable and then.
+1:08:05.905 --> 1:08:10.476
+The question is: Do we really need to do that?
+1:08:10.378 --> 1:08:14.015
+Therefore, there is a bunch of work.
+1:08:13.917 --> 1:08:16.518
+Can we do it differently?
+1:08:16.418 --> 1:08:19.622
+Can we generate a full target?
+1:08:20.160 --> 1:08:29.417
+We'll see it's not that easy and there's still
+an open debate whether this is really faster
+1:08:29.417 --> 1:08:31.832
+and quality, but think.
+1:08:32.712 --> 1:08:45.594
+So, as said, what we have done is our encoder
+decoder where we can process our encoder color,
+1:08:45.594 --> 1:08:50.527
+and then the output always depends.
+1:08:50.410 --> 1:08:54.709
+We generate the output and then we have to
+put it here the wide because then everything
+1:08:54.709 --> 1:08:56.565
+depends on the purpose of the output.
+1:08:56.916 --> 1:09:10.464
+This is what is referred to as an outer-regressive
+model and nearly outs speech generation and
+1:09:10.464 --> 1:09:16.739
+language generation or works in this outer.
+1:09:18.318 --> 1:09:21.132
+So the motivation is, can we do that more
+efficiently?
+1:09:21.361 --> 1:09:31.694
+And can we somehow process all target words
+in parallel?
+1:09:31.513 --> 1:09:41.305
+So instead of doing it one by one, we are
+inputting.
+1:09:45.105 --> 1:09:46.726
+So how does it work?
+1:09:46.649 --> 1:09:50.589
+So let's first have a basic auto regressive
+mode.
+1:09:50.810 --> 1:09:53.551
+So the encoder looks as it is before.
+1:09:53.478 --> 1:09:58.311
+That's maybe not surprising because here we
+know we can paralyze.
+1:09:58.618 --> 1:10:04.592
+So we have put in here our ink holder and
+generated the ink stash, so that's exactly
+1:10:04.592 --> 1:10:05.295
+the same.
+1:10:05.845 --> 1:10:16.069
+Machine TranslationHowever, now we need to
+do one more thing: One challenge is what we
+1:10:16.069 --> 1:10:26.764
+had before and that's a challenge of natural
+language generation like machine translation.
+1:10:32.672 --> 1:10:38.447
+We generate until we generate this out of
+end of center stock, but if we now generate
+1:10:38.447 --> 1:10:44.625
+everything at once that's no longer possible,
+so we cannot generate as long because we only
+1:10:44.625 --> 1:10:45.632
+generated one.
+1:10:46.206 --> 1:10:58.321
+So the question is how can we now determine
+how long the sequence is, and we can also accelerate.
+1:11:00.000 --> 1:11:06.384
+Yes, but there would be one idea, and there
+is other work which tries to do that.
+1:11:06.806 --> 1:11:15.702
+However, in here there's some work already
+done before and maybe you remember we had the
+1:11:15.702 --> 1:11:20.900
+IBM models and there was this concept of fertility.
+1:11:21.241 --> 1:11:26.299
+The concept of fertility is means like for
+one saucepan, and how many target pores does
+1:11:26.299 --> 1:11:27.104
+it translate?
+1:11:27.847 --> 1:11:34.805
+And exactly that we try to do here, and that
+means we are calculating like at the top we
+1:11:34.805 --> 1:11:36.134
+are calculating.
+1:11:36.396 --> 1:11:42.045
+So it says word is translated into word.
+1:11:41.908 --> 1:11:54.173
+Word might be translated into words into,
+so we're trying to predict in how many words.
+1:11:55.935 --> 1:12:10.314
+And then the end of the anchor, so this is
+like a length estimation.
+1:12:10.105 --> 1:12:15.532
+You can do it otherwise.
+1:12:16.236 --> 1:12:24.526
+You initialize your decoder input and we know
+it's good with word embeddings so we're trying
+1:12:24.526 --> 1:12:28.627
+to do the same thing and what people then do.
+1:12:28.538 --> 1:12:35.225
+They initialize it again with word embedding
+but in the frequency of the.
+1:12:35.315 --> 1:12:36.460
+So we have the cartilage.
+1:12:36.896 --> 1:12:47.816
+So one has two, so twice the is and then one
+is, so that is then our initialization.
+1:12:48.208 --> 1:12:57.151
+In other words, if you don't predict fertilities
+but predict lengths, you can just initialize
+1:12:57.151 --> 1:12:57.912
+second.
+1:12:58.438 --> 1:13:07.788
+This often works a bit better, but that's
+the other.
+1:13:07.611 --> 1:13:16.436
+Now you have everything in training and testing.
+1:13:16.656 --> 1:13:18.621
+This is all available at once.
+1:13:20.280 --> 1:13:31.752
+Then we can generate everything in parallel,
+so we have the decoder stack, and that is now
+1:13:31.752 --> 1:13:33.139
+as before.
+1:13:35.395 --> 1:13:41.555
+And then we're doing the translation predictions
+here on top of it in order to do.
+1:13:43.083 --> 1:13:59.821
+And then we are predicting here the target
+words and once predicted, and that is the basic
+1:13:59.821 --> 1:14:00.924
+idea.
+1:14:01.241 --> 1:14:08.171
+Machine translation: Where the idea is, we
+don't have to do one by one what we're.
+1:14:10.210 --> 1:14:13.900
+So this looks really, really, really great.
+1:14:13.816 --> 1:14:20.314
+On the first view there's one challenge with
+this, and this is the baseline.
+1:14:20.230 --> 1:14:27.572
+Of course there's some improvements, but in
+general the quality is often significant.
+1:14:28.068 --> 1:14:32.075
+So here you see the baseline models.
+1:14:31.967 --> 1:14:38.468
+You have a loss of ten blue points or something
+like that.
+1:14:38.878 --> 1:14:40.230
+So why does it change?
+1:14:40.171 --> 1:14:41.642
+So why is it happening?
+1:14:43.903 --> 1:14:56.250
+If you look at the errors there is repetitive
+tokens, so you have like or things like that.
+1:14:56.536 --> 1:15:01.995
+Broken senses or influent senses, so that
+exactly where algebra aggressive models are
+1:15:01.995 --> 1:15:04.851
+very good, we say that's a bit of a problem.
+1:15:04.788 --> 1:15:07.392
+They generate very fluid transcription.
+1:15:07.387 --> 1:15:10.898
+Translation: Sometimes there doesn't have
+to do anything with the input.
+1:15:11.411 --> 1:15:14.047
+But generally it really looks always very
+fluid.
+1:15:14.995 --> 1:15:20.865
+Here exactly the opposite, so the problem
+is that we don't have really fluid translation.
+1:15:21.421 --> 1:15:26.123
+And that is mainly due to the challenge that
+we have this independent assumption.
+1:15:26.646 --> 1:15:35.873
+So in this case, the probability of Y of the
+second position is independent of the probability
+1:15:35.873 --> 1:15:40.632
+of X, so we don't know what was there generated.
+1:15:40.535 --> 1:15:43.743
+We're just generating it there.
+1:15:43.964 --> 1:15:55.439
+You can see it also in a bit of examples.
+1:15:55.166 --> 1:16:03.646
+You can over-panelize shifts.
+1:16:04.024 --> 1:16:10.566
+And the problem is this is already an improvement
+again, but this is also similar to.
+1:16:11.071 --> 1:16:21.017
+So you can, for example, translate heeded
+back, or maybe you could also translate it
+1:16:21.017 --> 1:16:31.197
+with: But on their feeling down in feeling
+down, if the first position thinks of their
+1:16:31.197 --> 1:16:34.591
+feeling done and the second.
+1:16:35.075 --> 1:16:42.908
+So each position here and that is one of the
+main issues here doesn't know what the other.
+1:16:43.243 --> 1:16:53.846
+And for example, if you are translating something
+with, you can often translate things in two
+1:16:53.846 --> 1:16:58.471
+ways: German with a different agreement.
+1:16:58.999 --> 1:17:02.058
+And then here where you have to decide do
+a used jet.
+1:17:02.162 --> 1:17:05.460
+Interpretator: It doesn't know which word
+it has to select.
+1:17:06.086 --> 1:17:14.789
+Mean, of course, it knows a hidden state,
+but in the end you have a liability distribution.
+1:17:16.256 --> 1:17:20.026
+And that is the important thing in the outer
+regressive month.
+1:17:19.966 --> 1:17:24.295
+You know that because you have put it in you
+here, you don't know that.
+1:17:24.235 --> 1:17:29.624
+If it's equal probable here to two, you don't
+Know Which Is Selected, and of course that
+1:17:29.624 --> 1:17:32.833
+depends on what should be the latest traction
+under.
+1:17:33.333 --> 1:17:39.554
+Yep, that's the undershift, and we're going
+to last last the next time.
+1:17:39.467 --> 1:17:40.007
+Yes.
+1:17:40.840 --> 1:17:44.935
+Doesn't this also appear in and like now we're
+talking about physical training?
+1:17:46.586 --> 1:17:48.412
+The thing is in the auto regress.
+1:17:48.358 --> 1:17:50.185
+If you give it the correct one,.
+1:17:50.450 --> 1:17:55.827
+So if you predict here comma what the reference
+is feeling then you tell the model here.
+1:17:55.767 --> 1:17:59.540
+The last one was feeling and then it knows
+it has to be done.
+1:17:59.479 --> 1:18:04.045
+But here it doesn't know that because it doesn't
+get as input as a right.
+1:18:04.204 --> 1:18:24.286
+Yes, that's a bit depending on what.
+1:18:24.204 --> 1:18:27.973
+But in training, of course, you just try to
+make the highest one the current one.
+1:18:31.751 --> 1:18:38.181
+So what you can do is things like CDC loss
+which can adjust for this.
+1:18:38.089 --> 1:18:42.809
+So then you can also have this shifted correction.
+1:18:42.716 --> 1:18:50.584
+If you're doing this type of correction in
+the CDC loss you don't get full penalty.
+1:18:50.930 --> 1:18:58.486
+Just shifted by one, so it's a bit of a different
+loss, which is mainly used in, but.
+1:19:00.040 --> 1:19:03.412
+It can be used in order to address this problem.
+1:19:04.504 --> 1:19:13.844
+The other problem is that outer regressively
+we have the label buyers that tries to disimmigrate.
+1:19:13.749 --> 1:19:20.517
+That's the example did before was if you translate
+thank you to Dung.
+1:19:20.460 --> 1:19:31.925
+And then it might end up because it learns
+in the first position and the second also.
+1:19:32.492 --> 1:19:43.201
+In order to prevent that, it would be helpful
+for one output, only one output, so that makes
+1:19:43.201 --> 1:19:47.002
+the system already better learn.
+1:19:47.227 --> 1:19:53.867
+Might be that for slightly different inputs
+you have different outputs, but for the same.
+1:19:54.714 --> 1:19:57.467
+That we can luckily very easily solve.
+1:19:59.119 --> 1:19:59.908
+And it's done.
+1:19:59.855 --> 1:20:04.117
+We just learned the technique about it, which
+is called knowledge distillation.
+1:20:04.985 --> 1:20:13.398
+So what we can do and the easiest solution
+to prove your non-autoregressive model is to
+1:20:13.398 --> 1:20:16.457
+train an auto regressive model.
+1:20:16.361 --> 1:20:22.959
+Then you decode your whole training gamer
+with this model and then.
+1:20:23.603 --> 1:20:27.078
+While the main advantage of that is that this
+is more consistent,.
+1:20:27.407 --> 1:20:33.995
+So for the same input you always have the
+same output.
+1:20:33.875 --> 1:20:41.903
+So you have to make your training data more
+consistent and learn.
+1:20:42.482 --> 1:20:54.471
+So there is another advantage of knowledge
+distillation and that advantage is you have
+1:20:54.471 --> 1:20:59.156
+more consistent training signals.
+1:21:04.884 --> 1:21:10.630
+There's another to make the things more easy
+at the beginning.
+1:21:10.539 --> 1:21:16.469
+There's this plants model, black model where
+you do more masks.
+1:21:16.756 --> 1:21:26.080
+So during training, especially at the beginning,
+you give some correct solutions at the beginning.
+1:21:28.468 --> 1:21:38.407
+And there is this tokens at a time, so the
+idea is to establish other regressive training.
+1:21:40.000 --> 1:21:50.049
+And some targets are open, so you always predict
+only like first auto regression is K.
+1:21:50.049 --> 1:21:59.174
+It puts one, so you always have one input
+and one output, then you do partial.
+1:21:59.699 --> 1:22:05.825
+So in that way you can slowly learn what is
+a good and what is a bad answer.
+1:22:08.528 --> 1:22:10.862
+It doesn't sound very impressive.
+1:22:10.793 --> 1:22:12.536
+Don't contact me anyway.
+1:22:12.466 --> 1:22:15.326
+Go all over your training data several.
+1:22:15.875 --> 1:22:20.655
+You can even switch in between.
+1:22:20.506 --> 1:22:29.321
+There is a homework on this thing where you
+try to start.
+1:22:31.271 --> 1:22:41.563
+You have to learn so there's a whole work
+on that so this is often happening and it doesn't
+1:22:41.563 --> 1:22:46.598
+mean it's less efficient but still it helps.
+1:22:49.389 --> 1:22:57.979
+For later maybe here are some examples of
+how much things help.
+1:22:57.845 --> 1:23:04.961
+Maybe one point here is that it's really important.
+1:23:05.365 --> 1:23:13.787
+Here's the translation performance and speed.
+1:23:13.604 --> 1:23:24.410
+One point which is a point is if you compare
+researchers.
+1:23:24.784 --> 1:23:33.880
+So yeah, if you're compared to one very weak
+baseline transformer even with beam search,
+1:23:33.880 --> 1:23:40.522
+then you're ten times slower than a very strong
+auto regressive.
+1:23:40.961 --> 1:23:50.047
+If you make a strong baseline then it's going
+down to depending on times and here like: You
+1:23:50.047 --> 1:23:53.504
+have a lot of different speed ups.
+1:23:53.405 --> 1:24:03.262
+Generally, it makes a strong baseline and
+not very simple transformer.
+1:24:07.407 --> 1:24:19.020
+Half PrecisionYeah, with this one last thing
+that you can do to speed up things and also
+1:24:19.020 --> 1:24:25.936
+reduce your memory is what is called half precision.
+1:24:26.326 --> 1:24:29.139
+And especially for decoding issues for training.
+1:24:29.081 --> 1:24:31.150
+Sometimes it also gets less stale.
+1:24:32.592 --> 1:24:45.184
+With this we close nearly wait a bit, so what
+you should remember is that efficient machine
+1:24:45.184 --> 1:24:46.963
+translation.
+1:24:47.007 --> 1:24:51.939
+We have, for example, looked at knowledge
+distillation.
+1:24:51.851 --> 1:24:55.967
+We have looked at non auto regressive models.
+1:24:55.877 --> 1:24:57.671
+We have different.
+1:24:58.898 --> 1:25:02.383
+For today and then only requests.
+1:25:02.281 --> 1:25:08.432
+So if you haven't done so, please fill out
+the evaluation.
+1:25:08.388 --> 1:25:20.127
+So now if you have done so think then you
+should have and with the online people hopefully.
+1:25:20.320 --> 1:25:29.758
+Only possibility to tell us what things are
+good and what not the only one but the most
+1:25:29.758 --> 1:25:30.937
+efficient.
+1:25:31.851 --> 1:25:35.871
+So think of all the students doing it in this
+case okay and then thank.

demo_data/lectures/Lecture-14-27.06.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59f384b3137c89cb3f00f2020badb6eb5ff6de5043bd9e015adab92072e27e62
+size 113488295

demo_data/lectures/Lecture-15-11.07.2023/English.vtt ADDED Viewed

	@@ -0,0 +1,2295 @@

+WEBVTT
+0:00:00.060 --> 0:00:07.061
+IntroductionOK good so today's lecture is
+on on supervised machines and stations so what
+0:00:07.061 --> 0:00:13.512
+you have seen so far is different techniques
+are on supervised and MP so you are.
+0:00:13.593 --> 0:00:18.552
+Data right so let's say in English coppers
+you are one file and then in German you have
+0:00:18.552 --> 0:00:23.454
+another file which is sentence to sentence
+la and then you try to build systems around
+0:00:23.454 --> 0:00:23.679
+it.
+0:00:24.324 --> 0:00:30.130
+But what's different about this lecture is
+that you assume that you have no final data
+0:00:30.130 --> 0:00:30.663
+at all.
+0:00:30.597 --> 0:00:37.120
+You only have monolingual data and the question
+is how can we build systems to translate between
+0:00:37.120 --> 0:00:39.406
+these two languages right and so.
+0:00:39.359 --> 0:00:44.658
+This is a bit more realistic scenario because
+you have so many languages in the world.
+0:00:44.597 --> 0:00:50.288
+You cannot expect to have parallel data between
+all the two languages and so, but in typical
+0:00:50.288 --> 0:00:55.612
+cases you have newspapers and so on, which
+is like monolingual files, and the question
+0:00:55.612 --> 0:00:57.999
+is can we build something around them?
+0:00:59.980 --> 0:01:01.651
+They like said for today.
+0:01:01.586 --> 0:01:05.844
+First we'll start up with the interactions,
+so why do we need it?
+0:01:05.780 --> 0:01:11.549
+and also some infusion on how these models
+work before going into the technical details.
+0:01:11.484 --> 0:01:17.303
+I want to also go through an example,, which
+kind of gives you more understanding on how
+0:01:17.303 --> 0:01:19.264
+people came into more elders.
+0:01:20.820 --> 0:01:23.905
+Then the rest of the lecture is going to be
+two parts.
+0:01:23.849 --> 0:01:26.045
+One is we're going to translate words.
+0:01:25.989 --> 0:01:29.949
+We're not going to care about how can we translate
+the full sentence.
+0:01:29.893 --> 0:01:35.136
+But given to monolingual files, how can we
+get a dictionary basically, which is much easier
+0:01:35.136 --> 0:01:37.814
+than generating something in a sentence level?
+0:01:38.698 --> 0:01:43.533
+Then we're going to go into the Edwards case,
+which is the unsupervised sentence type solution.
+0:01:44.204 --> 0:01:50.201
+And here what you'll see is what are the training
+objectives which are quite different than the
+0:01:50.201 --> 0:01:55.699
+word translation and also where it doesn't
+but because this is also quite important and
+0:01:55.699 --> 0:02:01.384
+it's one of the reasons why unsupervised does
+not use anymore because the limitations kind
+0:02:01.384 --> 0:02:03.946
+of go away from the realistic use cases.
+0:02:04.504 --> 0:02:06.922
+And then that leads to the marketing world
+model.
+0:02:06.873 --> 0:02:07.131
+So.
+0:02:07.807 --> 0:02:12.915
+People are trying to do to build systems for
+languages that will not have any parallel data.
+0:02:12.860 --> 0:02:17.689
+Is use multilingual models and combine with
+these training objectives to get better at
+0:02:17.689 --> 0:02:17.911
+it.
+0:02:17.856 --> 0:02:18.152
+So.
+0:02:18.658 --> 0:02:24.396
+People are not trying to build bilingual systems
+currently for unsupervised arm translation,
+0:02:24.396 --> 0:02:30.011
+but I think it's good to know how they came
+to hear this point and what they're doing now.
+0:02:30.090 --> 0:02:34.687
+You also see some patterns overlapping which
+people are using.
+0:02:36.916 --> 0:02:41.642
+So as you said before, and you probably hear
+it multiple times now is that we have seven
+0:02:41.642 --> 0:02:43.076
+thousand languages around.
+0:02:43.903 --> 0:02:49.460
+Can be different dialects in someone, so it's
+quite hard to distinguish what's the language,
+0:02:49.460 --> 0:02:54.957
+but you can typically approximate that seven
+thousand and that leads to twenty five million
+0:02:54.957 --> 0:02:59.318
+pairs, which is the obvious reason why we do
+not have any parallel data.
+0:03:00.560 --> 0:03:06.386
+So you want to build an empty system for all
+possible language pests and the question is
+0:03:06.386 --> 0:03:07.172
+how can we?
+0:03:08.648 --> 0:03:13.325
+The typical use case, but there are actually
+quite few interesting use cases than what you
+0:03:13.325 --> 0:03:14.045
+would expect.
+0:03:14.614 --> 0:03:20.508
+One is the animal languages, which is the
+real thing that's happening right now with.
+0:03:20.780 --> 0:03:26.250
+The dog but with dolphins and so on, but I
+couldn't find a picture that could show this,
+0:03:26.250 --> 0:03:31.659
+but if you are interested in stuff like this
+you can check out the website where people
+0:03:31.659 --> 0:03:34.916
+are actually trying to understand how animals
+speak.
+0:03:35.135 --> 0:03:37.356
+It's Also a Bit More About.
+0:03:37.297 --> 0:03:44.124
+Knowing what the animals want to say but may
+not die dead but still people are trying to
+0:03:44.124 --> 0:03:44.661
+do it.
+0:03:45.825 --> 0:03:50.689
+More realistic thing that's happening is the
+translation of programming languages.
+0:03:51.371 --> 0:03:56.963
+And so this is quite a quite good scenario
+for entrepreneurs and empty is that you have
+0:03:56.963 --> 0:04:02.556
+a lot of code available online right in C +
++ and in Python and the question is how can
+0:04:02.556 --> 0:04:08.402
+we translate by just looking at the code alone
+and no parallel functions and so on and this
+0:04:08.402 --> 0:04:10.754
+is actually quite good right now so.
+0:04:12.032 --> 0:04:16.111
+See how these techniques were applied to do
+the programming translation.
+0:04:18.258 --> 0:04:23.882
+And then you can also think of language as
+something that is quite common so you can take
+0:04:23.882 --> 0:04:24.194
+off.
+0:04:24.132 --> 0:04:29.594
+Think of formal sentences in English as one
+language and informal sentences in English
+0:04:29.594 --> 0:04:35.433
+as another language and then learn the kind
+to stay between them and then it kind of becomes
+0:04:35.433 --> 0:04:37.380
+a style plan for a problem so.
+0:04:38.358 --> 0:04:43.042
+Although it's translation, you can consider
+different characteristics of a language and
+0:04:43.042 --> 0:04:46.875
+then separate them as two different languages
+and then try to map them.
+0:04:46.822 --> 0:04:52.022
+So it's not only about languages, but you
+can also do quite cool things by using unsophisticated
+0:04:52.022 --> 0:04:54.327
+techniques, which are quite possible also.
+0:04:56.256 --> 0:04:56.990
+I am so.
+0:04:56.909 --> 0:05:04.292
+This is kind of TV modeling for many of the
+use cases that we have for ours, ours and MD.
+0:05:04.210 --> 0:05:11.835
+But before we go into the modeling of these
+systems, what I want you to do is look at these
+0:05:11.835 --> 0:05:12.415
+dummy.
+0:05:13.813 --> 0:05:19.720
+We have text and language one, text and language
+two right, and nobody knows what these languages
+0:05:19.720 --> 0:05:20.082
+mean.
+0:05:20.021 --> 0:05:23.719
+They completely are made up right, and the
+question is also.
+0:05:23.658 --> 0:05:29.344
+They're not parallel lines, so the first line
+here and the first line is not a line, they're
+0:05:29.344 --> 0:05:30.811
+just monolingual files.
+0:05:32.052 --> 0:05:38.281
+And now think about how can you translate
+the word M1 from language one to language two,
+0:05:38.281 --> 0:05:41.851
+and this kind of you see how we try to model
+this.
+0:05:42.983 --> 0:05:47.966
+Would take your time and then think of how
+can you translate more into language two?
+0:06:41.321 --> 0:06:45.589
+About the model, if you ask somebody who doesn't
+know anything about machine translation right,
+0:06:45.589 --> 0:06:47.411
+and then you ask them to translate more.
+0:07:01.201 --> 0:07:10.027
+But it's also not quite easy if you think
+of the way that I made this example is relatively
+0:07:10.027 --> 0:07:10.986
+easy, so.
+0:07:11.431 --> 0:07:17.423
+Basically, the first two sentences are these
+two: A, B, C is E, and G cured up the U, V
+0:07:17.423 --> 0:07:21.849
+is L, A, A, C, S, and S, on and this is used
+towards the German.
+0:07:22.662 --> 0:07:25.241
+And then when you join these two words, it's.
+0:07:25.205 --> 0:07:32.445
+English German the third line and the last
+line, and then the fourth line is the first
+0:07:32.445 --> 0:07:38.521
+line, so German language, English, and then
+speak English, speak German.
+0:07:38.578 --> 0:07:44.393
+So this is how I made made up the example
+and what the intuition here is that you assume
+0:07:44.393 --> 0:07:50.535
+that the languages have a fundamental structure
+right and it's the same across all languages.
+0:07:51.211 --> 0:07:57.727
+Doesn't matter what language you are thinking
+of words kind of you have in the same way join
+0:07:57.727 --> 0:07:59.829
+together is the same way and.
+0:07:59.779 --> 0:08:06.065
+And plasma sign thinks the same way but this
+is not a realistic assumption for sure but
+0:08:06.065 --> 0:08:12.636
+it's actually a decent one to make and if you
+can think of this like if you can assume this
+0:08:12.636 --> 0:08:16.207
+then we can model systems in an unsupervised
+way.
+0:08:16.396 --> 0:08:22.743
+So this is the intuition that I want to give,
+and you can see that whenever assumptions fail,
+0:08:22.743 --> 0:08:23.958
+the systems fail.
+0:08:23.891 --> 0:08:29.824
+So in practice whenever we go far away from
+these assumptions, the systems try to more
+0:08:29.824 --> 0:08:30.778
+time to fail.
+0:08:33.753 --> 0:08:39.711
+So the example that I gave was actually perfect
+mapping right, so it never really sticks bad.
+0:08:39.648 --> 0:08:45.321
+They have the same number of words, same sentence
+structure, perfect mapping, and so on.
+0:08:45.257 --> 0:08:50.995
+This doesn't happen, but let's assume that
+this happens and try to see how we can moral.
+0:08:53.493 --> 0:08:59.017
+Unsupervised word translationOkay, now let's
+go a bit more formal, so what you want to do
+0:08:59.017 --> 0:09:01.042
+is unsupervise word translation.
+0:09:01.901 --> 0:09:08.773
+Here the task is that we have input data as
+monolingual data, so a bunch of sentences in
+0:09:08.773 --> 0:09:15.876
+one file and a bunch of sentences another file
+in two different languages, and the question
+0:09:15.876 --> 0:09:18.655
+is how can we get a bilingual word?
+0:09:19.559 --> 0:09:25.134
+So if you look at the picture you see that
+it's just kind of projected down into two dimension
+0:09:25.134 --> 0:09:30.358
+planes, but it's basically when you map them
+into a plot you see that the words that are
+0:09:30.358 --> 0:09:35.874
+parallel are closer together, and the question
+is how can we do it just looking at two files?
+0:09:36.816 --> 0:09:42.502
+And you can say that what we want to basically
+do is create a dictionary in the end given
+0:09:42.502 --> 0:09:43.260
+two fights.
+0:09:43.197 --> 0:09:45.410
+So this is the task that we want.
+0:09:46.606 --> 0:09:52.262
+And the first step on how we do this is to
+learn word vectors, and this chicken is whatever
+0:09:52.262 --> 0:09:56.257
+techniques that you have seen before, but to
+work glow or so on.
+0:09:56.856 --> 0:10:00.699
+So you take a monolingual data and try to
+learn word embeddings.
+0:10:02.002 --> 0:10:07.675
+Then you plot them into a graph, and then
+typically what you would see is that they're
+0:10:07.675 --> 0:10:08.979
+not aligned at all.
+0:10:08.914 --> 0:10:14.693
+One word space is somewhere, and one word
+space is somewhere else, and this is what you
+0:10:14.693 --> 0:10:18.043
+would typically expect to see in the in the
+image.
+0:10:19.659 --> 0:10:23.525
+Now our assumption was that both lines we
+just have the same.
+0:10:23.563 --> 0:10:28.520
+Culture and so that we can use this information
+to learn the mapping between these two spaces.
+0:10:30.130 --> 0:10:37.085
+So before how we do it, I think this is quite
+famous already, and everybody knows it a bit
+0:10:37.085 --> 0:10:41.824
+more is that we're emitting capture semantic
+relations right.
+0:10:41.747 --> 0:10:48.245
+So the distance between man and woman is approximately
+the same as king and prince.
+0:10:48.888 --> 0:10:54.620
+It's also for world dances, country capital
+and so on, so there are some relationships
+0:10:54.620 --> 0:11:00.286
+happening in the word emmering space, which
+is quite clear for at least one language.
+0:11:03.143 --> 0:11:08.082
+Now if you think of this, let's say of the
+English word embryng.
+0:11:08.006 --> 0:11:14.746
+Let's say of German word embryng and the way
+the King Keene Man woman organized is same
+0:11:14.746 --> 0:11:17.734
+as the German translation of his word.
+0:11:17.998 --> 0:11:23.336
+This is the main idea is that although they
+are somewhere else, the relationship is the
+0:11:23.336 --> 0:11:28.008
+same between the both languages and we can
+use this to to learn the mapping.
+0:11:31.811 --> 0:11:35.716
+'S not only for these poor words where it
+happens for all the words in the language,
+0:11:35.716 --> 0:11:37.783
+and so we can use this to to learn the math.
+0:11:39.179 --> 0:11:43.828
+This is the main idea is that both emittings
+have a similar shape.
+0:11:43.759 --> 0:11:48.431
+It's only that they're just not aligned and
+so you go to the here.
+0:11:48.362 --> 0:11:50.821
+They kind of have a similar shape.
+0:11:50.751 --> 0:11:57.211
+They're just in some different spaces and
+what you need to do is to map them into a common
+0:11:57.211 --> 0:11:57.708
+space.
+0:12:06.086 --> 0:12:12.393
+The w, such that if it multiplied w with x,
+they both become.
+0:12:35.335 --> 0:12:41.097
+That's true, but there are also many works
+that have the relationship right, and we hope
+0:12:41.097 --> 0:12:43.817
+that this is enough to learn the mapping.
+0:12:43.752 --> 0:12:49.823
+So there's always going to be a bit of noise,
+as in how when we align them they're not going
+0:12:49.823 --> 0:12:51.716
+to be exactly the same, but.
+0:12:51.671 --> 0:12:57.293
+What you can expect is that there are these
+main works that allow us to learn the mapping,
+0:12:57.293 --> 0:13:02.791
+so it's not going to be perfect, but it's an
+approximation that we make to to see how it
+0:13:02.791 --> 0:13:04.521
+works and then practice it.
+0:13:04.459 --> 0:13:10.078
+Also, it's not that the fact that women do
+not have any relationship does not affect that
+0:13:10.078 --> 0:13:10.452
+much.
+0:13:10.550 --> 0:13:15.429
+A lot of words usually have, so it kind of
+works out in practice.
+0:13:22.242 --> 0:13:34.248
+I have not heard about it, but if you want
+to say something about it, I would be interested,
+0:13:34.248 --> 0:13:37.346
+but we can do it later.
+0:13:41.281 --> 0:13:44.133
+Usual case: This is supervised.
+0:13:45.205 --> 0:13:49.484
+First way to do a supervised work translation
+where we have a dictionary right and that we
+0:13:49.484 --> 0:13:53.764
+can use that to learn the mapping, but in our
+case we assume that we have nothing right so
+0:13:53.764 --> 0:13:55.222
+we only have monolingual data.
+0:13:56.136 --> 0:14:03.126
+Then we need unsupervised planning to figure
+out W, and we're going to use guns to to find
+0:14:03.126 --> 0:14:06.122
+W, and it's quite a nice way to do it.
+0:14:08.248 --> 0:14:15.393
+So just before I go on how we use it to use
+case, I'm going to go briefly on gas right,
+0:14:15.393 --> 0:14:19.940
+so we have two components: generator and discriminator.
+0:14:21.441 --> 0:14:27.052
+Gen data tries to generate something obviously,
+and the discriminator tries to see if it's
+0:14:27.052 --> 0:14:30.752
+real data or something that is generated by
+the generation.
+0:14:31.371 --> 0:14:37.038
+And there's like this two player game where
+the winner decides to fool and the winner decides
+0:14:37.038 --> 0:14:41.862
+to market food and they try to build these
+two components and try to learn WWE.
+0:14:43.483 --> 0:14:53.163
+Okay, so let's say we have two languages,
+X and Y right, so the X language has N words
+0:14:53.163 --> 0:14:56.167
+with numbering dimensions.
+0:14:56.496 --> 0:14:59.498
+So what I'm reading is matrix is peak or something.
+0:14:59.440 --> 0:15:02.174
+Then we have target language why with m words.
+0:15:02.116 --> 0:15:06.945
+I'm also the same amount of things I mentioned
+and then we have a matrix peak or.
+0:15:07.927 --> 0:15:13.784
+Basically what you're going to do is use word
+to work and learn our word embedded.
+0:15:14.995 --> 0:15:23.134
+Now we have these X Mrings, Y Mrings, and
+what you want to know is W, such that W X and
+0:15:23.134 --> 0:15:24.336
+Y are align.
+0:15:29.209 --> 0:15:35.489
+With guns you have two steps, one is a discriminative
+step and one is the the mapping step and the
+0:15:35.489 --> 0:15:41.135
+discriminative step is to see if the embeddings
+are from the source or mapped embedding.
+0:15:41.072 --> 0:15:44.689
+So it's going to be much scary when I go to
+the figure.
+0:15:46.306 --> 0:15:50.041
+So we have a monolingual documents with two
+different languages.
+0:15:49.983 --> 0:15:54.498
+From here we get our source language ambients
+target language ambients right.
+0:15:54.440 --> 0:15:58.905
+Then we randomly initialize the transformation
+metrics W.
+0:15:58.905 --> 0:16:05.603
+Then we have the discriminator which tries
+to see if it's WX or Y, so it needs to know
+0:16:05.603 --> 0:16:13.379
+that this is a mapped one and this is the original
+language, and so if you look at the lost function
+0:16:13.379 --> 0:16:20.076
+here, it's basically that source is one given
+WX, so this is from the source language.
+0:16:23.543 --> 0:16:27.339
+Which means it's the target language em yeah.
+0:16:27.257 --> 0:16:34.437
+It's just like my figure is not that great,
+but you can assume that they are totally.
+0:16:40.260 --> 0:16:43.027
+So this is the kind of the lost function.
+0:16:42.961 --> 0:16:46.338
+We have N source words, M target words, and
+so on.
+0:16:46.272 --> 0:16:52.341
+So that's why you have one by M, one by M,
+and the discriminator is to just see if they're
+0:16:52.341 --> 0:16:55.742
+mapped or they're from the original target
+number.
+0:16:57.317 --> 0:17:04.024
+And then we have the mapping step where we
+train W to fool the the discriminators.
+0:17:04.564 --> 0:17:10.243
+So here it's the same way, but what you're
+going to just do is inverse the loss function.
+0:17:10.180 --> 0:17:15.829
+So now we freeze the discriminators, so it's
+important to note that in the previous sect
+0:17:15.829 --> 0:17:20.844
+we freezed the transformation matrix, and here
+we freezed your discriminators.
+0:17:22.482 --> 0:17:30.228
+And now it's to fool the discriminated rights,
+so it should predict that the source is zero
+0:17:30.228 --> 0:17:37.889
+given the map numbering, and the source is
+one given the target numbering, which is wrong,
+0:17:37.889 --> 0:17:40.920
+which is why we're attaining the W.
+0:17:40.920 --> 0:17:46.308
+Any questions on this okay so then how do
+we know when to stop?
+0:17:46.224 --> 0:17:55.845
+We just train until we reach convergence right
+and then we have our W hopefully train and
+0:17:55.845 --> 0:17:59.265
+map them into an airline space.
+0:18:02.222 --> 0:18:07.097
+The question is how can we evaluate this mapping?
+0:18:07.000 --> 0:18:13.902
+Does anybody know what we can use to mapping
+or evaluate the mapping?
+0:18:13.803 --> 0:18:15.879
+How good is a word?
+0:18:28.969 --> 0:18:33.538
+We use as I said we use a dictionary, at least
+in the end.
+0:18:33.461 --> 0:18:40.179
+We need a dictionary to evaluate, so this
+is our only final, so we aren't using it at
+0:18:40.179 --> 0:18:42.600
+all in attaining data and the.
+0:18:43.223 --> 0:18:49.681
+Is one is to check what's the position for
+our dictionary, just that.
+0:18:50.650 --> 0:18:52.813
+The first nearest neighbor and see if it's
+there on.
+0:18:53.573 --> 0:18:56.855
+But this is quite strict because there's a
+lot of noise in the emitting space right.
+0:18:57.657 --> 0:19:03.114
+Not always your first neighbor is going to
+be the translation, so what people also report
+0:19:03.114 --> 0:19:05.055
+is precision at file and so on.
+0:19:04.994 --> 0:19:10.175
+So you take the finerest neighbors and see
+if the translation is in there and so on.
+0:19:10.114 --> 0:19:15.529
+So the more you increase, the more likely
+that there is a translation because where I'm
+0:19:15.529 --> 0:19:16.698
+being quite noisy.
+0:19:19.239 --> 0:19:25.924
+What's interesting is that people have used
+dictionary to to learn word translation, but
+0:19:25.924 --> 0:19:32.985
+the way of doing this is much better than using
+a dictionary, so somehow our assumption helps
+0:19:32.985 --> 0:19:36.591
+us to to build better than a supervised system.
+0:19:39.099 --> 0:19:42.985
+So as you see on the top you have a question
+at one five ten.
+0:19:42.922 --> 0:19:47.310
+These are the typical numbers that you report
+for world translation.
+0:19:48.868 --> 0:19:55.996
+But guns are usually quite tricky to to train,
+and it does not converge on on language based,
+0:19:55.996 --> 0:20:02.820
+and this kind of goes back to a assumption
+that they kind of behave in the same structure
+0:20:02.820 --> 0:20:03.351
+right.
+0:20:03.275 --> 0:20:07.144
+But if you take a language like English and
+some.
+0:20:07.087 --> 0:20:12.203
+Other languages are almost very lotus, so
+it's quite different from English and so on.
+0:20:12.144 --> 0:20:13.623
+Then I've one language,.
+0:20:13.564 --> 0:20:18.754
+So whenever whenever our assumption fails,
+these unsupervised techniques always do not
+0:20:18.754 --> 0:20:21.200
+converge or just give really bad scores.
+0:20:22.162 --> 0:20:27.083
+And so the fact is that the monolingual embryons
+for distant languages are too far.
+0:20:27.024 --> 0:20:30.950
+They do not share the same structure, and
+so they do not convert.
+0:20:32.452 --> 0:20:39.380
+And so I just want to mention that there is
+a better retrieval technique than the nearest
+0:20:39.380 --> 0:20:41.458
+neighbor, which is called.
+0:20:42.882 --> 0:20:46.975
+But it's more advanced than mathematical,
+so I didn't want to go in it now.
+0:20:46.921 --> 0:20:51.811
+But if your interest is in some quite good
+retrieval segments, you can just look at these
+0:20:51.811 --> 0:20:53.007
+if you're interested.
+0:20:55.615 --> 0:20:59.216
+Cure for word translationOkay, so this is
+about the the word translation.
+0:20:59.167 --> 0:21:02.278
+Does anybody have any questions of cure?
+0:21:06.246 --> 0:21:07.501
+Was the worst answer?
+0:21:07.444 --> 0:21:12.545
+It was a bit easier than a sentence right,
+so you just assume that there's a mapping and
+0:21:12.545 --> 0:21:14.551
+then you try to learn the mapping.
+0:21:14.493 --> 0:21:19.641
+But now it's a bit more difficult because
+you need to jump at stuff also, which is quite
+0:21:19.641 --> 0:21:20.798
+much more trickier.
+0:21:22.622 --> 0:21:28.512
+Task here is that we have our input as manually
+well data for both languages as before, but
+0:21:28.512 --> 0:21:34.017
+now what we want to do is instead of translating
+word by word we want to do sentence.
+0:21:37.377 --> 0:21:44.002
+We have word of work now and so on to learn
+word amber inks, but sentence amber inks are
+0:21:44.002 --> 0:21:50.627
+actually not the site powered often, at least
+when people try to work on Answer Voice M,
+0:21:50.627 --> 0:21:51.445
+E, before.
+0:21:52.632 --> 0:21:54.008
+Now they're a bit okay.
+0:21:53.951 --> 0:21:59.028
+I mean, as you've seen in the practice on
+where we used places, they were quite decent.
+0:21:58.971 --> 0:22:03.007
+But then it's also the case on which data
+it's trained on and so on.
+0:22:02.949 --> 0:22:03.261
+So.
+0:22:04.164 --> 0:22:09.666
+Sentence embedings are definitely much more
+harder to get than were embedings, so this
+0:22:09.666 --> 0:22:13.776
+is a bit more complicated than the task that
+you've seen before.
+0:22:16.476 --> 0:22:16.994
+How U.
+0:22:16.994 --> 0:22:17.216
+N.
+0:22:17.216 --> 0:22:17.438
+T.
+0:22:17.438 --> 0:22:19.659
+WorksBefore we go into how U.
+0:22:19.659 --> 0:22:19.881
+N.
+0:22:19.881 --> 0:22:20.103
+M.
+0:22:20.103 --> 0:22:20.325
+T.
+0:22:20.325 --> 0:22:24.470
+Works, so this is your typical supervised
+system right.
+0:22:24.396 --> 0:22:29.537
+So we have parallel data source sentence target
+centers.
+0:22:29.447 --> 0:22:31.166
+We have a source.
+0:22:31.471 --> 0:22:36.709
+We have a target decoder and then we try to
+minimize the cross center pillar on this viral
+0:22:36.709 --> 0:22:37.054
+data.
+0:22:37.157 --> 0:22:39.818
+And this is how we train our typical system.
+0:22:43.583 --> 0:22:49.506
+But now we do not have any parallel data,
+and so the intuition here is that if we can
+0:22:49.506 --> 0:22:55.429
+learn language independent representations
+at the end quota outputs, then we can pass
+0:22:55.429 --> 0:22:58.046
+it along to the decoder that we want.
+0:22:58.718 --> 0:23:03.809
+It's going to get more clear in the future,
+but I'm trying to give a bit more intuition
+0:23:03.809 --> 0:23:07.164
+before I'm going to show you all the planning
+objectives.
+0:23:08.688 --> 0:23:15.252
+So I assume that we have these different encoders
+right, so it's not only two, you have a bunch
+0:23:15.252 --> 0:23:21.405
+of different source language encoders, a bunch
+of different target language decoders, and
+0:23:21.405 --> 0:23:26.054
+also I assume that the encoder is in the same
+representation space.
+0:23:26.706 --> 0:23:31.932
+If you give a sentence in English and the
+same sentence in German, the embeddings are
+0:23:31.932 --> 0:23:38.313
+quite the same, so like the muddling when embeddings
+die right, and so then what we can do is, depending
+0:23:38.313 --> 0:23:42.202
+on the language we want, pass it to the the
+appropriate decode.
+0:23:42.682 --> 0:23:50.141
+And so the kind of goal here is to find out
+a way to create language independent representations
+0:23:50.141 --> 0:23:52.909
+and then pass it to the decodement.
+0:23:54.975 --> 0:23:59.714
+Just keep in mind that you're trying to do
+language independent for some reason, but it's
+0:23:59.714 --> 0:24:02.294
+going to be more clear once we see how it works.
+0:24:05.585 --> 0:24:12.845
+So in total we have three objectives that
+we're going to try to train in our systems,
+0:24:12.845 --> 0:24:16.981
+so this is and all of them use monolingual
+data.
+0:24:17.697 --> 0:24:19.559
+So there's no pilot data at all.
+0:24:19.503 --> 0:24:24.448
+The first one is denoising water encoding,
+so it's more like you add noise to noise to
+0:24:24.448 --> 0:24:27.404
+the sentence, and then they construct the original.
+0:24:28.388 --> 0:24:34.276
+Then we have the on the flyby translation,
+so this is where you take a sentence, generate
+0:24:34.276 --> 0:24:39.902
+a translation, and then learn the the word
+smarting, which I'm going to show pictures
+0:24:39.902 --> 0:24:45.725
+stated, and then we have an adverse serial
+planning to do learn the language independent
+0:24:45.725 --> 0:24:46.772
+representation.
+0:24:47.427 --> 0:24:52.148
+So somehow we'll fill in these three tasks
+or retain on these three tasks.
+0:24:52.085 --> 0:24:55.324
+We somehow get an answer to President M.
+0:24:55.324 --> 0:24:55.561
+T.
+0:24:55.561 --> 0:25:02.513
+OK, so first we're going to do is denoising
+what I'm cutting right, so as I said we add
+0:25:02.513 --> 0:25:06.305
+noise to the sentence, so we take our sentence.
+0:25:06.826 --> 0:25:09.709
+And then there are different ways to add noise.
+0:25:09.649 --> 0:25:11.463
+You can shuffle words around.
+0:25:11.402 --> 0:25:12.621
+You can drop words.
+0:25:12.560 --> 0:25:18.284
+Do whatever you want to do as long as there's
+enough information to reconstruct the original
+0:25:18.284 --> 0:25:18.900
+sentence.
+0:25:19.719 --> 0:25:25.051
+And then we assume that the nicest one and
+the original one are parallel data and train
+0:25:25.051 --> 0:25:26.687
+similar to the supervised.
+0:25:28.168 --> 0:25:30.354
+So we have a source sentence.
+0:25:30.281 --> 0:25:32.476
+We have a noisy source right.
+0:25:32.403 --> 0:25:37.037
+So here what basically happened is that the
+word got shuffled.
+0:25:36.963 --> 0:25:38.964
+One word is dropped right.
+0:25:38.890 --> 0:25:41.208
+So this was a noise of source.
+0:25:41.133 --> 0:25:47.042
+And then we treat the noise of source and
+source as a sentence bed basically.
+0:25:49.009 --> 0:25:53.874
+Way retainers optimizing the cross entropy
+loss similar to.
+0:25:57.978 --> 0:26:03.211
+Basically a picture to show what's happening
+and we have the nice resources.
+0:26:03.163 --> 0:26:09.210
+Now is the target and then we have the reconstructed
+original source and original tag and since
+0:26:09.210 --> 0:26:14.817
+the languages are different we have our source
+hand coded target and coded source coded.
+0:26:17.317 --> 0:26:20.202
+And for this task we only need monolingual
+data.
+0:26:20.143 --> 0:26:25.249
+We don't need any pedal data because it's
+just taking a sentence and shuffling it and
+0:26:25.249 --> 0:26:27.446
+reconstructing the the original one.
+0:26:28.848 --> 0:26:31.058
+And we are four different blocks.
+0:26:30.993 --> 0:26:36.842
+This is kind of very important to keep in
+mind on how we change these connections later.
+0:26:41.121 --> 0:26:49.093
+Then this is more like the mathematical formulation
+where you predict source given the noisy.
+0:26:52.492 --> 0:26:55.090
+So that was the nursing water encoding.
+0:26:55.025 --> 0:26:58.404
+The second step is on the flight back translation.
+0:26:59.479 --> 0:27:06.386
+So what we do is, we put our model inference
+mode right, we take a source of sentences,
+0:27:06.386 --> 0:27:09.447
+and we generate a translation pattern.
+0:27:09.829 --> 0:27:18.534
+It might be completely wrong or maybe partially
+correct or so on, but we assume that the moral
+0:27:18.534 --> 0:27:20.091
+knows of it and.
+0:27:20.680 --> 0:27:25.779
+Tend rate: T head right and then what we do
+is assume that T head or not assume but T head
+0:27:25.779 --> 0:27:27.572
+and S are sentence space right.
+0:27:27.516 --> 0:27:29.927
+That's how we can handle the translation.
+0:27:30.530 --> 0:27:38.824
+So we train a supervised system on this sentence
+bed, so we do inference and then build a reverse
+0:27:38.824 --> 0:27:39.924
+translation.
+0:27:42.442 --> 0:27:49.495
+Are both more concrete, so we have a false
+sentence right, then we chamber the translation,
+0:27:49.495 --> 0:27:55.091
+then we give the general translation as an
+input and try to predict the.
+0:27:58.378 --> 0:28:03.500
+This is how we would do in practice right,
+so not before the source encoder was connected
+0:28:03.500 --> 0:28:08.907
+to the source decoder, but now we interchanged
+connections, so the source encoder is connected
+0:28:08.907 --> 0:28:10.216
+to the target decoder.
+0:28:10.159 --> 0:28:13.291
+The target encoder is turned into the source
+decoder.
+0:28:13.974 --> 0:28:20.747
+And given s we get t-hat and given t we get
+s-hat, so this is the first time.
+0:28:21.661 --> 0:28:24.022
+On the second time step, what you're going
+to do is reverse.
+0:28:24.664 --> 0:28:32.625
+So as that is here, t hat is here, and given
+s hat we are trying to predict t, and given
+0:28:32.625 --> 0:28:34.503
+t hat we are trying.
+0:28:36.636 --> 0:28:39.386
+Is this clear you have any questions on?
+0:28:45.405 --> 0:28:50.823
+Bit more mathematically, we try to play the
+class, give and take and so it's always the
+0:28:50.823 --> 0:28:53.963
+supervised NMP technique that we are trying
+to do.
+0:28:53.901 --> 0:28:59.684
+But you're trying to create this synthetic
+pass that kind of helpers to build an unsurprised
+0:28:59.684 --> 0:29:00.182
+system.
+0:29:02.362 --> 0:29:08.611
+Now also with maybe you can see here is that
+if the source encoded and targeted encoded
+0:29:08.611 --> 0:29:14.718
+the language independent, we can always shuffle
+the connections and the translations.
+0:29:14.647 --> 0:29:21.252
+That's why it was important to find a way
+to generate language independent representations.
+0:29:21.441 --> 0:29:26.476
+And the way we try to force this language
+independence is the gan step.
+0:29:27.627 --> 0:29:34.851
+So the third step kind of combines all of
+them is where we try to use gun to make the
+0:29:34.851 --> 0:29:37.959
+encoded output language independent.
+0:29:37.875 --> 0:29:42.826
+So here it's the same picture but from a different
+paper.
+0:29:42.741 --> 0:29:43.196
+So.
+0:29:43.343 --> 0:29:48.888
+We have X-rays, X-ray objects which is monolingual
+in data.
+0:29:48.796 --> 0:29:50.189
+We add noise.
+0:29:50.690 --> 0:29:54.736
+Then we encode it using the source and the
+target encoders right.
+0:29:54.675 --> 0:29:58.247
+Then we get the latent space Z source and
+Z target right.
+0:29:58.185 --> 0:30:03.451
+Then we decode and try to reconstruct the
+original one and this is the auto encoding
+0:30:03.451 --> 0:30:08.470
+loss which takes the X source which is the
+original one and then the translated.
+0:30:08.468 --> 0:30:09.834
+Predicted output.
+0:30:09.758 --> 0:30:16.699
+So hello, it always is the auto encoding step
+where the gun concern is in the between gang
+0:30:16.699 --> 0:30:24.097
+cord outputs, and here we have an discriminator
+which tries to predict which language the latent
+0:30:24.097 --> 0:30:25.241
+space is from.
+0:30:26.466 --> 0:30:33.782
+So given Z source it has to predict that the
+representation is from a language source and
+0:30:33.782 --> 0:30:39.961
+given Z target it has to predict the representation
+from a language target.
+0:30:40.520 --> 0:30:45.135
+And our headquarters are kind of teaching
+data right now, and then we have a separate
+0:30:45.135 --> 0:30:49.803
+network discriminator which tries to predict
+which language the Latin spaces are from.
+0:30:53.393 --> 0:30:57.611
+And then this one is when we combined guns
+with the other ongoing step.
+0:30:57.552 --> 0:31:02.765
+Then we had an on the fly back translation
+step right, and so here what we're trying to
+0:31:02.765 --> 0:31:03.002
+do.
+0:31:03.863 --> 0:31:07.260
+Is the same, basically just exactly the same.
+0:31:07.186 --> 0:31:12.947
+But when we are doing the training, we are
+at the adversarial laws here, so.
+0:31:13.893 --> 0:31:20.762
+We take our X source, gender and intermediate
+translation, so why target and why source right?
+0:31:20.690 --> 0:31:27.309
+This is the previous time step, and then we
+have to encode the new sentences and basically
+0:31:27.309 --> 0:31:32.765
+make them language independent or train to
+make them language independent.
+0:31:33.974 --> 0:31:43.502
+And then the hope is that now if we do this
+using monolingual data alone we can just switch
+0:31:43.502 --> 0:31:47.852
+connections and then get our translation.
+0:31:47.748 --> 0:31:49.619
+So the scale of.
+0:31:54.574 --> 0:32:03.749
+And so as I said before, guns are quite good
+for vision right, so this is kind of like the
+0:32:03.749 --> 0:32:11.312
+cycle gun approach that you might have seen
+in any computer vision course.
+0:32:11.911 --> 0:32:19.055
+Somehow protect that place at least not as
+promising as for merchants, and so people.
+0:32:18.972 --> 0:32:23.708
+What they did is to enforce this language
+independence.
+0:32:25.045 --> 0:32:31.226
+They try to use a shared encoder instead of
+having these different encoders right, and
+0:32:31.226 --> 0:32:37.835
+so this is basically the same painting objectives
+as before, but what you're going to do now
+0:32:37.835 --> 0:32:43.874
+is learn cross language language and then use
+the single encoder for both languages.
+0:32:44.104 --> 0:32:49.795
+And this kind also forces them to be in the
+same space, and then you can choose whichever
+0:32:49.795 --> 0:32:50.934
+decoder you want.
+0:32:52.552 --> 0:32:58.047
+You can use guns or you can just use a shared
+encoder and type to build your unsupervised
+0:32:58.047 --> 0:32:58.779
+MTT system.
+0:33:08.488 --> 0:33:09.808
+These are now the.
+0:33:09.738 --> 0:33:15.984
+The enhancements that you can do on top of
+your unsavoizant system is one you can create
+0:33:15.984 --> 0:33:16.686
+a shared.
+0:33:18.098 --> 0:33:22.358
+On top of the shared encoder you can ask are
+your guns lost or whatever so there's a lot
+0:33:22.358 --> 0:33:22.550
+of.
+0:33:24.164 --> 0:33:28.909
+Parallel data by word translationThe other
+thing that is more relevant right now is that
+0:33:28.909 --> 0:33:33.709
+you can create parallel data by word to word
+translation right because you know how to do
+0:33:33.709 --> 0:33:35.468
+all supervised word translation.
+0:33:36.376 --> 0:33:40.548
+First step is to create parallel data, assuming
+that word translations are quite good.
+0:33:41.361 --> 0:33:47.162
+And then you claim a supervised and empty
+model on these more likely wrong model data,
+0:33:47.162 --> 0:33:50.163
+but somehow gives you a good starting point.
+0:33:50.097 --> 0:33:56.072
+So you build your supervised and empty system
+on the word translation data, and then you
+0:33:56.072 --> 0:33:59.967
+initialize it before you're doing unsupervised
+and empty.
+0:34:00.260 --> 0:34:05.810
+And the hope is that when you're doing the
+back pain installation, it's a good starting
+0:34:05.810 --> 0:34:11.234
+point, but it's one technique that you can
+do to to improve your anthropoids and the.
+0:34:17.097 --> 0:34:23.697
+Back translation techniqueIn the previous
+case we had: The way we know when to stop was
+0:34:23.697 --> 0:34:26.547
+to see comedians on the gun training.
+0:34:26.472 --> 0:34:28.838
+Actually, all we want to do is when W.
+0:34:28.838 --> 0:34:32.053
+Comedians, which is quite easy to know when
+to stop.
+0:34:31.993 --> 0:34:37.486
+But in a realistic case, we don't have any
+parallel data right, so there's no validation.
+0:34:37.425 --> 0:34:42.003
+Or I mean, we might have test data in the
+end, but there's no validation.
+0:34:43.703 --> 0:34:48.826
+How will we tune our hyper parameters in this
+case because it's not really there's nothing
+0:34:48.826 --> 0:34:49.445
+for us to?
+0:34:50.130 --> 0:34:53.326
+Or the gold data in a sense like so.
+0:34:53.239 --> 0:35:01.188
+How do you think we can evaluate such systems
+or how can we tune hyper parameters in this?
+0:35:11.711 --> 0:35:17.089
+So what you're going to do is use the back
+translation technique.
+0:35:17.007 --> 0:35:24.299
+It's like a common technique where you have
+nothing okay that is to use back translation
+0:35:24.299 --> 0:35:26.921
+somehow and what you can do is.
+0:35:26.839 --> 0:35:31.674
+The main idea is validate on how good the
+reconstruction.
+0:35:32.152 --> 0:35:37.534
+So the idea is that if you have a good system
+then the intermediate translation is quite
+0:35:37.534 --> 0:35:39.287
+good and going back is easy.
+0:35:39.227 --> 0:35:44.651
+But if it's just noise that you generate in
+the forward step then it's really hard to go
+0:35:44.651 --> 0:35:46.967
+back, which is kind of the main idea.
+0:35:48.148 --> 0:35:53.706
+So the way it works is that we take a source
+sentence, we generate a translation in target
+0:35:53.706 --> 0:35:59.082
+language right, and then again can state the
+generated sentence and compare it with the
+0:35:59.082 --> 0:36:01.342
+original one, and if they're closer.
+0:36:01.841 --> 0:36:09.745
+It means that we have a good system, and if
+they are far this is kind of like an unsupervised
+0:36:09.745 --> 0:36:10.334
+grade.
+0:36:17.397 --> 0:36:21.863
+As far as the amount of data that you need.
+0:36:23.083 --> 0:36:27.995
+This was like the first initial resistance
+on on these systems is that you had.
+0:36:27.933 --> 0:36:32.067
+They wanted to do English and French and they
+had fifteen million.
+0:36:32.005 --> 0:36:37.972
+There was fifteen million more linguist sentences
+so it's quite a lot and they were able to get
+0:36:37.972 --> 0:36:40.582
+thirty two blue on these kinds of setups.
+0:36:41.721 --> 0:36:47.580
+But unsurprisingly if you have zero point
+one million pilot sentences you get the same
+0:36:47.580 --> 0:36:48.455
+performance.
+0:36:48.748 --> 0:36:50.357
+So it's a lot of training.
+0:36:50.298 --> 0:36:55.924
+It's a lot of monolingual data, but monolingual
+data is relatively easy to obtain is the fact
+0:36:55.924 --> 0:37:01.251
+that the training is also quite longer than
+the supervised system, but it's unsupervised
+0:37:01.251 --> 0:37:04.304
+so it's kind of the trade off that you are
+making.
+0:37:07.367 --> 0:37:13.101
+The other thing to note is that it's English
+and French, which is very close to our exemptions.
+0:37:13.041 --> 0:37:18.238
+Also, the monolingual data that they took
+are kind of from similar domains and so on.
+0:37:18.638 --> 0:37:27.564
+So that's why they're able to build such a
+good system, but you'll see later that it fails.
+0:37:36.256 --> 0:37:46.888
+Voice, and so mean what people usually do
+is first build a system right using whatever
+0:37:46.888 --> 0:37:48.110
+parallel.
+0:37:48.608 --> 0:37:56.549
+Then they use monolingual data and do back
+translation, so this is always being the standard
+0:37:56.549 --> 0:38:04.148
+way to to improve, and what people have seen
+is that: You don't even need zero point one
+0:38:04.148 --> 0:38:05.429
+million right.
+0:38:05.344 --> 0:38:10.701
+You just need like ten thousand or so on and
+then you do the monolingual back time station
+0:38:10.701 --> 0:38:12.173
+and you're still better.
+0:38:12.114 --> 0:38:13.295
+The answer is why.
+0:38:13.833 --> 0:38:19.534
+The question is it's really worth trying to
+to do this or maybe it's always better to find
+0:38:19.534 --> 0:38:20.787
+some parallel data.
+0:38:20.725 --> 0:38:26.076
+I'll expand a bit of money on getting few
+parallel data and then use it to start and
+0:38:26.076 --> 0:38:27.776
+find to build your system.
+0:38:27.713 --> 0:38:33.757
+So it was kind of the understanding that billing
+wool and spoiled systems are not that really.
+0:38:50.710 --> 0:38:54.347
+The thing is that with unlabeled data.
+0:38:57.297 --> 0:39:05.488
+Not in an obtaining signal, so when we are
+starting basically what we want to do is first
+0:39:05.488 --> 0:39:13.224
+get a good translation system and then use
+an unlabeled monolingual data to improve.
+0:39:13.613 --> 0:39:15.015
+But if you start from U.
+0:39:15.015 --> 0:39:15.183
+N.
+0:39:15.183 --> 0:39:20.396
+Empty our model might be really bad like it
+would be somewhere translating completely wrong.
+0:39:20.760 --> 0:39:26.721
+And then when you find your unlabeled data,
+it basically might be harming, or maybe the
+0:39:26.721 --> 0:39:28.685
+same as supervised applause.
+0:39:28.617 --> 0:39:35.323
+So the thing is, I hope, by fine tuning on
+labeled data as first is to get a good initialization.
+0:39:35.835 --> 0:39:38.404
+And then use the unsupervised techniques to
+get better.
+0:39:38.818 --> 0:39:42.385
+But if your starting point is really bad then
+it's not.
+0:39:45.185 --> 0:39:47.324
+Year so as we said before.
+0:39:47.245 --> 0:39:52.451
+This is kind of like the self supervised training
+usually works.
+0:39:52.371 --> 0:39:54.777
+First we have parallel data.
+0:39:56.456 --> 0:39:58.062
+Source language is X.
+0:39:57.989 --> 0:39:59.604
+Target language is Y.
+0:39:59.531 --> 0:40:05.961
+In the end we want a system that does X to
+Y, not Y to X, but first we want to train a
+0:40:05.961 --> 0:40:10.544
+backward model as it is Y to X, so target language
+to source.
+0:40:11.691 --> 0:40:17.353
+Then we take our moonlighting will target
+sentences, use our backward model to generate
+0:40:17.353 --> 0:40:21.471
+synthetic source, and then we join them with
+our original data.
+0:40:21.406 --> 0:40:27.568
+So now we have this noisy input, but always
+the gold output, which is kind of really important
+0:40:27.568 --> 0:40:29.514
+when you're doing backpaints.
+0:40:30.410 --> 0:40:36.992
+And then you can coordinate these big data
+and then you can train your X to Y cholesterol
+0:40:36.992 --> 0:40:44.159
+system and then you can always do this in multiple
+steps and usually three, four steps which kind
+0:40:44.159 --> 0:40:48.401
+of improves always and then finally get your
+best system.
+0:40:49.029 --> 0:40:54.844
+The point that I'm trying to make is that
+although answers and MPs the scores that I've
+0:40:54.844 --> 0:41:00.659
+shown before were quite good, you probably
+can get the same performance with with fifty
+0:41:00.659 --> 0:41:06.474
+thousand sentences, and also the languages
+that they've shown are quite similar and the
+0:41:06.474 --> 0:41:08.654
+texts were from the same domain.
+0:41:14.354 --> 0:41:21.494
+So any questions on u n m t ok yeah.
+0:41:22.322 --> 0:41:28.714
+MultilingualitySo after this fact that temperature
+was already better than than empty, what people
+0:41:28.714 --> 0:41:34.655
+have tried is to use this idea of multilinguality
+as you have seen in the previous lecture.
+0:41:34.590 --> 0:41:41.029
+The question is how can we do this knowledge
+transfer from high resource language to lower
+0:41:41.029 --> 0:41:42.232
+source language?
+0:41:44.484 --> 0:41:51.074
+One way to promote this language independent
+representations is to share the encoder and
+0:41:51.074 --> 0:41:57.960
+decoder for all languages, all their available
+languages, and that kind of hopefully enables
+0:41:57.960 --> 0:42:00.034
+the the knowledge transfer.
+0:42:03.323 --> 0:42:08.605
+When we're doing multilinguality, the two
+questions we need to to think of is how does
+0:42:08.605 --> 0:42:09.698
+the encoder know?
+0:42:09.637 --> 0:42:14.495
+How does the encoder encoder know which language
+that we're dealing with that?
+0:42:15.635 --> 0:42:20.715
+You already might have known the answer also,
+and the second question is how can we promote
+0:42:20.715 --> 0:42:24.139
+the encoder to generate language independent
+representations?
+0:42:25.045 --> 0:42:32.580
+By solving these two problems we can take
+help of high resource languages to do unsupervised
+0:42:32.580 --> 0:42:33.714
+translations.
+0:42:34.134 --> 0:42:40.997
+Typical example would be you want to do unsurpressed
+between English and Dutch right, but you are
+0:42:40.997 --> 0:42:47.369
+parallel data between English and German, so
+the question is can we use this parallel data
+0:42:47.369 --> 0:42:51.501
+to help building an unsurpressed betweenEnglish
+and Dutch?
+0:42:56.296 --> 0:43:01.240
+For the first one we try to take help of language
+embeddings for tokens, and this kind of is
+0:43:01.240 --> 0:43:05.758
+a straightforward way to know to tell them
+well which language they're dealing with.
+0:43:06.466 --> 0:43:11.993
+And for the second one we're going to look
+at some pre training objectives which are also
+0:43:11.993 --> 0:43:17.703
+kind of unsupervised so we need monolingual
+data mostly and this kind of helps us to promote
+0:43:17.703 --> 0:43:20.221
+the language independent representation.
+0:43:23.463 --> 0:43:29.954
+So the first three things more that we'll
+look at is excel, which is quite famous if
+0:43:29.954 --> 0:43:32.168
+you haven't heard of it yet.
+0:43:32.552 --> 0:43:40.292
+And: The way it works is that it's basically
+a transformer encoder right, so it's like the
+0:43:40.292 --> 0:43:42.419
+just the encoder module.
+0:43:42.334 --> 0:43:44.499
+No, there's no decoder here.
+0:43:44.884 --> 0:43:51.481
+And what we're trying to do is mask two tokens
+in a sequence and try to predict these mask
+0:43:51.481 --> 0:43:52.061
+tokens.
+0:43:51.988 --> 0:43:55.469
+So I quickly called us mask language modeling.
+0:43:55.996 --> 0:44:05.419
+Typical language modeling that you see is
+the Danish language modeling where you predict
+0:44:05.419 --> 0:44:08.278
+the next token in English.
+0:44:08.172 --> 0:44:11.140
+Then we have the position.
+0:44:11.871 --> 0:44:18.774
+Then we have the token embellings, and then
+here we have the mass token, and then we have
+0:44:18.774 --> 0:44:22.378
+the transformer encoder blocks to predict the.
+0:44:24.344 --> 0:44:30.552
+To do this for all languages using the same
+tang somewhere encoded and this kind of helps
+0:44:30.552 --> 0:44:36.760
+us to push the the sentence and bearings or
+the output of the encoded into a common space
+0:44:36.760 --> 0:44:37.726
+per multiple.
+0:44:42.782 --> 0:44:49.294
+So first we train an MLM on both source, both
+source and target language sites, and then
+0:44:49.294 --> 0:44:54.928
+we use it as a starting point for the encoded
+and decoded for a UNMP system.
+0:44:55.475 --> 0:45:03.034
+So we take a monolingual data, build a mass
+language model on both source and target languages,
+0:45:03.034 --> 0:45:07.129
+and then read it to be or initialize that in
+the U.
+0:45:07.129 --> 0:45:07.365
+N.
+0:45:07.365 --> 0:45:07.601
+P.
+0:45:07.601 --> 0:45:07.837
+C.
+0:45:07.837 --> 0:45:14.688
+Here we look at two languages, but you can
+also do it with one hundred languages once.
+0:45:14.609 --> 0:45:20.174
+So they're retain checkpoints that you can
+use, which are quite which have seen quite
+0:45:20.174 --> 0:45:21.662
+a lot of data and use.
+0:45:21.597 --> 0:45:24.412
+It always has a starting point for your U.
+0:45:24.412 --> 0:45:24.608
+N.
+0:45:24.608 --> 0:45:27.292
+MP system, which in practice works well.
+0:45:31.491 --> 0:45:36.759
+This detail is that since this is an encoder
+block only, and your U.
+0:45:36.759 --> 0:45:36.988
+N.
+0:45:36.988 --> 0:45:37.217
+M.
+0:45:37.217 --> 0:45:37.446
+T.
+0:45:37.446 --> 0:45:40.347
+System is encodered, decodered right.
+0:45:40.271 --> 0:45:47.517
+So there's this cross attention that's missing,
+but you can always branch like that randomly.
+0:45:47.440 --> 0:45:48.373
+It's fine.
+0:45:48.508 --> 0:45:53.077
+Not everything is initialized, but it's still
+decent.
+0:45:56.056 --> 0:46:02.141
+Then we have the other one is M by plane,
+and here you see that this kind of builds on
+0:46:02.141 --> 0:46:07.597
+the the unsupervised training objector, which
+is the realizing auto encoding.
+0:46:08.128 --> 0:46:14.337
+So what they do is they say that we don't
+even need to do the gun outback translation,
+0:46:14.337 --> 0:46:17.406
+but you can do it later, but pre training.
+0:46:17.335 --> 0:46:24.954
+We just do do doing doing doing water inputting
+on all different languages, and that also gives
+0:46:24.954 --> 0:46:32.651
+you: Out of the box good performance, so what
+we basically have here is the transformer encoded.
+0:46:34.334 --> 0:46:37.726
+You are trying to generate a reconstructed
+sequence.
+0:46:37.662 --> 0:46:38.946
+You need a tickle.
+0:46:39.899 --> 0:46:42.022
+So we gave an input sentence.
+0:46:41.952 --> 0:46:48.138
+We tried to predict the masked tokens from
+the or we tried to reconstruct the original
+0:46:48.138 --> 0:46:52.475
+sentence from the input segments, which was
+corrupted right.
+0:46:52.404 --> 0:46:57.169
+So this is the same denoting objective that
+you have seen before.
+0:46:58.418 --> 0:46:59.737
+This is for English.
+0:46:59.674 --> 0:47:04.156
+I think this is for Japanese and then once
+we do it for all languages.
+0:47:04.093 --> 0:47:09.567
+I mean they have this difference on twenty
+five, fifty or so on and then you can find
+0:47:09.567 --> 0:47:11.795
+you on your sentence and document.
+0:47:13.073 --> 0:47:20.454
+And so what they is this for the supervised
+techniques, but you can also use this as initializations
+0:47:20.454 --> 0:47:25.058
+for unsupervised buildup on that which also
+in practice works.
+0:47:30.790 --> 0:47:36.136
+Then we have these, so still now we kind of
+didn't see the the states benefit from the
+0:47:36.136 --> 0:47:38.840
+high resource language right, so as I said.
+0:47:38.878 --> 0:47:44.994
+Why you can use English as something for English
+to Dutch, and if you want a new Catalan, you
+0:47:44.994 --> 0:47:46.751
+can use English to French.
+0:47:48.408 --> 0:47:55.866
+One typical way to do this is to use favorite
+translation lights or you take the.
+0:47:55.795 --> 0:48:01.114
+So here it's finished two weeks so you take
+your time say from finish to English English
+0:48:01.114 --> 0:48:03.743
+two weeks and then you get the translation.
+0:48:04.344 --> 0:48:10.094
+What's important is that you have these different
+techniques and you can always think of which
+0:48:10.094 --> 0:48:12.333
+one to use given the data situation.
+0:48:12.273 --> 0:48:18.007
+So if it was like finish to Greek maybe it's
+pivotal better because you might get good finish
+0:48:18.007 --> 0:48:20.020
+to English and English to Greek.
+0:48:20.860 --> 0:48:23.255
+Sometimes it also depends on the language
+pair.
+0:48:23.205 --> 0:48:27.577
+There might be some information loss and so
+on, so there are quite a few variables you
+0:48:27.577 --> 0:48:30.040
+need to think of and decide which system to
+use.
+0:48:32.752 --> 0:48:39.654
+Then there's a zero shot, which probably also
+I've seen in the multilingual course, and how
+0:48:39.654 --> 0:48:45.505
+if you can improve the language independence
+then your zero shot gets better.
+0:48:45.430 --> 0:48:52.107
+So maybe if you use the multilingual models
+and do zero shot directly, it's quite good.
+0:48:53.093 --> 0:48:58.524
+Thought we have zero shots per word, and then
+we have the answer to voice translation where
+0:48:58.524 --> 0:49:00.059
+we can calculate between.
+0:49:00.600 --> 0:49:02.762
+Just when there is no battle today.
+0:49:06.686 --> 0:49:07.565
+Is to solve.
+0:49:07.497 --> 0:49:11.960
+So sometimes what we have seen so far is that
+we basically have.
+0:49:15.255 --> 0:49:16.754
+To do from looking at it.
+0:49:16.836 --> 0:49:19.307
+These two files alone you can create a dictionary.
+0:49:19.699 --> 0:49:26.773
+Can build an unsupervised entry system, not
+always, but if the domains are similar in the
+0:49:26.773 --> 0:49:28.895
+languages, that's similar.
+0:49:28.816 --> 0:49:36.279
+But if there are distant languages, then the
+unsupervised texting doesn't usually work really
+0:49:36.279 --> 0:49:36.756
+well.
+0:49:37.617 --> 0:49:40.297
+What um.
+0:49:40.720 --> 0:49:46.338
+Would be is that if you can get some paddle
+data from somewhere or do bitex mining that
+0:49:46.338 --> 0:49:51.892
+we have seen in the in the laser practicum
+then you can use that as to initialize your
+0:49:51.892 --> 0:49:57.829
+system and then try and accept a semi supervised
+energy system and that would be better than
+0:49:57.829 --> 0:50:00.063
+just building an unsupervised and.
+0:50:00.820 --> 0:50:06.546
+With that as the end.
+0:50:07.207 --> 0:50:08.797
+Quickly could be.
+0:50:16.236 --> 0:50:25.591
+In common, they can catch the worst because
+the thing about finding a language is: And
+0:50:25.591 --> 0:50:35.053
+there's another joy in playing these games,
+almost in the middle of a game, and she's a
+0:50:35.053 --> 0:50:40.107
+characteristic too, and she is a global waver.
+0:50:56.916 --> 0:51:03.798
+Next talk inside and this somehow gives them
+many abilities, not only translation but other
+0:51:03.798 --> 0:51:08.062
+than that there are quite a few things that
+they can do.
+0:51:10.590 --> 0:51:17.706
+But the translation in itself usually doesn't
+really work really well if you build a system
+0:51:17.706 --> 0:51:20.878
+from your specific system for your case.
+0:51:22.162 --> 0:51:27.924
+I would guess that it's usually better than
+the LLM, but you can always adapt the LLM to
+0:51:27.924 --> 0:51:31.355
+the task that you want, and then it could be
+better.
+0:51:32.152 --> 0:51:37.849
+A little amount of the box might not be the
+best choice for your task force.
+0:51:37.775 --> 0:51:44.138
+For me, I'm working on new air translation,
+so it's more about translating software.
+0:51:45.065 --> 0:51:50.451
+And it's quite often each domain as well,
+and if use the LLM out of the box, they're
+0:51:50.451 --> 0:51:53.937
+actually quite bad compared to the systems
+that built.
+0:51:54.414 --> 0:51:56.736
+But you can do these different techniques
+like prompting.
+0:51:57.437 --> 0:52:03.442
+This is what people usually do is heart prompting
+where they give similar translation pairs in
+0:52:03.442 --> 0:52:08.941
+the prompt and then ask it to translate and
+then that kind of improves the performance
+0:52:08.941 --> 0:52:09.383
+a lot.
+0:52:09.320 --> 0:52:15.124
+So there are different techniques that you
+can do to adapt your eye lens and then it might
+0:52:15.124 --> 0:52:16.400
+be better than the.
+0:52:16.376 --> 0:52:17.742
+Task a fixed system.
+0:52:18.418 --> 0:52:22.857
+But if you're looking for niche things, I
+don't think error limbs are that good.
+0:52:22.802 --> 0:52:26.268
+But if you want to do to do, let's say, unplugged
+translation.
+0:52:26.213 --> 0:52:29.974
+In this case you can never be sure that they
+haven't seen the data.
+0:52:29.918 --> 0:52:35.048
+First of all is that if you see the data in
+that language or not, and if they're panthetic,
+0:52:35.048 --> 0:52:36.832
+they probably did see the data.
+0:52:40.360 --> 0:53:00.276
+I feel like they have pretty good understanding
+of each million people.
+0:53:04.784 --> 0:53:09.059
+Depends on the language, but I'm pretty surprised
+that it works on a lotus language.
+0:53:09.009 --> 0:53:11.122
+I would expect it to work on German and.
+0:53:11.972 --> 0:53:13.633
+But if you take a lot of first language,.
+0:53:14.474 --> 0:53:20.973
+Don't think it works, and also there are quite
+a few papers where they've already showed that
+0:53:20.973 --> 0:53:27.610
+if you build a system yourself or build a typical
+way to build a system, it's quite better than
+0:53:27.610 --> 0:53:29.338
+the bit better than the.
+0:53:29.549 --> 0:53:34.883
+But you can always do things with limbs to
+get better, but then I'm probably.
+0:53:37.557 --> 0:53:39.539
+Anymore.
+0:53:41.421 --> 0:53:47.461
+So if not then we're going to end the lecture
+here and then on Thursday we're going to have
+0:53:47.461 --> 0:53:51.597
+documented empty which is also run by me so
+thanks for coming.

demo_data/lectures/Lecture-15-11.07.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62985057e3dfdb7c34a3ef8e74a9b52e9529b2a974ff62438c617e6d699b5a89
+size 81272567

demo_data/lectures/Lecture-18-18.07.2023/English.vtt ADDED Viewed

	@@ -0,0 +1,2738 @@

+WEBVTT
+0:00:01.541 --> 0:00:06.914
+IntroOkay, so we'll come back to today's lecture.
+0:00:08.528 --> 0:00:23.334
+We want to talk about is speech translation,
+so we'll have two lectures in this week about
+0:00:23.334 --> 0:00:26.589
+speech translation.
+0:00:27.087 --> 0:00:36.456
+And so in the last week we'll have some exercise
+and repetition.
+0:00:36.312 --> 0:00:46.692
+We want to look at what is now to do when
+we want to translate speech.
+0:00:46.946 --> 0:00:55.675
+So we want to address the specific challenges
+that occur when we switch from translating
+0:00:55.675 --> 0:00:56.754
+to speech.
+0:00:57.697 --> 0:01:13.303
+Today we will look at the more general picture
+out and build the systems.
+0:01:13.493 --> 0:01:22.219
+Speech TranslationAnd then secondly an end
+approach where we are going to put in audio
+0:01:22.219 --> 0:01:23.623
+and generate.
+0:01:24.224 --> 0:01:41.439
+Which are the main dominant systems which
+are used in research and commercial systems.
+0:01:43.523 --> 0:01:56.879
+More general, what is the general task of
+speech translation that is shown here?
+0:01:56.714 --> 0:02:01.832
+The idea is we have a speech.
+0:02:02.202 --> 0:02:12.838
+Then we want to have a system which takes
+this audio and then translates it into another
+0:02:12.838 --> 0:02:14.033
+language.
+0:02:15.095 --> 0:02:20.694
+Then it's no longer as clear the output modality.
+0:02:20.582 --> 0:02:33.378
+In contrast, for humans we can typically have:
+So you can either have more textual translation,
+0:02:33.378 --> 0:02:37.911
+then you have subtitles, and the.
+0:02:38.538 --> 0:02:57.010
+Are you want to have it also in audio like
+it's done for human interpretation?
+0:02:57.417 --> 0:03:03.922
+See there is not the one best solution, so
+all of this one is always better.
+0:03:03.837 --> 0:03:09.415
+It heavily depends on what is the use of what
+the people prefer.
+0:03:09.929 --> 0:03:14.950
+For example, you can think of if you know
+a bit the source of language, but you're a
+0:03:14.950 --> 0:03:17.549
+bit unsure and don't understand everything.
+0:03:17.490 --> 0:03:23.138
+They may texture it out for this pattern because
+you can direct your gear to what was said and
+0:03:23.138 --> 0:03:26.706
+only if you're unsure you check down with your
+translation.
+0:03:27.727 --> 0:03:33.511
+Are another things that might be preferable
+to have a complete spoken of.
+0:03:34.794 --> 0:03:48.727
+So there are both ones for a long time in
+automatic systems focused mainly on text output.
+0:03:48.574 --> 0:04:06.741
+In most cases: But of course you can always
+hand them to text to speech systems which generates
+0:04:06.741 --> 0:04:09.958
+audio from that.
+0:04:12.772 --> 0:04:14.494
+Why should we care about that?
+0:04:14.438 --> 0:04:15.773
+Why should we do that?
+0:04:17.737 --> 0:04:24.141
+There is the nice thing that yeah, with a
+globalized world, we are able to now interact
+0:04:24.141 --> 0:04:25.888
+with a lot more people.
+0:04:25.815 --> 0:04:29.206
+You can do some conferences around the world.
+0:04:29.132 --> 0:04:31.567
+We can travel around the world.
+0:04:31.671 --> 0:04:37.802
+We can by Internet watch movies from all over
+the world and watch TV from all over the world.
+0:04:38.618 --> 0:04:47.812
+However, there is still this barrier that
+is mainly to watch videos, either in English
+0:04:47.812 --> 0:04:49.715
+or in a language.
+0:04:50.250 --> 0:05:00.622
+So what is currently happening in order to
+reach a large audience is that everybody.
+0:05:00.820 --> 0:05:07.300
+So if we are going, for example, to a conferences,
+these are international conferences.
+0:05:08.368 --> 0:05:22.412
+However, everybody will then speak English
+since that is some of the common language that
+0:05:22.412 --> 0:05:26.001
+everybody understands.
+0:05:26.686 --> 0:05:32.929
+So on the other hand, we cannot like have
+human interpreters like they ever work.
+0:05:32.892 --> 0:05:37.797
+You have that maybe in the European Parliament
+or in important business meetings.
+0:05:38.078 --> 0:05:47.151
+But this is relatively expensive, and so the
+question is, can we enable communication in
+0:05:47.151 --> 0:05:53.675
+your mother-in-law without having to have human
+interpretation?
+0:05:54.134 --> 0:06:04.321
+And there like speech translation can be helpful
+in order to help you bridge this gap.
+0:06:06.726 --> 0:06:22.507
+In this case, there are different scenarios
+of how you can apply speech translation.
+0:06:22.422 --> 0:06:29.282
+That's typically more interactive than we
+are talking about text translation.
+0:06:29.194 --> 0:06:32.802
+Text translation is most commonly used.
+0:06:33.153 --> 0:06:41.637
+Course: Nowadays there's things like chat
+and so on where it could also be interactive.
+0:06:42.082 --> 0:06:48.299
+In contrast to speech translation, that is
+less static, so there is different ways of
+0:06:48.299 --> 0:06:48.660
+how.
+0:06:49.149 --> 0:07:00.544
+The one scenario is what is called a translation
+where you first get an input, then you translate
+0:07:00.544 --> 0:07:03.799
+this fixed input, and then.
+0:07:04.944 --> 0:07:12.823
+With me, which means you have always like
+fixed, yeah fixed challenges which you need
+0:07:12.823 --> 0:07:14.105
+to translate.
+0:07:14.274 --> 0:07:25.093
+You don't need to like beat your mind what
+are the boundaries where there's an end.
+0:07:25.405 --> 0:07:31.023
+Also, there is no overlapping.
+0:07:30.842 --> 0:07:42.986
+There is always a one-person sentence that
+is getting translated.
+0:07:43.443 --> 0:07:51.181
+Of course, this has a disadvantage that it
+makes the conversation a lot longer because
+0:07:51.181 --> 0:07:55.184
+you always have only speech and translation.
+0:07:57.077 --> 0:08:03.780
+For example, if you would use that for a presentation
+there would be yeah quite get quite long, if
+0:08:03.780 --> 0:08:09.738
+I would just imagine you sitting here in the
+lecture I would say three sentences that I
+0:08:09.738 --> 0:08:15.765
+would wait for this interpreter to translate
+it, then I would say the next two sentences
+0:08:15.765 --> 0:08:16.103
+and.
+0:08:16.676 --> 0:08:28.170
+That is why in these situations, for example,
+if you have a direct conversation with a patient,
+0:08:28.170 --> 0:08:28.888
+then.
+0:08:29.209 --> 0:08:32.733
+But still there it's too big to be taking
+them very long.
+0:08:33.473 --> 0:08:42.335
+And that's why there's also the research on
+simultaneous translation, where the idea is
+0:08:42.335 --> 0:08:43.644
+in parallel.
+0:08:43.964 --> 0:08:46.179
+That Is the Dining for Human.
+0:08:46.126 --> 0:08:52.429
+Interpretation like if you think of things
+like the European Parliament where they of
+0:08:52.429 --> 0:08:59.099
+course not only speak always one sentence but
+are just giving their speech and in parallel
+0:08:59.099 --> 0:09:04.157
+human interpreters are translating the speech
+into another language.
+0:09:04.985 --> 0:09:12.733
+The same thing is interesting for automatic
+speech translation where we in parallel generate
+0:09:12.733 --> 0:09:13.817
+translation.
+0:09:15.415 --> 0:09:32.271
+The challenges then, of course, are that we
+need to segment our speech into somehow's chunks.
+0:09:32.152 --> 0:09:34.903
+We just looked for the dots we saw.
+0:09:34.827 --> 0:09:38.619
+There are some challenges that we have to
+check.
+0:09:38.541 --> 0:09:41.020
+The Doctor may not understand.
+0:09:41.201 --> 0:09:47.478
+But in generally getting sentence boundary
+sentences is not a really research question.
+0:09:47.647 --> 0:09:51.668
+While in speech translation, this is not that
+easy.
+0:09:51.952 --> 0:10:05.908
+Either getting that in the audio is difficult
+because it's not like we typically do breaks
+0:10:05.908 --> 0:10:09.742
+when there's a sentence.
+0:10:10.150 --> 0:10:17.432
+And even if you then see the transcript and
+would have to add the punctuation, this is
+0:10:17.432 --> 0:10:18.101
+not as.
+0:10:20.340 --> 0:10:25.942
+Another question is how many speakers we have
+here.
+0:10:25.834 --> 0:10:31.761
+In presentations you have more like a single
+speaker.
+0:10:31.931 --> 0:10:40.186
+That is normally easier from the part of audio
+processing, so in general in speech translation.
+0:10:40.460 --> 0:10:49.308
+You can have different challenges and they
+can be of different components.
+0:10:49.190 --> 0:10:56.039
+In addition to translation, you have: And
+if you're not going, for example, the magical
+0:10:56.039 --> 0:11:00.398
+speaker, there are significantly additional
+challenges.
+0:11:00.720 --> 0:11:10.313
+So we as humans we are very good in filtering
+out noises, or if two people speak in parallel
+0:11:10.313 --> 0:11:15.058
+to like separate these two speakers and hear.
+0:11:15.495 --> 0:11:28.300
+However, if you want to do that with automatic
+systems that is very challenging so that you
+0:11:28.300 --> 0:11:33.172
+can separate the speakers so that.
+0:11:33.453 --> 0:11:41.284
+For the more of you have this multi-speaker
+scenario, typically it's also less well prepared.
+0:11:41.721 --> 0:11:45.807
+So you're getting very, we'll talk about the
+spontaneous effects.
+0:11:46.186 --> 0:11:53.541
+So people like will stop in the middle of
+the sentence, they change their sentence, and
+0:11:53.541 --> 0:12:01.481
+so on, and like filtering these, these fluences
+out of the text and working with them is often
+0:12:01.481 --> 0:12:02.986
+very challenging.
+0:12:05.565 --> 0:12:09.144
+So these are all additional challenges when
+you have multiples.
+0:12:10.330 --> 0:12:19.995
+Then there's a question of an online or offline
+system, sometimes textbook station.
+0:12:19.880 --> 0:12:21.844
+We also mainly.
+0:12:21.962 --> 0:12:36.507
+That means you can take the whole text and
+you can translate it in a badge.
+0:12:37.337 --> 0:12:44.344
+However, for speech translation there's also
+several scenarios where this is the case.
+0:12:44.264 --> 0:12:51.488
+For example, when you're translating a movie,
+it's not only that you don't have to do it
+0:12:51.488 --> 0:12:54.735
+live, but you can take the whole movie.
+0:12:55.215 --> 0:13:05.473
+However, there is also a lot of situations
+where you don't have this opportunity like
+0:13:05.473 --> 0:13:06.785
+or sports.
+0:13:07.247 --> 0:13:13.963
+And you don't want to like first like let
+around a sports event and then like show in
+0:13:13.963 --> 0:13:19.117
+the game three hours later then there is not
+really any interest.
+0:13:19.399 --> 0:13:31.118
+So you have to do it live, and so we have
+the additional challenge of translating the
+0:13:31.118 --> 0:13:32.208
+system.
+0:13:32.412 --> 0:13:42.108
+There are still things on the one end of course.
+0:13:41.910 --> 0:13:49.632
+It needs to be real time translation.
+0:13:49.869 --> 0:13:54.153
+It's taking longer, then you're getting more
+and more and more delayed.
+0:13:55.495 --> 0:14:05.245
+So it maybe seems simple, but there have been
+research systems which are undertime slower
+0:14:05.245 --> 0:14:07.628
+than real time or so.
+0:14:07.520 --> 0:14:15.104
+If you want to show what is possible with
+the best current systems,.
+0:14:16.596 --> 0:14:18.477
+But that isn't even not enough.
+0:14:18.918 --> 0:14:29.593
+The other question: You can have a system
+which is even like several times real time.
+0:14:29.509 --> 0:14:33.382
+In less than one second, it might still be
+not useful.
+0:14:33.311 --> 0:14:39.646
+Then the question is like the latency, so
+how much time has passed since you can produce
+0:14:39.646 --> 0:14:39.931
+an.
+0:14:40.120 --> 0:14:45.814
+It might be that in average you can like concress
+it, but you still can't do it directly.
+0:14:45.751 --> 0:14:51.547
+You need to do it after, or you need to have
+the full context of thirty seconds before you
+0:14:51.547 --> 0:14:55.178
+can output something, and then you have a large
+latency.
+0:14:55.335 --> 0:15:05.871
+So it can be that do it as fast as it is produced,
+but have to wait until the food.
+0:15:06.426 --> 0:15:13.772
+So we'll look into that on Thursday how we
+can then generate translations that are having
+0:15:13.772 --> 0:15:14.996
+a low latency.
+0:15:15.155 --> 0:15:21.587
+You can imagine, for example, in German that
+it's maybe quite challenging since the word
+0:15:21.587 --> 0:15:23.466
+is often like at the end.
+0:15:23.394 --> 0:15:30.108
+If you're using perfect, like in harbor and
+so on, and then in English you have to directly
+0:15:30.108 --> 0:15:30.983
+produce it.
+0:15:31.311 --> 0:15:38.757
+So if you really want to have no context you
+might need to wait until the end of the sentence.
+0:15:41.021 --> 0:15:45.920
+Besides that, of course, offline and it gives
+you more additional help.
+0:15:45.852 --> 0:15:51.399
+Context Based SystemsI think last week you
+talked about context based systems that typically
+0:15:51.399 --> 0:15:55.575
+have context from maybe from the past but maybe
+also from the future.
+0:15:55.595 --> 0:16:02.923
+Then, of course, you cannot use anything from
+the future in this case, but you can use it.
+0:16:07.407 --> 0:16:24.813
+Finally, there is a thing about how you want
+to present it to the audience in automatic
+0:16:24.813 --> 0:16:27.384
+translation.
+0:16:27.507 --> 0:16:31.361
+There is also the thing that you want to do.
+0:16:31.275 --> 0:16:35.302
+All your outfits are running like the system.
+0:16:35.996 --> 0:16:36.990
+Top of it.
+0:16:36.900 --> 0:16:44.315
+Then they answered questions: How should it
+be spoken so you can do things like.
+0:16:46.586 --> 0:16:52.507
+Voice cloning so that it's like even the same
+voice than the original speaker.
+0:16:53.994 --> 0:16:59.081
+And if you do text or dubbing then there might
+be additional constraints.
+0:16:59.012 --> 0:17:05.614
+So if you think about subtitles: And they
+should be readable, and we are too big to speak
+0:17:05.614 --> 0:17:07.961
+faster than you can maybe read.
+0:17:08.908 --> 0:17:14.239
+So you might need to shorten your text.
+0:17:14.105 --> 0:17:20.170
+People say that a subtitle can be two lines.
+0:17:20.035 --> 0:17:26.103
+Each line can be this number of characters.
+0:17:26.346 --> 0:17:31.753
+So you cannot like if you have too long text,
+we might need to shorten that to do that.
+0:17:32.052 --> 0:17:48.272
+Similarly, if you think about dubbing, if
+you want to produce dubbing voice, then the
+0:17:48.272 --> 0:17:50.158
+original.
+0:17:51.691 --> 0:17:59.294
+Here is another problem that we have different
+settings like a more formal setting and let's
+0:17:59.294 --> 0:18:00.602
+have different.
+0:18:00.860 --> 0:18:09.775
+If you think about the United Nations maybe
+you want more former things and between friends
+0:18:09.775 --> 0:18:14.911
+maybe that former and there are languages which
+use.
+0:18:15.355 --> 0:18:21.867
+That is sure that is an important research
+question.
+0:18:21.744 --> 0:18:28.013
+To do that would more think of it more generally.
+0:18:28.308 --> 0:18:32.902
+That's important in text translation.
+0:18:32.781 --> 0:18:41.003
+If you translate a letter to your boss, it
+should sound different.
+0:18:42.202 --> 0:18:53.718
+So there is a question of how you can do this
+style work on how you can do that.
+0:18:53.576 --> 0:19:00.545
+For example, if you can specify that you might.
+0:19:00.460 --> 0:19:10.954
+So you can tax the center or generate an informal
+style because, as you correctly said, this
+0:19:10.954 --> 0:19:16.709
+is especially challenging again in the situations.
+0:19:16.856 --> 0:19:20.111
+Of course, there are ways of like being formal
+or less formal.
+0:19:20.500 --> 0:19:24.940
+But it's not like as clear as you do it, for
+example, in German where you have the twin
+0:19:24.940 --> 0:19:25.091
+C.
+0:19:25.091 --> 0:19:26.857
+So there is no one to own mapping.
+0:19:27.287 --> 0:19:34.269
+If you want to make that sure you can build
+a system which generates different styles in
+0:19:34.269 --> 0:19:38.662
+the output, so yeah that's definitely also
+a challenge.
+0:19:38.584 --> 0:19:43.763
+It just may be not mentioned here because
+it's not specific now.
+0:19:44.524 --> 0:19:54.029
+Generally, of course, these are all challenges
+in how to customize and adapt systems to use
+0:19:54.029 --> 0:19:56.199
+cases with specific.
+0:20:00.360 --> 0:20:10.230
+Cascading SystemsSpeech translation has been
+done for quite a while and it's maybe not surprising
+0:20:10.230 --> 0:20:13.554
+it started with more simple use.
+0:20:13.793 --> 0:20:24.557
+So people first started to look into, for
+example, limited to main translations.
+0:20:24.424 --> 0:20:33.728
+The tourist was typically application if you're
+going to a new city.
+0:20:34.834 --> 0:20:44.028
+Then there are several open things of doing
+open domain translation, especially people.
+0:20:44.204 --> 0:20:51.957
+Like where there's a lot of data so you could
+build systems which are more open to main,
+0:20:51.957 --> 0:20:55.790
+but of course it's still a bit restrictive.
+0:20:55.703 --> 0:20:59.061
+It's true in the European Parliament.
+0:20:58.973 --> 0:21:01.892
+People talk about anything but.
+0:21:02.162 --> 0:21:04.820
+And so it's not completely used for everything.
+0:21:05.165 --> 0:21:11.545
+Nowadays we've seen this technology in a lot
+of different situations guess you ought.
+0:21:11.731 --> 0:21:17.899
+Use it so there is some basic technologies
+where you can use them already.
+0:21:18.218 --> 0:21:33.599
+There is still a lot of open questions going
+from if you are going to really spontaneous
+0:21:33.599 --> 0:21:35.327
+meetings.
+0:21:35.655 --> 0:21:41.437
+Then these systems typically work good for
+like some languages where we have a lot of
+0:21:41.437 --> 0:21:42.109
+friendly.
+0:21:42.742 --> 0:21:48.475
+But if we want to go for really low resource
+data then things are often challenging.
+0:21:48.448 --> 0:22:02.294
+Last week we had a workshop on spoken language
+translation and there is a low-resource data
+0:22:02.294 --> 0:22:05.756
+track which is dialed.
+0:22:05.986 --> 0:22:06.925
+And so on.
+0:22:06.840 --> 0:22:14.700
+All these languages can still then have significantly
+lower performance than for a higher.
+0:22:17.057 --> 0:22:20.126
+So how does this work?
+0:22:19.993 --> 0:22:30.061
+If we want to do speech translation, there's
+like three basic technology: So on the one
+0:22:30.061 --> 0:22:40.815
+hand, it's automatic speech recognition where
+automatic speech recognition normally transacts
+0:22:40.815 --> 0:22:41.615
+audio.
+0:22:42.822 --> 0:22:58.289
+Then what we talked about here is machine
+translation, which takes input and translates
+0:22:58.289 --> 0:23:01.276
+into the target.
+0:23:02.642 --> 0:23:11.244
+And the very simple model now, if you think
+about it, is of course the similar combination.
+0:23:11.451 --> 0:23:14.740
+We have solved all these parts in a salt bedrock.
+0:23:14.975 --> 0:23:31.470
+We are working on all these problems there,
+so if we want to do a speech transition, maybe.
+0:23:31.331 --> 0:23:35.058
+Such problems we just put all these combinations
+together.
+0:23:35.335 --> 0:23:45.130
+And then you get what you have as a cascading
+system, which first is so you take your audio.
+0:23:45.045 --> 0:23:59.288
+To take this as input and generate the output,
+and then you take this text output, put it
+0:23:59.288 --> 0:24:00.238
+into.
+0:24:00.640 --> 0:24:05.782
+So in that way you have now.
+0:24:08.008 --> 0:24:18.483
+Have now a solution for generating doing speech
+translation for these types of systems, and
+0:24:18.483 --> 0:24:20.874
+this type is called.
+0:24:21.681 --> 0:24:28.303
+It is still often reaching state of the art,
+however it has benefits and disadvantages.
+0:24:28.668 --> 0:24:41.709
+So the one big benefit is we have independent
+components and some of that is nice.
+0:24:41.552 --> 0:24:48.469
+So if there are great ideas put into your.
+0:24:48.788 --> 0:24:57.172
+And then some other times people develop a
+new good way of how to improve.
+0:24:57.060 --> 0:25:00.976
+You can also take this model and.
+0:25:01.381 --> 0:25:07.639
+So you can leverage improvements from all
+the different communities in order to adapt.
+0:25:08.288 --> 0:25:18.391
+Furthermore, we would like to see, since all
+of them is learning, that the biggest advantage
+0:25:18.391 --> 0:25:23.932
+is that we have training data for each individual.
+0:25:24.164 --> 0:25:34.045
+So there's a lot less training data where
+you have the English audio, so it's easy to
+0:25:34.045 --> 0:25:34.849
+train.
+0:25:36.636 --> 0:25:48.595
+Now am a one that we will focus on when talking
+about the cascaded approach is that often it.
+0:25:48.928 --> 0:25:58.049
+So you need to adapt each component a bit
+so that it's adapting to its input and.
+0:25:58.278 --> 0:26:08.728
+So we'll focus there especially on how to
+combine and since said the main focus is: So
+0:26:08.728 --> 0:26:18.578
+if you would directly use an output that might
+not work as perfect as you would,.
+0:26:18.918 --> 0:26:33.467
+So a major challenge when building a cascade
+of speech translation systems is how can we
+0:26:33.467 --> 0:26:38.862
+adapt these systems and how can?
+0:26:41.681 --> 0:26:43.918
+So why, why is this the kick?
+0:26:44.164 --> 0:26:49.183
+So it would look quite nice.
+0:26:49.010 --> 0:26:54.617
+It seems to be very reasonable.
+0:26:54.442 --> 0:26:58.196
+You have some audio.
+0:26:58.018 --> 0:27:03.388
+You put it into your system.
+0:27:04.965 --> 0:27:23.759
+However, this is a bit which for thinking
+because if you speak what you speak is more.
+0:27:23.984 --> 0:27:29.513
+And especially all that rarely have punctuations
+in there, and while the anti-system.
+0:27:29.629 --> 0:27:43.247
+They assume, of course, that it's a full sentence,
+that you don't have there some.
+0:27:43.523 --> 0:27:55.087
+So we see we want to get this bridge between
+the output and the input, and we might need
+0:27:55.087 --> 0:27:56.646
+additional.
+0:27:58.778 --> 0:28:05.287
+And that is typically what is referred to
+as re-case and re-piculation system.
+0:28:05.445 --> 0:28:15.045
+So the idea is that you might be good to have
+something like an adapter here in between,
+0:28:15.045 --> 0:28:20.007
+which really tries to adapt the speech input.
+0:28:20.260 --> 0:28:28.809
+That can be at different levels, but it might
+be even more rephrasing.
+0:28:29.569 --> 0:28:40.620
+If you think of the sentence, if you have
+false starts, then when speaking you sometimes
+0:28:40.620 --> 0:28:41.986
+assume oh.
+0:28:41.901 --> 0:28:52.224
+You restart it, then you might want to delete
+that because if you read it you don't want
+0:28:52.224 --> 0:28:52.688
+to.
+0:28:56.096 --> 0:28:57.911
+Why is this yeah?
+0:28:57.810 --> 0:29:01.445
+The case in punctuation important.
+0:29:02.622 --> 0:29:17.875
+One important thing is directly for the challenge
+is when speak is just a continuous stream of
+0:29:17.875 --> 0:29:18.999
+words.
+0:29:19.079 --> 0:29:27.422
+Then just speaking and punctuation marks,
+and so on are all notes are there in natural.
+0:29:27.507 --> 0:29:30.281
+However, they are of course important.
+0:29:30.410 --> 0:29:33.877
+They are first of all very important for readability.
+0:29:34.174 --> 0:29:41.296
+If you have once read a text without characterization
+marks, you need more time to process it.
+0:29:41.861 --> 0:29:47.375
+They're sometimes even semantically important.
+0:29:47.258 --> 0:29:52.892
+There's a list for grandpa and big difference.
+0:29:53.553 --> 0:30:00.089
+And so this, of course, with humans as well,
+it'd be easy to distinguish by again doing
+0:30:00.089 --> 0:30:01.426
+it automatically.
+0:30:01.352 --> 0:30:06.181
+It's more typically and finally, in our case,
+if we want to do.
+0:30:06.386 --> 0:30:13.672
+We are assuming normally sentence wise, so
+we always enter out system which is like one
+0:30:13.672 --> 0:30:16.238
+sentence by the next sentence.
+0:30:16.736 --> 0:30:26.058
+If you want to do speech translation of a
+continuous stream, then of course what are
+0:30:26.058 --> 0:30:26.716
+your.
+0:30:28.168 --> 0:30:39.095
+And the easiest and most straightforward situation
+is, of course, if you have a continuously.
+0:30:39.239 --> 0:30:51.686
+And if it generates your calculation marks,
+it's easy to separate your text into sentences.
+0:30:52.032 --> 0:31:09.157
+So we can again reuse our system and thereby
+have a normal anti-system on this continuous.
+0:31:14.174 --> 0:31:21.708
+These are a bit older numbers, but they show
+you a bit also how important all that is.
+0:31:21.861 --> 0:31:31.719
+So this was so the best is if you do insurance
+transcript you get roughly a blue score of.
+0:31:32.112 --> 0:31:47.678
+If you have as it is with some air based length
+segmentation, then you get something like.
+0:31:47.907 --> 0:31:57.707
+If you then use the segments correctly as
+it's done from the reference, you get one blue
+0:31:57.707 --> 0:32:01.010
+point and another blue point.
+0:32:01.201 --> 0:32:08.085
+So you see that you have been total like nearly
+two blue points just by having the correct
+0:32:08.085 --> 0:32:09.144
+segmentation.
+0:32:10.050 --> 0:32:21.178
+This shows you that it's important to estimate
+as good a segmentation because even if you
+0:32:21.178 --> 0:32:25.629
+still have the same arrows in your.
+0:32:27.147 --> 0:32:35.718
+Is to be into this movement, which is also
+not as unusual as we do in translation.
+0:32:36.736 --> 0:32:40.495
+So this is done by looking at the reference.
+0:32:40.412 --> 0:32:48.055
+It should show you how much these scores are
+done to just analyze how important are these.
+0:32:47.971 --> 0:32:55.700
+So you take the A's R transcript and you look
+at the reference and it's only done for the.
+0:32:55.635 --> 0:33:05.843
+If we have optimal punctuations, if our model
+is as good and optimal, so as a reference we
+0:33:05.843 --> 0:33:15.939
+could: But of course this is not how we can
+do it in reality because we don't have access
+0:33:15.939 --> 0:33:16.948
+to that.
+0:33:17.657 --> 0:33:24.044
+Because one would invade you okay, why should
+we do that?
+0:33:23.933 --> 0:33:28.781
+If we have the optimal then it's possible.
+0:33:31.011 --> 0:33:40.060
+And yeah, that is why a typical system does
+not only yeah depend on if our key component.
+0:33:40.280 --> 0:33:56.468
+But in between you have this segmentation
+in there in order to have more input and.
+0:33:56.496 --> 0:34:01.595
+You can also prefer often this invariability
+over the average study.
+0:34:04.164 --> 0:34:17.896
+SegmentationSo the task of segmentation is
+to re-segment the text into what is called
+0:34:17.896 --> 0:34:24.283
+sentence like unit, so you also assign.
+0:34:24.444 --> 0:34:39.421
+That is more a traditional thing because for
+a long time case information was not provided.
+0:34:39.879 --> 0:34:50.355
+So there was any good ASR system which directly
+provides you with case information and this
+0:34:50.355 --> 0:34:52.746
+may not be any more.
+0:34:56.296 --> 0:35:12.060
+How that can be done is you can have three
+different approaches because that was some
+0:35:12.060 --> 0:35:16.459
+of the most common one.
+0:35:17.097 --> 0:35:23.579
+Course: That is not the only thing you can
+do.
+0:35:23.441 --> 0:35:30.891
+You can also try to train the data to generate
+that.
+0:35:31.891 --> 0:35:41.324
+On the other hand, that is of course more
+challenging.
+0:35:41.153 --> 0:35:47.503
+You need some type of segmentation.
+0:35:48.028 --> 0:35:59.382
+Mean, of course, you can easily remove and
+capture information from your data and then
+0:35:59.382 --> 0:36:05.515
+play a system which does non-case to non-case.
+0:36:05.945 --> 0:36:15.751
+You can also, of course, try to combine these
+two into one so that you directly translate
+0:36:15.751 --> 0:36:17.386
+from non-case.
+0:36:17.817 --> 0:36:24.722
+What is more happening by now is that you
+also try to provide these to that you provide.
+0:36:24.704 --> 0:36:35.267
+The ASR is a segmentation directly get these
+information in there.
+0:36:35.110 --> 0:36:45.597
+The systems that combine the A's and A's are:
+Yes, there is a valid rule.
+0:36:45.455 --> 0:36:51.182
+What we come later to today is that you do
+audio to text in the target language.
+0:36:51.111 --> 0:36:54.880
+That is what is referred to as an end to end
+system.
+0:36:54.809 --> 0:36:59.686
+So it's directly and this is still more often
+done for text output.
+0:36:59.614 --> 0:37:03.416
+But there is also end to end system which
+directly.
+0:37:03.683 --> 0:37:09.109
+There you have additional challenges by how
+to even measure if things are correct or not.
+0:37:09.089 --> 0:37:10.522
+Mean for text.
+0:37:10.427 --> 0:37:18.074
+You can mention, in other words, that for
+audio the audio signal is even more.
+0:37:18.318 --> 0:37:27.156
+That's why it's currently mostly speech to
+text, but that is one single system, but of
+0:37:27.156 --> 0:37:27.969
+course.
+0:37:32.492 --> 0:37:35.605
+Yeah, how can you do that?
+0:37:35.490 --> 0:37:45.161
+You can do adding these calculation information:
+Will look into three systems.
+0:37:45.039 --> 0:37:53.132
+You can do that as a sequence labeling problem
+or as a monolingual.
+0:37:54.534 --> 0:37:57.145
+Let's have a little bit of a series.
+0:37:57.075 --> 0:37:59.485
+This was some of the first ideas.
+0:37:59.414 --> 0:38:04.545
+There's the idea where you try to do it mainly
+based on language model.
+0:38:04.474 --> 0:38:11.446
+So how probable is that there is a punctuation
+that was done with like old style engram language
+0:38:11.446 --> 0:38:12.884
+models to visually.
+0:38:13.073 --> 0:38:24.687
+So you can, for example, if you have a program
+language model to calculate the score of Hello,
+0:38:24.687 --> 0:38:25.787
+how are?
+0:38:25.725 --> 0:38:33.615
+And then you compare this probability and
+take the one which has the highest probability.
+0:38:33.527 --> 0:38:39.928
+You might have something like if you have
+very long pauses, you anyway.
+0:38:40.340 --> 0:38:49.345
+So this is a very easy model, which only calculates
+some language model probabilities, and however
+0:38:49.345 --> 0:38:57.440
+the advantages of course are: And then, of
+course, in general, so what we will look into
+0:38:57.440 --> 0:39:05.535
+here is that maybe interesting is that most
+of the systems, also the advance, are really
+0:39:05.535 --> 0:39:08.719
+mainly focused purely on the text.
+0:39:09.289 --> 0:39:19.237
+If you think about how to insert punctuation
+marks, maybe your first idea would have been
+0:39:19.237 --> 0:39:22.553
+we can use pause information.
+0:39:23.964 --> 0:39:30.065
+But however interestingly most systems that
+use are really focusing on the text.
+0:39:31.151 --> 0:39:34.493
+There are several reasons.
+0:39:34.369 --> 0:39:44.149
+One is that it's easier to get training data
+so you only need pure text data.
+0:39:46.806 --> 0:40:03.221
+The next way you can do it is you can make
+it as a secret labeling tax or something like
+0:40:03.221 --> 0:40:04.328
+that.
+0:40:04.464 --> 0:40:11.734
+Then you have how there is nothing in you,
+and there is a.
+0:40:11.651 --> 0:40:15.015
+A question.
+0:40:15.315 --> 0:40:31.443
+So you have the number of labels, the number
+of punctuation symbols you have for the basic
+0:40:31.443 --> 0:40:32.329
+one.
+0:40:32.892 --> 0:40:44.074
+Typically nowadays it would use something
+like bird, and then you can train a sister.
+0:40:48.168 --> 0:40:59.259
+Any questions to that then it would probably
+be no contrary, you know, or not.
+0:41:00.480 --> 0:41:03.221
+Yeah, you have definitely a labeled imbalance.
+0:41:04.304 --> 0:41:12.405
+Think that works relatively well and haven't
+seen that.
+0:41:12.260 --> 0:41:21.087
+It's not a completely crazy label, maybe twenty
+times more.
+0:41:21.561 --> 0:41:29.636
+It can and especially for the more rare things
+mean, the more rare things is question marks.
+0:41:30.670 --> 0:41:43.877
+At least for question marks you have typically
+very strong indicator words.
+0:41:47.627 --> 0:42:03.321
+And then what was done for quite a long time
+can we know how to do machine translation?
+0:42:04.504 --> 0:42:12.640
+So the idea is, can we just translate non
+punctuated English into punctuated English
+0:42:12.640 --> 0:42:14.650
+and do it correctly?
+0:42:15.855 --> 0:42:25.344
+So what you need is something like this type
+of data where the source doesn't have punctuation.
+0:42:25.845 --> 0:42:30.641
+Course: A year is already done.
+0:42:30.491 --> 0:42:36.490
+You have to make it a bit challenging.
+0:42:41.661 --> 0:42:44.550
+Yeah, that is true.
+0:42:44.405 --> 0:42:55.188
+If you think about the normal trained age,
+you have to do one thing more.
+0:42:55.043 --> 0:43:00.730
+Is it otherwise difficult to predict?
+0:43:05.745 --> 0:43:09.277
+Here it's already this already looks different
+than normal training data.
+0:43:09.229 --> 0:43:09.901
+What is the.
+0:43:10.350 --> 0:43:15.305
+People want to use this transcript of speech.
+0:43:15.198 --> 0:43:19.509
+We'll probably go to our text editors.
+0:43:19.419 --> 0:43:25.906
+Yes, that is all already quite too difficult.
+0:43:26.346 --> 0:43:33.528
+Mean, that's making things a lot better with
+the first and easiest thing is you have to
+0:43:33.528 --> 0:43:35.895
+randomly cut your sentences.
+0:43:35.813 --> 0:43:43.310
+So if you take just me normally we have one
+sentence per line and if you take this as your
+0:43:43.310 --> 0:43:44.546
+training data.
+0:43:44.924 --> 0:43:47.857
+And that is, of course, not very helpful.
+0:43:48.208 --> 0:44:01.169
+So in order to build the training corpus for
+doing punctuation you randomly cut your sentences
+0:44:01.169 --> 0:44:08.264
+and then you can remove all your punctuation
+marks.
+0:44:08.528 --> 0:44:21.598
+Because of course there is no longer to do
+when you have some random segments in your
+0:44:21.598 --> 0:44:22.814
+system.
+0:44:25.065 --> 0:44:37.984
+And then you can, for example, if you then
+have generated your punctuation marks before
+0:44:37.984 --> 0:44:41.067
+going to the system.
+0:44:41.221 --> 0:44:54.122
+And that is an important thing, which we like
+to see is more challenging for end systems.
+0:44:53.979 --> 0:45:00.146
+We can change the segmentation, so maybe.
+0:45:00.040 --> 0:45:06.417
+You can, then if you're combining these things
+you can change the segmentation here, so.
+0:45:06.406 --> 0:45:18.178
+While you have ten new ten segments in your,
+you might only have five ones in your anymore.
+0:45:18.050 --> 0:45:18.972
+Then.
+0:45:19.259 --> 0:45:33.172
+Which might be more useful or helpful in because
+you have to reorder things and so on.
+0:45:33.273 --> 0:45:43.994
+And if you think of the wrong segmentation
+then you cannot reorder things from the beginning
+0:45:43.994 --> 0:45:47.222
+to the end of the sentence.
+0:45:49.749 --> 0:45:57.998
+ArrowsOkay, so much about segmentation do
+you have any more questions about that?
+0:46:02.522 --> 0:46:21.299
+Then there is one additional thing you can
+do, and that is when we refer to the idea.
+0:46:21.701 --> 0:46:29.356
+And when you get input there might be some
+arrows in there, so it might not be perfect.
+0:46:29.889 --> 0:46:36.322
+So the question is, can we adapt to that?
+0:46:36.169 --> 0:46:45.360
+And can the system be improved by saying that
+it can some.
+0:46:45.265 --> 0:46:50.591
+So that is as aware that before there is a.
+0:46:50.490 --> 0:46:55.449
+Their arm might not be the best one.
+0:46:55.935 --> 0:47:01.961
+There are different ways of dealing with them.
+0:47:01.833 --> 0:47:08.118
+You can use a best list but several best lists.
+0:47:08.408 --> 0:47:16.711
+So the idea is that you're not only telling
+the system this is the transcript, but here
+0:47:16.711 --> 0:47:18.692
+I'm not going to be.
+0:47:19.419 --> 0:47:30.748
+Or that you can try to make it more robust
+towards arrows from an system so that.
+0:47:32.612 --> 0:47:48.657
+Interesting what is often done is hope convince
+you it might be a good idea to deal.
+0:47:48.868 --> 0:47:57.777
+The interesting thing is if you're looking
+into a lot of systems, this is often ignored,
+0:47:57.777 --> 0:48:04.784
+so they are not adapting their T-system to
+this type of A-S-R system.
+0:48:05.345 --> 0:48:15.232
+So it's not really doing any handling of Arab,
+and the interesting thing is often works as
+0:48:15.232 --> 0:48:15.884
+good.
+0:48:16.516 --> 0:48:23.836
+And one reason is, of course, one reason is
+if the ASR system does not arrow up to like
+0:48:23.836 --> 0:48:31.654
+a challenging situation, and then the antisystem
+is really for the antisystem hard to detect.
+0:48:31.931 --> 0:48:38.080
+If it would be easy for the system to detect
+the error you would integrate this information
+0:48:38.080 --> 0:48:44.296
+into: That is not always the case, but that
+of course makes it a bit challenging, and that's
+0:48:44.296 --> 0:48:49.776
+why there is a lot of systems where it's not
+explicitly handled how to deal with.
+0:48:52.912 --> 0:49:06.412
+But of course it might be good, so one thing
+is you can give him a best list and you can
+0:49:06.412 --> 0:49:09.901
+translate every entry.
+0:49:10.410 --> 0:49:17.705
+And then you have two scores like the anti-probability
+and the square probability.
+0:49:18.058 --> 0:49:25.695
+Combine them and then generate or output the
+output from what has the best combined.
+0:49:26.366 --> 0:49:29.891
+And then it might no longer be the best.
+0:49:29.805 --> 0:49:38.144
+It might like we had a bean search, so this
+has the best score, but this has a better combined.
+0:49:39.059 --> 0:49:46.557
+The problem sometimes works, but the problem
+is that the anti-system might then tend to
+0:49:46.557 --> 0:49:52.777
+just translate not the correct sentence but
+the one easier to translate.
+0:49:53.693 --> 0:50:03.639
+You can also generate a more compact representation
+of this invest in it by having this type of
+0:50:03.639 --> 0:50:04.467
+graphs.
+0:50:05.285 --> 0:50:22.952
+Lettices: So then you could like try to do
+a graph to text translation so you can translate.
+0:50:22.802 --> 0:50:26.582
+Where like all possibilities, by the way our
+systems are invented.
+0:50:26.906 --> 0:50:31.485
+So it can be like a hostage, a conference
+with some programs.
+0:50:31.591 --> 0:50:35.296
+So the highest probability is here.
+0:50:35.193 --> 0:50:41.986
+Conference is being recorded, but there are
+other possibilities.
+0:50:42.302 --> 0:50:53.054
+And you can take all of this information out
+there with your probabilities.
+0:50:59.980 --> 0:51:07.614
+But we'll see this type of arrow propagation
+that if you have an error that this might then
+0:51:07.614 --> 0:51:15.165
+propagate to, and t errors is one of the main
+reasons why people looked into other ways of
+0:51:15.165 --> 0:51:17.240
+doing it and not having.
+0:51:19.219 --> 0:51:28.939
+Advantages and DisadvantagesBy generally a
+cascaded combination, as we've seen it, it
+0:51:28.939 --> 0:51:39.224
+has several advantages: The biggest maybe is
+the data availability so we can train systems
+0:51:39.224 --> 0:51:42.615
+for the different components.
+0:51:42.822 --> 0:51:47.228
+So you can train your individual components
+on relatively large stages.
+0:51:47.667 --> 0:51:58.207
+A modular system where you can improve each
+individual model and if there's new development
+0:51:58.207 --> 0:52:01.415
+and models you can improve.
+0:52:01.861 --> 0:52:11.603
+There are several advantages, but of course
+there are also some disadvantages: The most
+0:52:11.603 --> 0:52:19.574
+common thing is that there is what is referred
+to as arrow propagation.
+0:52:19.463 --> 0:52:28.223
+If the arrow is arrow, probably your output
+will then directly do an arrow.
+0:52:28.868 --> 0:52:41.740
+Typically it's like if there's an error in
+the system, it's easier to like ignore by a
+0:52:41.740 --> 0:52:46.474
+quantity scale than the output.
+0:52:46.967 --> 0:52:49.785
+What do that mean?
+0:52:49.637 --> 0:53:01.211
+It's complicated, so if you have German, the
+ASR does the Arab, and instead.
+0:53:01.101 --> 0:53:05.976
+Then most probably you'll ignore it or you'll
+still know what it was said.
+0:53:05.911 --> 0:53:11.817
+Maybe you even don't notice because you'll
+fastly read over it and don't see that there's
+0:53:11.817 --> 0:53:12.998
+one letter wrong.
+0:53:13.673 --> 0:53:25.291
+However, if you translate this one in an English
+sentence about speeches, there's something
+0:53:25.291 --> 0:53:26.933
+about wines.
+0:53:27.367 --> 0:53:37.238
+So it's a lot easier typically to read over
+like arrows in the than reading over them in
+0:53:37.238 --> 0:53:38.569
+the speech.
+0:53:40.120 --> 0:53:45.863
+But there is additional challenges in in cascaded
+systems.
+0:53:46.066 --> 0:53:52.667
+So secondly we have seen that we optimize
+each component individually so you have a separate
+0:53:52.667 --> 0:53:59.055
+optimization and that doesn't mean that the
+overall performance is really the best at the
+0:53:59.055 --> 0:53:59.410
+end.
+0:53:59.899 --> 0:54:07.945
+And we have tried to do that by already saying
+yes.
+0:54:07.790 --> 0:54:17.694
+You need to adapt them a bit to work good
+together, but still.
+0:54:20.280 --> 0:54:24.185
+Secondly, like that, there's a computational
+complexity.
+0:54:24.116 --> 0:54:30.315
+You always need to run an ASR system and an
+MTT system, and especially if you think about
+0:54:30.315 --> 0:54:32.864
+it, it should be fast and real time.
+0:54:32.795 --> 0:54:37.067
+It's challenging to always run two systems
+and not a single.
+0:54:38.038 --> 0:54:45.245
+And one final thing which you might have not
+directly thought of, but most of the world's
+0:54:45.245 --> 0:54:47.407
+languages do not have any.
+0:54:48.108 --> 0:55:01.942
+So if you have a language which doesn't have
+any script, then of course if you want to translate
+0:55:01.942 --> 0:55:05.507
+it you cannot first use.
+0:55:05.905 --> 0:55:13.705
+So in order to do this, the pressure was mentioned
+before ready.
+0:55:13.585 --> 0:55:24.265
+Build somehow a system which takes the audio
+and directly generates text in the target.
+0:55:26.006 --> 0:55:41.935
+And there is quite big opportunity for that
+because before that there was very different
+0:55:41.935 --> 0:55:44.082
+technology.
+0:55:44.644 --> 0:55:55.421
+However, since we are using neuromachine translation
+encoded decoder models, the interesting thing
+0:55:55.421 --> 0:56:00.429
+is that we are using very similar technology.
+0:56:00.360 --> 0:56:06.047
+It's like in both cases very similar architecture.
+0:56:05.935 --> 0:56:09.284
+The main difference is once.
+0:56:09.649 --> 0:56:17.143
+But generally how it's done is very similar,
+and therefore of course it might be put everything
+0:56:17.143 --> 0:56:22.140
+together, and that is what is referred to as
+end-to-end speech.
+0:56:22.502 --> 0:56:31.411
+So that means we're having one large neural
+network and decoded voice system, but we put
+0:56:31.411 --> 0:56:34.914
+an audio in one language and then.
+0:56:36.196 --> 0:56:43.106
+We can then have a system which directly does
+the full process.
+0:56:42.998 --> 0:56:46.457
+We don't have to care anymore.
+0:56:48.048 --> 0:57:02.615
+So if you think of it as before, so we have
+this decoder, and that's the two separate.
+0:57:02.447 --> 0:57:04.805
+We have the.
+0:57:05.085 --> 0:57:18.044
+And instead of going via the discrete text
+representation in the Suez language, we can
+0:57:18.044 --> 0:57:21.470
+go via the continuous.
+0:57:21.681 --> 0:57:26.027
+Of course, they hope it's by not doing this
+discrimination in between.
+0:57:26.146 --> 0:57:30.275
+We don't have a problem at doing errors.
+0:57:30.174 --> 0:57:32.797
+We can only cover later.
+0:57:32.772 --> 0:57:47.849
+But we can encode here the variability or
+so that we have and then only define the decision.
+0:57:51.711 --> 0:57:54.525
+And so.
+0:57:54.274 --> 0:58:02.253
+What we're doing is we're having very similar
+technique.
+0:58:02.113 --> 0:58:12.194
+We're having still the decoder model where
+we're coming from the main.
+0:58:12.552 --> 0:58:24.098
+Instead of getting discrete tokens in there
+as we have subwords, we always encoded that
+0:58:24.098 --> 0:58:26.197
+in one pattern.
+0:58:26.846 --> 0:58:42.505
+The problem is that this is in continuous,
+so we have to check how we can work with continuous
+0:58:42.505 --> 0:58:43.988
+signals.
+0:58:47.627 --> 0:58:55.166
+Mean, the first thing in your system is when
+you do your disc freeze and code it.
+0:59:02.402 --> 0:59:03.888
+A newer machine translation.
+0:59:03.837 --> 0:59:05.041
+You're getting a word.
+0:59:04.989 --> 0:59:06.300
+It's one hot, some not.
+0:59:21.421 --> 0:59:24.678
+The first layer of the machine translation.
+0:59:27.287 --> 0:59:36.147
+Yes, you do the word embedding, so then you
+have a continuous thing.
+0:59:36.019 --> 0:59:40.132
+So if you know get continuous.
+0:59:40.961 --> 0:59:46.316
+Deal with it the same way, so we'll see not
+a big of a challenge.
+0:59:46.235 --> 0:59:48.672
+What is more challenging is.
+0:59:49.349 --> 1:00:04.498
+So the audio signal is ten times longer or
+so, like more time steps you have.
+1:00:04.764 --> 1:00:10.332
+And so that is, of course, any challenge how
+we can deal with this type of long sequence.
+1:00:11.171 --> 1:00:13.055
+The advantage is a bit.
+1:00:12.976 --> 1:00:17.867
+The long sequence is only at the input and
+not at the output.
+1:00:17.789 --> 1:00:24.938
+So when you remember for the efficiency, for
+example, like a long sequence are especially
+1:00:24.938 --> 1:00:29.228
+challenging in the decoder, but also for the
+encoder.
+1:00:31.371 --> 1:00:33.595
+So how it is this?
+1:00:33.478 --> 1:00:40.619
+How can we process audio into an speech translation
+system?
+1:00:41.501 --> 1:00:51.856
+And you can follow mainly what is done in
+an system, so you have the audio signal.
+1:00:52.172 --> 1:00:59.135
+Then you measure your amplitude at every time
+step.
+1:00:59.001 --> 1:01:04.361
+It's typically something like killing.
+1:01:04.384 --> 1:01:13.893
+And then you're doing this, this windowing,
+so that you get a signal of a length twenty
+1:01:13.893 --> 1:01:22.430
+to thirty seconds, and you have all these windowings
+so that you measure them.
+1:01:22.342 --> 1:01:32.260
+A simple gear, and then you look at these
+time signals of seconds.
+1:01:32.432 --> 1:01:36.920
+So in the end then it is ten seconds, ten
+million seconds.
+1:01:36.844 --> 1:01:39.737
+You have for every ten milliseconds.
+1:01:40.000 --> 1:01:48.309
+Some type of representation which type of
+representation you can generate from that,
+1:01:48.309 --> 1:01:49.286
+but that.
+1:01:49.649 --> 1:02:06.919
+So instead of having no letter or word, you
+have no representations for every 10mm of your
+1:02:06.919 --> 1:02:08.437
+system.
+1:02:08.688 --> 1:02:13.372
+How we record that now your thirty second
+window here there is different ways.
+1:02:16.176 --> 1:02:31.891
+Was a traditional way of how people have done
+that from an audio signal what frequencies
+1:02:31.891 --> 1:02:34.010
+are in the.
+1:02:34.114 --> 1:02:44.143
+So to do that you can do this malfrequency,
+capsule co-pression so you can use gear transformations.
+1:02:44.324 --> 1:02:47.031
+Which frequencies are there?
+1:02:46.938 --> 1:02:53.568
+You know that the letters are different by
+the different frequencies.
+1:02:53.813 --> 1:03:04.243
+And then if you're doing that, use the matte
+to covers for your window we have before.
+1:03:04.624 --> 1:03:15.086
+So for each of these windows: You will calculate
+what frequencies in there and then get features
+1:03:15.086 --> 1:03:20.047
+for this window and features for this window.
+1:03:19.980 --> 1:03:28.028
+These are the frequencies that occur there
+and that help you to model which letters are
+1:03:28.028 --> 1:03:28.760
+spoken.
+1:03:31.611 --> 1:03:43.544
+More recently, instead of doing the traditional
+signal processing, you can also replace that
+1:03:43.544 --> 1:03:45.853
+by deep learning.
+1:03:46.126 --> 1:03:56.406
+So that we are using a self-supervised approach
+from language model to generate features that
+1:03:56.406 --> 1:03:58.047
+describe what.
+1:03:58.358 --> 1:03:59.821
+So you have your.
+1:03:59.759 --> 1:04:07.392
+All your signal again, and then for each child
+to do your convolutional neural networks to
+1:04:07.392 --> 1:04:07.811
+get.
+1:04:07.807 --> 1:04:23.699
+First representation here is a transformer
+network here, and in the end it's similar to
+1:04:23.699 --> 1:04:25.866
+a language.
+1:04:25.705 --> 1:04:30.238
+And you tried to predict what was referenced
+here.
+1:04:30.670 --> 1:04:40.806
+So that is in a way similar that you also
+try to learn a good representation of all these
+1:04:40.806 --> 1:04:51.281
+audio signals by predicting: And then you don't
+do the signal processing base, but have this
+1:04:51.281 --> 1:04:52.745
+way to make.
+1:04:52.812 --> 1:04:59.430
+But in all the things that you have to remember
+what is most important for you, and to end
+1:04:59.430 --> 1:05:05.902
+system is, of course, that you in the end get
+for every minute ten milliseconds, you get
+1:05:05.902 --> 1:05:11.283
+a representation of this audio signal, which
+is again a vector, and that.
+1:05:11.331 --> 1:05:15.365
+And then you can use your normal encoder to
+code your model to do this research.
+1:05:21.861 --> 1:05:32.694
+So that is all which directly has to be changed,
+and then you can build your first base.
+1:05:33.213 --> 1:05:37.167
+You do the audio processing.
+1:05:37.031 --> 1:05:49.153
+You of course need data which is like Audio
+and English and Text in German and then you
+1:05:49.153 --> 1:05:50.668
+can train.
+1:05:53.333 --> 1:05:57.854
+And interestingly, it works at the beginning.
+1:05:57.756 --> 1:06:03.263
+The systems were maybe a bit worse, but we
+saw really.
+1:06:03.964 --> 1:06:11.803
+This is like from the biggest workshop where
+people like compared different systems.
+1:06:11.751 --> 1:06:17.795
+Special challenge on comparing Cascaded to
+end to end systems and you see two thousand
+1:06:17.795 --> 1:06:18.767
+and eighteen.
+1:06:18.698 --> 1:06:25.068
+We had quite a huge gap between the Cascaded
+and end to end systems and then it got nearer
+1:06:25.068 --> 1:06:27.937
+and earlier in starting in two thousand.
+1:06:27.907 --> 1:06:33.619
+Twenty the performance was mainly the same,
+so there was no clear difference anymore.
+1:06:34.014 --> 1:06:42.774
+So this is, of course, writing a bit of hope
+saying if we better learn how to build these
+1:06:42.774 --> 1:06:47.544
+internal systems, they might really fall better.
+1:06:49.549 --> 1:06:52.346
+However, a bit.
+1:06:52.452 --> 1:06:59.018
+This satisfying this is how this all continues,
+and this is not only in two thousand and twenty
+1:06:59.018 --> 1:07:04.216
+one, but even nowadays we can say there is
+no clear performance difference.
+1:07:04.148 --> 1:07:10.919
+It's not like the one model is better than
+the other, but we are seeing very similar performance.
+1:07:11.391 --> 1:07:19.413
+So the question is what is the difference?
+1:07:19.227 --> 1:07:29.119
+Of course, this can only be achieved by new
+tricks.
+1:07:30.570 --> 1:07:35.658
+Yes and no, that's what we will mainly look
+into now.
+1:07:35.564 --> 1:07:39.335
+How can we make use of other types of.
+1:07:39.359 --> 1:07:53.236
+In that case you can achieve some performance
+by using different types of training so you
+1:07:53.236 --> 1:07:55.549
+can also make.
+1:07:55.855 --> 1:08:04.961
+So if you are training or preparing the systems
+only on very small corpora where you have as
+1:08:04.961 --> 1:08:10.248
+much data than you have for the individual
+ones then.
+1:08:10.550 --> 1:08:22.288
+So that is the biggest challenge of an end
+system that you have small corpora and therefore.
+1:08:24.404 --> 1:08:30.479
+Of course, there is several advantages so
+you can give access to the audio information.
+1:08:30.750 --> 1:08:42.046
+So that's, for example, interesting if you
+think about it, you might not have modeled
+1:08:42.046 --> 1:08:45.198
+everything in the text.
+1:08:45.067 --> 1:08:50.324
+So remember when we talk about biases.
+1:08:50.230 --> 1:08:55.448
+Male or female, and that of course is not
+in the text any more, but in the audio signal
+1:08:55.448 --> 1:08:56.515
+it's still there.
+1:08:58.078 --> 1:09:03.108
+It also allows you to talk about that on Thursday
+when you talk about latency.
+1:09:03.044 --> 1:09:08.871
+You have a bit better chance if you do an
+end to end system to get a lower latency because
+1:09:08.871 --> 1:09:14.378
+you only have one system and you don't have
+two systems which might have to wait for.
+1:09:14.934 --> 1:09:20.046
+And having one system might be also a bit
+easier management.
+1:09:19.962 --> 1:09:23.149
+See that two systems work and so on.
+1:09:26.346 --> 1:09:41.149
+The biggest challenge of end systems is the
+data, so as you correctly pointed out, typically
+1:09:41.149 --> 1:09:42.741
+there is.
+1:09:43.123 --> 1:09:45.829
+There is some data for Ted.
+1:09:45.733 --> 1:09:47.399
+People did that.
+1:09:47.301 --> 1:09:52.792
+They took the English audio with all the translations.
+1:09:53.273 --> 1:10:02.423
+But in January there is a lot less so we'll
+look into how you can use other data sources.
+1:10:05.305 --> 1:10:10.934
+Audio TranslationAnd secondly, the second
+challenge is that we have to deal with audio.
+1:10:11.431 --> 1:10:22.163
+For example, in input length, and therefore
+it's also important to handle this in your
+1:10:22.163 --> 1:10:27.590
+network and maybe have dedicated solutions.
+1:10:31.831 --> 1:10:40.265
+So in general we have this challenge that
+we have a lot of text and translation and audio
+1:10:40.265 --> 1:10:43.076
+transcript data by quite few.
+1:10:43.643 --> 1:10:50.844
+So what can we do in one trick?
+1:10:50.619 --> 1:11:00.750
+You already know a bit from other research.
+1:11:02.302 --> 1:11:14.325
+Exactly so what you can do is you can, for
+example, use to take a power locust, generate
+1:11:14.325 --> 1:11:19.594
+an audio of a Suez language, and then.
+1:11:21.341 --> 1:11:33.780
+There has been a bit motivated by what we
+have seen in Beck translation, which was very
+1:11:33.780 --> 1:11:35.476
+successful.
+1:11:38.758 --> 1:11:54.080
+However, it's a bit more challenging because
+it is often very different from real audience.
+1:11:54.314 --> 1:12:07.131
+So often if you build a system only trained
+on, but then generalized to real audio data
+1:12:07.131 --> 1:12:10.335
+is quite challenging.
+1:12:10.910 --> 1:12:20.927
+And therefore here the synthetic data generation
+is significantly more challenging than when.
+1:12:20.981 --> 1:12:27.071
+Because if you read a text, it's maybe bad
+translation.
+1:12:26.962 --> 1:12:33.163
+It's hard, but it's a real text or a text
+generated by.
+1:12:35.835 --> 1:12:42.885
+But it's a valid solution, and for example
+we use that also for say current systems.
+1:12:43.923 --> 1:12:53.336
+Of course you can also do a bit of forward
+translation that is done so that you take data.
+1:12:53.773 --> 1:13:02.587
+But then the problem is that your reference
+is not always correct, and you remember when
+1:13:02.587 --> 1:13:08.727
+we talked about back translation, it's a bit
+of an advantage.
+1:13:09.229 --> 1:13:11.930
+But both can be done and both have been done.
+1:13:12.212 --> 1:13:20.277
+So you can think about this picture again.
+1:13:20.089 --> 1:13:30.221
+You can take this data and generate the audio
+to it.
+1:13:30.750 --> 1:13:39.007
+However, it is only synthetic of what can
+be used for the voice handling technology for:
+1:13:39.007 --> 1:13:47.078
+But you have not, I mean, yet you get text
+to speech, but the voice cloning would need
+1:13:47.078 --> 1:13:47.913
+a voice.
+1:13:47.821 --> 1:13:53.113
+You can use, of course, and then it's nothing
+else than a normal.
+1:13:54.594 --> 1:14:03.210
+But still think there are better than both,
+but there are some characteristics of that
+1:14:03.210 --> 1:14:05.784
+which is quite different.
+1:14:07.327 --> 1:14:09.341
+But yeah, it's getting better.
+1:14:09.276 --> 1:14:13.499
+That is definitely true, and then this might
+get more and more.
+1:14:16.596 --> 1:14:21.885
+Here make sure it's a good person and our
+own systems because we try to train and.
+1:14:21.881 --> 1:14:24.356
+And it's like a feedback mood.
+1:14:24.277 --> 1:14:28.669
+There's anything like the Dutch English model
+that's.
+1:14:28.648 --> 1:14:33.081
+Yeah, you of course need a decent amount of
+real data.
+1:14:33.001 --> 1:14:40.228
+But I mean, as I said, so there is always
+an advantage if you have this synthetics thing
+1:14:40.228 --> 1:14:44.045
+only on the input side and not on the outside.
+1:14:44.464 --> 1:14:47.444
+That you at least always generate correct
+outcomes.
+1:14:48.688 --> 1:14:54.599
+That's different in a language case because
+they have input and the output and it's not
+1:14:54.599 --> 1:14:55.002
+like.
+1:14:58.618 --> 1:15:15.815
+The other idea is to integrate additional
+sources so you can have more model sharing.
+1:15:16.376 --> 1:15:23.301
+But you can use these components also in the
+system.
+1:15:23.171 --> 1:15:28.662
+Typically the text decoder and the text.
+1:15:29.169 --> 1:15:41.845
+And so the other way of languaging is to join
+a train or somehow train all these tasks.
+1:15:43.403 --> 1:15:54.467
+The first and easy thing to do is multi task
+training so the idea is you take these components
+1:15:54.467 --> 1:16:02.038
+and train these two components and train the
+speech translation.
+1:16:02.362 --> 1:16:13.086
+So then, for example, all your encoders used
+by the speech translation system can also gain
+1:16:13.086 --> 1:16:14.951
+from the large.
+1:16:14.975 --> 1:16:24.048
+So everything can gain a bit of emphasis,
+but it can partly gain in there quite a bit.
+1:16:27.407 --> 1:16:39.920
+The other idea is to do it in a pre-training
+phase.
+1:16:40.080 --> 1:16:50.414
+And then you take the end coder and the text
+decoder and trade your model on that.
+1:16:54.774 --> 1:17:04.895
+Finally, there is also what is referred to
+as knowledge distillation, so there you have
+1:17:04.895 --> 1:17:11.566
+to remember if you learn from a probability
+distribution.
+1:17:11.771 --> 1:17:24.371
+So what you can do then is you have your system
+and if you then have your audio and text input
+1:17:24.371 --> 1:17:26.759
+you can use your.
+1:17:27.087 --> 1:17:32.699
+And then get a more rich signal that you'll
+not only know this is the word, but you have
+1:17:32.699 --> 1:17:33.456
+a complete.
+1:17:34.394 --> 1:17:41.979
+Example is typically also done because, of
+course, if you have ski data, it still begins
+1:17:41.979 --> 1:17:49.735
+that you don't only have source language audio
+and target language text, but then you also
+1:17:49.735 --> 1:17:52.377
+have the source language text.
+1:17:53.833 --> 1:18:00.996
+Get a good idea of the text editor and the
+artist design.
+1:18:00.872 --> 1:18:16.051
+Now have to be aligned so that: Otherwise
+they wouldn't be able to determine which degree
+1:18:16.051 --> 1:18:17.906
+they'd be.
+1:18:18.178 --> 1:18:25.603
+What you've been doing in non-stasilation
+is you run your MP and then you get your probability
+1:18:25.603 --> 1:18:32.716
+distribution for all the words and you use
+that to train and that is not only more helpful
+1:18:32.716 --> 1:18:34.592
+than only getting back.
+1:18:35.915 --> 1:18:44.427
+You can, of course, use the same decoder to
+be even similar.
+1:18:44.287 --> 1:18:49.732
+Otherwise you don't have exactly the.
+1:18:52.832 --> 1:19:03.515
+Is a good point making these tools, and generally
+in all these cases it's good to have more similar
+1:19:03.515 --> 1:19:05.331
+representations.
+1:19:05.224 --> 1:19:07.260
+You can transfer.
+1:19:07.607 --> 1:19:23.743
+If you hear your representation to give from
+the audio encoder and the text encoder are
+1:19:23.743 --> 1:19:27.410
+more similar, then.
+1:19:30.130 --> 1:19:39.980
+So here you have your text encoder in the
+target language and you can train it on large
+1:19:39.980 --> 1:19:40.652
+data.
+1:19:41.341 --> 1:19:45.994
+But of course you want to benefit also for
+this task because that's what your most interested.
+1:19:46.846 --> 1:19:59.665
+Of course, the most benefit for this task
+is if these two representations you give are
+1:19:59.665 --> 1:20:01.728
+more similar.
+1:20:02.222 --> 1:20:11.631
+Therefore, it's interesting to look into how
+can we make these two representations as similar
+1:20:11.631 --> 1:20:21.141
+as: The hope is that in the end you can't even
+do something like zero shot transfer, but while
+1:20:21.141 --> 1:20:25.945
+you only learn this one you can also deal with.
+1:20:30.830 --> 1:20:40.257
+So what you can do is you can look at these
+two representations.
+1:20:40.112 --> 1:20:42.876
+So once the text.
+1:20:43.003 --> 1:20:51.184
+And you can either put them into the text
+decoder to the encoder.
+1:20:51.060 --> 1:20:53.487
+We have seen both.
+1:20:53.359 --> 1:21:03.617
+You can think: If you want to build an A's
+and to insist on you can either take the audio
+1:21:03.617 --> 1:21:06.580
+encoder and see how deep.
+1:21:08.748 --> 1:21:21.915
+However, you have these two representations
+and you want to make them more similar.
+1:21:21.759 --> 1:21:23.655
+One thing.
+1:21:23.863 --> 1:21:32.797
+Here we have, like you said, for every ten
+million seconds we have a representation.
+1:21:35.335 --> 1:21:45.763
+So what people may have done, for example,
+is to remove redundant information so you can:
+1:21:45.763 --> 1:21:56.308
+So you can use your system to put India based
+on letter or words and then average over the
+1:21:56.308 --> 1:21:58.394
+words or letters.
+1:21:59.179 --> 1:22:07.965
+So that the number of representations from
+the encoder is the same as you would get from.
+1:22:12.692 --> 1:22:20.919
+Okay, that much to data do have any more questions
+first about that.
+1:22:27.207 --> 1:22:39.507
+Then we'll finish with the audience assessing
+and highlight a bit while this is challenging,
+1:22:39.507 --> 1:22:52.864
+so here's: One test here has one thousand eight
+hundred sentences, so there are words or characters.
+1:22:53.954 --> 1:22:59.336
+If you look how many all your features, so
+how many samples there is like one point five
+1:22:59.336 --> 1:22:59.880
+million.
+1:23:00.200 --> 1:23:10.681
+So you have ten times more pizzas than you
+have characters, and then again five times
+1:23:10.681 --> 1:23:11.413
+more.
+1:23:11.811 --> 1:23:23.934
+So you have the sequence leg of the audio
+as long as you have for words, and that is
+1:23:23.934 --> 1:23:25.788
+a challenge.
+1:23:26.086 --> 1:23:34.935
+So the question is what can you do to make
+the sequins a bit shorter and not have this?
+1:23:38.458 --> 1:23:48.466
+The one thing is you can try to reduce the
+dimensional entity in your encounter.
+1:23:48.343 --> 1:23:50.821
+There's different.
+1:23:50.991 --> 1:24:04.302
+So, for example, you can just sum up always
+over some or you can do a congregation.
+1:24:04.804 --> 1:24:12.045
+Are you a linear projectile or you even take
+not every feature but only every fifth or something?
+1:24:12.492 --> 1:24:23.660
+So this way you can very easily reduce your
+number of features in there, and there has
+1:24:23.660 --> 1:24:25.713
+been different.
+1:24:26.306 --> 1:24:38.310
+There's also what you can do with things like
+a convolutional layer.
+1:24:38.136 --> 1:24:43.883
+If you skip over what you can,.
+1:24:47.327 --> 1:24:55.539
+And then, in addition to the audio, the other
+problem is higher variability.
+1:24:55.432 --> 1:25:04.641
+So if you have a text you can: But there are
+very different ways of saying that you can
+1:25:04.641 --> 1:25:09.874
+distinguish whether say a sentence or your
+voice.
+1:25:10.510 --> 1:25:21.224
+That of course makes it more challenging because
+now you get different inputs and while they
+1:25:21.224 --> 1:25:22.837
+were in text.
+1:25:23.263 --> 1:25:32.360
+So that makes especially for limited data
+things more challenging and you want to somehow
+1:25:32.360 --> 1:25:35.796
+learn that this is not important.
+1:25:36.076 --> 1:25:39.944
+So there is the idea again okay.
+1:25:39.827 --> 1:25:47.566
+Can we doing some type of data augmentation
+to better deal with?
+1:25:48.908 --> 1:25:55.735
+And again people can mainly use what has been
+done in and try to do the same things.
+1:25:56.276 --> 1:26:02.937
+You can try to do a bit of noise and speech
+perturbation so playing the audio like slower
+1:26:02.937 --> 1:26:08.563
+and a bit faster to get more samples then and
+you can train on all of them.
+1:26:08.489 --> 1:26:14.929
+What is very important and very successful
+recently is what is called Spektr augment.
+1:26:15.235 --> 1:26:25.882
+The idea is that you directly work on all
+your features and you can try to last them
+1:26:25.882 --> 1:26:29.014
+and that gives you more.
+1:26:29.469 --> 1:26:41.717
+What do they mean with masking so this is
+your audio feature and then there is different?
+1:26:41.962 --> 1:26:47.252
+You can do what is referred to as mask and
+a time masking.
+1:26:47.162 --> 1:26:50.482
+That means you just set some masks.
+1:26:50.730 --> 1:26:58.003
+And since then you should be still able to
+to deal with it because you can normally.
+1:26:57.937 --> 1:27:05.840
+Also without that you are getting more robust
+and not and you can handle that because then
+1:27:05.840 --> 1:27:10.877
+many symbols which have different time look
+more similar.
+1:27:11.931 --> 1:27:22.719
+You are not only doing that for time masking
+but also for frequency masking so that if you
+1:27:22.719 --> 1:27:30.188
+have here the frequency channels you mask a
+frequency channel.
+1:27:30.090 --> 1:27:33.089
+Thereby being able to better recognize these
+things.
+1:27:35.695 --> 1:27:43.698
+This we have had an overview of the two main
+approaches for speech translation that is on
+1:27:43.698 --> 1:27:51.523
+the one hand cascaded speech translation and
+on the other hand we talked about advanced
+1:27:51.523 --> 1:27:53.302
+speech translation.
+1:27:53.273 --> 1:28:02.080
+It's like how to combine things and what they
+work together for end speech translations.
+1:28:02.362 --> 1:28:06.581
+Here was data challenges and a bit about long
+circuits.
+1:28:07.747 --> 1:28:09.304
+We have any more questions.
+1:28:11.451 --> 1:28:19.974
+Can you really describe the change in cascading
+from translation to text to speech because
+1:28:19.974 --> 1:28:22.315
+thought the translation.
+1:28:25.745 --> 1:28:30.201
+Yes, so mean that works again the easiest
+thing.
+1:28:30.111 --> 1:28:32.954
+What of course is challenging?
+1:28:32.863 --> 1:28:40.753
+What can be challenging is how to make that
+more lively and like that pronunciation?
+1:28:40.680 --> 1:28:47.369
+And yeah, which things are put more important,
+how to put things like that into.
+1:28:47.627 --> 1:28:53.866
+In the normal text, otherwise it would sound
+very monotone.
+1:28:53.762 --> 1:28:57.404
+You want to add this information.
+1:28:58.498 --> 1:29:02.656
+That is maybe one thing to make it a bit more
+emotional.
+1:29:02.583 --> 1:29:04.920
+That is maybe one thing which.
+1:29:05.305 --> 1:29:13.448
+But you are right there and out of the box.
+1:29:13.263 --> 1:29:20.670
+If you have everything works decently.
+1:29:20.800 --> 1:29:30.507
+Still, especially if you have a very monotone
+voice, so think these are quite some open challenges.
+1:29:30.750 --> 1:29:35.898
+Maybe another open challenge is that it's
+not so much for the end product, but for the
+1:29:35.898 --> 1:29:37.732
+development is very important.
+1:29:37.673 --> 1:29:40.100
+It's very hard to evaluate the quality.
+1:29:40.740 --> 1:29:48.143
+So you cannot doubt that there is a way about
+most systems are currently evaluated by human
+1:29:48.143 --> 1:29:49.109
+evaluation.
+1:29:49.589 --> 1:29:54.474
+So you cannot try hundreds of things and run
+your blue score and get this score.
+1:29:54.975 --> 1:30:00.609
+So therefore no means very important to have
+some type of evaluation metric and that is
+1:30:00.609 --> 1:30:01.825
+quite challenging.
+1:30:08.768 --> 1:30:15.550
+And thanks for listening, and we'll have the
+second part of speech translation on search.

demo_data/lectures/Lecture-18-18.07.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7158cf58687ceeb69cae55cb9786cecc77ea95e9afcc0b29251b8b9cfe54cdb5
+size 125329284

demo_data/lectures/Lecture-19-21.07.2023/English.vtt ADDED Viewed

	@@ -0,0 +1,2860 @@

+WEBVTT
+0:00:01.121 --> 0:00:12.579
+IntroductionOkay, so welcome to today's lecture,
+on Tuesday we started to talk about speech
+0:00:12.579 --> 0:00:14.198
+translation.
+0:00:14.634 --> 0:00:27.037
+And the idea is hopefully an idea of the basic
+ideas we have in speech translation, the two
+0:00:27.037 --> 0:00:29.464
+major approaches.
+0:00:29.829 --> 0:00:41.459
+And the other one is the end system where
+we have one large system which is everything
+0:00:41.459 --> 0:00:42.796
+together.
+0:00:43.643 --> 0:00:58.459
+Until now we mainly focus on text output that
+we'll see today, but you can extend these ideas
+0:00:58.459 --> 0:01:01.138
+to other speech.
+0:01:01.441 --> 0:01:08.592
+But since it's also like a machine translation
+lecture, you of course mainly focus a bit on
+0:01:08.592 --> 0:01:10.768
+the translation challenges.
+0:01:12.172 --> 0:01:25.045
+And what is the main focus of today's lecture
+is to look into why that is challenging speech
+0:01:25.045 --> 0:01:26.845
+translation.
+0:01:27.627 --> 0:01:33.901
+So a bit more focus on what is now really
+the difference to all you and how we can address.
+0:01:34.254 --> 0:01:39.683
+SegmentationWe'll start there by with the
+segmentation problem.
+0:01:39.598 --> 0:01:45.992
+We had that already of bits, but especially
+for end-to-end.
+0:01:46.386 --> 0:01:57.253
+So the problem is that until now it was easy
+to segment the input into sentences and then
+0:01:57.253 --> 0:02:01.842
+translate each sentence individually.
+0:02:02.442 --> 0:02:17.561
+When you're now translating audio, the challenge
+is that you have just a sequence of audio input
+0:02:17.561 --> 0:02:20.055
+and there's no.
+0:02:21.401 --> 0:02:27.834
+So you have this difference that your audio
+is a continuous stream, but the text is typically
+0:02:27.834 --> 0:02:28.930
+sentence based.
+0:02:28.861 --> 0:02:31.614
+So how can you match this gap in there?
+0:02:31.545 --> 0:02:37.645
+We'll see that is really essential, and if
+you're not using a decent good system there,
+0:02:37.645 --> 0:02:41.250
+then you can lose a lot of quality and performance.
+0:02:41.641 --> 0:02:44.267
+That is what also meant before.
+0:02:44.185 --> 0:02:51.702
+So if you have a more complex system out of
+several units, it's really essential that they
+0:02:51.702 --> 0:02:56.659
+all work together and it's very easy to lose
+significantly.
+0:02:57.497 --> 0:03:13.029
+The second challenge we'll talk about is disfluencies,
+so the style of speaking is very different
+0:03:13.029 --> 0:03:14.773
+from text.
+0:03:15.135 --> 0:03:24.727
+So if you translate or TedTalks, that's normally
+very good speakers.
+0:03:24.588 --> 0:03:30.152
+They will give you a very fluent text.
+0:03:30.670 --> 0:03:36.692
+When you want to translate a lecture, it might
+be more difficult or rednested.
+0:03:37.097 --> 0:03:39.242
+Mean people are not well that well.
+0:03:39.182 --> 0:03:42.282
+They should be prepared in giving the lecture
+and.
+0:03:42.362 --> 0:03:48.241
+But it's not that I mean, typically a lecture
+will have like rehearsal like five times before
+0:03:48.241 --> 0:03:52.682
+he is giving this lecture, and then like will
+it completely be fluent?
+0:03:52.619 --> 0:03:56.089
+He might at some point notice all this is
+not perfect.
+0:03:56.026 --> 0:04:00.064
+I want to rephrase, and he'll have to sing
+during the lecture.
+0:04:00.300 --> 0:04:04.049
+Might be also good that he's thinking, so
+he's not going too fast and things like.
+0:04:05.305 --> 0:04:07.933
+If you then go to the other extreme, it's
+more meetings.
+0:04:08.208 --> 0:04:15.430
+If you have a lively discussion, of course,
+people will interrupt, they will restart, they
+0:04:15.430 --> 0:04:22.971
+will think while they speak, and you know that
+sometimes you tell people first think and speak
+0:04:22.971 --> 0:04:26.225
+because they are changing their opinion.
+0:04:26.606 --> 0:04:31.346
+So the question of how can you deal with this?
+0:04:31.245 --> 0:04:37.499
+And there again it might be solutions for
+that, or at least.
+0:04:39.759 --> 0:04:46.557
+Then for the output we will look into simultaneous
+translation that is at least not very important
+0:04:46.557 --> 0:04:47.175
+in text.
+0:04:47.107 --> 0:04:53.696
+There might be some cases but normally you
+have all text available and then you're translating
+0:04:53.696 --> 0:04:54.043
+and.
+0:04:54.394 --> 0:05:09.220
+While for speech translation, since it's often
+a life interaction, then of course it's important.
+0:05:09.149 --> 0:05:12.378
+Otherwise it's hard to follow.
+0:05:12.274 --> 0:05:19.464
+You see what said five minutes ago and the
+slide is not as helpful.
+0:05:19.739 --> 0:05:35.627
+You have to wait very long before you can
+answer because you have to first wait for what
+0:05:35.627 --> 0:05:39.197
+is happening there.
+0:05:40.660 --> 0:05:46.177
+And finally, we can talk a bit about presentation.
+0:05:46.069 --> 0:05:54.724
+For example, mentioned that if you're generating
+subtitles, it's not possible.
+0:05:54.854 --> 0:06:01.110
+So in professional subtitles there are clear
+rules.
+0:06:00.989 --> 0:06:05.632
+Subtitle has to be shown for seconds.
+0:06:05.510 --> 0:06:08.935
+It's maximum of two lines.
+0:06:09.549 --> 0:06:13.156
+Because otherwise it's getting too long, it's
+not able to read it anymore, and so.
+0:06:13.613 --> 0:06:19.826
+So if you want to achieve that, of course,
+you might have to adjust and select what you
+0:06:19.826 --> 0:06:20.390
+really.
+0:06:23.203 --> 0:06:28.393
+The first date starts with the segmentation.
+0:06:28.277 --> 0:06:36.353
+On the one end it's an issue while training,
+on the other hand it's.
+0:06:38.678 --> 0:06:47.781
+What is the problem so when we train it's
+relatively easy to separate our data into sentence
+0:06:47.781 --> 0:06:48.466
+level.
+0:06:48.808 --> 0:07:02.241
+So if you have your example, you have the
+audio and the text, then you typically know
+0:07:02.241 --> 0:07:07.083
+that this sentence is aligned.
+0:07:07.627 --> 0:07:16.702
+You can use these time information to cut
+your audio and then you can train and then.
+0:07:18.018 --> 0:07:31.775
+Because what we need for an enchilada model
+is to be an output chart, in this case an audio
+0:07:31.775 --> 0:07:32.822
+chart.
+0:07:33.133 --> 0:07:38.551
+And even if this is a long speech, it's easy
+then since we have this time information to
+0:07:38.551 --> 0:07:39.159
+separate.
+0:07:39.579 --> 0:07:43.866
+But we are using therefore, of course, the
+target side information.
+0:07:45.865 --> 0:07:47.949
+The problem is now in runtime.
+0:07:47.881 --> 0:07:49.367
+This is not possible.
+0:07:49.300 --> 0:07:55.273
+Here we can do that based on the calculation
+marks and the sentence segmentation on the
+0:07:55.273 --> 0:07:57.921
+target side because that is splitting.
+0:07:57.853 --> 0:08:02.131
+But during transcript, during translation
+it is not possible.
+0:08:02.442 --> 0:08:10.297
+Because there is just a long audio signal,
+and of course if you have your test data to
+0:08:10.297 --> 0:08:15.263
+split it into: That has been done for some
+experience.
+0:08:15.173 --> 0:08:22.834
+It's fine, but it's not a realistic scenario
+because if you really apply it in real world,
+0:08:22.834 --> 0:08:25.949
+we won't have a manual segmentation.
+0:08:26.266 --> 0:08:31.838
+If a human has to do that then he can do the
+translation so you want to have a full automatic
+0:08:31.838 --> 0:08:32.431
+pipeline.
+0:08:32.993 --> 0:08:38.343
+So the question is how can we deal with this
+type of you know?
+0:09:09.309 --> 0:09:20.232
+So the question is how can we deal with this
+time of situation and how can we segment the
+0:09:20.232 --> 0:09:23.024
+audio into some units?
+0:09:23.863 --> 0:09:32.079
+And here is one further really big advantage
+of a cascaded sauce: Because how is this done
+0:09:32.079 --> 0:09:34.336
+in a cascade of systems?
+0:09:34.245 --> 0:09:38.484
+We are splitting the audio with some features
+we are doing.
+0:09:38.414 --> 0:09:42.042
+We can use similar ones which we'll discuss
+later.
+0:09:41.970 --> 0:09:43.840
+Then we run against chin.
+0:09:43.768 --> 0:09:48.801
+We have the transcript and then we can do
+what we talked last about.
+0:09:49.069 --> 0:10:02.260
+So if you have this is an audio signal and
+the training data it was good.
+0:10:02.822 --> 0:10:07.951
+So here we have a big advantage.
+0:10:07.795 --> 0:10:16.758
+We can use a different segmentation for the
+and for the.
+0:10:16.601 --> 0:10:21.323
+Why is that a big advantage?
+0:10:23.303 --> 0:10:34.067
+Will say for a team task is more important
+because we can then do the sentence transformation.
+0:10:34.955 --> 0:10:37.603
+See and Yeah, We Can Do the Same Thing.
+0:10:37.717 --> 0:10:40.226
+To save us, why is it not as important for
+us?
+0:10:40.173 --> 0:10:40.819
+Are maybe.
+0:10:43.363 --> 0:10:48.589
+We don't need that much context.
+0:10:48.430 --> 0:11:01.101
+We only try to restrict the word, but the
+context to consider is mainly small.
+0:11:03.283 --> 0:11:11.419
+Would agree with it in more context, but there
+is one more important: its.
+0:11:11.651 --> 0:11:16.764
+The is monotone, so there's no reordering.
+0:11:16.645 --> 0:11:22.455
+The second part of the signal is no reordering.
+0:11:22.334 --> 0:11:23.559
+We have.
+0:11:23.683 --> 0:11:29.147
+And of course if we are doing that we cannot
+really order across boundaries between segments.
+0:11:29.549 --> 0:11:37.491
+It might be challenging if we split the words
+so that it's not perfect for so that.
+0:11:37.637 --> 0:11:40.846
+But we need to do quite long range reordering.
+0:11:40.777 --> 0:11:47.035
+If you think about the German where the work
+has moved, and now the English work is in one
+0:11:47.035 --> 0:11:50.198
+part, but the end of the sentence is another.
+0:11:50.670 --> 0:11:59.427
+And of course this advantage we have now here
+that if we have a segment we have.
+0:12:01.441 --> 0:12:08.817
+And that this segmentation is important.
+0:12:08.638 --> 0:12:15.300
+Here are some motivations for that.
+0:12:15.675 --> 0:12:25.325
+What you are doing is you are taking the reference
+text and you are segmenting.
+0:12:26.326 --> 0:12:30.991
+And then, of course, your segments are exactly
+yeah cute.
+0:12:31.471 --> 0:12:42.980
+If you're now using different segmentation
+strategies, you're using significantly in blue
+0:12:42.980 --> 0:12:44.004
+points.
+0:12:43.876 --> 0:12:50.400
+If the segmentation is bad, you have a lot
+worse.
+0:12:52.312 --> 0:13:10.323
+And interesting, here you ought to see how
+it was a human, but people have in a competition.
+0:13:10.450 --> 0:13:22.996
+You can see that by working on the segmentation
+and using better segmentation you can improve
+0:13:22.996 --> 0:13:25.398
+your performance.
+0:13:26.006 --> 0:13:29.932
+So it's really essential.
+0:13:29.781 --> 0:13:41.714
+One other interesting thing is if you're looking
+into the difference between.
+0:13:42.082 --> 0:13:49.145
+So it really seems to be more important to
+have a good segmentation for our cascaded system.
+0:13:49.109 --> 0:13:56.248
+For an intra-end system because there you
+can't re-segment while it is less important
+0:13:56.248 --> 0:13:58.157
+for a cascaded system.
+0:13:58.074 --> 0:14:05.049
+Of course, it's still important, but the difference
+between the two segmentations.
+0:14:06.466 --> 0:14:18.391
+It was a shared task some years ago like it's
+just one system from different.
+0:14:22.122 --> 0:14:31.934
+So the question is how can we deal with this
+in speech translation and what people look
+0:14:31.934 --> 0:14:32.604
+into?
+0:14:32.752 --> 0:14:48.360
+Now we want to use different techniques to
+split the audio signal into segments.
+0:14:48.848 --> 0:14:54.413
+You have the disadvantage that you can't change
+it.
+0:14:54.306 --> 0:15:00.409
+Therefore, some of the quality might be more
+important.
+0:15:00.660 --> 0:15:15.678
+But in both cases, of course, the A's are
+better if you have a good segmentation.
+0:15:17.197 --> 0:15:23.149
+So any idea, how would you have this task
+now split this audio?
+0:15:23.056 --> 0:15:26.221
+What type of tool would you use?
+0:15:28.648 --> 0:15:41.513
+The fuse was a new network to segment half
+for instance supervise.
+0:15:41.962 --> 0:15:44.693
+Yes, that's exactly already the better system.
+0:15:44.635 --> 0:15:50.376
+So for long time people have done more simple
+things because we'll come to that a bit challenging
+0:15:50.376 --> 0:15:52.250
+as creating or having the data.
+0:15:53.193 --> 0:16:00.438
+The first thing is you use some tool out of
+the box like voice activity detection which
+0:16:00.438 --> 0:16:07.189
+has been there as a whole research field so
+people find when somebody's speaking.
+0:16:07.647 --> 0:16:14.952
+And then you use that in this different threshold
+you always have the ability that somebody's
+0:16:14.952 --> 0:16:16.273
+speaking or not.
+0:16:17.217 --> 0:16:19.889
+Then you split your signal.
+0:16:19.794 --> 0:16:26.763
+It will not be perfect, but you transcribe
+or translate each component.
+0:16:28.508 --> 0:16:39.337
+But as you see, a supervised classification
+task is even better, and that is now the most
+0:16:39.337 --> 0:16:40.781
+common use.
+0:16:41.441 --> 0:16:49.909
+The supervisor is doing that as a supervisor
+classification and then you'll try to use this
+0:16:49.909 --> 0:16:50.462
+type.
+0:16:50.810 --> 0:16:53.217
+We're going into a bit more detail on how
+to do that.
+0:16:53.633 --> 0:17:01.354
+So what you need to do first is, of course,
+you have to have some labels whether this is
+0:17:01.354 --> 0:17:03.089
+an end of sentence.
+0:17:03.363 --> 0:17:10.588
+You do that by using the alignment between
+the segments and the audio.
+0:17:10.487 --> 0:17:12.021
+You have the.
+0:17:12.212 --> 0:17:15.365
+The two people have not for each word, so
+these tank steps.
+0:17:15.312 --> 0:17:16.891
+This word is said this time.
+0:17:17.157 --> 0:17:27.935
+This word is said by what you typically have
+from this time to time to time.
+0:17:27.795 --> 0:17:34.657
+We have the second segment, the second segment.
+0:17:35.195 --> 0:17:39.051
+Which also used to trade for example your
+advanced system and everything.
+0:17:41.661 --> 0:17:53.715
+Based on that you can label each frame in
+there so if you have a green or blue that is
+0:17:53.715 --> 0:17:57.455
+our speech segment so you.
+0:17:58.618 --> 0:18:05.690
+And these labels will then later help you,
+but you extract exactly these types of.
+0:18:07.067 --> 0:18:08.917
+There's one big challenge.
+0:18:08.848 --> 0:18:15.113
+If you have two sentences which are directly
+connected to each other, then if you're doing
+0:18:15.113 --> 0:18:18.693
+this labeling, you would not have a break in
+later.
+0:18:18.624 --> 0:18:23.513
+If you tried to extract that, there should
+be something great or not.
+0:18:23.943 --> 0:18:31.955
+So what you typically do is in the last frame.
+0:18:31.785 --> 0:18:41.334
+You mark as outside, although it's not really
+outside.
+0:18:43.463 --> 0:18:46.882
+Yes, I guess you could also do that in more
+of a below check.
+0:18:46.827 --> 0:18:48.653
+I mean, this is the most simple.
+0:18:48.598 --> 0:18:51.431
+It's like inside outside, so it's related
+to that.
+0:18:51.376 --> 0:18:54.894
+Of course, you could have an extra startup
+segment, and so on.
+0:18:54.838 --> 0:18:57.370
+I guess this is just to make it more simple.
+0:18:57.314 --> 0:19:00.159
+You only have two labels, not a street classroom.
+0:19:00.103 --> 0:19:02.380
+But yeah, you could do similar things.
+0:19:12.432 --> 0:19:20.460
+Has caused down the roads to problems because
+it could be an important part of a segment
+0:19:20.460 --> 0:19:24.429
+which has some meaning and we do something.
+0:19:24.339 --> 0:19:28.400
+The good thing is frames are normally very.
+0:19:28.688 --> 0:19:37.586
+Like some milliseconds, so normally if you
+remove some milliseconds you can still understand
+0:19:37.586 --> 0:19:38.734
+everything.
+0:19:38.918 --> 0:19:46.999
+Mean the speech signal is very repetitive,
+and so you have information a lot of times.
+0:19:47.387 --> 0:19:50.730
+That's why we talked along there last time
+they could try to shrink the steak and.
+0:19:51.031 --> 0:20:00.995
+If you now have a short sequence where there
+is like which would be removed and that's not
+0:20:00.995 --> 0:20:01.871
+really.
+0:20:02.162 --> 0:20:06.585
+Yeah, but it's not a full letter is missing.
+0:20:06.487 --> 0:20:11.011
+It's like only the last ending of the vocal.
+0:20:11.751 --> 0:20:15.369
+Think it doesn't really happen.
+0:20:15.256 --> 0:20:23.057
+We have our audio signal and we have these
+gags that are not above.
+0:20:23.883 --> 0:20:29.288
+With this blue rectangulars the inside speech
+segment and with the guess it's all set yes.
+0:20:29.669 --> 0:20:35.736
+So then you have the full signal and you're
+meaning now labeling your task as a blue or
+0:20:35.736 --> 0:20:36.977
+white prediction.
+0:20:36.908 --> 0:20:39.202
+So that is your prediction task.
+0:20:39.133 --> 0:20:44.975
+You have the audio signal only and your prediction
+task is like label one or zero.
+0:20:45.305 --> 0:20:55.585
+Once you do that then based on this labeling
+you can extract each segment again like each
+0:20:55.585 --> 0:20:58.212
+consecutive blue area.
+0:20:58.798 --> 0:21:05.198
+See then removed maybe the non-speaking part
+already and duo speech translation only on
+0:21:05.198 --> 0:21:05.998
+the parts.
+0:21:06.786 --> 0:21:19.768
+Which is good because the training would have
+done similarly.
+0:21:20.120 --> 0:21:26.842
+So on the noise in between you never saw in
+the training, so it's good to throw it away.
+0:21:29.649 --> 0:21:34.930
+One challenge, of course, is now if you're
+doing that, what is your input?
+0:21:34.860 --> 0:21:40.664
+You cannot do the sequence labeling normally
+on the whole talk, so it's too long.
+0:21:40.593 --> 0:21:46.738
+So if you're doing this prediction of the
+label, you also have a window for which you
+0:21:46.738 --> 0:21:48.239
+do the segmentation.
+0:21:48.788 --> 0:21:54.515
+And that's the bedline we have in the punctuation
+prediction.
+0:21:54.422 --> 0:22:00.392
+If we don't have good borders, random splits
+are normally good.
+0:22:00.299 --> 0:22:03.939
+So what we do now is split the audio.
+0:22:04.344 --> 0:22:09.134
+So that would be our input, and then the part
+three would be our labels.
+0:22:09.269 --> 0:22:15.606
+This green would be the input and here we
+want, for example, blue labels and then white.
+0:22:16.036 --> 0:22:20.360
+Here only do labors and here at the beginning
+why maybe at the end why.
+0:22:21.401 --> 0:22:28.924
+So thereby you have now a fixed window always
+for which you're doing than this task of predicting.
+0:22:33.954 --> 0:22:43.914
+How you build your classifier that is based
+again.
+0:22:43.719 --> 0:22:52.512
+We had this wave to be mentioned last week.
+0:22:52.752 --> 0:23:00.599
+So in training you use labels to say whether
+it's in speech or outside speech.
+0:23:01.681 --> 0:23:17.740
+Inference: You give them always the chance
+and then predict whether this part like each
+0:23:17.740 --> 0:23:20.843
+label is afraid.
+0:23:23.143 --> 0:23:29.511
+Bit more complicated, so one challenge is
+if you randomly split off cognition, losing
+0:23:29.511 --> 0:23:32.028
+your context for the first brain.
+0:23:31.954 --> 0:23:38.693
+It might be very hard to predict whether this
+is now in or out of, and also for the last.
+0:23:39.980 --> 0:23:48.449
+You often need a bit of context whether this
+is audio or not, and at the beginning.
+0:23:49.249 --> 0:23:59.563
+So what you do is you put the audio in twice.
+0:23:59.339 --> 0:24:08.538
+You want to do it with splits and then.
+0:24:08.788 --> 0:24:15.996
+It is shown you have shifted the two offsets,
+so one is predicted with the other offset.
+0:24:16.416 --> 0:24:23.647
+And then averaging the probabilities so that
+at each time you have, at least for one of
+0:24:23.647 --> 0:24:25.127
+the predictions,.
+0:24:25.265 --> 0:24:36.326
+Because at the end of the second it might
+be very hard to predict whether this is now
+0:24:36.326 --> 0:24:39.027
+speech or nonspeech.
+0:24:39.939 --> 0:24:47.956
+Think it is a high parameter, but you are
+not optimizing it, so you just take two shifts.
+0:24:48.328 --> 0:24:54.636
+Of course try a lot of different shifts and
+so on.
+0:24:54.512 --> 0:24:59.649
+The thing is it's mainly a problem here.
+0:24:59.523 --> 0:25:04.412
+If you don't do two outsets you have.
+0:25:05.105 --> 0:25:14.761
+You could get better by doing that, but would
+be skeptical if it really matters, and also
+0:25:14.761 --> 0:25:18.946
+have not seen any experience in doing.
+0:25:19.159 --> 0:25:27.629
+Guess you're already good, you have maybe
+some arrows in there and you're getting.
+0:25:31.191 --> 0:25:37.824
+So with this you have your segmentation.
+0:25:37.663 --> 0:25:44.228
+However, there is a problem in between.
+0:25:44.064 --> 0:25:49.158
+Once the model is wrong then.
+0:25:49.789 --> 0:26:01.755
+The normal thing would be the first thing
+that you take some threshold and that you always
+0:26:01.755 --> 0:26:05.436
+label everything in speech.
+0:26:06.006 --> 0:26:19.368
+The problem is when you are just doing this
+one threshold that you might have.
+0:26:19.339 --> 0:26:23.954
+Those are the challenges.
+0:26:23.777 --> 0:26:31.168
+Short segments mean you have no context.
+0:26:30.988 --> 0:26:35.503
+The policy will be bad.
+0:26:37.077 --> 0:26:48.954
+Therefore, people use this probabilistic divided
+cocker algorithm, so the main idea is start
+0:26:48.954 --> 0:26:56.744
+with the whole segment, and now you split the
+whole segment.
+0:26:57.397 --> 0:27:09.842
+Then you split there and then you continue
+until each segment is smaller than the maximum
+0:27:09.842 --> 0:27:10.949
+length.
+0:27:11.431 --> 0:27:23.161
+But you can ignore some splits, and if you
+split one segment into two parts you first
+0:27:23.161 --> 0:27:23.980
+trim.
+0:27:24.064 --> 0:27:40.197
+So normally it's not only one signal position,
+it's a longer area of non-voice, so you try
+0:27:40.197 --> 0:27:43.921
+to find this longer.
+0:27:43.943 --> 0:27:51.403
+Now your large segment is split into two smaller
+segments.
+0:27:51.277 --> 0:27:56.085
+Now you are checking these segments.
+0:27:56.296 --> 0:28:04.683
+So if they are very, very short, it might
+be good not to spin at this point because you're
+0:28:04.683 --> 0:28:05.697
+ending up.
+0:28:06.006 --> 0:28:09.631
+And this way you continue all the time, and
+then hopefully you'll have a good stretch.
+0:28:10.090 --> 0:28:19.225
+So, of course, there's one challenge with
+this approach: if you think about it later,
+0:28:19.225 --> 0:28:20.606
+low latency.
+0:28:25.405 --> 0:28:31.555
+So in this case you have to have the full
+audio available.
+0:28:32.132 --> 0:28:38.112
+So you cannot continuously do that mean if
+you would do it just always.
+0:28:38.029 --> 0:28:45.589
+If the probability is higher you split but
+in this case you try to find a global optimal.
+0:28:46.706 --> 0:28:49.134
+A heuristic body.
+0:28:48.999 --> 0:28:58.130
+You find a global solution for your whole
+tar and not a local one.
+0:28:57.993 --> 0:29:02.223
+Where's the system most sure?
+0:29:02.802 --> 0:29:12.467
+So that's a bit of a challenge here, but the
+advantage of course is that in the end you
+0:29:12.467 --> 0:29:14.444
+have no segments.
+0:29:17.817 --> 0:29:23.716
+Any more questions like this.
+0:29:23.519 --> 0:29:36.696
+Then the next thing is we also need to evaluate
+in this scenario.
+0:29:37.097 --> 0:29:44.349
+So know machine translation is quite a long
+way.
+0:29:44.201 --> 0:29:55.305
+History now was the beginning of the semester,
+but hope you can remember.
+0:29:55.675 --> 0:30:09.214
+Might be with blue score, might be with comment
+or similar, but you need to have.
+0:30:10.310 --> 0:30:22.335
+But this assumes that you have this one-to-one
+match, so you always have an output and machine
+0:30:22.335 --> 0:30:26.132
+translation, which is nicely.
+0:30:26.506 --> 0:30:34.845
+So then it might be that our output has four
+segments, while our reference output has only
+0:30:34.845 --> 0:30:35.487
+three.
+0:30:36.756 --> 0:30:40.649
+And now is, of course, questionable like what
+should we compare in our metric.
+0:30:44.704 --> 0:30:53.087
+So it's no longer directly possible to directly
+do that because what should you compare?
+0:30:53.413 --> 0:31:00.214
+Just have four segments there and three segments
+there, and of course it seems to be that.
+0:31:00.920 --> 0:31:06.373
+The first one it likes to the first one when
+you see I can't speak Spanish, but you're an
+0:31:06.373 --> 0:31:09.099
+audience of the guests who is already there.
+0:31:09.039 --> 0:31:14.472
+So even like just a woman, the blue comparing
+wouldn't work, so you need to do something
+0:31:14.472 --> 0:31:17.158
+about that to take this type of evaluation.
+0:31:19.019 --> 0:31:21.727
+Still any suggestions what you could do.
+0:31:25.925 --> 0:31:44.702
+How can you calculate a blue score because
+you don't have one you want to see?
+0:31:45.925 --> 0:31:49.365
+Here you put another layer which spies to
+add in the second.
+0:31:51.491 --> 0:31:56.979
+It's even not aligning only, but that's one
+solution, so you need to align and resign.
+0:31:57.177 --> 0:32:06.886
+Because even if you have no alignment so this
+to this and this to that you see that it's
+0:32:06.886 --> 0:32:12.341
+not good because the audio would compare to
+that.
+0:32:13.453 --> 0:32:16.967
+That we'll discuss is even one simpler solution.
+0:32:16.896 --> 0:32:19.065
+Yes, it's a simpler solution.
+0:32:18.993 --> 0:32:23.086
+It's called document based blue or something
+like that.
+0:32:23.013 --> 0:32:25.720
+So you just take the full document.
+0:32:26.566 --> 0:32:32.630
+For some matrix it's good and it's not clear
+how good it is to the other, but there might
+0:32:32.630 --> 0:32:32.900
+be.
+0:32:33.393 --> 0:32:36.454
+Think of more simple metrics like blue.
+0:32:36.377 --> 0:32:40.358
+Do you have any idea what could be a disadvantage?
+0:32:49.249 --> 0:32:56.616
+Blue is matching ingrams so you start with
+the original.
+0:32:56.487 --> 0:33:01.274
+You check how many ingrams in here.
+0:33:01.901 --> 0:33:11.233
+If you're not doing that on the full document,
+you can also match grams from year to year.
+0:33:11.751 --> 0:33:15.680
+So you can match things very far away.
+0:33:15.579 --> 0:33:21.323
+Start doing translation and you just randomly
+randomly.
+0:33:22.142 --> 0:33:27.938
+And that, of course, could be a bit of a disadvantage
+or like is a problem, and therefore people
+0:33:27.938 --> 0:33:29.910
+also look into the segmentation.
+0:33:29.850 --> 0:33:34.655
+But I've recently seen some things, so document
+levels tours are also normally.
+0:33:34.594 --> 0:33:39.924
+If you have a relatively high quality system
+or state of the art, then they also have a
+0:33:39.924 --> 0:33:41.802
+good correlation of the human.
+0:33:46.546 --> 0:33:59.241
+So how are we doing that so we are putting
+end of sentence boundaries in there and then.
+0:33:59.179 --> 0:34:07.486
+Alignment based on a similar Livingston distance,
+so at a distance between our output and the
+0:34:07.486 --> 0:34:09.077
+reference output.
+0:34:09.449 --> 0:34:13.061
+And here is our boundary.
+0:34:12.922 --> 0:34:23.484
+We map the boundary based on the alignment,
+so in Lithuania you only have.
+0:34:23.803 --> 0:34:36.036
+And then, like all the words that are before,
+it might be since there is not a random.
+0:34:36.336 --> 0:34:44.890
+Mean it should be, but it can happen things
+like that, and it's not clear where.
+0:34:44.965 --> 0:34:49.727
+At the break, however, they are typically
+not that bad because they are words which are
+0:34:49.727 --> 0:34:52.270
+not matching between reference and hypothesis.
+0:34:52.216 --> 0:34:56.871
+So normally it doesn't really matter that
+much because they are anyway not matching.
+0:34:57.657 --> 0:35:05.888
+And then you take the mule as a T output and
+use that to calculate your metric.
+0:35:05.785 --> 0:35:12.576
+Then it's again a perfect alignment for which
+you can calculate.
+0:35:14.714 --> 0:35:19.229
+Any idea you could do it the other way around.
+0:35:19.133 --> 0:35:23.361
+You could resigment your reference to the.
+0:35:29.309 --> 0:35:30.368
+Which one would you select?
+0:35:34.214 --> 0:35:43.979
+I think segmenting the assertive also is much
+more natural because the reference sentence
+0:35:43.979 --> 0:35:46.474
+is the fixed solution.
+0:35:47.007 --> 0:35:52.947
+Yes, that's the right motivation if you do
+think about blue or so.
+0:35:52.858 --> 0:35:57.647
+Additionally important if you change your
+reference.
+0:35:57.857 --> 0:36:07.175
+You might have a different number of diagrams
+or diagrams because the sentences are different
+0:36:07.175 --> 0:36:08.067
+lengths.
+0:36:08.068 --> 0:36:15.347
+Here your five system, you're always comparing
+it to the same system, and you don't compare
+0:36:15.347 --> 0:36:16.455
+to different.
+0:36:16.736 --> 0:36:22.317
+The only different base of segmentation, but
+still it could make some do.
+0:36:25.645 --> 0:36:37.129
+DisfluenciesGood, that's all about sentence
+segmentation, then a bit about disfluencies
+0:36:37.129 --> 0:36:40.130
+and what there really.
+0:36:42.182 --> 0:36:51.138
+So as said in daily life, you're not speaking
+like very nice full sentences every.
+0:36:51.471 --> 0:36:53.420
+He was speaking powerful sentences.
+0:36:53.365 --> 0:36:54.451
+We do repetitions.
+0:36:54.834 --> 0:37:00.915
+It's especially if it's more interactive,
+so in meetings, phone calls and so on.
+0:37:00.840 --> 0:37:04.521
+If you have multiple speakers, they also break.
+0:37:04.724 --> 0:37:16.651
+Each other, and then if you keep them, they
+are harder to translate because most of your
+0:37:16.651 --> 0:37:17.991
+training.
+0:37:18.278 --> 0:37:30.449
+It's also very difficult to read, so we'll
+have some examples there to transcribe everything
+0:37:30.449 --> 0:37:32.543
+as it was said.
+0:37:33.473 --> 0:37:36.555
+What type of things are there?
+0:37:37.717 --> 0:37:42.942
+So you have all these pillow works.
+0:37:42.797 --> 0:37:47.363
+These are very easy to remove.
+0:37:47.216 --> 0:37:52.964
+You can just use regular expressions.
+0:37:53.433 --> 0:38:00.139
+Is getting more difficult with some other
+type of filler works.
+0:38:00.034 --> 0:38:03.391
+In German you have this or in.
+0:38:04.024 --> 0:38:08.473
+And these ones you cannot just remove by regular
+expression.
+0:38:08.400 --> 0:38:15.032
+You shouldn't remove all yacht from a text
+because it might be very important information
+0:38:15.032 --> 0:38:15.769
+for well.
+0:38:15.715 --> 0:38:19.995
+It may be not as important as you are, but
+still it might be very important.
+0:38:20.300 --> 0:38:24.215
+So just removing them is there already more
+difficult.
+0:38:26.586 --> 0:38:29.162
+Then you have these repetitions.
+0:38:29.084 --> 0:38:32.580
+You have something like mean saw him there.
+0:38:32.500 --> 0:38:33.619
+There was a.
+0:38:34.334 --> 0:38:41.001
+And while for the first one that might be
+very easy to remove because you just look for
+0:38:41.001 --> 0:38:47.821
+double, the thing is that the repetition might
+not be exactly the same, so there is there
+0:38:47.821 --> 0:38:48.199
+was.
+0:38:48.124 --> 0:38:54.110
+So there is already getting a bit more complicated,
+of course still possible.
+0:38:54.614 --> 0:39:01.929
+You can remove Denver so the real sense would
+be like to have a ticket to Houston.
+0:39:02.882 --> 0:39:13.327
+But there the detection, of course, is getting
+more challenging as you want to get rid of.
+0:39:13.893 --> 0:39:21.699
+You don't have the data, of course, which
+makes all the tasks harder, but you probably
+0:39:21.699 --> 0:39:22.507
+want to.
+0:39:22.417 --> 0:39:24.774
+That's really meaningful.
+0:39:24.684 --> 0:39:26.063
+Current isn't.
+0:39:25.972 --> 0:39:31.124
+That is now a really good point and it's really
+there.
+0:39:31.051 --> 0:39:34.785
+The thing about what is your final task?
+0:39:35.155 --> 0:39:45.526
+If you want to have a transcript reading it,
+I'm not sure if we have another example.
+0:39:45.845 --> 0:39:54.171
+So there it's nicer if you have a clean transfer
+and if you see subtitles in, they're also not
+0:39:54.171 --> 0:39:56.625
+having all the repetitions.
+0:39:56.537 --> 0:40:03.812
+It's the nice way to shorten but also getting
+the structure you cannot even make.
+0:40:04.064 --> 0:40:11.407
+So in this situation, of course, they might
+give you information.
+0:40:11.296 --> 0:40:14.749
+There is a lot of stuttering.
+0:40:15.015 --> 0:40:22.835
+So in this case agree it might be helpful
+in some way, but meaning reading all the disfluencies
+0:40:22.835 --> 0:40:25.198
+is getting really difficult.
+0:40:25.116 --> 0:40:28.051
+If you have the next one, we have.
+0:40:28.308 --> 0:40:31.630
+That's a very long text.
+0:40:31.497 --> 0:40:35.824
+You need a bit of time to pass.
+0:40:35.689 --> 0:40:39.479
+This one is not important.
+0:40:40.480 --> 0:40:48.461
+It might be nice if you can start reading
+from here.
+0:40:48.310 --> 0:40:52.012
+Let's have a look here.
+0:40:51.858 --> 0:40:54.798
+Try to read this.
+0:40:57.297 --> 0:41:02.725
+You can understand it, but think you need
+a bit of time to really understand what was.
+0:41:11.711 --> 0:41:21.480
+And now we have the same text, but you have
+highlighted in bold, and not only read the
+0:41:21.480 --> 0:41:22.154
+bold.
+0:41:23.984 --> 0:41:25.995
+And ignore everything which is not bold.
+0:41:30.250 --> 0:41:49.121
+Would assume it's easier to read just the
+book part more faster and more faster.
+0:41:50.750 --> 0:41:57.626
+Yeah, it might be, but I'm not sure we have
+a master thesis of that.
+0:41:57.526 --> 0:41:59.624
+If seen my videos,.
+0:42:00.000 --> 0:42:09.875
+Of the recordings, I also have it more likely
+that it's like a fluent speak and I'm not like
+0:42:09.875 --> 0:42:12.318
+doing the hesitations.
+0:42:12.652 --> 0:42:23.764
+Don't know if somebody else has looked into
+the Cusera video, but notice that.
+0:42:25.005 --> 0:42:31.879
+For these videos spoke every minute, three
+times or something, and then people were there
+0:42:31.879 --> 0:42:35.011
+and cutting things and making hopefully.
+0:42:35.635 --> 0:42:42.445
+And therefore if you want to more achieve
+that, of course, no longer exactly what was
+0:42:42.445 --> 0:42:50.206
+happening, but if it more looks like a professional
+video, then you would have to do that and cut
+0:42:50.206 --> 0:42:50.998
+that out.
+0:42:50.919 --> 0:42:53.535
+But yeah, there are definitely.
+0:42:55.996 --> 0:42:59.008
+We're also going to do this thing again.
+0:42:58.935 --> 0:43:02.317
+First turn is like I'm going to have a very.
+0:43:02.422 --> 0:43:07.449
+Which in the end they start to slow down just
+without feeling as though they're.
+0:43:07.407 --> 0:43:10.212
+It's a good point for the next.
+0:43:10.124 --> 0:43:13.561
+There is not the one perfect solution.
+0:43:13.473 --> 0:43:20.656
+There's some work on destruction removal,
+but of course there's also disability.
+0:43:20.567 --> 0:43:27.397
+Removal is not that easy, so do you just remove
+that's in order everywhere.
+0:43:27.607 --> 0:43:29.708
+But how much like cleaning do you do?
+0:43:29.652 --> 0:43:31.368
+It's more a continuous thing.
+0:43:31.811 --> 0:43:38.211
+Is it more really you only remove stuff or
+are you also into rephrasing and here is only
+0:43:38.211 --> 0:43:38.930
+removing?
+0:43:39.279 --> 0:43:41.664
+But maybe you want to rephrase it.
+0:43:41.596 --> 0:43:43.234
+That's hearing better.
+0:43:43.503 --> 0:43:49.185
+So then it's going into what people are doing
+in style transfer.
+0:43:49.097 --> 0:43:52.422
+We are going from a speech style to.
+0:43:52.872 --> 0:44:07.632
+So there is more continuum, and of course
+Airconditioner is not the perfect solution,
+0:44:07.632 --> 0:44:10.722
+but exactly what.
+0:44:15.615 --> 0:44:19.005
+Yeah, we're challenging.
+0:44:18.869 --> 0:44:30.216
+You have examples where the direct copy is
+not as hard or is not exactly the same.
+0:44:30.080 --> 0:44:35.415
+That is, of course, more challenging.
+0:44:41.861 --> 0:44:49.889
+If it's getting really mean why it's so challenging,
+if it's really spontaneous even for the speaker,
+0:44:49.889 --> 0:44:55.634
+you need maybe even the video to really get
+that and at least the audio.
+0:45:01.841 --> 0:45:06.025
+Yeah what it also depends on.
+0:45:06.626 --> 0:45:15.253
+The purpose, of course, and very important
+thing is the easiest tasks just to removing.
+0:45:15.675 --> 0:45:25.841
+Of course you have to be very careful because
+if you remove some of the not, it's normally
+0:45:25.841 --> 0:45:26.958
+not much.
+0:45:27.227 --> 0:45:33.176
+But if you remove too much, of course, that's
+very, very bad because you're losing important.
+0:45:33.653 --> 0:45:46.176
+And this might be even more challenging if
+you think about rarer and unseen works.
+0:45:46.226 --> 0:45:56.532
+So when doing this removal, it's important
+to be careful and normally more conservative.
+0:46:03.083 --> 0:46:15.096
+Of course, also you have to again see if you're
+doing that now in a two step approach, not
+0:46:15.096 --> 0:46:17.076
+an end to end.
+0:46:16.944 --> 0:46:20.777
+So first you need a remote.
+0:46:21.501 --> 0:46:30.230
+But you have to somehow sing it in the whole
+type line.
+0:46:30.074 --> 0:46:36.936
+If you learn text or remove disfluencies,.
+0:46:36.796 --> 0:46:44.070
+But it might be that the ASR system is outputing
+something else or that it's more of an ASR
+0:46:44.070 --> 0:46:44.623
+error.
+0:46:44.864 --> 0:46:46.756
+So um.
+0:46:46.506 --> 0:46:52.248
+Just for example, if you do it based on language
+modeling scores, it might be that you're just
+0:46:52.248 --> 0:46:57.568
+the language modeling score because the has
+done some errors, so you really have to see
+0:46:57.568 --> 0:46:59.079
+the combination of that.
+0:46:59.419 --> 0:47:04.285
+And for example, we had like partial words.
+0:47:04.174 --> 0:47:06.441
+They are like some.
+0:47:06.328 --> 0:47:08.827
+We didn't have that.
+0:47:08.908 --> 0:47:18.248
+So these feelings cannot be that you start
+in the middle of the world and then you switch
+0:47:18.248 --> 0:47:19.182
+because.
+0:47:19.499 --> 0:47:23.214
+And of course, in text in perfect transcript,
+that's very easy to recognize.
+0:47:23.166 --> 0:47:24.374
+That's not a real word.
+0:47:24.904 --> 0:47:37.198
+However, when you really do it into an system,
+he will normally detect some type of word because
+0:47:37.198 --> 0:47:40.747
+he only can help the words.
+0:47:50.050 --> 0:48:03.450
+Example: We should think so if you have this
+in the transcript it's easy to detect as a
+0:48:03.450 --> 0:48:05.277
+disgusting.
+0:48:05.986 --> 0:48:11.619
+And then, of course, it's more challenging
+in a real world example where you have.
+0:48:12.492 --> 0:48:27.834
+Style TransferNow to the approaches one thing
+is to really put it in between so you put your
+0:48:27.834 --> 0:48:29.814
+A's system.
+0:48:31.391 --> 0:48:45.139
+So what your task is like, so you have this
+text and the outputs in this text.
+0:48:45.565 --> 0:48:49.605
+There is different formulations of that.
+0:48:49.507 --> 0:48:54.535
+You might not be able to do everything like
+that.
+0:48:55.195 --> 0:49:10.852
+Or do you also allow, for example, rephrasing
+for reordering so in text you might have the
+0:49:10.852 --> 0:49:13.605
+word correctly.
+0:49:13.513 --> 0:49:24.201
+But the easiest thing is you only do it more
+like removing, so some things can be removed.
+0:49:29.049 --> 0:49:34.508
+Any ideas how to do that this is output.
+0:49:34.375 --> 0:49:41.036
+You have training data so we have training
+data.
+0:49:47.507 --> 0:49:55.869
+To put in with the spoon you can eat it even
+after it is out, but after the machine has.
+0:50:00.000 --> 0:50:05.511
+Was wearing rocks, so you have not just the
+shoes you remove but wearing them as input,
+0:50:05.511 --> 0:50:07.578
+as disfluent text and as output.
+0:50:07.515 --> 0:50:09.152
+It should be fueled text.
+0:50:09.089 --> 0:50:15.168
+It can be before or after recycling as you
+said, but you have this type of task, so technically
+0:50:15.168 --> 0:50:20.043
+how would you address this type of task when
+you have to solve this type of.
+0:50:24.364 --> 0:50:26.181
+That's exactly so.
+0:50:26.086 --> 0:50:28.802
+That's one way of doing it.
+0:50:28.705 --> 0:50:33.072
+It's a translation task and you train your.
+0:50:33.913 --> 0:50:34.683
+Can do.
+0:50:34.587 --> 0:50:42.859
+Then, of course, the bit of the challenge
+is that you automatically allow rephrasing
+0:50:42.859 --> 0:50:43.540
+stuff.
+0:50:43.943 --> 0:50:52.240
+Which of the one end is good so you have more
+opportunities but it might be also a bad thing
+0:50:52.240 --> 0:50:58.307
+because if you have more opportunities you
+have more opportunities.
+0:51:01.041 --> 0:51:08.300
+If you want to prevent that, it can also do
+more simple labeling, so for each word your
+0:51:08.300 --> 0:51:10.693
+label should not be removed.
+0:51:12.132 --> 0:51:17.658
+People have also been looked into parsley.
+0:51:17.530 --> 0:51:29.098
+You remember maybe the past trees at the beginning
+like the structure because the ideas.
+0:51:29.649 --> 0:51:45.779
+There's also more unsupervised approaches
+where you then phrase it as a style transfer
+0:51:45.779 --> 0:51:46.892
+task.
+0:51:50.310 --> 0:51:58.601
+At the last point since we have that yes,
+it has also been done in an end-to-end fashion
+0:51:58.601 --> 0:52:06.519
+so that it's really you have as input the audio
+signal and output you have than the.
+0:52:06.446 --> 0:52:10.750
+The text, without influence, is a clearly
+clear text.
+0:52:11.131 --> 0:52:19.069
+You model every single total, which of course
+has a big advantage.
+0:52:18.950 --> 0:52:25.706
+You can use these paralinguistic features,
+pauses, and.
+0:52:25.705 --> 0:52:34.091
+If you switch so you start something then
+oh it doesn't work continue differently so.
+0:52:34.374 --> 0:52:42.689
+So you can easily use in a fashion while in
+a cascade approach.
+0:52:42.559 --> 0:52:47.500
+As we saw there you have text input.
+0:52:49.990 --> 0:53:02.389
+But on the one end we have again, and in the
+more extreme case the problem before was endless.
+0:53:02.258 --> 0:53:06.961
+Of course there is even less data.
+0:53:11.611 --> 0:53:12.837
+Good.
+0:53:12.633 --> 0:53:30.817
+This is all about the input to a very more
+person, or maybe if you think about YouTube.
+0:53:32.752 --> 0:53:34.989
+Talk so this could use be very exciting.
+0:53:36.296 --> 0:53:42.016
+Is more viewed as style transferred.
+0:53:41.861 --> 0:53:53.149
+You can use ideas from machine translation
+where you have one language.
+0:53:53.713 --> 0:53:57.193
+So there is ways of trying to do this type
+of style transfer.
+0:53:57.637 --> 0:54:02.478
+Think is definitely also very promising to
+make it more and more fluent in a business.
+0:54:03.223 --> 0:54:17.974
+Because one major issue about all the previous
+ones is that you need training data and then
+0:54:17.974 --> 0:54:21.021
+you need training.
+0:54:21.381 --> 0:54:32.966
+So I mean, think that we are only really of
+data that we have for English.
+0:54:32.811 --> 0:54:39.457
+Maybe there is a very few data in German.
+0:54:42.382 --> 0:54:49.680
+Low Latency SpeechOkay, then let's talk about
+low latency speech.
+0:54:50.270 --> 0:55:05.158
+So the idea is if we are doing life translation
+of a talker, so we want to start out.
+0:55:05.325 --> 0:55:23.010
+This is possible because there is typically
+some kind of monotony in many languages.
+0:55:24.504 --> 0:55:29.765
+And this is also what, for example, human
+interpreters are doing to have a really low
+0:55:29.765 --> 0:55:30.071
+leg.
+0:55:30.750 --> 0:55:34.393
+They are even going further.
+0:55:34.268 --> 0:55:40.928
+They guess what will be the ending of the
+sentence.
+0:55:41.421 --> 0:55:51.120
+Then they can already continue, although it's
+not sad it might be needed, but that is even
+0:55:51.120 --> 0:55:53.039
+more challenging.
+0:55:54.714 --> 0:55:58.014
+Why is it so difficult?
+0:55:57.876 --> 0:56:09.799
+There is this train of on the one end for
+a and you want to have more context because
+0:56:09.799 --> 0:56:14.513
+we learn if we have more context.
+0:56:15.015 --> 0:56:24.033
+And therefore to have more contacts you have
+to wait as long as possible.
+0:56:23.911 --> 0:56:27.693
+The best is to have the full.
+0:56:28.168 --> 0:56:35.244
+On the other hand, you want to have a low
+latency for the user to wait to generate as
+0:56:35.244 --> 0:56:35.737
+soon.
+0:56:36.356 --> 0:56:47.149
+So if you're doing no situation you have to
+find the best way to start in order to have
+0:56:47.149 --> 0:56:48.130
+a good.
+0:56:48.728 --> 0:56:52.296
+There's no longer the perfect solution.
+0:56:52.207 --> 0:56:56.847
+People will also evaluate what is the translation.
+0:56:57.657 --> 0:57:09.942
+While it's challenging in German to English,
+German has this very nice thing where the prefix
+0:57:09.942 --> 0:57:16.607
+of the word can be put at the end of the sentence.
+0:57:17.137 --> 0:57:24.201
+And you only know if the person registers
+or cancels his station at the end of the center.
+0:57:24.985 --> 0:57:33.690
+So if you want to start the translation in
+English you need to know at this point is the.
+0:57:35.275 --> 0:57:39.993
+So you would have to wait until the end of
+the year.
+0:57:39.904 --> 0:57:42.934
+That's not really what you want.
+0:57:43.843 --> 0:57:45.795
+What happened.
+0:57:47.207 --> 0:58:09.887
+Other solutions of doing that are: Have been
+motivating like how we can do that subject
+0:58:09.887 --> 0:58:16.073
+object or subject work.
+0:58:16.496 --> 0:58:24.582
+In German it's not always subject, but there
+are relative sentence where you have that,
+0:58:24.582 --> 0:58:25.777
+so it needs.
+0:58:28.808 --> 0:58:41.858
+How we can do that is, we'll look today into
+three ways of doing that.
+0:58:41.674 --> 0:58:46.277
+The one is to mitigate.
+0:58:46.766 --> 0:58:54.824
+And then the IVAR idea is to do retranslating,
+and there you can now use the text output.
+0:58:54.934 --> 0:59:02.302
+So the idea is you translate, and if you later
+notice it was wrong then you can retranslate
+0:59:02.302 --> 0:59:03.343
+and correct.
+0:59:03.803 --> 0:59:14.383
+Or you can do what is called extremely coding,
+so you can generically.
+0:59:17.237 --> 0:59:30.382
+Let's start with the optimization, so if you
+have a sentence, it may reach a conference,
+0:59:30.382 --> 0:59:33.040
+and in this time.
+0:59:32.993 --> 0:59:39.592
+So you have a good translation quality while
+still having low latency.
+0:59:39.699 --> 0:59:50.513
+You have an extra model which does your segmentation
+before, but your aim is not to have a segmentation.
+0:59:50.470 --> 0:59:53.624
+But you can somehow measure in training data.
+0:59:53.555 --> 0:59:59.841
+If do these types of segment lengths, that's
+my latency and that's my translation quality,
+0:59:59.841 --> 1:00:02.811
+and then you can try to search a good way.
+1:00:03.443 --> 1:00:20.188
+If you're doing that one, it's an extra component,
+so you can use your system as it was.
+1:00:22.002 --> 1:00:28.373
+The other idea is to directly output the first
+high processes always, so always when you have
+1:00:28.373 --> 1:00:34.201
+text or audio we translate, and if we then
+have more context available we can update.
+1:00:35.015 --> 1:00:50.195
+So imagine before, if get an eye register
+and there's a sentence continued, then.
+1:00:50.670 --> 1:00:54.298
+So you change the output.
+1:00:54.159 --> 1:01:07.398
+Of course, that might be also leading to bad
+user experience if you always flicker and change
+1:01:07.398 --> 1:01:09.229
+your output.
+1:01:09.669 --> 1:01:15.329
+The bit like human interpreters also are able
+to correct, so they're doing a more long text.
+1:01:15.268 --> 1:01:20.829
+If they are guessing how to continue to say
+and then he's saying something different, they
+1:01:20.829 --> 1:01:22.480
+also have to correct them.
+1:01:22.418 --> 1:01:26.795
+So here, since it's not all you, we can even
+change what we have said.
+1:01:26.733 --> 1:01:29.632
+Yes, that's exactly what we have implemented.
+1:01:31.431 --> 1:01:49.217
+So how that works is, we are aware, and then
+we translate it, and if we get more input like
+1:01:49.217 --> 1:01:51.344
+you, then.
+1:01:51.711 --> 1:02:00.223
+And so we can always continue to do that and
+improve the transcript that we have.
+1:02:00.480 --> 1:02:07.729
+So in the end we have the lowest possible
+latency because we always output what is possible.
+1:02:07.651 --> 1:02:14.368
+On the other hand, introducing a bit of a
+new problem is: There's another challenge when
+1:02:14.368 --> 1:02:20.104
+we first used that this one was first used
+for old and that it worked fine.
+1:02:20.029 --> 1:02:21.353
+You switch to NMT.
+1:02:21.283 --> 1:02:25.573
+You saw one problem that is even generating
+more flickering.
+1:02:25.503 --> 1:02:28.880
+The problem is the normal machine translation.
+1:02:29.669 --> 1:02:35.414
+So implicitly learn all the output that always
+ends with a dot, and it's always a full sentence.
+1:02:36.696 --> 1:02:42.466
+And this was even more important somewhere
+in the model than really what is in the input.
+1:02:42.983 --> 1:02:55.910
+So if you give him a partial sentence, it
+will still generate a full sentence.
+1:02:55.747 --> 1:02:58.214
+So encourage.
+1:02:58.298 --> 1:03:05.821
+It's like trying to just continue it somehow
+to a full sentence and if it's doing better
+1:03:05.821 --> 1:03:10.555
+guessing stuff then you have to even have more
+changes.
+1:03:10.890 --> 1:03:23.944
+So here we have a trained mismatch and that's
+maybe more a general important thing that the
+1:03:23.944 --> 1:03:28.910
+modem might learn a bit different.
+1:03:29.289 --> 1:03:32.636
+It's always ending with a dog, so you don't
+just guess something in general.
+1:03:33.053 --> 1:03:35.415
+So we have your trained test mismatch.
+1:03:38.918 --> 1:03:41.248
+And we have a trained test message.
+1:03:41.184 --> 1:03:43.710
+What is the best way to address that?
+1:03:46.526 --> 1:03:51.934
+That's exactly the right, so we have to like
+train also on that.
+1:03:52.692 --> 1:03:55.503
+The problem is for particle sentences.
+1:03:55.431 --> 1:03:59.612
+There's not training data, so it's hard to
+find all our.
+1:04:00.580 --> 1:04:06.531
+Hi, I'm ransom quite easy to generate artificial
+pottery scent or at least for the source.
+1:04:06.926 --> 1:04:15.367
+So you just take, you take all the prefixes
+of the source data.
+1:04:17.017 --> 1:04:22.794
+On the problem of course, with a bit what
+do you know lying?
+1:04:22.699 --> 1:04:30.846
+If you have a sentence, I encourage all of
+what should be the right target for that.
+1:04:31.491 --> 1:04:45.381
+And the constraints on the one hand, it should
+be as long as possible, so you always have
+1:04:45.381 --> 1:04:47.541
+a long delay.
+1:04:47.687 --> 1:04:55.556
+On the other hand, it should be also a suspect
+of the previous ones, and it should be not
+1:04:55.556 --> 1:04:57.304
+too much inventing.
+1:04:58.758 --> 1:05:02.170
+A very easy solution works fine.
+1:05:02.066 --> 1:05:05.421
+You can just do a length space.
+1:05:05.316 --> 1:05:09.617
+You also take two thirds of the target.
+1:05:10.070 --> 1:05:19.626
+His learning then implicitly to guess a bit
+if you think about the beginning of example.
+1:05:20.000 --> 1:05:30.287
+This one, if you do two sorts like half, in
+this case the target would be eye register.
+1:05:30.510 --> 1:05:39.289
+So you're doing a bit of implicit guessing,
+and if it's getting wrong you have rewriting,
+1:05:39.289 --> 1:05:43.581
+but you're doing a good amount of guessing.
+1:05:49.849 --> 1:05:53.950
+In addition, this would be like how it looks
+like if it was like.
+1:05:53.888 --> 1:05:58.301
+If it wasn't a housing game, then the target
+could be something like.
+1:05:58.979 --> 1:06:02.513
+One problem is that you just do that this
+way.
+1:06:02.438 --> 1:06:04.622
+It's most of your training.
+1:06:05.245 --> 1:06:11.983
+And in the end you're interested in the overall
+translation quality, so for full sentence.
+1:06:11.909 --> 1:06:18.998
+So if you train on that, it will mainly learn
+how to translate prefixes because ninety percent
+1:06:18.998 --> 1:06:21.536
+or more of your data is prefixed.
+1:06:22.202 --> 1:06:31.636
+That's why we'll see that it's better to do
+like a ratio.
+1:06:31.473 --> 1:06:39.285
+So half your training data are full sentences.
+1:06:39.759 --> 1:06:47.693
+Because if you're doing this well you see
+that for every word prefix and only one sentence.
+1:06:48.048 --> 1:06:52.252
+You also see that nicely here here are both.
+1:06:52.158 --> 1:06:56.551
+This is the blue scores and you see the bass.
+1:06:58.518 --> 1:06:59.618
+Is this one?
+1:06:59.534 --> 1:07:03.284
+It has a good quality because it's trained.
+1:07:03.198 --> 1:07:11.371
+If you know, train with all the partial sentences
+is more focusing on how to translate partial
+1:07:11.371 --> 1:07:12.318
+sentences.
+1:07:12.752 --> 1:07:17.840
+Because all the partial sentences will at
+some point be removed, because at the end you
+1:07:17.840 --> 1:07:18.996
+translate the full.
+1:07:20.520 --> 1:07:24.079
+There's many tasks to read, but you have the
+same performances.
+1:07:24.504 --> 1:07:26.938
+On the other hand, you see here the other
+problem.
+1:07:26.890 --> 1:07:28.657
+This is how many words got updated.
+1:07:29.009 --> 1:07:31.579
+You want to have as few updates as possible.
+1:07:31.522 --> 1:07:34.892
+Updates need to remove things which are once
+being shown.
+1:07:35.255 --> 1:07:40.538
+This is quite high for the baseline.
+1:07:40.395 --> 1:07:50.535
+If you know the partials that are going down,
+they should be removed.
+1:07:51.151 --> 1:07:58.648
+And then for moody tasks you have a bit like
+the best note of swim.
+1:08:02.722 --> 1:08:05.296
+Any more questions to this type of.
+1:08:09.309 --> 1:08:20.760
+The last thing is that you want to do an extremely.
+1:08:21.541 --> 1:08:23.345
+Again, it's a bit implication.
+1:08:23.287 --> 1:08:25.271
+Scenario is what you really want.
+1:08:25.213 --> 1:08:30.135
+As you said, we sometimes use this updating,
+and for text output it'd be very nice.
+1:08:30.077 --> 1:08:35.203
+But imagine if you want to audio output, of
+course you can't change it anymore because
+1:08:35.203 --> 1:08:37.854
+on one side you cannot change what was said.
+1:08:37.795 --> 1:08:40.860
+So in this time you more need like a fixed
+output.
+1:08:41.121 --> 1:08:47.440
+And then the style of street decoding is interesting.
+1:08:47.323 --> 1:08:55.586
+Where you, for example, get sourced, the seagullins
+are so stoked in.
+1:08:55.468 --> 1:09:00.901
+Then you decide oh, now it's better to wait.
+1:09:01.041 --> 1:09:14.643
+So you somehow need to have this type of additional
+information.
+1:09:15.295 --> 1:09:23.074
+Here you have to decide should know I'll put
+a token or should wait for my and feel.
+1:09:26.546 --> 1:09:32.649
+So you have to do this additional labels like
+weight, weight, output, output, wage and so
+1:09:32.649 --> 1:09:32.920
+on.
+1:09:33.453 --> 1:09:38.481
+There are different ways of doing that.
+1:09:38.355 --> 1:09:45.773
+You can have an additional model that does
+this decision.
+1:09:46.166 --> 1:09:53.669
+And then have a higher quality or better to
+continue and then have a lower latency in this
+1:09:53.669 --> 1:09:54.576
+different.
+1:09:55.215 --> 1:09:59.241
+Surprisingly, a very easy task also works,
+sometimes quite good.
+1:10:03.043 --> 1:10:10.981
+And that is the so called way care policy
+and the idea is there at least for text to
+1:10:10.981 --> 1:10:14.623
+text translation that is working well.
+1:10:14.530 --> 1:10:22.376
+It's like you wait for words and then you
+always output one and like one for each.
+1:10:22.682 --> 1:10:28.908
+So your weight slow works at the beginning
+of the sentence, and every time a new board
+1:10:28.908 --> 1:10:29.981
+is coming you.
+1:10:31.091 --> 1:10:39.459
+So you have the same times to beat as input,
+so you're not legging more or less, but to
+1:10:39.459 --> 1:10:41.456
+have enough context.
+1:10:43.103 --> 1:10:49.283
+Of course this for example for the unmarried
+will not solve it perfectly but if you have
+1:10:49.283 --> 1:10:55.395
+a bit of local reordering inside your token
+that you can manage very well and then it's
+1:10:55.395 --> 1:10:57.687
+a very simple solution but it's.
+1:10:57.877 --> 1:11:00.481
+The other one was dynamic.
+1:11:00.385 --> 1:11:06.944
+Depending on the context you can decide how
+long you want to wait.
+1:11:07.687 --> 1:11:21.506
+It also only works if you have a similar amount
+of tokens, so if your target is very short
+1:11:21.506 --> 1:11:22.113
+of.
+1:11:22.722 --> 1:11:28.791
+That's why it's also more challenging for
+audio input because the speaking rate is changing
+1:11:28.791 --> 1:11:29.517
+and so on.
+1:11:29.451 --> 1:11:35.582
+You would have to do something like I'll output
+a word for every second a year or something
+1:11:35.582 --> 1:11:35.982
+like.
+1:11:36.636 --> 1:11:45.459
+The problem is that the audio speaking speed
+is not like fixed but quite very, and therefore.
+1:11:50.170 --> 1:11:58.278
+Therefore, what you can also do is you can
+use a similar solution than we had before with
+1:11:58.278 --> 1:11:59.809
+the resetteling.
+1:12:00.080 --> 1:12:02.904
+You remember we were re-decoded all the time.
+1:12:03.423 --> 1:12:12.253
+And you can do something similar in this case
+except that you add something in that you're
+1:12:12.253 --> 1:12:16.813
+saying, oh, if I read it cold, I'm not always.
+1:12:16.736 --> 1:12:22.065
+Can decode as I want, but you can do this
+target prefix decoding, so what you say is
+1:12:22.065 --> 1:12:23.883
+in your achievement section.
+1:12:23.820 --> 1:12:26.830
+You can easily say generate a translation
+bus.
+1:12:27.007 --> 1:12:29.810
+The translation has to start with the prefix.
+1:12:31.251 --> 1:12:35.350
+How can you do that?
+1:12:39.839 --> 1:12:49.105
+In the decoder exactly you start, so if you
+do beam search you select always the most probable.
+1:12:49.349 --> 1:12:57.867
+And now you say oh, I'm not selecting the
+most perfect, but this is the fourth, so in
+1:12:57.867 --> 1:13:04.603
+the first step have to take this one, in the
+second start decoding.
+1:13:04.884 --> 1:13:09.387
+And then you're making sure that your second
+always starts with this prefix.
+1:13:10.350 --> 1:13:18.627
+And then you can use your immediate retranslation,
+but you're no longer changing the output.
+1:13:19.099 --> 1:13:31.595
+Out as it works, so it may get a speech signal
+and input, and it is not outputing any.
+1:13:32.212 --> 1:13:45.980
+So then if you got you get a translation maybe
+and then you decide yes output.
+1:13:46.766 --> 1:13:54.250
+And then you're translating as one as two
+as sweet as four, but now you say generate
+1:13:54.250 --> 1:13:55.483
+only outputs.
+1:13:55.935 --> 1:14:07.163
+And then you're translating and maybe you're
+deciding on and now a good translation.
+1:14:07.031 --> 1:14:08.891
+Then you're.
+1:14:09.749 --> 1:14:29.984
+Yes, but don't get to worry about what the
+effect is.
+1:14:30.050 --> 1:14:31.842
+We're generating your target text.
+1:14:32.892 --> 1:14:36.930
+But we're not always outputing the full target
+text now.
+1:14:36.859 --> 1:14:43.693
+What we are having is we have here some strategy
+to decide: Oh, is a system already sure enough
+1:14:43.693 --> 1:14:44.405
+about it?
+1:14:44.334 --> 1:14:49.374
+If it's sure enough and it has all the information,
+we can output it.
+1:14:49.302 --> 1:14:50.746
+And then the next.
+1:14:51.291 --> 1:14:55.931
+If we say here sometimes with better not to
+get output we won't output it already.
+1:14:57.777 --> 1:15:06.369
+And thereby the hope is in the uphill model
+should not yet outcut a register because it
+1:15:06.369 --> 1:15:10.568
+doesn't mean no yet if it's a case or not.
+1:15:13.193 --> 1:15:18.039
+Output StrategiesSo what we have to discuss
+is what is a good output strategy.
+1:15:18.658 --> 1:15:20.070
+So you could do.
+1:15:19.987 --> 1:15:23.808
+The output strategy could be something like.
+1:15:23.743 --> 1:15:39.871
+If you think of weight cape, this is an output
+strategy here that you always input.
+1:15:40.220 --> 1:15:44.990
+Good, and you can view your weight in a similar
+way as.
+1:15:45.265 --> 1:15:55.194
+But now, of course, we can also look at other
+output strategies where it's more generic and
+1:15:55.194 --> 1:15:59.727
+it's deciding whether in some situations.
+1:16:01.121 --> 1:16:12.739
+And one thing that works quite well is referred
+to as local agreement, and that means you're
+1:16:12.739 --> 1:16:13.738
+always.
+1:16:14.234 --> 1:16:26.978
+Then you're looking what is the same thing
+between my current translation and the one
+1:16:26.978 --> 1:16:28.756
+did before.
+1:16:29.349 --> 1:16:31.201
+So let's do that again in six hours.
+1:16:31.891 --> 1:16:45.900
+So your input is a first audio segment and
+your title text is all model trains.
+1:16:46.346 --> 1:16:53.231
+Then you're getting six opposites, one and
+two, and this time the output is all models.
+1:16:54.694 --> 1:17:08.407
+You see trains are different, but both of
+them agree that it's all so in those cases.
+1:17:09.209 --> 1:17:13.806
+So we can be hopefully a big show that really
+starts with all.
+1:17:15.155 --> 1:17:22.604
+So now we say we're output all, so at this
+time instead we'll output all, although before.
+1:17:23.543 --> 1:17:27.422
+We are getting one, two, three as input.
+1:17:27.327 --> 1:17:35.703
+This time we have a prefix, so now we are
+only allowing translations to start with all.
+1:17:35.608 --> 1:17:42.939
+We cannot change that anymore, so we now need
+to generate some translation.
+1:17:43.363 --> 1:17:46.323
+And then it can be that its now all models
+are run.
+1:17:47.927 --> 1:18:01.908
+Then we compare here and see this agrees on
+all models so we can output all models.
+1:18:02.882 --> 1:18:07.356
+So this by we can dynamically decide is a
+model is very anxious.
+1:18:07.288 --> 1:18:10.180
+We always talk with something different.
+1:18:11.231 --> 1:18:24.872
+Then it's, we'll wait longer, it's more for
+the same thing, and hope we don't need to wait.
+1:18:30.430 --> 1:18:40.238
+Is it clear again that the signal wouldn't
+be able to detect?
+1:18:43.203 --> 1:18:50.553
+The hope it is because if it's not sure of,
+of course, it in this kind would have to switch
+1:18:50.553 --> 1:18:51.671
+all the time.
+1:18:56.176 --> 1:19:01.375
+So if it would be the first step to register
+and the second time to cancel and they may
+1:19:01.375 --> 1:19:03.561
+register again, they wouldn't do it.
+1:19:03.502 --> 1:19:08.348
+Of course, it is very short because in register
+a long time, then it can't deal.
+1:19:08.568 --> 1:19:23.410
+That's why there's two parameters that you
+can use and which might be important, or how.
+1:19:23.763 --> 1:19:27.920
+So you do it like every one second, every
+five seconds or something like that.
+1:19:28.648 --> 1:19:37.695
+Put it more often as your latency will be
+because your weight is less long, but also
+1:19:37.695 --> 1:19:39.185
+you might do.
+1:19:40.400 --> 1:19:50.004
+So that is the one thing and the other thing
+is for words you might do everywhere, but if
+1:19:50.004 --> 1:19:52.779
+you think about audio it.
+1:19:53.493 --> 1:20:04.287
+And the other question you can do like the
+agreement, so the model is sure.
+1:20:04.145 --> 1:20:10.255
+If you say have to agree, then hopefully.
+1:20:10.650 --> 1:20:21.369
+What we saw is think there has been a really
+normally good performance and otherwise your
+1:20:21.369 --> 1:20:22.441
+latency.
+1:20:22.963 --> 1:20:42.085
+Okay, we'll just make more tests and we'll
+get the confidence.
+1:20:44.884 --> 1:20:47.596
+Have to completely agree with that.
+1:20:47.520 --> 1:20:52.968
+So when this was done, that was our first
+idea of using the confidence.
+1:20:52.892 --> 1:21:00.206
+The problem is that currently that's my assumption
+is that the modeling the model confidence is
+1:21:00.206 --> 1:21:03.940
+not that easy, and they are often overconfident.
+1:21:04.324 --> 1:21:17.121
+In the paper there is this type also where
+you try to use the confidence in some way to
+1:21:17.121 --> 1:21:20.465
+decide the confidence.
+1:21:21.701 --> 1:21:26.825
+But that gave worse results, and that's why
+we looked into that.
+1:21:27.087 --> 1:21:38.067
+So it's a very good idea think, but it seems
+not to at least how it was implemented.
+1:21:38.959 --> 1:21:55.670
+There is one way that maybe goes in more direction,
+which is very new.
+1:21:55.455 --> 1:22:02.743
+If this one, the last word is attending mainly
+to the end of the audio.
+1:22:02.942 --> 1:22:04.934
+You might you should not output it yet.
+1:22:05.485 --> 1:22:15.539
+Because they might think there is something
+more missing than you need to know, so they
+1:22:15.539 --> 1:22:24.678
+look at the attention and only output parts
+which look to not the audio signal.
+1:22:25.045 --> 1:22:40.175
+So there is, of course, a lot of ways how
+you can do it better or easier in some way.
+1:22:41.901 --> 1:22:53.388
+Instead tries to predict the next word with
+a large language model, and then for text translation
+1:22:53.388 --> 1:22:54.911
+you predict.
+1:22:55.215 --> 1:23:01.177
+Then you translate all of them and decide
+if there is a change so you can even earlier
+1:23:01.177 --> 1:23:02.410
+do your decision.
+1:23:02.362 --> 1:23:08.714
+The idea is that if we continue and then this
+will be to a change in the translation, then
+1:23:08.714 --> 1:23:10.320
+we should have opened.
+1:23:10.890 --> 1:23:18.302
+So it's more doing your estimate about possible
+continuations of the source instead of looking
+1:23:18.302 --> 1:23:19.317
+at previous.
+1:23:23.783 --> 1:23:31.388
+All that works is a bit here like one example.
+1:23:31.227 --> 1:23:39.644
+It has a legacy baselines and you are not
+putting.
+1:23:40.040 --> 1:23:47.041
+And you see in this case you have worse blood
+scores here.
+1:23:46.923 --> 1:23:51.673
+For equal one you have better latency.
+1:23:52.032 --> 1:24:01.123
+The how to and how does anybody have an idea
+of what could be challenging there or when?
+1:24:05.825 --> 1:24:20.132
+One problem of these models are hallucinations,
+and often very long has a negative impact on.
+1:24:24.884 --> 1:24:30.869
+If you don't remove the last four words but
+your model now starts to hallucinate and invent
+1:24:30.869 --> 1:24:37.438
+just a lot of new stuff then yeah you're removing
+the last four words of that but if it has invented
+1:24:37.438 --> 1:24:41.406
+ten words and you're still outputting six of
+these invented.
+1:24:41.982 --> 1:24:48.672
+Typically once it starts hallucination generating
+some output, it's quite long, so then it's
+1:24:48.672 --> 1:24:50.902
+no longer enough to just hold.
+1:24:51.511 --> 1:24:57.695
+And then, of course, a bit better if you compare
+to the previous ones.
+1:24:57.608 --> 1:25:01.530
+Their destinations are typically different.
+1:25:07.567 --> 1:25:25.939
+Yes, so we don't talk about the details, but
+for outputs, for presentations, there's different
+1:25:25.939 --> 1:25:27.100
+ways.
+1:25:27.347 --> 1:25:36.047
+So you want to have maximum two lines, maximum
+forty-two characters per line, and the reading
+1:25:36.047 --> 1:25:40.212
+speed is a maximum of twenty-one characters.
+1:25:40.981 --> 1:25:43.513
+How to Do That We Can Skip.
+1:25:43.463 --> 1:25:46.804
+Then you can generate something like that.
+1:25:46.886 --> 1:25:53.250
+Another challenge is, of course, that you
+not only need to generate the translation,
+1:25:53.250 --> 1:25:59.614
+but for subtlyning you also want to generate
+when to put breaks and what to display.
+1:25:59.619 --> 1:26:06.234
+Because it cannot be full sentences, as said
+here, if you have like maximum twenty four
+1:26:06.234 --> 1:26:10.443
+characters per line, that's not always a full
+sentence.
+1:26:10.368 --> 1:26:12.250
+So how can you make it?
+1:26:13.093 --> 1:26:16.253
+And then for speech there's not even a hint
+of wisdom.
+1:26:18.398 --> 1:26:27.633
+So what we have done today is yeah, we looked
+into maybe three challenges: We have this segmentation,
+1:26:27.633 --> 1:26:33.065
+which is a challenge both in evaluation and
+in the decoder.
+1:26:32.974 --> 1:26:40.604
+We talked about disfluencies and we talked
+about simultaneous translations and how to
+1:26:40.604 --> 1:26:42.911
+address these challenges.
+1:26:43.463 --> 1:26:45.507
+Any more questions.
+1:26:48.408 --> 1:26:52.578
+Good then new content.
+1:26:52.396 --> 1:26:58.100
+We are done for this semester.
+1:26:57.916 --> 1:27:04.913
+You can keep your knowledge in that.
+1:27:04.744 --> 1:27:09.405
+Repetition where we can try to repeat a bit
+what we've done all over the semester.
+1:27:10.010 --> 1:27:13.776
+Now prepare a bit of repetition to what think
+is important.
+1:27:14.634 --> 1:27:21.441
+But of course is also the chance for you to
+ask specific questions.
+1:27:21.341 --> 1:27:25.447
+It's not clear to me how things relate.
+1:27:25.745 --> 1:27:34.906
+So if you have any specific questions, please
+come to me or send me an email or so, then
+1:27:34.906 --> 1:27:36.038
+I'm happy.
+1:27:36.396 --> 1:27:46.665
+If should focus on it really in depth, it
+might be good not to come and send me an email
+1:27:46.665 --> 1:27:49.204
+on Wednesday evening.

demo_data/lectures/Lecture-19-21.07.2023/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:627fd6a73ed6853821cd58c2fc9e938a7844998ed51c4163f2d0a4771dc5c156
+size 130103518

demo_data/nips-2021/25953/metadata.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "title": "Sliced Mutual Information: A Scalable Measure of Statistical Dependence"
+}

demo_data/nips-2021/25953/transcript_whisper_large-v2.vtt ADDED Viewed

	@@ -0,0 +1,581 @@

+WEBVTT
+00:00.000 --> 00:13.140
+Hi everyone, my name is Zyw Goldfeld and this is a joint work with Christian Greenwald about
+00:13.140 --> 00:18.200
+sliced mutual information, which is a new measure of statistical dependence that has
+00:18.200 --> 00:22.520
+some nice scalability properties to high dimensional settings.
+00:22.520 --> 00:26.540
+And to get started, I think we're all familiar with classic mutual information that is defined
+00:26.540 --> 00:30.920
+between let's say continuous high dimensional random variables, which is the regime that
+00:30.920 --> 00:36.240
+we'll mostly be interested in, like SOH, basically the KL divergence between their joint distributions
+00:36.240 --> 00:39.040
+and the product of their marginals.
+00:39.040 --> 00:44.520
+And mutual information is indeed this fundamental measure of dependence that enjoys many good
+00:44.520 --> 00:50.060
+properties such that the fact that it nullifies if and only if our random variables are independent,
+00:50.060 --> 00:55.200
+it is invariant to bijections and it meets several useful representations, decompositions,
+00:55.200 --> 00:56.600
+variational forms, etc.
+00:56.600 --> 01:02.440
+And in fact, it can be even obtained axiomatically as the unique functional of the joint distribution
+01:02.440 --> 01:07.760
+that satisfies some natural informativeness conditions.
+01:07.760 --> 01:11.120
+And as such, mutual information has seen a variety of applications in information theory
+01:11.120 --> 01:13.680
+and statistics more recently in machine learning.
+01:13.680 --> 01:18.920
+But the problem is that all this nice structure comes with a hefty price, since computing
+01:18.920 --> 01:24.520
+mutual information in high dimensions or estimating it from samples is very, very hard, effectively
+01:24.520 --> 01:25.520
+infeasible.
+01:25.520 --> 01:30.240
+And this is the so-called curse of dimensionality and sort of the problem that we try to tackle
+01:30.240 --> 01:31.400
+in this work.
+01:31.400 --> 01:37.040
+And to address this difficulty, what we propose is sliced mutual information, which is, like
+01:37.040 --> 01:42.520
+I said, a new measure of statistical dependence, not necessarily a proxy of mutual information
+01:42.520 --> 01:48.820
+as such, but rather an alternative notion, which is defined as this average of scalar
+01:48.820 --> 01:54.640
+mutual information terms between projections of our high dimensional variables onto randomly
+01:54.640 --> 01:58.520
+chosen directions from the corresponding unit spheres.
+01:58.520 --> 02:03.520
+And it's of course inspired by the recent popularization of slicing techniques for statistical
+02:03.520 --> 02:07.480
+divergences, in particular the Wasserstein, the sliced Wasserstein distance is a great
+02:07.480 --> 02:08.480
+example.
+02:08.480 --> 02:14.440
+But the way it works for sliced mutual information is roughly so, well, let's say that this is
+02:14.440 --> 02:19.120
+our first high dimensional variable X and this is its distribution.
+02:19.120 --> 02:22.480
+What you do is draw a projection direction uniformly from the sphere.
+02:22.480 --> 02:26.960
+You then project this random variable onto that direction, do the same for your other
+02:26.960 --> 02:28.200
+random variable.
+02:28.200 --> 02:34.360
+And now for these two projected scalar new variables, we just compute the mutual information
+02:34.360 --> 02:38.560
+between them and average everything over the choice of direction.
+02:38.560 --> 02:40.600
+So that's basically the definition.
+02:40.600 --> 02:45.880
+And with that, the goal of this work is effectively to show that sliced mutual information is
+02:45.880 --> 02:50.080
+both a meaningful and a scalable mutual information alternative.
+02:50.080 --> 02:56.200
+Meaningful, well, in the sense that it preserves many of the desired properties that make mutual
+02:56.200 --> 03:00.240
+information appealing to begin with and scalable in the sense that it alleviates the set of
+03:00.240 --> 03:03.800
+computational and statistical difficulties.
+03:03.800 --> 03:04.800
+All right.
+03:04.800 --> 03:11.080
+Yeah, and to address this first point, let me show you that, well, despite those one
+03:11.080 --> 03:15.800
+dimensional projections, sliced mutual information indeed inherits many of the properties of
+03:15.800 --> 03:17.700
+classic mutual information.
+03:17.700 --> 03:23.740
+So we have, well, of course, non-negativity, but furthermore, identification of independence.
+03:23.740 --> 03:28.960
+We have an entropy decomposition for an appropriate definition of sliced entropy.
+03:28.960 --> 03:31.840
+We can represent it as a KL divergence, a sliced KL divergence.
+03:31.840 --> 03:38.920
+To be more precise, we have a chain rule tensorization for independent copies, as well as a Donsker-Varadhan-like
+03:38.920 --> 03:44.840
+variational form that can be readily used for neural estimation of sliced mutual information.
+03:44.840 --> 03:49.720
+We actually make use of that in some of our empirical results.
+03:49.720 --> 03:53.400
+And well, I mean, you are more than welcome to check the paper or visit us as a poster
+03:53.400 --> 03:55.280
+if you want to know more about any of these.
+03:55.280 --> 04:00.480
+But really, the upshot here is that much of the classic structure is still there after
+04:00.480 --> 04:02.360
+the slicing.
+04:02.360 --> 04:06.240
+Now another interesting feature of sliced mutual information comes to light when you
+04:06.240 --> 04:10.400
+think of it in the context of the famous data processing inequality.
+04:10.400 --> 04:15.560
+And for starters, recall that classic mutual information satisfies the DPI, which in particular
+04:15.560 --> 04:21.440
+means that if you process either of your random variables with a deterministic function, say
+04:21.440 --> 04:27.400
+this f over here, you can only lose the informativeness in the classic sense.
+04:27.400 --> 04:33.360
+Now sliced mutual information plays differently with processing and can in some sense benefit
+04:33.360 --> 04:39.280
+from nice transformations that, let's say, give rise to some nicer manifold for your
+04:39.280 --> 04:40.280
+random variable.
+04:40.280 --> 04:43.880
+And to understand this, keep in mind that, well, first of all, sliced mutual information
+04:43.880 --> 04:47.320
+only looks at projections of random variables.
+04:47.320 --> 04:52.720
+And it may very well be the case that some transformations of x, let's say, have more
+04:52.720 --> 04:58.480
+informative projections about y than x itself.
+04:58.480 --> 05:01.080
+And here's a simple example to that effect.
+05:01.080 --> 05:06.120
+So consider a two-dimensional isotropic Gaussian x, so two coordinates, x1 and x2.
+05:06.120 --> 05:10.440
+And let's take y to be, for example, its first coordinate.
+05:10.440 --> 05:15.440
+Now if you look at the mutual information between two fixed projections of x and y,
+05:15.440 --> 05:18.600
+well, projection does nothing to y, right, because it's a scalar.
+05:18.600 --> 05:20.400
+But it does affect x.
+05:20.400 --> 05:24.520
+And if you look at the mutual information between two projections of x and y, you quickly
+05:24.520 --> 05:31.120
+realize that x1 really plays the role of the signal here, whereas x2 behaves like noise.
+05:31.120 --> 05:36.120
+And therefore, any transformation that will effectively improve your signal-to-noise ratio,
+05:36.120 --> 05:42.520
+for example, like this g sub a over here, where a is less than 1, will indeed give rise
+05:42.520 --> 05:45.880
+to a higher sliced mutual information value.
+05:45.880 --> 05:50.300
+So all in all, sliced mutual information can be increased from processing, which means
+05:50.300 --> 05:54.440
+that, well, in particular, it validates the data processing inequality and is different
+05:54.440 --> 05:56.840
+from classic mutual information in that sense.
+05:56.840 --> 06:03.120
+But interestingly, and as I will show you shortly, this is actually a quite useful thing
+06:03.120 --> 06:08.400
+to have, for example, for feature extraction tasks, because we can use sliced mutual information
+06:08.400 --> 06:14.240
+effectively to maximize it in order to extract informative features and land on those nicer
+06:14.240 --> 06:17.660
+manifolds that I mentioned a moment ago.
+06:17.660 --> 06:22.280
+And here's an example theorem that kind of makes this statement precise or formal, where
+06:22.280 --> 06:28.120
+we consider the maximization of sliced mutual information over linear transformations of
+06:28.120 --> 06:29.920
+our random variables.
+06:29.920 --> 06:34.200
+And this would, of course, not affect classic mutual information at all.
+06:34.200 --> 06:39.160
+But what we can show is that for sliced mutual information, this maximization ends up extracting
+06:39.160 --> 06:44.960
+the two most informative projection directions for you, which in particular will be encoded
+06:44.960 --> 06:52.200
+in the optimizing matrices, these A sub x star and A sub y star.
+06:52.200 --> 06:55.240
+And of course, there's nothing special about this particular setup.
+06:55.240 --> 07:00.720
+And we can establish similar results for, well, first of all, rank-constrained matrices
+07:00.720 --> 07:06.720
+that as opposed to what's shown here would extract the, let's say, our most informative
+07:06.720 --> 07:08.840
+features or projection directions.
+07:08.840 --> 07:11.120
+In the paper, we also extend this result to shallow neural networks.
+07:11.120 --> 07:17.840
+And in fact, our argument can be easily extended to cover additional nonlinear cases as well.
+07:17.840 --> 07:21.440
+OK, so that's pretty much for structural properties.
+07:21.440 --> 07:25.400
+But like I said at the beginning, the real premise of this framework is overcoming the
+07:25.400 --> 07:26.400
+curse of dimensionality.
+07:26.400 --> 07:32.640
+And let me show you that this is indeed the case, that sliced mutual information is or
+07:32.640 --> 07:38.640
+can be estimated in a scalable manner, effectively by combining your favorite scalar mutual information
+07:38.640 --> 07:42.200
+estimator with a simple Monte Carlo average step.
+07:42.200 --> 07:43.480
+And this is how it works.
+07:43.480 --> 07:48.260
+So let's say we're giving n IID samples from our high-dimensional random variables.
+07:48.260 --> 07:53.400
+And we're further given a scalar mutual information estimator that achieves, say, error delta
+07:53.400 --> 08:00.240
+of n when applied to n IID samples of some pair of one-dimensional variables, a and b.
+08:00.240 --> 08:02.040
+OK, so let's say we have these.
+08:02.040 --> 08:08.760
+Now, to estimate sliced mutual information, first thing to do is sample, let's say, m
+08:08.760 --> 08:14.680
+random projections from the corresponding spheres in an IID fashion, at which point
+08:14.680 --> 08:22.400
+we will take our high-dimensional n samples and project them onto each of these m random
+08:22.400 --> 08:24.960
+projections that we've generated.
+08:24.960 --> 08:30.780
+And the thing to observe here is that the resulting n times n data set of these projections
+08:30.780 --> 08:35.220
+is nothing but IID samples from the corresponding projected distribution, which is the right
+08:35.220 --> 08:39.400
+thing to have here if what you're trying to estimate is sliced mutual information.
+08:39.400 --> 08:43.860
+So having that, I mean, at this point, per projection direction, we can apply the scalar
+08:43.860 --> 08:49.400
+mutual information estimator and then just take one big, happy Monte Carlo average of
+08:49.400 --> 08:52.040
+the entire thing over the different projection directions.
+08:52.040 --> 08:55.600
+And this would give rise to the proposed sliced mutual information estimator.
+08:55.600 --> 08:59.780
+Now, you can compute this thing very easily, because at the end of the day, it's an average
+08:59.780 --> 09:03.000
+of scalar mutual information estimates.
+09:03.000 --> 09:09.120
+And as far as performance guarantees, we can show that so long that the per-sliced mutual
+09:09.120 --> 09:15.840
+information is bounded, the uniform absolute error of this estimator scales like 1 over
+09:15.840 --> 09:22.240
+the root of m, the number of our Monte Carlo samples, plus the error of the scalar mutual
+09:22.240 --> 09:23.240
+information estimator.
+09:23.240 --> 09:26.520
+And I'm just restating this informally over here.
+09:26.520 --> 09:31.240
+And what this all in all shows is that sliced mutual information can therefore be estimated
+09:31.240 --> 09:37.760
+the rate of scalar mutual information estimation problem plus this m to the minus half Monte
+09:37.760 --> 09:38.760
+Carlo penalty.
+09:38.760 --> 09:43.440
+And the thing is that under appropriate smoothness assumptions, the one-dimensional rate is in
+09:43.440 --> 09:45.200
+fact parametric.
+09:45.200 --> 09:49.720
+And therefore, if you just match the size of your data set and the number of Monte Carlo
+09:49.720 --> 09:54.640
+samples, just equate n and m, the sliced mutual information between high-dimensional variables
+09:54.640 --> 09:59.360
+can be estimated at the parametric n to the minus half rate, perhaps up to some logarithmic
+09:59.360 --> 10:00.360
+factors.
+10:00.360 --> 10:06.360
+And this is, of course, a significant speed up and stands in sharp contrast to the slow,
+10:06.360 --> 10:12.040
+exponentially bad in dimension, curse of dimensionality rate for classic mutual information.
+10:12.040 --> 10:17.200
+Yeah, now this scalability makes, in fact, running empirical experiments with sliced
+10:17.200 --> 10:18.720
+mutual information quite a breeze.
+10:18.720 --> 10:24.160
+So let me quickly show you some sort of proof of concept experiments, let's say.
+10:24.160 --> 10:28.280
+And the first one just relies on the fact that, well, SMI, sliced mutual information
+10:28.280 --> 10:29.840
+can identify independence.
+10:29.840 --> 10:34.440
+And therefore, we examine it as a figure of merit for independence testing, basically
+10:34.440 --> 10:38.640
+by thresholding the computed sliced mutual information value.
+10:38.640 --> 10:42.000
+And the results that we have obtained, of course, we've compared them with the same
+10:42.000 --> 10:45.360
+test, but based on classic mutual information.
+10:45.360 --> 10:50.320
+And this figure over here shows that for a bunch of different settings, well, it presents
+10:50.320 --> 10:55.040
+the area under the ROC curve as a function of the number of samples, the standard way
+10:55.040 --> 10:59.160
+to represent the quality of an independence test.
+10:59.160 --> 11:02.920
+And you basically want this number to be 1, which corresponds to an omniscient test.
+11:02.920 --> 11:07.520
+And what we observe is that sliced mutual information performs consistently well across
+11:07.520 --> 11:13.080
+different setups and across different dimensions, whereas the performance of the mutual information,
+11:13.080 --> 11:18.280
+the classic mutual information-based test, quickly degrades as dimension grows.
+11:18.280 --> 11:23.280
+Now, on top of that, let me also demonstrate how sliced mutual information can be used
+11:23.280 --> 11:24.680
+for feature extraction.
+11:24.680 --> 11:29.780
+And here, what we want to do is maximize the sliced mutual information between linear transformations
+11:29.780 --> 11:37.160
+of x and y that are now chosen to be IID samples from the same MNIST class, which we restrict
+11:37.160 --> 11:39.240
+to be either 0 or 1.
+11:39.240 --> 11:42.840
+And the choice of class is also random, so basically just a fair coin flip.
+11:42.840 --> 11:47.280
+And by observing that sliced mutual information between x and y is at most 1 bit, I mean,
+11:47.280 --> 11:52.560
+it's always upper bounded by mutual information, which equals a single bit in this case, basically
+11:52.560 --> 11:57.320
+the class label, the way to understand what we're doing here is that we're looking for
+11:57.320 --> 12:03.400
+the linear feature that is most informative for classifying or determining this class
+12:03.400 --> 12:04.760
+label.
+12:04.760 --> 12:08.200
+And interestingly enough, this is what this procedure ends up learning, where the figure
+12:08.200 --> 12:15.040
+shows basically the first two rows of the optimal A matrix that we obtained, rearranged
+12:15.040 --> 12:17.480
+in the dimension of an MNIST image.
+12:17.480 --> 12:22.720
+And this really looks like a match filter, if you're familiar, which, when applied to
+12:22.720 --> 12:27.480
+the samples, would indeed be able to tell you whether the sample came from the 0 class
+12:27.480 --> 12:28.640
+or not.
+12:28.640 --> 12:33.680
+And as far as for the value itself, well, the maximized sliced mutual information value
+12:33.680 --> 12:39.800
+ends up being roughly 0.7, which is quite close to the 1 bit upper bound, and is much,
+12:39.800 --> 12:44.400
+much larger than what you would get if you would not learn A, and let's say just instantiate
+12:44.400 --> 12:49.480
+it as a matrix with IID entries drawn according to some distribution.
+12:49.480 --> 12:53.640
+And this is just to say that something meaningful indeed being learned here, and something meaningful
+12:53.640 --> 13:00.160
+indeed happens when you maximize the sliced mutual information as your optimization objective.
+13:00.160 --> 13:03.400
+OK, so yeah, that's basically it.
+13:03.400 --> 13:09.160
+And just to recap, we introduced sliced mutual information, which is this average of scalar
+13:09.160 --> 13:12.160
+mutual information terms between one-dimensional projections.
+13:12.160 --> 13:15.880
+We've seen that it preserves much of the structure of classic mutual information.
+13:15.880 --> 13:22.280
+It can be efficiently computed and estimated from samples, and can also be, in fact, increased
+13:22.280 --> 13:28.280
+by our processing if, indeed, your processing gives rise to more informative projections.
+13:28.280 --> 13:32.960
+And we've presented some proof of concept applications to independence testing, to feature
+13:32.960 --> 13:33.960
+extraction.
+13:33.960 --> 13:35.800
+We have a couple of more in the paper.
+13:35.800 --> 13:36.960
+But let me say this.
+13:36.960 --> 13:41.480
+While this is mostly theoretical work, and a large-scale empirical exploration is sort
+13:41.480 --> 13:46.640
+of beyond its scope, we firmly believe that sliced mutual information will be extremely
+13:46.640 --> 13:51.360
+useful for various such tasks, and are very excited to look into this in the future.
+13:51.360 --> 13:52.680
+And yeah, with that, I'll stop.
+13:52.680 --> 13:57.220
+Thank you guys for listening, and do visit us at the poster, and check out the paper
+13:57.220 --> 14:12.560
+if you would like to know more.

demo_data/nips-2021/25953/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06f4968133dc8ada5fd9bf717fcd61a91049cd3c3034553cb6c2490f292c8a42
+size 90905227

demo_data/nips-2021/25957/metadata.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "title": "Shared Independent Component Analysis for Multi-Subject Neuroimaging"
+}

demo_data/nips-2021/25957/transcript_whisper_large-v2.vtt ADDED Viewed

	@@ -0,0 +1,539 @@

+WEBVTT
+00:00.000 --> 00:14.000
+Hi, I'm Hugo Richard, I'm a third year PhD student at Université Paris-Saclay.
+00:14.000 --> 00:18.480
+I'm in the INRIA Paris et Alpes team and my supervisor is Bertrand Thirion.
+00:18.480 --> 00:24.600
+Today I'll talk about shared independent component analysis for multi-subject neuroimaging.
+00:24.600 --> 00:31.400
+This is a joint work with Pierre Abelin, Alexandre Grandfort, Bertrand Thirion and Anna Pouy-Varine.
+00:31.400 --> 00:36.360
+First let us consider two sources that are emitting a signal that is recorded by two
+00:36.360 --> 00:37.360
+sensors.
+00:37.360 --> 00:43.120
+This can be seen as a simplified model of magnetoencephalography where brain sources
+00:43.120 --> 00:46.000
+are recorded by magnetometers.
+00:46.000 --> 00:50.200
+Because propagation time can be neglected, the signal recorded by the sensors can be
+00:50.200 --> 00:55.840
+seen as a linear mixture of the signal emitted by the sources.
+00:55.840 --> 00:59.600
+S is a set of sources that are assumed to be independent.
+00:59.600 --> 01:06.400
+X are the recordings and A describes how the sources are mixed to produce the recordings.
+01:06.400 --> 01:12.120
+At first sight this model may seem ill-defined because if we permute two columns in A and
+01:12.120 --> 01:19.600
+permute the corresponding sources in S, we'll get a new set of sources S' and a new mixing
+01:19.600 --> 01:25.360
+matrix A' that describes X just as well as A and S.
+01:25.360 --> 01:30.360
+And similarly if we scale the column of A by some constant, one column of A by some
+01:30.360 --> 01:34.920
+constant and the corresponding source by the same constant, we'll also get an equivalent
+01:34.920 --> 01:35.920
+description of X.
+01:35.920 --> 01:44.840
+However, these scale and permutation indeterminacies are the only one if the sources contain at
+01:44.840 --> 01:46.840
+most one Gaussian component.
+01:46.840 --> 01:52.040
+Let us consider the more general problem where you have multiple subjects that are exposed
+01:52.040 --> 01:54.560
+to the same stimuli.
+01:54.560 --> 02:00.640
+We have two subjects, X1 and X2, and they have different mixing matrices, A1 and A2,
+02:00.640 --> 02:04.560
+and different noise levels, N1 and N2.
+02:04.560 --> 02:08.720
+The interpretation is that they have shared sources because they have shared connective
+02:08.720 --> 02:09.720
+processes.
+02:09.720 --> 02:15.120
+They have different mixing matrices because they have different spatial topography.
+02:15.120 --> 02:20.600
+And they have different noises because we want to model inter-subject variability.
+02:20.600 --> 02:22.480
+This model is called group ICA.
+02:22.480 --> 02:27.840
+There are many methods to provide a solution for the group ICA problem.
+02:27.840 --> 02:34.560
+A very popular one introduced by Calhoun in 2001 is to just stack the data of all subjects
+02:34.560 --> 02:42.520
+feature-wise and then perform a PCA, a principal component analysis, on the stacked data.
+02:42.520 --> 02:47.520
+And therefore you obtain reduced data and apply independent component analysis on the
+02:47.520 --> 02:50.520
+reduced data to obtain a set of sources.
+02:50.520 --> 02:55.960
+Another formulation is introduced by Varoko in 2010 and is called K-NICA.
+02:55.960 --> 03:01.320
+You just replace the principal component analysis with a multiset CCA, so a multiset canonical
+03:01.320 --> 03:06.120
+correlation analysis, where you have to solve a generalized eigenvalue problem.
+03:06.120 --> 03:12.800
+There are many different formulations of multiset CCA, but this one with a generalized eigenvalue
+03:12.800 --> 03:15.560
+problem is the fastest to solve.
+03:15.560 --> 03:17.840
+KNICA and Cut-ICA have a lot of advantages.
+03:17.840 --> 03:21.000
+First, they are very fast to fit.
+03:21.000 --> 03:23.320
+And second, they are simple to implement.
+03:23.320 --> 03:26.920
+These are the two reasons why they are so popular in neuroimaging.
+03:26.920 --> 03:30.160
+However, they do not optimize the proper likelihood.
+03:30.160 --> 03:35.680
+So therefore they do not benefit from advantages of such estimators such as asymptotic efficiency.
+03:35.680 --> 03:41.480
+There are a lot of other related work that do optimize the proper likelihood.
+03:41.480 --> 03:46.240
+I want to mention the independent vector analysis, which is a very powerful framework introduced
+03:46.240 --> 03:48.760
+by Li in 2008.
+03:48.760 --> 03:54.560
+So unified approach of Guo in 2008 that we will also mention and talk about later.
+03:54.560 --> 04:01.040
+The approach of Shen in 2015 that also allows to perform dimension reduction.
+04:01.040 --> 04:08.320
+And the multi-view ICA that was introduced by our team last year.
+04:08.320 --> 04:15.200
+I want to quickly say that it's not obvious to design a likelihood-based approach that
+04:15.200 --> 04:17.400
+is tractable.
+04:17.400 --> 04:23.680
+And with this example of the Gaussian mixture noisy ICA by Bermond and Cardozo, we'll see
+04:23.680 --> 04:31.400
+that standard approach leads to intractable algorithms.
+04:31.400 --> 04:37.080
+The model we take here is the same as the group ICA, but we assume that the noise is
+04:37.080 --> 04:40.120
+Gaussian with the same variance for all subjects.
+04:40.120 --> 04:47.600
+We'll also assume that the sources follow a Gaussian mixture model.
+04:47.600 --> 04:53.040
+And we further assume that the weights of the Gaussian mixtures are known.
+04:53.040 --> 04:56.360
+We can solve such model via expectation maximization.
+04:56.360 --> 05:01.400
+And if we write the E-step, we'll get a closed form that involves a large sum.
+05:01.400 --> 05:09.040
+Because of this large size, this sum, and therefore the M algorithm is intractable whenever
+05:09.040 --> 05:11.600
+Q and K are large.
+05:11.600 --> 05:17.520
+Our contribution is shared ICA, what we call Shikha for short, where the data of subject
+05:17.520 --> 05:23.080
+i are assumed as a linear mixture of noisy sources, and the noise here is not on the
+05:23.080 --> 05:24.080
+sensor, but on the sources.
+05:24.080 --> 05:30.000
+The noise is Gaussian with a variance that can be different for each subject and different
+05:30.000 --> 05:31.000
+for each component.
+05:31.000 --> 05:37.800
+S are assumed to be independent, but in contrast to almost all existing work, some components
+05:37.800 --> 05:38.800
+can be Gaussian.
+05:38.800 --> 05:41.600
+We have a few blanket assumptions.
+05:41.600 --> 05:45.840
+We assume that the data are centered, that the mixing metrics are invertible, that the
+05:45.840 --> 05:50.680
+sources have identical variance, and that the number of subjects is greater than 3.
+05:50.680 --> 05:54.000
+We have two algorithms to solve the Shikha model.
+05:54.000 --> 06:01.520
+We have ShikhaJ, that is a FAS algorithm that is based on multiset CCA, and ShikhaML, a
+06:01.520 --> 06:04.000
+maximum likelihood approach.
+06:04.000 --> 06:07.600
+In Shikha, there are two ways to recover the parameters.
+06:07.600 --> 06:12.880
+Either the source are non-Gaussian, in which case we can use classical ICA results to recover
+06:12.880 --> 06:15.720
+the unmixing matrices.
+06:15.720 --> 06:20.120
+When the components are Gaussian, then we need something else, and what we use here
+06:20.120 --> 06:22.480
+is noise diversity.
+06:22.480 --> 06:28.320
+When the noise is sufficiently diverse, then it's possible to recover the unmixing matrix
+06:28.320 --> 06:34.120
+and the noise covariance up to a permutation and sign indeterminacy.
+06:34.120 --> 06:38.240
+Note that the noise diversity in Gaussian components is also a necessary condition.
+06:38.240 --> 06:42.680
+If it does not hold, then Shikha cannot be identified.
+06:42.680 --> 06:48.520
+Let us now focus on this theorem that is at the core of the ShikhaJ algorithm.
+06:48.520 --> 06:53.520
+Namely it shows that we can solve group ICA with multiset CCA.
+06:53.520 --> 06:58.880
+So assume the data follows the Shikha model, and consider the multiset CCA framed as a
+06:58.880 --> 07:00.920
+generalized eigenvalue problem.
+07:00.920 --> 07:08.080
+This generalized eigenvalue problem relies on two matrices, C and D. So C is formed by
+07:08.080 --> 07:13.560
+second-order statistics, and D is formed by the diagonal blocks in C.
+07:13.560 --> 07:19.880
+And so if we solve this eigenvalue problem and take the first k leading eigenvectors,
+07:19.880 --> 07:26.520
+we can recover the correct unmixing matrix from them, up to a permutation and a scaling.
+07:26.520 --> 07:32.000
+And this can only be done if the k first eigenvalues are distinct.
+07:32.000 --> 07:34.320
+Note that the distinct eigenvalue condition is also necessary.
+07:34.320 --> 07:40.480
+If two eigenvalues are the same, then this adds the need to determine IC, and therefore
+07:40.480 --> 07:42.280
+we cannot solve group IC.
+07:42.280 --> 07:48.640
+Note also that the condition that some eigenvalues need to be distinct is stronger than the noise
+07:48.640 --> 07:54.080
+diversity condition we have in the identifiability theorem.
+07:54.080 --> 07:59.360
+And therefore we can exhibit an example which is identifiable, but on which multiset CCA
+07:59.360 --> 08:00.360
+will fail.
+08:00.360 --> 08:04.800
+And I refer you to the paper for more details on this.
+08:04.800 --> 08:10.160
+So in our theorem, in order to recover the correct unmixing matrix, we need to have access
+08:10.160 --> 08:12.480
+to the second-order statistics.
+08:12.480 --> 08:18.860
+However, in practice, we only have access to them, up to some sampling noise.
+08:18.860 --> 08:24.520
+And because the mapping from matrices to eigenvectors is highly non-smooth, a small deviation in
+08:24.520 --> 08:31.160
+the second-order statistics can lead to a high deviation of the recovered unmixing matrix.
+08:31.160 --> 08:38.080
+Now to show this in practice, we take three subjects, two components, and noise covariance
+08:38.080 --> 08:47.440
+matrices with two values, lambda1 and lambda2, that are separated by an eigengap epsilon.
+08:47.440 --> 08:52.440
+And we compare the solution of multiset CCA on the true covariance matrices and on the
+08:52.440 --> 08:59.520
+perturbed covariance matrix, where the perturbation scale is given by delta.
+08:59.520 --> 09:07.240
+And for different values of epsilon, 10-4, 10-3, 10-2, 10-1, we show how the performance
+09:07.240 --> 09:14.720
+of the algorithm, so the M-ary distance between the true unmixing matrix and the estimated
+09:14.720 --> 09:20.880
+unmixing matrix, varies when the perturbation scale increases.
+09:20.880 --> 09:26.600
+And we see that when the eigengap is very close, so 10-4, the violet curve, then even
+09:26.600 --> 09:31.440
+with a very small perturbation, you can get to a very bad M-ary distance.
+09:31.440 --> 09:35.720
+So the black dashed curve is a performance of chance.
+09:35.720 --> 09:41.200
+Luckily, there is a large gap between the k-th eigenvalues and the k plus 1.
+09:41.200 --> 09:46.120
+This means that in practice, the span of the p-leading eigenvectors is approximately preserved.
+09:46.120 --> 09:53.600
+We can recover the true unmixing matrix from the unmixing matrix estimated by multiset
+09:53.600 --> 09:56.520
+CCA, just by multiplying by a matrix Q.
+09:56.520 --> 10:02.640
+And in order to estimate Q, we make use of the fact that the unmixed data should have
+10:02.640 --> 10:03.640
+a diagonal covariance.
+10:03.640 --> 10:09.680
+This leads us to a joint diagonalization problem that we can solve efficiently.
+10:09.680 --> 10:14.480
+So if we take the experiments we've done on the previous slide, the results are still
+10:14.480 --> 10:15.480
+shown here.
+10:15.480 --> 10:21.640
+You can see the violet curves, and that is very sensitive to perturbation.
+10:21.640 --> 10:29.360
+And so if we apply joint diagonalization, all these curves move, and they join the dashed
+10:29.360 --> 10:30.360
+curve on the bottom.
+10:30.360 --> 10:34.720
+And therefore, it's much better, because now the new curves that are represented by the
+10:34.720 --> 10:42.920
+dashed line are less sensitive to perturbations.
+10:42.920 --> 10:47.920
+So now we've obtained the correct unmixing matrix, but up to a scaling.
+10:47.920 --> 10:55.040
+And so we need an additional step to find the correct scaling, and another one to find
+10:55.040 --> 11:00.680
+the other parameter that is still unestimated, which are the noise covariance.
+11:00.680 --> 11:04.000
+And luckily, it's very easy to find the noise covariance.
+11:04.000 --> 11:06.280
+We can do this via an EM algorithm.
+11:06.280 --> 11:11.920
+The E-step and the M-step are in closed form, and this yields a very fast algorithm.
+11:11.920 --> 11:15.200
+But the Shikha-J is not a maximum likelihood estimator.
+11:15.200 --> 11:22.600
+So now we will focus on Shikha-ML, which is our maximum likelihood estimator.
+11:22.600 --> 11:31.240
+So I won't go too much into details on this, but we optimize this via an EM using a Gaussian
+11:31.240 --> 11:33.480
+mixture assumption as a source.
+11:33.480 --> 11:35.960
+We assume that the weights are known.
+11:35.960 --> 11:41.480
+What I just want to showcase here is that the E-step of the algorithm, the one that
+11:41.480 --> 11:46.000
+gives you the expectation of the sources given the data, and the variance of the sources
+11:46.000 --> 11:50.760
+given the data, only involves the sum of size 2.
+11:50.760 --> 11:57.320
+So previously we had a sum that had an exponential number of terms, and here we don't have that
+11:57.320 --> 11:58.320
+anymore.
+11:58.320 --> 12:02.920
+So the E-step is much faster than what we had before, and therefore the EM algorithm
+12:02.920 --> 12:07.200
+here is tractable, whereas it was not the case before.
+12:07.200 --> 12:11.440
+I first want to present our synthetic experiment where we generate data according to the Shikha-ML
+12:11.440 --> 12:13.200
+and Shikha-J model.
+12:13.200 --> 12:18.560
+In case A, we have only Gaussian components, but we have noise diversity, and therefore
+12:18.560 --> 12:24.240
+methods that use noise diversity to recover the sources such as Shikha-ML and Shikha-J
+12:24.240 --> 12:25.240
+perform best.
+12:25.240 --> 12:34.000
+In the second case, we have only non-Gaussian components and no noise diversity, so methods
+12:34.000 --> 12:41.520
+that use non-Gaussianity perform well such as Kana-ICA, Shikha-ML, or MultiView-ICA.
+12:41.520 --> 12:45.200
+And the last case, half of the components are Gaussian with noise diversity, and the
+12:45.200 --> 12:49.000
+other half are non-Gaussian but without noise diversity.
+12:49.000 --> 12:53.000
+And in this case, only Shikha-ML is able to correctly recover the sources.
+12:53.000 --> 12:57.960
+MV-ICA doesn't do that, but it's not as good as Shikha-ML.
+12:57.960 --> 13:00.400
+Let us now talk about our experiments on real data.
+13:00.400 --> 13:05.080
+We have this reconstruction experiment on fMRI data where subjects are exposed to a
+13:05.080 --> 13:07.920
+naturalistic stimuli such as movie watching.
+13:07.920 --> 13:15.320
+We use 80% of the movie to learn the unmixing matrices of all subjects, and then on the
+13:15.320 --> 13:22.320
+20% left of the movie, we compute the common sources, and from these common sources computed
+13:22.320 --> 13:28.800
+using 80% of the subject, we try to reconstruct the data of the 20% left of the subject.
+13:28.800 --> 13:33.880
+We compute the R2 score within regions of interest between the reconstructed data and
+13:33.880 --> 13:39.480
+the true data, and plot them as a function of the number of components used.
+13:39.480 --> 13:43.000
+As we see, Shikha-ML outperforms all of the methods.
+13:43.000 --> 13:47.400
+As a take-home message, Shikha is a powerful framework to extract shared sources.
+13:47.400 --> 13:52.840
+Shikha-J is a fast approach to fit the model, but it only uses second-order information.
+13:52.840 --> 13:58.800
+In contrast, Shikha-ML is a bit slower, but is able to use non-gaussianity in addition
+13:58.800 --> 14:00.960
+to second-order information.
+14:00.960 --> 14:03.840
+In practice, Shikha-ML yields the best results.
+14:03.840 --> 14:05.960
+The methods we've introduced work on reduced data.
+14:05.960 --> 14:11.160
+It would be interesting to know how to reduce the data so that they perform optimally.
+14:11.160 --> 14:15.400
+Another way to improve our results would be to learn the density of the shared sources
+14:15.400 --> 14:19.480
+in Shikha-ML instead of having them fixed.
+14:19.480 --> 14:23.400
+Thanks for listening, and have a good day!

demo_data/nips-2021/25957/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0539c1b965a157ce62df522fef5ea03cdec6198f5995fefa04cfddf947861fd
+size 93633719

demo_data/nips-2021/25958/metadata.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "title": "ParK: Sound and Efficient Kernel Ridge Regression by Feature Space Partitions"
+}

demo_data/nips-2021/25958/transcript_whisper_large-v2.vtt ADDED Viewed

	@@ -0,0 +1,374 @@

+WEBVTT
+00:00.000 --> 00:07.000
+Hello everyone, I'm Luigi Carretino, and this is a joint work with Stefano Vigonia,
+00:07.000 --> 00:10.000
+Daniele Calandriello, and Lorenzo Rosasco.
+00:10.000 --> 00:16.000
+The problem that we study in this work is a standard regression problem, where we want
+00:16.000 --> 00:24.000
+to estimate an unknown function f star given n pairs of points, x's and y's, and then
+00:24.000 --> 00:34.000
+given n pairs of points, x's and y's, where y's are noisy evaluations of the functions
+00:34.000 --> 00:38.000
+f star on the input points axis.
+00:41.000 --> 00:46.000
+A well-established method to learn nonlinear functions is kernel ridge regression.
+00:46.000 --> 00:53.000
+The basic idea is to map the input points into a higher dimensional space, where linear
+00:53.000 --> 00:59.000
+relationships can be learned that then translate in nonlinear ones in the input space.
+01:01.000 --> 01:07.000
+To formalize this, we can think about solving a standard empirical risk minimization problem
+01:07.000 --> 01:12.000
+regularized over a spatial function which is a reproducing kernel Hilbert space.
+01:14.000 --> 01:20.000
+Numerically speaking, the solution of this type of problem boils down to solving a linear
+01:20.000 --> 01:26.000
+system. Particularly, we can see here that the linear system is going to be Kc equal
+01:26.000 --> 01:33.000
+y, where K is the kernel matrix evaluated in all the pairs of points of the training
+01:33.000 --> 01:39.000
+sets, c are the weights that we aim to learn, and y's are the output points.
+01:40.000 --> 01:45.000
+We know that this method is optimal from a statistical point of view, but a drawback
+01:45.000 --> 01:52.000
+is that it suffers from computational scalability. In fact, in terms of time complexity, if we
+01:52.000 --> 01:57.000
+have n training points and we want to solve the linear system directly, we'll have to
+01:57.000 --> 02:03.000
+invert the matrix K, and this will cost us n cubed in time.
+02:06.000 --> 02:11.000
+Multiple ways of accelerating this process have been proposed over time.
+02:11.000 --> 02:17.000
+The first one is to solve the methods iteratively instead of inverting directly the matrix K.
+02:18.000 --> 02:25.000
+This allows us to only have matrix vector multiplications, and so the overall cost of
+02:25.000 --> 02:30.000
+an iterative method to solve this linear system is going to be Tn squared.
+02:31.000 --> 02:39.000
+Another method is the one known as sketching, where we can see this as subsampling the linear
+02:39.000 --> 02:46.000
+system, in particular subsampling columns of this linear system, where we can take m
+02:46.000 --> 02:52.000
+columns of the linear system uniformly at random to get a smaller one, and the cost
+02:52.000 --> 02:55.000
+of this will be m squared n.
+02:57.000 --> 03:04.000
+Another method instead is splitting. This allows us to divide the main problem into
+03:04.000 --> 03:12.000
+many, in this case Q, subproblems, each one that can be solved independently and so
+03:12.000 --> 03:20.000
+potentially can be distributed. So we can have a cost which boils down to n over Q to
+03:20.000 --> 03:22.000
+the power of 3.
+03:25.000 --> 03:30.000
+Combinations of these methods have been proposed in the literature. In particular, if
+03:30.000 --> 03:35.000
+we combine iterating and sketching, we can get a solver that can solve the problem in
+03:35.000 --> 03:38.000
+a time complexity of Tmn.
+03:40.000 --> 03:47.000
+If instead we combine sketching and splitting, we can get a solver that can be computed
+03:47.000 --> 03:51.000
+in m squared times n over Q.
+03:51.000 --> 03:59.000
+And in this work, we try to blend all these techniques to derive a new algorithm, which
+03:59.000 --> 04:09.000
+we will call PARC, that can achieve a time complexity of Tm times n over Q to the power
+04:09.000 --> 04:10.000
+of 2.
+04:12.000 --> 04:18.000
+So as we just said, in this work, we propose a new large-scale kernel regression solver
+04:18.000 --> 04:22.000
+that combines the computational benefits of iteration, sketching, and splitting.
+04:23.000 --> 04:27.000
+Notice, though, that these are approximation techniques and they may come at the cost of
+04:27.000 --> 04:35.000
+accuracy. But we are able to show that this new algorithm is able to preserve generalization
+04:35.000 --> 04:37.000
+under suitable partitions.
+04:38.000 --> 04:44.000
+Now also notice that instead of general splitting, we are going to need to focus on a
+04:44.000 --> 04:48.000
+particular type, which is the partitions.
+04:48.000 --> 04:53.000
+So we introduce a new principal partition scheme for kernel methods.
+04:56.000 --> 05:01.000
+We now look at the difference between data splitting and space partitioning.
+05:01.000 --> 05:08.000
+Given a set of points, the procedure of splitting takes groups of points at random and assign
+05:08.000 --> 05:10.000
+them to different splits or clusters.
+05:10.000 --> 05:14.000
+In this picture, for example, we divide the points in four splits.
+05:15.000 --> 05:21.000
+Partitioning instead divides the space in different cells, and then the points are implicitly
+05:21.000 --> 05:25.000
+assigned to a particular cluster based on which cell they belong to.
+05:27.000 --> 05:32.000
+Notice that with the splitting methods, we don't consider local information while we
+05:32.000 --> 05:37.000
+perform the splitting, but we do when we perform partitioning.
+05:37.000 --> 05:42.000
+Now, from this picture, the concept of partitioning a space seems pretty straightforward.
+05:43.000 --> 05:48.000
+However, when you start considering high dimensional feature space, subtle problems can
+05:48.000 --> 05:49.000
+appear.
+05:50.000 --> 05:55.000
+So first, as a recap, remember that there are two important spaces to consider in our
+05:55.000 --> 05:56.000
+regression problem.
+05:57.000 --> 06:04.000
+The input space X with its input space features and the kernel space H with its input space
+06:04.000 --> 06:10.000
+features, and the kernel space H, which potentially has many more implicit features.
+06:13.000 --> 06:17.000
+Traditionally, partition methods are applied directly to the input space.
+06:18.000 --> 06:24.000
+For example, a classical approach is to select a subset of points as centroids and then
+06:24.000 --> 06:30.000
+partition the space in cells by assigning each portion of the space to the closest centroid,
+06:30.000 --> 06:32.000
+which is called a Voronoi partition.
+06:32.000 --> 06:38.000
+Since we are in the input space, closest here is defined according to a simple Euclidean
+06:38.000 --> 06:39.000
+distance.
+06:40.000 --> 06:45.000
+However, remember that our target function and our whole regression does not happen
+06:45.000 --> 06:51.000
+directly on the input data space, but rather on the data mapped in the feature space.
+06:52.000 --> 06:58.000
+And after we apply our feature map to the data, the concept of closest and the partition
+06:58.000 --> 06:59.000
+can radically change.
+06:59.000 --> 07:05.000
+For example, here on the right, we choose a kernel space associated with a cosine similarity
+07:06.000 --> 07:12.000
+and again plot how the centroids partition the input space, but this time we chose closest
+07:12.000 --> 07:14.000
+according to the new cosine distance.
+07:15.000 --> 07:20.000
+The resulting partition is very different from the Euclidean one as it captures the
+07:20.000 --> 07:22.000
+non-linearity of the kernel function.
+07:22.000 --> 07:28.000
+In the paper, we discuss how this difference can impact the regression and we identified
+07:28.000 --> 07:34.000
+sufficient conditions that the partition should satisfy in order to guarantee good generalization
+07:34.000 --> 07:35.000
+of the learning process.
+07:37.000 --> 07:43.000
+Crucially, we will see that these guarantees depend not on how the input space is partitioned,
+07:43.000 --> 07:45.000
+but rather how the feature space is partitioned.
+07:45.000 --> 07:51.000
+As a consequence, for our PARC methods, we focus on choosing centroids solely using the
+07:51.000 --> 07:53.000
+kernel version of the distance.
+07:57.000 --> 08:00.000
+We are now ready to present in more detail how the PARC algorithm works.
+08:01.000 --> 08:07.000
+First of all, PARC partitioned the feature space into Q Voronoi cells and the first thing
+08:07.000 --> 08:16.000
+to do is to identify the centroids in the feature space that allows us to describe the
+08:16.000 --> 08:17.000
+Voronoi cells.
+08:19.000 --> 08:25.000
+Then inside each Voronoi cell, we learn a local estimator using an uniterated and sketched
+08:25.000 --> 08:27.000
+version of kernel ridge regression.
+08:30.000 --> 08:36.000
+And then at prediction time, when a new sample arrives, we can use the Q Voronoi feature
+08:36.000 --> 08:38.000
+to identify the new sample.
+08:40.000 --> 08:47.000
+We use the local estimator corresponding to the Voronoi cell to which the new points fall
+08:47.000 --> 08:48.000
+on.
+08:52.000 --> 08:57.000
+The generalization error of standard kernel ridge regression without partitioning can
+08:57.000 --> 09:02.000
+be upper bounded by two terms, a bias term and a variance term.
+09:02.000 --> 09:10.000
+In our work, we can show that also the generalization error of PARC can be upper bounded by a bias
+09:10.000 --> 09:11.000
+term and a variance term.
+09:11.000 --> 09:16.000
+But this time, these two terms are weighted and they are weighted by a certain quantity
+09:16.000 --> 09:25.000
+that depends on an angle theta, which is the minimum angle between all the subspaces of
+09:25.000 --> 09:26.000
+the partitions.
+09:26.000 --> 09:33.000
+For example, when all the subspaces are orthogonal between each other, we recover the exact same
+09:33.000 --> 09:36.000
+generalization error of standard kernel ridge regression.
+09:38.000 --> 09:45.000
+But we are also able to show that for angles which are small enough, we are able to obtain
+09:45.000 --> 09:50.000
+a generalization error which is of the same order of standard kernel ridge regression.
+09:50.000 --> 09:54.000
+These theoretical results suggest us how to construct a good partition.
+09:54.000 --> 10:00.000
+So in particular, PARC selects the Voronoi centroids greedily in order to promote orthogonality
+10:00.000 --> 10:01.000
+between the Voronoi cells.
+10:01.000 --> 10:06.000
+And in particular, we use the Schur complement to measure the orthogonality.
+10:10.000 --> 10:16.000
+We also use the Schur complement to measure the orthogonality of the Voronoi centroids.
+10:16.000 --> 10:20.000
+And in particular, we use the Schur complement to measure the orthogonality.
+10:24.000 --> 10:28.000
+Given all these ingredients, we are now able to measure the computational complexity of
+10:28.000 --> 10:32.000
+PARC, which has a time complexity that is the sum of two terms.
+10:33.000 --> 10:40.000
+A first term, q squared n log n, which is the cost of computing the centroids with the
+10:40.000 --> 10:41.000
+just mentioned procedure.
+10:41.000 --> 10:46.000
+And a second term, q squared n log n, which is the cost of computing the most expensive
+10:46.000 --> 10:47.000
+local estimator.
+10:51.000 --> 10:57.000
+Empirically, we performed experiments on data set of millions and of billions of points,
+10:57.000 --> 11:01.000
+and we compared with the currently fastest global kernel methods and with some other
+11:01.000 --> 11:02.000
+splitting kernel methods.
+11:03.000 --> 11:08.000
+We can see that PARC is the only method that manages to match the accuracy of the global
+11:08.000 --> 11:11.000
+estimator.
+11:11.000 --> 11:13.000
+Thank you all for your attention.
+11:13.000 --> 11:40.000
+And thank you to the poster for all your questions and more details.

demo_data/nips-2021/25958/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fefd926545331be9df0497e824634fa23129d26c9c9e7fdbe67c0382b98b4556
+size 22931245

demo_data/nips-2021/25959/metadata.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "title": "Adversarial Feature Desensitization"
+}

demo_data/nips-2021/25959/transcript_whisper_large-v2.vtt ADDED Viewed

	@@ -0,0 +1,353 @@

+WEBVTT
+00:00.000 --> 00:13.120
+Hello, my name is Pouya Bahshiban and I'm going to tell you about our paper titled
+00:13.120 --> 00:18.720
+Adversarial Feature Desensitization. This is joint work with a number of wonderful collaborators
+00:18.720 --> 00:24.400
+at MIWA, University of Montreal and McGill University, including Reza Bayat, Adam Ibrahim,
+00:24.400 --> 00:32.160
+Kartika Hoja, Mojtaba Farmazi, Tourez Dale, Lake Richards and Erin Oji. A common assumption in
+00:32.160 --> 00:36.560
+machine learning is that the train and test samples come from the same distribution.
+00:37.200 --> 00:42.960
+While this is a reasonable assumption under most circumstances, it is intentionally violated in the
+00:42.960 --> 00:49.600
+regime of adversarial attacks. Adversarial attacks are algorithms that search for slight input
+00:49.600 --> 00:55.600
+perturbations that cause the input to be misclassified. In the case of white box attacks,
+00:55.600 --> 01:01.600
+the model itself is transparent to the attacker and the attacker uses it to identify the possible
+01:01.600 --> 01:07.760
+inputs that would lead to misclassifications. A famous example of this is the image of a panda
+01:07.760 --> 01:13.360
+that when perturbed with imperceptible noise, alters the model's prediction from a panda to a
+01:13.360 --> 01:19.840
+gibbon. As prior literature has shown, this is a common issue in almost all machine learning methods
+01:19.840 --> 01:25.280
+and unless the classifier is specifically trained to be robust against these attacks,
+01:25.280 --> 01:28.720
+the attacks could completely break down the classifier's performance.
+01:30.240 --> 01:35.600
+This issue becomes even more critical when we consider the vast usage of these machine learning
+01:35.600 --> 01:41.040
+systems in our societies. For example, the possible security concerns that rise in face
+01:41.040 --> 01:46.720
+recognition systems prone to adversarial attacks or the safety in autonomous driving systems.
+01:48.080 --> 01:54.000
+So what is an adversarial attack? To formally define the adversarial attacks, let's assume a
+01:54.000 --> 02:00.080
+feature learning function f that projects inputs x to latent space with feature space z
+02:01.600 --> 02:08.720
+and a classifier that uses the latent code z to predict the correct class label y hat.
+02:08.720 --> 02:14.480
+The perturbation function or the attack generates a perturbed sample x prime
+02:14.480 --> 02:21.520
+within the epsilon neighborhood of the input x, which we're showing here as b of x and epsilon.
+02:22.160 --> 02:28.880
+By maximizing the classification objective, the opposite of how we normally optimize the classifier's
+02:28.880 --> 02:36.720
+parameter. Many methods have been proposed to defend the models against adversarial attacks.
+02:36.720 --> 02:42.640
+Two of these methods that have withstood the test of time so far are the adversarial training
+02:43.200 --> 02:50.160
+by Alexander Modrianov, which proposes a defense method by solving a minimax optimization problem
+02:50.160 --> 02:56.000
+that involves finding an adversarial input by maximizing the classification loss in the inner
+02:56.000 --> 03:03.840
+loop followed by a classifier training to minimizing the classifier loss on these adversarial inputs.
+03:03.840 --> 03:09.920
+This procedure is graphically shown for two hypothetical classes in the diagram on this slide.
+03:10.560 --> 03:15.440
+The adversarial training method essentially learns to separate the distributions of adversarial
+03:15.440 --> 03:22.400
+examples belonging to different classes. The second method is the trades method by Zhang et al,
+03:22.400 --> 03:27.440
+which proposes to push the decision boundary of the classifier away from the data.
+03:27.440 --> 03:32.480
+Trades achieves this by introducing a regularization term to the original learning
+03:32.480 --> 03:38.320
+objective for classification that penalizes the mismatch between the predicted label
+03:38.320 --> 03:44.400
+for the clean and perturbed inputs. The diagram on the right side again graphically illustrates
+03:44.400 --> 03:50.000
+this procedure, where now the defense method learns to separate the distributions of clean examples
+03:50.000 --> 03:54.400
+belonging to different classes while minimizing the loss of the classifier.
+03:54.400 --> 03:59.920
+The third method is the trade method by Wang et al, which proposes to push the decision boundary
+03:59.920 --> 04:06.880
+of the classifier to the inner loop followed by a classifier training to minimizing the
+04:06.880 --> 04:13.120
+classification loss on these adversarial inputs. The third method is the trade method by Zhang et al,
+04:13.120 --> 04:18.720
+which proposes to push the decision boundary of the classifier to the inner loop followed by a
+04:18.720 --> 04:27.840
+classifier training to minimizing the classification loss on these adversarial inputs to the inner
+04:27.840 --> 04:34.640
+loop. The third method is the trade method by Wang et al, which proposes to push the decision
+04:34.640 --> 04:39.920
+boundary of the classifier to minimizing the classification loss. The fourth method is the
+04:39.920 --> 04:45.600
+trade method by Wang et al, which proposes to push the decision boundary of the classifier
+04:45.600 --> 04:52.160
+for a source domain, but we want the classifier to also perform the same task on a related target
+04:52.160 --> 05:00.960
+domain that we might not have enough data for or that the generating procedure for sampling
+05:00.960 --> 05:09.440
+domain might be expensive. The domain adaptation theory proposed by Ben David et al answers the
+05:09.440 --> 05:15.840
+question of under what conditions can we adapt a classifier trained on the source domain for use
+05:15.840 --> 05:23.920
+in the target domain. Here we consider the original clean distributions as the source domain and the
+05:23.920 --> 05:31.280
+distribution of adversarial images generated from those images as the target domain. Although here
+05:31.280 --> 05:38.240
+the target domain continuously evolves because the adversarial examples are based on the current
+05:38.240 --> 05:46.000
+state of the model at each time step. And similar to the domain adaptation theory, our goal here
+05:46.000 --> 05:52.960
+is to learn how to perform well on both source and target domains, meaning the natural and
+05:52.960 --> 06:02.240
+adversarial domains. Now before I tell you about our proposed method, let's dive a bit deeper into
+06:02.240 --> 06:08.960
+what the domain adaptation theory from Ben David et al states. Similar to before, let's assume a
+06:08.960 --> 06:14.880
+feature learning function f that projects inputs x to latent space or feature space z and the
+06:14.880 --> 06:23.040
+classifier that predicts the correct label y, y hat, from those latent codes. Now consider natural
+06:23.040 --> 06:31.440
+and adversarial examples as input domains dx and d' x and their induced feature distributions
+06:31.440 --> 06:42.560
+which go through the f function as dz and d' z. Also consider epsilon z and epsilon' z
+06:42.560 --> 06:50.320
+as the classification error over the domains dz and d' z, what we are going to refer to as the
+06:50.320 --> 06:58.880
+clean accuracy and the adversarial accuracy. The domain adaptation theory now gives a bond
+06:58.880 --> 07:04.320
+on the adversarial error in terms of the natural error and the distance between the two domains.
+07:05.120 --> 07:11.680
+Fortunately, from the prior work, we know that h delta h distance, which measures the distance
+07:11.680 --> 07:17.440
+between two domains, can be estimated using the classifier trained to discriminate between the
+07:17.440 --> 07:26.080
+two domains. Now our defense method called adversarial feature desensitization essentially
+07:26.080 --> 07:34.720
+minimizes the bound on the adversarial error epsilon' z using a three-step procedure which
+07:34.720 --> 07:40.560
+has some conceptual similarities with prior work on adversarial domain adaptation from Ganin et al.
+07:42.240 --> 07:49.280
+For this, we first update the parameters theta and phi in the feature learning function f and
+07:49.280 --> 07:56.320
+task classifier c to minimize the classification loss on the natural domain. This is shown with
+07:56.320 --> 08:01.920
+green arrows and green boxes marked 1 on both the equation and on the diagram.
+08:04.000 --> 08:10.400
+Secondly, we estimate the h delta h distance using an additional domain discriminator
+08:10.960 --> 08:17.600
+network that predicts the domain identity from the latent code z. We update the domain
+08:17.600 --> 08:24.720
+discriminator parameters psi to minimize the domain classification loss. And finally,
+08:24.720 --> 08:31.680
+in the third step, we update the feature learning network parameters theta to maximize the domain
+08:31.680 --> 08:39.600
+classification loss in an adversarial way. These two steps are marked with red arrows in the figure
+08:39.600 --> 08:48.960
+and red boxes on the equation. Similar to previous two methods, adversarial training and trades that
+08:48.960 --> 08:55.760
+I showed you, we here we can also graphically demonstrate this procedure. In our method AFD,
+08:55.760 --> 09:01.040
+we learn to separate the classes from the distributions of clean examples while at the
+09:01.040 --> 09:07.840
+same time we optimize a domain classifier that learns the boundary between the clean and adversarial
+09:07.840 --> 09:14.560
+examples for each class. And finally, we push the adversarial examples to the opposite side of that
+09:14.560 --> 09:22.400
+boundary. This procedure implicitly desensitizes the learned features to adversarial perturbations
+09:22.400 --> 09:30.480
+and hence the name adversarial feature desensitization. We tested our method on four
+09:30.480 --> 09:35.840
+data sets and compared them with a number of other baselines including with adversarial training and
+09:35.840 --> 09:43.760
+trades. We made two versions of our method called AFDTCGAN that uses the adversarial losses from
+09:43.760 --> 09:50.880
+Goodfellow et al and AFDWGAN that uses the Wasserstein losses from Arjovski and Goodtuner.
+09:52.000 --> 09:57.840
+In the table, we evaluated all methods on several white box and black box attacks with
+09:57.840 --> 10:07.360
+nominal strengths into each data set. Overall, our method AFD and especially AFDWGAN showed superior
+10:07.360 --> 10:15.200
+performance against most attacks in most data sets. However, AFD was behind trades on several attacks
+10:15.200 --> 10:20.720
+especially on CIFAR-100 and TinyImageNet data set that had more classes in it.
+10:20.720 --> 10:26.080
+We also looked in trust attack methods and attack strengths which we controlled with the parameter
+10:26.080 --> 10:32.800
+epsilon. The diagrams on the right show the robust accuracy for each defense method across
+10:32.800 --> 10:41.200
+eight attack methods and various epsilon values for each of them. Overall, our results in these
+10:41.200 --> 10:48.240
+diagrams showed that AFD's robustness generalizes better than the baselines across attacks and
+10:48.240 --> 10:55.200
+across attack strengths. To quantify these differences, we also computed the area under
+10:55.200 --> 11:00.000
+the curve for each method for each attack and summarized them in a table on the left.
+11:00.880 --> 11:06.800
+As you can see, AFD's robust performance generalizes better to unseen and stronger attacks
+11:06.800 --> 11:15.680
+compared to other baselines. If you remember from previous slides, the domain adaptation theory
+11:15.680 --> 11:22.400
+predicted a bound on the adversarial error which can also be turned into a bound on the generalization
+11:22.400 --> 11:30.320
+gap between natural and adversarial attacks. We empirically tested this prediction in our experiments
+11:30.320 --> 11:37.600
+under two settings. Under the first setting, we varied the epsilon value for the PGDL-infinity
+11:37.600 --> 11:45.600
+attack which was used during the training. And under the second setting, we varied the
+11:45.600 --> 11:51.120
+epsilon value for the PGDL-infinity attack which was used during the training. And under the second setting, we used a diverse set of attacks and various attack strengths for each of them.
+11:52.000 --> 11:58.480
+And under both scenarios, we found that the domain discriminator, which was originally trained on a
+11:58.480 --> 12:05.280
+particular attack and attack strength, in our case it was PGDL-infinity attack with a fixed epsilon
+12:05.280 --> 12:10.960
+for each data set, could well predict the generalization gap to unseen attacks and
+12:10.960 --> 12:18.000
+different attack magnitudes. This suggests that the adversarial training against a domain classifier
+12:18.000 --> 12:24.000
+like that used in our proposed method could potentially lead to robust models with better
+12:24.000 --> 12:33.520
+generalization capacity. Finally, while we showed that AFD generalizes well to most other attacks
+12:33.520 --> 12:39.200
+and attack strengths, it occasionally was worse compared to other baselines, especially in data
+12:39.200 --> 12:45.760
+sets with more classes like Tiny ImageNet. This could potentially be due to the difficulty of training
+12:46.320 --> 12:51.680
+domain classifiers in these data sets and leaves much space for future work on
+12:51.680 --> 12:57.120
+investigating the effect of domain classifiers on the robustness of feature learning functions.
+12:58.080 --> 13:04.400
+Also, AFD required more backward computations compared to some of the other baselines
+13:04.400 --> 13:11.120
+such as adversarial training, and as a result, its training time was on average about 31%
+13:11.120 --> 13:17.680
+longer than adversarial training. We invite you to read our paper for more details and please
+13:17.680 --> 13:34.720
+get in touch with us if you have any questions. Thanks for watching this video and we hope you enjoyed it.

demo_data/nips-2021/25959/video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76fac80c58c0fd077be83cb3d4b052aaf70c0128d8884b24f83a34a9f9c72fe3
+size 86886949

demo_data/nips-2021/25962/metadata.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "title": "Locally differentially private estimation of functionals of discrete distributions"
+}

demo_data/nips-2021/25962/transcript_whisper_large-v2.vtt ADDED Viewed

	@@ -0,0 +1,155 @@

+WEBVTT
+00:00.000 --> 00:14.000
+Bonjour à tous, je suis Yannis Hartel et je vais vous présenter un travail sur l'estimation
+00:14.000 --> 00:18.000
+de fonctionnalité en termes de certaines contraintes particulières de la privacité.
+00:18.000 --> 00:24.000
+C'est un travail en lien avec mon conseiller postdoc, le professeur Cristina Gutucia.
+00:24.000 --> 00:30.000
+Nous sommes intéressés par le fonctionnalité de la somme de puissance, qui est la somme de probabilités associées
+00:30.000 --> 00:37.000
+à une distribution discrète, à la puissance gamma, où gamma est un nombre réel positif.
+00:37.000 --> 00:46.000
+Donc, ce fonctionnalité de la somme de puissance est un exemple d'information qui se déroule dans différents domaines
+00:46.000 --> 00:54.000
+comme les statistiques, l'apprentissage de machines, la théorie de l'information, la science de la neurone, etc.
+00:54.000 --> 01:00.000
+Voici donc le problème statistique standard, où l'objectif est d'estimer la somme de puissance fonctionnelle
+01:00.000 --> 01:10.000
+basée sur des exemples NIID, X1, X2 jusqu'à XN, qui suivent une distribution discrète B avec une taille d'alphabet K.
+01:10.000 --> 01:19.000
+Une approche beaucoup utilisée est le estimateur de plug-in, où l'on utilise un estimateur du paramètre P
+01:19.000 --> 01:25.000
+pour construire un estimateur du fonctionnalité, à travers le principe de plug-in.
+01:25.000 --> 01:32.000
+Cette approche n'est pas seulement simple et intuitive, mais elle est aussi théoriquement saine,
+01:32.000 --> 01:38.000
+car elle satisfait une efficacité asymptotique et une néro-optimalité non-asymptote.
+01:38.000 --> 01:45.000
+La question intéressante de notre paper est de savoir si cette approche de plug-in
+01:45.000 --> 01:50.000
+fonctionne dans un état de séparation non standard, où l'on impose une contrainte de privé,
+01:50.000 --> 01:55.000
+et plus précisément, le setup de la privé différente local.
+01:55.000 --> 02:06.000
+Ce qui signifie que l'on impose un état de privé fort, où l'on n'a pas accès aux données initiales et sensibles, les XI.
+02:06.000 --> 02:12.000
+Au lieu de ça, l'on a seulement accès à une version privée de XI.
+02:12.000 --> 02:22.000
+Voici la représentation d'un mécanisme simple qui n'est pas interactif.
+02:22.000 --> 02:30.000
+Les termes local ici reflètent le fait que le mécanisme QI ne voit que les données XI.
+02:30.000 --> 02:38.000
+En d'autres mots, il n'y a pas de troisième parti confiant qui a accès à toutes les données sensibles.
+02:38.000 --> 02:48.000
+C'est un mécanisme de privé non-interactif simple, mais bien sûr, nous sommes aussi intéressés par des mécanismes plus sophistiqués,
+02:48.000 --> 02:55.000
+notamment le mécanisme de séquence interactif, où chaque QI voit les données privées dévoilées précédemment,
+02:55.000 --> 03:00.000
+et les données privées de XI, et les données privées de XI.
+03:00.000 --> 03:10.000
+Dans cette étude non-standard, nous retournons au problème original de l'estimation fonctionnelle de la power sum,
+03:10.000 --> 03:15.000
+où nous n'avons qu'accès à des données privées de XI jusqu'à XL.
+03:15.000 --> 03:26.000
+Notre première contribution est de donner une caractérisation tigrée et non-transomatique du erreur de caractérisation de la power sum de l'estimateur.
+03:26.000 --> 03:33.000
+Ce résultat montre que l'estimateur de la power sum n'est pas optimal.
+03:33.000 --> 03:41.000
+Cela contraste avec la performance de l'estimateur de la power sum dans le problème statistique standard.
+03:41.000 --> 03:50.000
+Le message ici est que les bons estimateurs dans le setup standard ne sont pas toujours bons estimateurs dans le setup local privacy.
+03:50.000 --> 04:00.000
+Notre deuxième contribution est la correction du estimateur de plug-in grâce à une attentionnée de troncation de Pk de petites probabilités.
+04:00.000 --> 04:06.000
+Cette correction conduit à une réduction significative du risque d'erreur.
+04:06.000 --> 04:13.000
+En particulier, le risque devient indépendant du size alphabétique K lorsque K est grand.
+04:13.000 --> 04:22.000
+Cette deuxième contribution, par contre, se base sur un mécanisme de privé non-interactif simple.
+04:22.000 --> 04:29.000
+Dans la seconde partie du document, nous examinons un mécanisme de séquence interactive plus sophistiqué,
+04:29.000 --> 04:40.000
+pour lequel nous construisons une procédure de deux pas qui nous permet de réduire le risque grâce à un facteur logarithmique.
+04:40.000 --> 04:45.000
+Enfin, à la fin du document, nous fournissons un lien universel en bas sur le risque d'erreur
+04:45.000 --> 04:51.000
+avec respect à tous les estimateurs et tous les mécanismes non-interactifs et séquentially interactifs.
+04:51.000 --> 04:56.000
+Malheureusement, ce lien bas est un lien d'accords uniquement dans certains cas,
+04:56.000 --> 05:02.000
+ce qui nous laisse avec quelques questions très importantes à poser sur ce problème.
+05:02.000 --> 05:10.000
+Je pense que ce premier travail sur l'estimation fonctionnelle dans le contexte de la privé locale
+05:10.000 --> 05:14.000
+vous donne au moins trois points clés.
+05:14.000 --> 05:23.000
+Le premier point clé est le besoin de construire une procédure statistique prudente pour la configuration de la privé locale,
+05:23.000 --> 05:31.000
+puisque c'est un setup où un bon estimateur dans un cadre standard n'a pas nécessairement de fonction.
+05:31.000 --> 05:38.000
+Le deuxième point clé est que l'approche de type de plug-in analysée dans ce document
+05:38.000 --> 05:43.000
+sert comme un benchmark pour de futurs travaux et des procédures plus sophistiquées.
+05:43.000 --> 05:51.000
+Et le dernier point clé est que notre analyse de l'approche de type de plug-in et des mécanismes non-interactifs
+05:51.000 --> 05:56.000
+montrent des régimes où le problème d'estimation est difficile
+05:56.000 --> 06:01.000
+et espérons que cela incite les gens à amener des développements ici.
+06:01.000 --> 06:08.000
+Merci à tous, et pour plus de détails, veuillez vérifier notre document en ligne.
+06:08.000 --> 06:22.000
+Bye!