diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..05ebbf6f5aa84c62da8f15b97a99e10974d19581 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,40 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +video.mp4 filter=lfs diff=lfs merge=lfs -text +*.psd filter=lfs diff=lfs merge=lfs -text +*.mp4 filter=lfs diff=lfs merge=lfs -text +demo_data/lectures/*/*.mp4 filter=lfs diff=lfs merge=lfs -text +demo_data/*/.mp4 filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..24290bf23229ffe2d6e1501bc9815770e28ac062 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +--- +title: Chaptering Demo (YTSeg & MiniSeg) +emoji: ⚡ +colorFrom: blue +colorTo: blue +sdk: streamlit +sdk_version: 1.32.2 +app_file: app.py +pinned: false +license: other +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..b00a09392c52c98e5478c7ad1aa4591bca601699 --- /dev/null +++ b/app.py @@ -0,0 +1,256 @@ +import itertools +import json +import re +from functools import partial +from pathlib import Path + +import pandas as pd +import requests +import streamlit as st + +from generate_text_api import SummarizerGenerator +from model_inferences.utils.files import get_captions_from_vtt, get_transcript + +USE_PARAGRAPHING_MODEL = True + +def get_sublist_by_flattened_index(A, i): + current_index = 0 + for sublist in A: + sublist_length = len(sublist) + if current_index <= i < current_index + sublist_length: + return sublist, A.index(sublist) + current_index += sublist_length + return None, None + +import requests + + +def get_talk_metadata(video_id): + url = "https://www.ted.com/graphql" + + headers = { + "Content-Type": "application/json", + "Accept": "application/json", + "x-operation-name": "Transcript", # Replace with the actual operation name + } + + data = { + "query": """ + query GetTalk($videoId: ID!) { + video(id: $videoId) { + title, + presenterDisplayName, + nativeDownloads {medium} + } + } + """, + "variables": { + "videoId": video_id, # Corrected key to "videoId" + }, + } + + response = requests.post(url, json=data, headers=headers) + + if response.status_code == 200: + result = response.json() + return result + else: + print(f"Error: {response.status_code}, {response.text}") + +class OfflineTextSegmenterClient: + def __init__(self, host_url): + self.host_url = host_url.rstrip("/") + "/segment" + + def segment(self, text, captions=None, generate_titles=False, threshold=0.4): + payload = { + 'text': text, + 'captions': captions, + 'generate_titles': generate_titles, + "prefix_titles": True, + "threshold": threshold, + } + + headers = { + 'Content-Type': 'application/json' + } + + response = requests.post(self.host_url, data=json.dumps(payload), headers=headers).json() + #segments = response["annotated_segments"] if "annotated_segments" in response else response["segments"] + return {'segments':response["segments"], 'titles': response["titles"], 'sentences': response["sentences"]} + +class Toc: + + def __init__(self): + self._items = [] + self._placeholder = None + + def title(self, text): + self._markdown(text, "h1") + + def header(self, text): + self._markdown(text, "h2", " " * 2) + + def subheader(self, text): + self._markdown(text, "h3", " " * 4) + + def placeholder(self, sidebar=False): + self._placeholder = st.sidebar.empty() if sidebar else st.empty() + + def generate(self): + if self._placeholder: + self._placeholder.markdown("\n".join(self._items), unsafe_allow_html=True) + + def _markdown(self, text, level, space=""): + key = re.sub(r'[^\w-]', '', text.replace(" ", "-").replace("'", "-").lower()) + st.markdown(f"<{level} id='{key}'>{text}", unsafe_allow_html=True) + self._items.append(f"{space}* {text}") + +endpoint = "http://hiaisc.isl.iar.kit.edu/summarize/summarize_stream" + +client = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/chapterize") +if USE_PARAGRAPHING_MODEL: + paragrapher = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/paragraph") +summarizer = SummarizerGenerator(endpoint) + +import re + + +def replace_newlines(text): + updated_text = re.sub(r'\n+', r'\n\n', text) + return updated_text + +def generate_summary(summarizer, generated_text_box, input_, prefix=""): + all_generated_text = prefix + for generated_text in summarizer.generate_summary_stream(input_): + all_generated_text += replace_newlines(generated_text) + generated_text_box.info(all_generated_text) + print(all_generated_text) + return all_generated_text.strip() + +st.header("Demo: Intelligent Recap") + +if not hasattr(st, 'global_state'): + st.global_state = {'NIPS 2021 Talks': None, 'TED Talks': None} + # NIPS 2021 Talks + transcript_files = itertools.islice(Path("demo_data/nips-2021/").rglob("transcript_whisper_large-v2.vtt"), 15) + # get titles from metadata.json + transcripts_map = {} + for transcript_file in transcript_files: + base_path = transcript_file.parent + metadata = base_path / "metadata.json" + txt_file = base_path / "transcript_whisper_large-v2.txt" + with open(metadata) as f: + metadata = json.load(f) + title = metadata["title"] + transcript = get_transcript(txt_file) + captions = get_captions_from_vtt(transcript_file) + transcripts_map[title] = {"transcript": transcript, "captions": captions, "video": base_path / "video.mp4"} + st.global_state['NIPS 2021 Talks'] = transcripts_map + + data = pd.read_json("demo_data/ted_talks.json") + video_ids = data.talk_id.tolist() + transcripts = data.text.apply(lambda x: " ".join(x)).tolist() + transcripts_map = {} + for video_id, transcript in zip(video_ids, transcripts): + metadata = get_talk_metadata(video_id) + title = metadata["data"]["video"]["title"] + presenter = metadata["data"]["video"]["presenterDisplayName"] + print(metadata["data"]) + if metadata["data"]["video"]["nativeDownloads"] is None: + continue + video_url = metadata["data"]["video"]["nativeDownloads"]["medium"] + transcripts_map[title] = {"transcript": transcript, "video": video_url, "presenter": presenter} + st.global_state['TED Talks'] = transcripts_map + + def get_lecture_id(path): + return int(path.parts[-2].split('-')[1]) + + transcript_files = Path("demo_data/lectures/").rglob("English.vtt") + sorted_path_list = sorted(transcript_files, key=get_lecture_id) + + transcripts_map = {} + for transcript_file in sorted_path_list: + base_path = transcript_file.parent + lecture_id = base_path.parts[-1] + transcript = " ".join([c["text"].strip() for c in get_captions_from_vtt(transcript_file)]).replace("\n", " ") + video_path = Path(base_path, "video.mp4") + transcripts_map["Machine Translation: " + lecture_id] = {"transcript": transcript, "video": video_path} + st.global_state['KIT Lectures'] = transcripts_map + +type_of_document = st.selectbox('What kind of document do you want to test it on?', list(st.global_state.keys())) + +transcripts_map = st.global_state[type_of_document] + +selected_talk = st.selectbox("Choose a document...", list(transcripts_map.keys())) + +st.video(str(transcripts_map[selected_talk]['video']), format="video/mp4", start_time=0) + +input_text = st.text_area("Transcript", value=transcripts_map[selected_talk]['transcript'], height=300) + +toc = Toc() + +summarization_todos = [] + +with st.expander("Adjust Thresholds"): + threshold = st.slider('Chapter Segmentation Threshold', 0.00, 1.00, value=0.4, step=0.05) + paragraphing_threshold = st.slider('Paragraphing Threshold', 0.00, 1.00, value=0.5, step=0.05) + +if st.button("Process Transcript"): + with st.sidebar: + st.header("Table of Contents") + toc.placeholder() + + st.header(selected_talk, divider='rainbow') + # if 'presenter' in transcripts_map[selected_talk]: + # st.markdown(f"### *by **{transcripts_map[selected_talk]['presenter']}***") + + captions = transcripts_map[selected_talk]['captions'] if 'captions' in transcripts_map[selected_talk] else None + result = client.segment(input_text, captions, generate_titles=True, threshold=threshold) + if USE_PARAGRAPHING_MODEL: + presult = paragrapher.segment(input_text, captions, generate_titles=False, threshold=paragraphing_threshold) + paragraphs = presult['segments'] + segments, titles, sentences = result['segments'], result['titles'], result['sentences'] + + if USE_PARAGRAPHING_MODEL: + prev_chapter_idx = 0 + prev_paragraph_idx = 0 + segment = [] + for i, sentence in enumerate(sentences): + chapter, chapter_idx = get_sublist_by_flattened_index(segments, i) + paragraph, paragraph_idx = get_sublist_by_flattened_index(paragraphs, i) + + if (chapter_idx != prev_chapter_idx and paragraph_idx == prev_paragraph_idx) or (paragraph_idx != prev_paragraph_idx and chapter_idx != prev_chapter_idx): + print("Chapter / Chapter & Paragraph") + segment_text = " ".join(segment) + toc.subheader(titles[prev_chapter_idx]) + if len(segment_text) > 450: + generated_text_box = st.info("") + summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text)) + st.write(segment_text) + segment = [] + elif paragraph_idx != prev_paragraph_idx and chapter_idx == prev_chapter_idx: + print("Paragraph") + segment.append("\n\n") + + segment.append(sentence) + + prev_chapter_idx = chapter_idx + prev_paragraph_idx = paragraph_idx + + segment_text = " ".join(segment) + toc.subheader(titles[prev_chapter_idx]) + generated_text_box = st.info("") + summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text)) + st.write(segment_text) + + else: + segments = [" ".join([sentence for sentence in segment]) for segment in segments] + for title, segment in zip(titles, segments): + toc.subheader(title) + generated_text_box = st.info("") + summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment)) + st.write(segment) + toc.generate() + +for summarization_todo in summarization_todos: + summarization_todo() diff --git a/demo_data/lectures/Lecture-01-18.04.2023/English.vtt b/demo_data/lectures/Lecture-01-18.04.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..beee34f353fae9bf89c684c47653d8f502e3b4f7 --- /dev/null +++ b/demo_data/lectures/Lecture-01-18.04.2023/English.vtt @@ -0,0 +1,2582 @@ +WEBVTT + +0:00:00.000 --> 0:00:10.115 +That easy to say this is a good translation +and this is a bad translation. + +0:00:10.115 --> 0:00:12.947 +How can we evaluate? + +0:00:13.413 --> 0:00:26.083 +We will put an emphasis on machine translation +because that is currently the state of the + +0:00:26.083 --> 0:00:26.787 +art. + +0:00:28.028 --> 0:00:35.120 +But we are now focused on the details of neural +networks where we are describing the basic + +0:00:35.120 --> 0:00:39.095 +ideas and how to use the info machine translation. + +0:00:39.095 --> 0:00:41.979 +This is not a neural network course. + +0:00:42.242 --> 0:00:49.574 +If you have some background in Neo Networks, +that is of course of an advantage, but it should + +0:00:49.574 --> 0:00:51.134 +not be a challenge. + +0:00:51.134 --> 0:00:58.076 +If you have not done the details, we'll shortly +cover the background and the main ideas. + +0:00:58.076 --> 0:01:00.338 +How can we use them for for? + +0:01:00.280 --> 0:01:06.880 +Machine translation: We will starve the first +two, three lectures with some like more traditional + +0:01:06.880 --> 0:01:12.740 +approaches how they work because they still +give some good intuition, some good ideas. + +0:01:12.872 --> 0:01:17.141 +And they help us to understand where our systems +might be better. + +0:01:17.657 --> 0:01:22.942 +And yeah, we have an innocence on really what +do we need to do to build a strong system. + +0:01:23.343 --> 0:01:35.534 +And then we have a part on experience where +it's about how to build the systems and how + +0:01:35.534 --> 0:01:37.335 +to apply it. + +0:01:39.799 --> 0:01:47.774 +For additional reading materials, so we have +the slides on the website. + +0:01:47.774 --> 0:01:55.305 +There is also links to papers which cover +the topic of the lecture. + +0:01:55.235 --> 0:01:58.436 +If You'd Like to Study Additional Books. + +0:01:59.559 --> 0:02:07.158 +Think the most relevant is this machine translation +from Philip Kurnan, which gives an introduction + +0:02:07.158 --> 0:02:09.210 +about machine translation. + +0:02:09.210 --> 0:02:15.897 +But this lecture is, of course, not a one +to one like we don't go through the book, but + +0:02:15.897 --> 0:02:17.873 +it covers related topics. + +0:02:18.678 --> 0:02:25.094 +Is a previous version of that statistical +machine translation focusing on that part, + +0:02:25.094 --> 0:02:28.717 +and we cover some of that part rather than +all. + +0:02:28.717 --> 0:02:35.510 +If you want to have more basics about natural +language processing, this might be helpful. + +0:02:39.099 --> 0:02:53.738 +In addition, there is an online course on +machine translation which we also develop here + +0:02:53.738 --> 0:02:57.521 +at which is available. + +0:02:57.377 --> 0:03:04.894 +Input where you're, of course, free to use +that I might give you some other type of presentation + +0:03:04.894 --> 0:03:07.141 +of the lecture important is. + +0:03:07.141 --> 0:03:14.193 +It's, of course, a lot shorter and book doesn't +cover all the topics which you're covering + +0:03:14.193 --> 0:03:15.432 +in the lecture. + +0:03:15.655 --> 0:03:19.407 +So, of course, for the exam everything which +was in the lecture is important. + +0:03:19.679 --> 0:03:25.012 +This covers like the first half where don't +know exactly the first X lectures. + +0:03:26.026 --> 0:03:28.554 +Feel free to have a look at that. + +0:03:28.554 --> 0:03:29.596 +It's shorter. + +0:03:29.596 --> 0:03:36.438 +Maybe there's some of you interesting to have +very short videos or after the lecture single + +0:03:36.438 --> 0:03:39.934 +this topic I didn't understand want to repeat. + +0:03:40.260 --> 0:03:50.504 +Then this might be helpful, but it's important +that there is more content in the lecture. + +0:03:53.753 --> 0:04:02.859 +The exam will be minutes and oral exam and +just make an appointment and then. + +0:04:05.305 --> 0:04:09.735 +If you think this is a really cool topic, +want to hear more. + +0:04:09.735 --> 0:04:14.747 +There's two similars, one on advanced topics +in machine translation. + +0:04:15.855 --> 0:04:24.347 +Which is every Thursday and there is one which +was already on Monday. + +0:04:24.347 --> 0:04:34.295 +But if you're interested in speech translation +to contact us and there, I think,. + +0:04:34.734 --> 0:04:47.066 +Then there are other lectures, one more learning +by Professor Vival, and for us some of you + +0:04:47.066 --> 0:04:48.942 +have already. + +0:04:48.888 --> 0:04:55.496 +Lecture, which is related but of discovering +more general natural language processing than + +0:04:55.496 --> 0:04:57.530 +will be again available in. + +0:04:57.597 --> 0:05:07.108 +Winter semester, and then we are concentrating +on the task of machine translation and mighty. + +0:05:11.191 --> 0:05:14.630 +Yeah, and also there's an automatic speech +emission problem. + +0:05:16.616 --> 0:05:27.150 +And this is a bit what we are planning to +talk about in this semester. + +0:05:27.150 --> 0:05:30.859 +Today we have a general. + +0:05:31.371 --> 0:05:37.362 +Then on Thursday we are doing a bit of a different +lecture and that's about the linguistic. + +0:05:37.717 --> 0:05:42.475 +It may be quite different from what you're +more computer scientist, what you've done there, + +0:05:42.475 --> 0:05:43.354 +but don't worry. + +0:05:43.763 --> 0:05:49.051 +We're coming in a very basic thing that I +think it's important if you're dealing with + +0:05:49.051 --> 0:05:53.663 +natural language to have a bit of an understanding +of what language isn't. + +0:05:53.663 --> 0:05:59.320 +Maybe I've learned about that in high school, +but also for you this I guess some years ago. + +0:05:59.619 --> 0:06:07.381 +And so it's a bit of yeah, it better understand +also what other challenges there. + +0:06:07.307 --> 0:06:16.866 +And especially since we are all dealing with +our mother time, it may be English, but there + +0:06:16.866 --> 0:06:25.270 +is a lot of interesting phenomena which would +not occur in these two languages. + +0:06:25.625 --> 0:06:30.663 +And therefore we'll also look a bit into what +are things which might happen in other languages. + +0:06:30.930 --> 0:06:35.907 +If we want to build machine translation, of +course we want to build machine Translation + +0:06:35.907 --> 0:06:36.472 +for many. + +0:06:38.178 --> 0:06:46.989 +Then we will see a lot of these machine learning +based how to get the data and process the data + +0:06:46.989 --> 0:06:47.999 +next week. + +0:06:48.208 --> 0:07:03.500 +And then we'll have one lecture about statistical +machine translation, which was the approach + +0:07:03.500 --> 0:07:06.428 +for twenty years. + +0:07:07.487 --> 0:07:17.308 +And then maybe surprisingly very early we'll +talk about evaluation and this is because evaluation + +0:07:17.308 --> 0:07:24.424 +is really essential for machine translation +and it's very challenging. + +0:07:24.804 --> 0:07:28.840 +To decide if machine translation output is +good or bad is really challenging. + +0:07:29.349 --> 0:07:38.563 +If you see another translation for a machine +to decide is not as difficult and even for + +0:07:38.563 --> 0:07:48.387 +a machine translation output and ask them to +rate, you'll get three different answers: And + +0:07:48.387 --> 0:07:55.158 +so it's worse to investigate it, and of course +it's also important to have that at the beginning + +0:07:55.158 --> 0:08:01.928 +because if we're later talking about some techniques, +it will be always saying this technique is + +0:08:01.928 --> 0:08:03.813 +better by x percent or so. + +0:08:04.284 --> 0:08:06.283 +And we'll also have a practical good course +of this. + +0:08:06.746 --> 0:08:16.553 +Then we're going to build language models +which are in point to translation models. + +0:08:16.736 --> 0:08:28.729 +After the half you have a basic understanding +of what and basic machine translation. + +0:08:29.029 --> 0:08:39.065 +And then on the second part of the lecture +we will cover more advanced topics. + +0:08:39.065 --> 0:08:42.369 +What are the challenging? + +0:08:43.463 --> 0:08:48.035 +One challenge is, of course, about additional +resources about data. + +0:08:48.208 --> 0:08:53.807 +So the question is how can we get more data +or better data and their different ways of + +0:08:53.807 --> 0:08:54.258 +doing? + +0:08:54.214 --> 0:09:00.230 +Our thralling data will look into our building +systems which not translate between one language + +0:09:00.230 --> 0:09:06.122 +but which translate between fifteen languages +and youth knowledge and share knowledge between + +0:09:06.122 --> 0:09:09.632 +the language so that for each pair they need +less data. + +0:09:11.751 --> 0:09:19.194 +And then we'll have something about efficiency. + +0:09:19.194 --> 0:09:27.722 +That is, of course, with more and more complex +models. + +0:09:27.647 --> 0:09:33.053 +Because then nobody can afford to do that, +so how can you build really efficient things? + +0:09:33.393 --> 0:09:38.513 +Who also like energy is getting more expensive +so it's even more important to build systems. + +0:09:39.419 --> 0:09:43.447 +We're Looking to Biases So. + +0:09:43.423 --> 0:09:50.364 +That is a machine translation quite interesting +because some information are represented different + +0:09:50.364 --> 0:09:51.345 +in languages. + +0:09:51.345 --> 0:09:55.552 +So if you think about German, there is always +clear or not. + +0:09:55.552 --> 0:10:00.950 +But in a lot of situations, it's clear if +you talk about to teach her about. + +0:10:01.321 --> 0:10:03.807 +Another Person If It's Male or Female. + +0:10:04.204 --> 0:10:13.832 +From English to German you don't have this +information, so how do you generate that and + +0:10:13.832 --> 0:10:15.364 +what systems? + +0:10:15.515 --> 0:10:24.126 +Will just assume things and we'll see that +exactly this is happening, so in order to address + +0:10:24.126 --> 0:10:27.459 +these challenges and try to reduce. + +0:10:28.368 --> 0:10:35.186 +The main adaptation is what I said that beginning +systems are good at the task they are trained. + +0:10:35.186 --> 0:10:37.928 +But how can we adapt them to new task? + +0:10:38.959 --> 0:10:51.561 +Document level is doing more context and we +have two lectures about speech translation, + +0:10:51.561 --> 0:10:56.859 +so mostly before we are translating. + +0:10:57.117 --> 0:11:00.040 +Are now translating audio things. + +0:11:00.040 --> 0:11:05.371 +We have just additional challenges and these +we will address. + +0:11:10.450 --> 0:11:22.165 +So to the motivation, why should you work +on the theme translation and why should you + +0:11:22.165 --> 0:11:23.799 +put effort? + +0:11:24.224 --> 0:11:30.998 +So we want or we are living in a more global +society. + +0:11:30.998 --> 0:11:37.522 +You have now the chance to communicate with +people. + +0:11:37.897 --> 0:11:44.997 +And the danger of course is that languages +are dying, and more and more languages are + +0:11:44.997 --> 0:11:45.988 +going away. + +0:11:46.006 --> 0:11:53.669 +I think at least that some opportunity in +order to keep more languages is that we have + +0:11:53.669 --> 0:12:01.509 +technology solutions which help you to speak +in your language and still communicate with + +0:12:01.509 --> 0:12:04.592 +people who speak another language. + +0:12:04.864 --> 0:12:16.776 +And on the one hand there is the need and +more and more people want to speak in some + +0:12:16.776 --> 0:12:19.159 +other languages. + +0:12:19.759 --> 0:12:27.980 +For example, Iceland was really keen on getting +Icelandic into commercial systems and they + +0:12:27.980 --> 0:12:36.471 +even provided data and so on because they wanted +that their language is spoken longer and not + +0:12:36.471 --> 0:12:38.548 +just people switching. + +0:12:38.959 --> 0:12:47.177 +So there's even like yeah, they were spending +for promoting this language in order to have + +0:12:47.177 --> 0:12:55.125 +all these digital tools available for languages +which are not spoken by so many people. + +0:12:56.156 --> 0:13:07.409 +So it's questionable and it's not completely +clear technology always provides. + +0:13:10.430 --> 0:13:25.622 +If we think about machine translation, there +are different use cases in which you can use + +0:13:25.622 --> 0:13:26.635 +that. + +0:13:27.207 --> 0:13:36.978 +And this has some characteristics: So typically +in this case it is where machine translation + +0:13:36.978 --> 0:13:40.068 +was used first anybody. + +0:13:40.780 --> 0:13:50.780 +Because most youth outlets around the world +report at least some of the same events, like + +0:13:50.780 --> 0:13:58.669 +was probably covered around the world in a +lot of different languages. + +0:13:59.279 --> 0:14:08.539 +That is one point yes, so the training gator +is there. + +0:14:08.539 --> 0:14:16.284 +That's definitely a good point here and then. + +0:14:17.717 --> 0:14:19.425 +Yes, there was my regional idea. + +0:14:19.425 --> 0:14:23.256 +The motivation program was a bit different +by you, but it's a good point. + +0:14:23.256 --> 0:14:26.517 +So on the one end you'll understand maybe +not perfect English. + +0:14:26.517 --> 0:14:30.762 +Also, it's for his personal use, so you're +using machine translation for you use. + +0:14:31.311 --> 0:14:37.367 +It's not as important that this is really +perfect written text, but you're more interested + +0:14:37.367 --> 0:14:38.564 +in understanding. + +0:14:38.858 --> 0:14:45.570 +Maybe it's more clearer if you think about +the other situation where it's about dissimination + +0:14:45.570 --> 0:14:48.926 +that means producing text in another language. + +0:14:48.926 --> 0:14:55.138 +So just imagine you have a website or you +have a restaurant and you want to offer your + +0:14:55.138 --> 0:14:55.566 +menu. + +0:14:56.476 --> 0:15:01.948 +And in this case maybe you want to have a +higher quality because in some of your. + +0:15:01.901 --> 0:15:06.396 +You're presenting something of yourself and +you want to have good quality. + +0:15:06.396 --> 0:15:11.490 +Just remember you're writing a letter and +if you're translating your letter then you + +0:15:11.490 --> 0:15:17.123 +don't want to have it full of mistakes because +it's somehow a bad, bad oppression but if it's + +0:15:17.123 --> 0:15:20.300 +assimilation it's about you getting the information. + +0:15:20.660 --> 0:15:25.564 +So here you want your disciplination, you're +producing texts for another language. + +0:15:26.006 --> 0:15:31.560 +And then you have the disadvantage that you +maybe want to have a higher quality. + +0:15:31.831 --> 0:15:43.432 +Therefore, typically there is less amount, +so normally you're getting more information + +0:15:43.432 --> 0:15:46.499 +than you're producing. + +0:15:49.109 --> 0:15:57.817 +Then of course there is a dynamic scenario +where there is some type of interaction and + +0:15:57.817 --> 0:16:07.099 +the one thing which is interesting about the +dialogue scenario is there is: So if you're + +0:16:07.099 --> 0:16:18.045 +translating a website you have all the data +available but in a dialogue scenario you. + +0:16:18.378 --> 0:16:23.655 +And we'll see that in speech recognition this +is a big challenge. + +0:16:23.655 --> 0:16:30.930 +Just to mention German where in German the +work is often more at the end, so each harmony. + +0:16:32.052 --> 0:16:36.343 +Know that you want to generate the English +sentence. + +0:16:36.343 --> 0:16:42.740 +Now you need to know if you cancel this registration +to produce a second word. + +0:16:42.740 --> 0:16:49.785 +So you have to either guess or do something +in order to provide the translation before + +0:16:49.785 --> 0:16:52.052 +the translation is already. + +0:16:57.817 --> 0:17:00.530 +The question, of course, is in the new world. + +0:17:00.530 --> 0:17:05.659 +I mean, of course, we can, on the one hand, +say we don't want to have English, but the + +0:17:05.659 --> 0:17:10.789 +question is do we really need that many languages +and how many are here at the moment? + +0:17:11.291 --> 0:17:20.248 +Does anybody have an idea how many languages +are spoken in the world? + +0:17:23.043 --> 0:17:26.510 +This is already the first big challenge. + +0:17:26.510 --> 0:17:34.120 +What a language is and what no language is +is already difficult, and then maybe one point + +0:17:34.120 --> 0:17:40.124 +people have to argue first about written language +or spoken languages. + +0:17:40.400 --> 0:17:47.765 +For written languages I think that number +is still too low, but for a spoken language + +0:17:47.765 --> 0:17:53.879 +people normally think: So you see that it's +really a lot of languages which will be difficult + +0:17:53.879 --> 0:17:54.688 +to all happen. + +0:17:55.035 --> 0:18:00.662 +And these are just like you see Europe where +there's relatively few languages. + +0:18:00.662 --> 0:18:05.576 +You already have quite a lot of languages, +even walls and countries. + +0:18:06.126 --> 0:18:13.706 +Of course sometimes you share the language, +but then you have Briton or Gillesian vest + +0:18:13.706 --> 0:18:17.104 +where you have languages in a country. + +0:18:18.478 --> 0:18:24.902 +And yeah, of course, there's the question: +When does it start to be a language? + +0:18:24.902 --> 0:18:27.793 +And when is it more like a dialect? + +0:18:27.793 --> 0:18:28.997 +So is Catalan? + +0:18:28.997 --> 0:18:31.727 +Is Swiss German a known language? + +0:18:31.727 --> 0:18:33.253 +Or is it the same? + +0:18:33.293 --> 0:18:36.887 +So then, of course, it's are like Czech and +Slovakian. + +0:18:36.887 --> 0:18:42.704 +I know heard that people can understand each +other so they can just continue talking and + +0:18:42.704 --> 0:18:45.711 +understand by some of their own language and. + +0:18:46.026 --> 0:18:56.498 +Of course, it's partly also like about your +own nationality, so I think some people said + +0:18:56.498 --> 0:18:57.675 +creation. + +0:18:58.018 --> 0:19:04.957 +But think for a lot of people you shouldn't +say that they are part of being creation language. + +0:19:05.165 --> 0:19:10.876 +But you see therefore that it is not completely +clear that there is no hardwater between this + +0:19:10.876 --> 0:19:13.974 +and the new language, and this is a different +one. + +0:19:14.094 --> 0:19:19.403 +And of course it's getting more fluent when +you talk about scientific things. + +0:19:19.403 --> 0:19:25.189 +I guess sometimes it's no longer clear if +it's German or English because we start to + +0:19:25.189 --> 0:19:27.707 +use a lot of English terms in there. + +0:19:27.707 --> 0:19:31.519 +So of course there's interesting mixes which +will talk. + +0:19:33.193 --> 0:19:38.537 +So should everybody just speak English, and +these numbers are a bit older, have to admit: + +0:19:38.938 --> 0:19:47.124 +However, I don't think they're completely different +now and it says like how many people know in + +0:19:47.124 --> 0:19:54.718 +Europe can speak English for countries where +English is not the mothertown or for people. + +0:19:54.995 --> 0:20:06.740 +In some countries like smaller ones, for smaller +countries you have quite high numbers. + +0:20:07.087 --> 0:20:13.979 +However, there are many countries where you +have like twenty to thirty percent of the population, + +0:20:13.979 --> 0:20:16.370 +only being able to speak English. + +0:20:16.370 --> 0:20:22.559 +So if we would only do everything only in +English, we would exclude half the population + +0:20:22.559 --> 0:20:23.333 +of Europe. + +0:20:23.563 --> 0:20:30.475 +And therefore providing translations is very +important and therefore, for example, the European + +0:20:30.475 --> 0:20:35.587 +Parliament puts a really large amount of money +into doing translation. + +0:20:35.695 --> 0:20:40.621 +So that's why you can speak in your mother +too in the European Parliament. + +0:20:40.621 --> 0:20:46.204 +Everybody like everyone elected there can +speak in there and they were translated to + +0:20:46.204 --> 0:20:52.247 +all the other languages and it's a huge effort +and so the question is can we do better with + +0:20:52.247 --> 0:20:52.838 +machine. + +0:20:53.493 --> 0:20:58.362 +And for other countries things are even more. + +0:20:58.362 --> 0:21:05.771 +They may be not worse, difficult, but they +are even more challenging. + +0:21:06.946 --> 0:21:13.764 +So there's even more diversity of languages +and it might be even more important to do machines. + +0:21:16.576 --> 0:21:31.034 +If you see how many people speak French, Portuguese +or English, it's relatively few compared to + +0:21:31.034 --> 0:21:33.443 +the population. + +0:21:33.813 --> 0:21:46.882 +So think that this should be around millions +would understand you, but all the others wouldn't. + +0:21:49.289 --> 0:21:54.877 +So it seems to be very important to provide +some taebo translation. + +0:21:54.877 --> 0:21:58.740 +It's a quite big industry as a European Union. + +0:21:58.740 --> 0:22:05.643 +This is already also quite long ago, but it +won't get less spent like in that year. + +0:22:05.643 --> 0:22:08.931 +One point three billion on translation. + +0:22:09.289 --> 0:22:21.315 +So it might be very helpful to have tools +in order to provide them, and as said, not + +0:22:21.315 --> 0:22:26.267 +all directions might be important. + +0:22:26.426 --> 0:22:35.059 +Is even not possible for students, so in the +European Parliament they don't have all combinations + +0:22:35.059 --> 0:22:36.644 +of the different. + +0:22:36.977 --> 0:22:42.210 +And language is so if they want to translate +from Maltese to Estonian or so. + +0:22:42.402 --> 0:22:47.361 +And maybe they have a translator for that, +but there are some directions which don't have + +0:22:47.361 --> 0:22:47.692 +that. + +0:22:47.692 --> 0:22:52.706 +Then they handle directly, but they would +translate first to French, German or or English, + +0:22:52.706 --> 0:22:57.721 +and then there would be a second translator +getting the translation and really translating + +0:22:57.721 --> 0:22:59.154 +to your Italian language. + +0:22:59.299 --> 0:23:06.351 +And it's not always English, so they are really +selecting what is most helpful. + +0:23:06.351 --> 0:23:13.931 +But you see that even in this small setup, +with this large amount of effort in there, + +0:23:13.931 --> 0:23:17.545 +there's not enough ability to translate. + +0:23:19.819 --> 0:23:21.443 +And of course this was text. + +0:23:21.443 --> 0:23:26.538 +Then you have a lot of other things where +you want to, for example, do speech translation. + +0:23:26.538 --> 0:23:31.744 +There is a lot of conferences which currently +are all held in English, which of course might + +0:23:31.744 --> 0:23:35.831 +also not be the best solution if you've gone +to some of the conferences. + +0:23:36.176 --> 0:23:45.964 +You might have heard some accented speech +where people speak a language that is very + +0:23:45.964 --> 0:23:49.304 +different from their mother. + +0:23:49.749 --> 0:23:52.059 +Might be difficult to understand. + +0:23:52.212 --> 0:23:59.123 +We're currently having an effort for example +by ACL, which is the conference organized in + +0:23:59.123 --> 0:24:06.112 +this field to provide these translations into +ten hour languages so that also students who + +0:24:06.112 --> 0:24:06.803 +are not. + +0:24:06.746 --> 0:24:12.446 +That familiar English is able to read the +papers and watch the present case. + +0:24:16.416 --> 0:24:25.243 +So the question is what can you do here and +one interesting solution which we'll cover + +0:24:25.243 --> 0:24:26.968 +in this lecture? + +0:24:27.087 --> 0:24:38.112 +This always comes with a question: is it will +it replace the human? + +0:24:38.112 --> 0:24:40.382 +And yes, the. + +0:24:40.300 --> 0:24:49.300 +Idea, but the question doesn't really happen +and I'm any skeptical about that. + +0:24:49.300 --> 0:24:52.946 +So currently we are not seeing. + +0:24:53.713 --> 0:24:55.807 +So much more effort needed. + +0:24:55.807 --> 0:25:00.294 +Of course, machine translation is now used +as some type of. + +0:25:01.901 --> 0:25:11.785 +If you think about in the European Parliament, +they will have some humans doing their translation + +0:25:11.785 --> 0:25:18.060 +because: If you think about the chancel of +Germany trembling somewhere and quite sure + +0:25:18.060 --> 0:25:18.784 +you want,. + +0:25:19.179 --> 0:25:31.805 +And so it's more like we are augmenting the +possibilities to have more possibilities to + +0:25:31.805 --> 0:25:37.400 +provide translation and travel around. + +0:25:39.499 --> 0:25:53.650 +How can this technology help so machine translation +is one way of dealing with? + +0:25:54.474 --> 0:26:01.144 +Of course, there is other tasks which do even +without machine translation. + +0:26:01.144 --> 0:26:04.613 +Just think about summarize my lecture. + +0:26:04.965 --> 0:26:08.019 +Approaches doing that what they call end to +end. + +0:26:08.019 --> 0:26:11.635 +So you just put an English text and get a +German summary. + +0:26:11.635 --> 0:26:17.058 +However, a good baseline and an important +thing is to either first lecture into German + +0:26:17.058 --> 0:26:22.544 +and then do a summary art, first do a summary +in English and then translation language. + +0:26:23.223 --> 0:26:28.764 +Translation is very important in order to +different application scenarios. + +0:26:28.764 --> 0:26:33.861 +We have that dissemination dialogue but also +information extraction. + +0:26:33.861 --> 0:26:39.993 +So if you want to do like get information +not only from English websites but from. + +0:26:40.300 --> 0:26:42.427 +Very different websites. + +0:26:42.427 --> 0:26:46.171 +It's helpful to have this type of solution. + +0:26:50.550 --> 0:26:52.772 +Yeah, what can you translate? + +0:26:52.772 --> 0:26:59.660 +Of course, we will focus on text, as I said +for most of them, because it's about translation + +0:26:59.660 --> 0:27:06.178 +and anything first translates to text, and +then change to text, and then we can do text + +0:27:06.178 --> 0:27:07.141 +translation. + +0:27:09.189 --> 0:27:19.599 +And text is not equals text, so we can do +translation that is some of the most common. + +0:27:19.499 --> 0:27:27.559 +Is working on translation, so just imagine +you are developing your new. + +0:27:27.947 --> 0:27:34.628 +Nowadays you don't want to have to only be +available in English or German books in as + +0:27:34.628 --> 0:27:40.998 +many languages as possible, and if you use +the standard tools it's not that easy. + +0:27:41.141 --> 0:27:50.666 +We have a different type of domain and there +again we have very few contexts. + +0:27:50.666 --> 0:27:56.823 +Normally we translate: To pick up an app you +have the menu and there's like safe. + +0:27:57.577 --> 0:28:02.535 +And then you only have safe. + +0:28:02.535 --> 0:28:14.845 +How should translate safe should it be written +or should it be spicing? + +0:28:16.856 --> 0:28:24.407 +Then, of course, if you have like files, it +might be that you have meta data to transport. + +0:28:26.466 --> 0:28:27.137 +Novels. + +0:28:27.137 --> 0:28:32.501 +Some work on that, but yeah, that's always +a typical criticism. + +0:28:32.501 --> 0:28:36.440 +You'll never be able to translate Shakespeare. + +0:28:36.656 --> 0:28:43.684 +Think this is somehow the last use case of +machine translation. + +0:28:43.684 --> 0:28:47.637 +For a translation of books there's. + +0:28:47.847 --> 0:28:57.047 +But the nice thing about machine translation +is that it can translate to things which are + +0:28:57.047 --> 0:29:05.327 +boring, so think about translating some bureaucrative +forms or some regulations. + +0:29:05.565 --> 0:29:11.302 +This is normally not very interesting, it's +very repetitive, so their automation works + +0:29:11.302 --> 0:29:11.697 +well. + +0:29:11.931 --> 0:29:17.519 +Of course, there is also translations on Paibos +images. + +0:29:17.519 --> 0:29:24.604 +I guess you point your camera to an object +where it translates things. + +0:29:25.005 --> 0:29:43.178 +And we'll cover that at the end, as said, +the speech translation. + +0:29:43.663 --> 0:29:46.795 +So you can't provide the translation of the +lecture. + +0:29:46.795 --> 0:29:50.518 +If I'm five slides further then you would +see the translation. + +0:29:50.518 --> 0:29:52.291 +It might not be very helpful. + +0:29:54.794 --> 0:29:57.062 +We are not speaking as we are written. + +0:29:57.062 --> 0:29:59.097 +It's again like a domain mismatch. + +0:29:59.359 --> 0:30:10.161 +So typically the sentences are not full sentences +and I'm saying this is not the right way to + +0:30:10.161 --> 0:30:19.354 +praise it and if you just read what was written +it might be hard to understand. + +0:30:23.803 --> 0:30:36.590 +We are focusing on the first application scenario +that is fully out of management. + +0:30:37.177 --> 0:30:46.373 +Of course, there are quite interesting application +scenarios for other things where it should + +0:30:46.373 --> 0:30:47.645 +be referred. + +0:30:47.867 --> 0:30:49.695 +Where it's no longer going to be. + +0:30:49.695 --> 0:30:52.436 +We have this tool and it works, but it's a +market. + +0:30:52.436 --> 0:30:57.381 +We have the machine translation system and +the human translator, and they somehow cooperate + +0:30:57.381 --> 0:30:59.853 +and try to be as fast as possible in doing +a. + +0:31:00.380 --> 0:31:12.844 +The easiest idea there would be the first +point you take the machine translation. + +0:31:13.553 --> 0:31:17.297 +That sometimes farther might not be the best +way of suing it. + +0:31:17.357 --> 0:31:25.308 +Any ideas or what else you could do, then +maybe the machine could aid the human and say + +0:31:25.308 --> 0:31:27.838 +I'm sure about this author. + +0:31:28.368 --> 0:31:32.319 +Yeah, very interesting, very good. + +0:31:32.319 --> 0:31:42.252 +Of course, the dangerous thing there is you +asking something from a machine translation + +0:31:42.252 --> 0:31:45.638 +system where it's really bad. + +0:31:45.845 --> 0:31:50.947 +There is quality estimation that maybe it +will couple that in evaluation so in evaluation + +0:31:50.947 --> 0:31:55.992 +you know what is correct translation and you +have another output and you try to estimate + +0:31:55.992 --> 0:31:57.409 +how good is the quality. + +0:31:57.409 --> 0:32:02.511 +In quality estimation you don't have you only +have a source and time and good question is + +0:32:02.511 --> 0:32:03.531 +exactly this one. + +0:32:03.531 --> 0:32:05.401 +Is it a good translation or not? + +0:32:05.665 --> 0:32:12.806 +This might be easier because the system might +not know what translation is. + +0:32:13.053 --> 0:32:23.445 +Human is very good at that for machines that +are difficult, but of course that's an interesting + +0:32:23.445 --> 0:32:24.853 +application. + +0:32:25.065 --> 0:32:32.483 +Be more interactive so that you may be translating +if the human changes the fifth word. + +0:32:32.483 --> 0:32:36.361 +What does it mean for the remaining sentence? + +0:32:36.361 --> 0:32:38.131 +Do I need to change? + +0:32:38.131 --> 0:32:43.948 +There are also things like you don't have +to repeat the same errors. + +0:32:47.767 --> 0:32:57.651 +Hell our automated basemen, you only want +to correct at once and not at all positions. + +0:33:00.000 --> 0:33:21.784 +And then they ask, for example, so before +the translation is done they ask: I'm not directly + +0:33:21.784 --> 0:33:23.324 +aware of that. + +0:33:23.324 --> 0:33:33.280 +I think it's a good way of ending and I think +it's where, especially with more advanced dialogue + +0:33:33.280 --> 0:33:34.717 +strategy and. + +0:33:35.275 --> 0:33:38.831 +Currently think of most of the focus is like +at least determining. + +0:33:39.299 --> 0:33:45.646 +Don't have this information that is already +challenging, so there is quite some work on + +0:33:45.646 --> 0:33:49.541 +quality estimation that I'm missing your information. + +0:33:49.789 --> 0:33:53.126 +But is there something missing? + +0:33:53.126 --> 0:33:59.904 +It's really quite challenging and think that +is where currently. + +0:34:00.260 --> 0:34:05.790 +What is there is there is opportunities to +provide or there is models to directly provide + +0:34:05.790 --> 0:34:06.527 +additional? + +0:34:06.786 --> 0:34:13.701 +You can give them anything you have and provide +them. + +0:34:13.701 --> 0:34:21.129 +It's a similar situation if you're translating +to German. + +0:34:21.641 --> 0:34:31.401 +And it would just guess normally or do some +random guessing always means it's using some + +0:34:31.401 --> 0:34:36.445 +information which should not be really there. + +0:34:36.776 --> 0:34:46.449 +So then you can provide it with an additional +input or you should use formula or non formula. + +0:34:47.747 --> 0:35:04.687 +To know that this information is missing. + +0:35:04.544 --> 0:35:19.504 +Since you're not specifically modeling this, +it's likely that there is a gender difference + +0:35:19.504 --> 0:35:21.805 +in languages. + +0:35:26.046 --> 0:35:39.966 +One are we doing good search on machine translation, +so it's a very important part to ask in natural + +0:35:39.966 --> 0:35:42.860 +language processing. + +0:35:43.283 --> 0:35:49.234 +So of course you have a lot of computer science +thing in there and that's the backbone of. + +0:35:49.569 --> 0:36:01.848 +However, task and understanding you can also +get from information like computational linguistics, + +0:36:01.848 --> 0:36:08.613 +which tell you about what language it's good +to know. + +0:36:08.989 --> 0:36:15.425 +Doesn't mean that in a computer we have to +bottle it exactly the same, but for example + +0:36:15.425 --> 0:36:22.453 +to know that there is something like morphology, +which means how words are built, and that for + +0:36:22.453 --> 0:36:24.746 +some languages it's very easy. + +0:36:24.746 --> 0:36:28.001 +In English there is nearly no worth coming. + +0:36:28.688 --> 0:36:35.557 +Well in Germany you already start for soon +you have like different forms and so on. + +0:36:36.316 --> 0:36:41.991 +And for other languages, for finish, it's +even more complicated with Basque. + +0:36:41.991 --> 0:36:44.498 +I think for some words more than. + +0:36:45.045 --> 0:36:52.098 +So knowing this, of course, gives you some +advice. + +0:36:52.098 --> 0:37:04.682 +How do I look at that now because we'll see +in the basic treat each word as an individual? + +0:37:06.106 --> 0:37:09.259 +Of course there is a lot of interest also +prone from industry. + +0:37:09.259 --> 0:37:10.860 +There is a lot of applications. + +0:37:11.191 --> 0:37:17.068 +There's research groups at Google, Facebook, +and Amazon. + +0:37:17.068 --> 0:37:26.349 +So there's quite a lot of interest in providing +that for German and English it is solved. + +0:37:26.546 --> 0:37:27.569 +Annoucing it's hard. + +0:37:27.569 --> 0:37:31.660 +We're saying that not hard, but of course +we haven't acquired high quality in them. + +0:37:32.212 --> 0:37:39.296 +But there's currently really a large trend +in building other systems for low research + +0:37:39.296 --> 0:37:40.202 +languages. + +0:37:40.480 --> 0:37:53.302 +So there are tasks on last year's task on +translating from Native American languages: + +0:37:53.193 --> 0:37:58.503 +Don't know yet but but five other languages, +so how can you translate from them? + +0:37:58.538 --> 0:38:05.074 +Then you don't have like millions of sentences, +but you might have only the Bible or some more + +0:38:05.074 --> 0:38:05.486 +data. + +0:38:05.486 --> 0:38:08.169 +Then the question is, what can you do? + +0:38:08.169 --> 0:38:09.958 +And how good can you get? + +0:38:14.794 --> 0:38:17.296 +One thing is very important. + +0:38:17.296 --> 0:38:25.751 +Of course, in a lot of A I is to measure the +quality and what you can measure is quite important. + +0:38:25.986 --> 0:38:37.213 +So that's why for many years of regular there +is different evaluation campaigns where people + +0:38:37.213 --> 0:38:38.178 +submit. + +0:38:39.419 --> 0:38:45.426 +We're often part of the statistical machine +translation original, yet now I think it's + +0:38:45.426 --> 0:38:51.019 +a machine translation where it's mostly about +European languages and used texts. + +0:38:51.051 --> 0:38:57.910 +The International Workshop of Spoken Language +Translation, which is translation about lectures + +0:38:57.910 --> 0:39:04.263 +which we are co organizing, and there is a +bovia as I said building strong systems this + +0:39:04.263 --> 0:39:04.696 +time. + +0:39:04.664 --> 0:39:11.295 +This has established translating conference +presentations from English into ten different + +0:39:11.295 --> 0:39:17.080 +languages: And then, of course, you have to +deal with things like special vocabulary. + +0:39:17.037 --> 0:39:23.984 +You think about recurrent real networks are +terms like co-recurrent networks, convolutional + +0:39:23.984 --> 0:39:24.740 +networks. + +0:39:25.545 --> 0:39:29.917 +That might be more difficult to translate +and you also have to decide who I need to translate + +0:39:29.917 --> 0:39:33.359 +or should I keep it in English, and that's +not the same in each language. + +0:39:33.873 --> 0:39:37.045 +In German maybe mostly you keep it. + +0:39:37.045 --> 0:39:44.622 +I think in French people are typically like +wanting to translate as much as possible. + +0:39:44.622 --> 0:39:52.200 +These are then challenges and then, of course, +in Poland where it's also challenging. + +0:39:53.153 --> 0:39:59.369 +I think all of the speakers in the test that +are not native in your speakers, so you need + +0:39:59.369 --> 0:40:05.655 +to translate people with a German accent or +with a French accent or with a Japanese accent + +0:40:05.655 --> 0:40:09.178 +or an English accent, which poison has additional. + +0:40:12.272 --> 0:40:21.279 +Yes, so there is criticism always with new +technologies because people say will never + +0:40:21.279 --> 0:40:23.688 +translate Shakespeare. + +0:40:24.204 --> 0:40:26.845 +Partly agree with the second. + +0:40:26.845 --> 0:40:34.682 +Maybe it's not good at translating Shakespeare, +but there's many people working on that. + +0:40:35.255 --> 0:40:38.039 +Of course, the poison cookie is a challenge. + +0:40:38.858 --> 0:40:44.946 +The thing is here that the cookie chart that +you can't never be sure if the machine translation + +0:40:44.946 --> 0:40:47.546 +system doesn't really mistake somewhere. + +0:40:47.546 --> 0:40:53.316 +So if you can't be sure that there's no error +in there, how can you trust the translation? + +0:40:55.275 --> 0:41:01.892 +That is partly true, on the other hand, otherwise +you have to translate to a human translator + +0:41:01.892 --> 0:41:06.116 +and men who are sometimes overestimating human +performance. + +0:41:06.746 --> 0:41:15.111 +They are very good translators but under a +lot of pressure and not human translations. + +0:41:15.715 --> 0:41:22.855 +The question is: When can you trust it enough +anyway? + +0:41:22.855 --> 0:41:28.540 +You should be careful about trusting them. + +0:41:31.011 --> 0:41:38.023 +And I think some of them are too old now because +it has been shown that it is helpful to have + +0:41:38.023 --> 0:41:41.082 +some type of machine translation system. + +0:41:41.082 --> 0:41:47.722 +Of course, it is not buying the car, so typically +still a system is not working forever. + +0:41:48.048 --> 0:41:56.147 +If you want your dedicated system, which is +good for the task you are, they are typically + +0:41:56.147 --> 0:41:57.947 +not as generalized. + +0:41:58.278 --> 0:42:07.414 +That can translate news and chats, and I don't +know what. + +0:42:07.414 --> 0:42:12.770 +So typically if you want to show. + +0:42:12.772 --> 0:42:18.796 +It's not made for, it has not seen very well +and then you see a bad quality. + +0:42:19.179 --> 0:42:27.139 +But that's also like yeah, therefore you don't +build it. + +0:42:27.139 --> 0:42:42.187 +If you have a sports car and you are driving +off road you should: Yeah, you can also say + +0:42:42.187 --> 0:42:49.180 +the other way around trans machine translation +is already solved, and especially with more + +0:42:49.180 --> 0:42:50.487 +people think so. + +0:42:50.750 --> 0:43:04.275 +However, there is an impressive performance +of machine translation, but it's not stated + +0:43:04.275 --> 0:43:06.119 +of the art. + +0:43:06.586 --> 0:43:11.811 +And yeah, they're good for some domains and +some languages that are even like already. + +0:43:12.572 --> 0:43:27.359 +Have Microsoft has a very super human performance +claiming that their machine translated system. + +0:43:27.467 --> 0:43:38.319 +However, there was one domain use and some +language in Spanish where there is a huge amount + +0:43:38.319 --> 0:43:45.042 +of training data and you can build a very strong +system. + +0:43:45.505 --> 0:43:48.605 +And you even don't have to go to these extreme +cases. + +0:43:48.688 --> 0:43:54.328 +We have worked on Canada, which is a language +in India spoken. + +0:43:54.328 --> 0:44:01.669 +I think by also around eighty million people +so similar to to German that it has. + +0:44:01.669 --> 0:44:07.757 +The quality is significantly worse, it has +significantly less data. + +0:44:08.108 --> 0:44:15.132 +There are still quite a lot of languages where +the quality is not, where you want to have. + +0:44:15.295 --> 0:44:17.971 +Scaling this is not as easy at this thing. + +0:44:17.971 --> 0:44:23.759 +That's why we're also interested in multilingual +systems with the hope that we don't have to + +0:44:23.759 --> 0:44:29.548 +build a system for each possible combination, +but we can build a system which can cover many + +0:44:29.548 --> 0:44:33.655 +tags, many languages and then also need less +data for each other. + +0:44:39.639 --> 0:44:51.067 +With invasion maybe some presentation of everything +is a bit cat that can say the most important. + +0:44:51.331 --> 0:45:09.053 +So machine translation started coming from +information theory in there was this: It's + +0:45:09.053 --> 0:45:13.286 +treating machine translation as encryption +or decryption. + +0:45:13.533 --> 0:45:21.088 +Don't understand it, want to have it in English, +treat it as if it's like encrypted English, + +0:45:21.088 --> 0:45:28.724 +and then apply my decryption algorithm, which +they were working a lot during the Second World + +0:45:28.724 --> 0:45:29.130 +War. + +0:45:29.209 --> 0:45:34.194 +And so if I cannot do this detruction then +this sings a song. + +0:45:34.934 --> 0:45:42.430 +And they based on that they had rules and +so on. + +0:45:42.430 --> 0:45:50.843 +So they had the judge Georgetown experiments +in where. + +0:45:51.691 --> 0:45:57.419 +From English and then they were like wow. + +0:45:57.419 --> 0:46:01.511 +This is solved in some years. + +0:46:01.511 --> 0:46:04.921 +Now we can do sentences. + +0:46:06.546 --> 0:46:18.657 +As you can imagine this didn't really work +out that way, so it's not really happening. + +0:46:18.657 --> 0:46:24.503 +The spirit is willing, but flesh is weak. + +0:46:24.444 --> 0:46:30.779 +Translated it to Russian and then to Germany +and then vodka is good but the meat is rotten. + +0:46:31.271 --> 0:46:39.694 +Think it never really happened this way, but +you can see you can imagine that something + +0:46:39.694 --> 0:46:49.533 +like that could happen, and then in in the +there was this report saying: It's more challenging + +0:46:49.533 --> 0:46:56.877 +than expected and the problem is that we have +to invest more. + +0:46:56.877 --> 0:47:02.801 +There's no benefit for doing machine translation. + +0:47:04.044 --> 0:47:09.255 +At least in some other countries there was +a bit, but then for some time there wasn't + +0:47:09.255 --> 0:47:10.831 +that big out of progress. + +0:47:12.152 --> 0:47:26.554 +We have then in the' 70s there were some rule +based systems that would cover out some linguistic + +0:47:26.554 --> 0:47:28.336 +background. + +0:47:28.728 --> 0:47:34.013 +They are now doing very good machine translation, +but they had a really huge rule base. + +0:47:34.314 --> 0:47:43.538 +So they really have like handwritten roots +how to parse sentences, how to translate parse + +0:47:43.538 --> 0:47:45.587 +sentences to parse. + +0:47:46.306 --> 0:47:55.868 +When which word should be translated, these +rule based systems were quite strong for a + +0:47:55.868 --> 0:47:57.627 +very long time. + +0:47:57.917 --> 0:48:03.947 +So even in or so for some language fares and +some remains, it was better than a machine + +0:48:03.947 --> 0:48:04.633 +learning. + +0:48:05.505 --> 0:48:09.576 +Well, of course, there was a lot of effort +in and a lot of experts were building this. + +0:48:11.791 --> 0:48:13.170 +And then. + +0:48:13.053 --> 0:48:18.782 +The first statistical machine translations +were coming in the early nineties. + +0:48:18.782 --> 0:48:25.761 +There's the system by IBM will refer to them +as a T by the IBM models, which are quite famous, + +0:48:25.761 --> 0:48:32.886 +and they were used to film your machine translations +from the nineties nineties to two thousand. + +0:48:32.912 --> 0:48:35.891 +Fifteen or so people were working on the IBM +models. + +0:48:36.496 --> 0:48:44.608 +And that was the first way of doing a machine +translation with statisticals or machine learning. + +0:48:44.924 --> 0:48:52.143 +And it was possible through the French English +under a corpusol from the Canadian Parliament + +0:48:52.143 --> 0:48:59.516 +they also had proceedings in French and English +and people tried to use that to translate and. + +0:49:01.681 --> 0:49:06.919 +And yes, so that was than the start of statistical +machine translation. + +0:49:07.227 --> 0:49:17.797 +Is called a phrase page machine translation +was introduced where you could add more information + +0:49:17.797 --> 0:49:26.055 +in use longer chunks to translate and phrase +page translation was somehow. + +0:49:26.326 --> 0:49:27.603 +She'll Start Fourteen. + +0:49:27.767 --> 0:49:37.721 +With this straight space machine sensation +we saw the first commercial systems. + +0:49:38.178 --> 0:49:45.301 +And yeah, that was the first big advantage +where really you can see the machine translation. + +0:49:47.287 --> 0:49:55.511 +And neural machine translation was mainly +introduced. + +0:49:55.511 --> 0:50:07.239 +That means there was a shift from traditional +statistical modeling to using. + +0:50:07.507 --> 0:50:09.496 +And that was quite impressive. + +0:50:09.496 --> 0:50:11.999 +It was really within one or two years. + +0:50:11.999 --> 0:50:17.453 +The whole research community shifted from +what they had been working on since twenty + +0:50:17.453 --> 0:50:17.902 +years. + +0:50:17.902 --> 0:50:23.485 +And everybody was using this pattern, you +know networks, because just the performances + +0:50:23.485 --> 0:50:25.089 +were really really much. + +0:50:25.425 --> 0:50:35.048 +Especially they are what we also see now with +chat boards like the impressive thing. + +0:50:35.135 --> 0:50:45.261 +That was very, very challenging if you see +machine translation before that, especially + +0:50:45.261 --> 0:50:47.123 +if the English. + +0:50:47.547 --> 0:50:53.352 +But if you were transmitting to German you +would see that the agreement so that it's there + +0:50:53.352 --> 0:50:58.966 +shown abound and dishewn and boima and this +didn't always really work perfect maybe for + +0:50:58.966 --> 0:51:04.835 +the short range of work but then it has to +be accusative and it's like far away then things + +0:51:04.835 --> 0:51:06.430 +didn't really work well. + +0:51:06.866 --> 0:51:13.323 +Now with new machine translation we have a +bit of a different problem: So the sentences + +0:51:13.323 --> 0:51:16.901 +are typically really nice. + +0:51:16.901 --> 0:51:24.056 +They are perfectly written not always but +very often. + +0:51:24.224 --> 0:51:36.587 +So that adequacy and their conveillance should +have the same meaning is typically the bigger. + +0:51:42.002 --> 0:51:46.039 +So how can we do so last? + +0:51:46.039 --> 0:51:54.889 +What are the things and how can we do machine +rendering? + +0:51:55.235 --> 0:52:01.297 +So we had first blue based systems, and as +a side systems we did that we manually created + +0:52:01.297 --> 0:52:01.769 +rules. + +0:52:01.861 --> 0:52:07.421 +And there were rules how to dissemvy real +ambiguities. + +0:52:07.421 --> 0:52:16.417 +For example, we had the word banks look at +the context and do rules like to decide when. + +0:52:17.197 --> 0:52:28.418 +How to translate the structure, but you know +how to transfer the structure that you work + +0:52:28.418 --> 0:52:33.839 +has to split it in German and move to the. + +0:52:35.295 --> 0:52:36.675 +Here's a difficult thing. + +0:52:36.675 --> 0:52:39.118 +My thing is you don't need any training data. + +0:52:39.118 --> 0:52:41.295 +It's not like now with machine learning. + +0:52:41.295 --> 0:52:46.073 +If you build a machine translation system, +the first question you should ask is do I have + +0:52:46.073 --> 0:52:46.976 +data to do that? + +0:52:46.976 --> 0:52:48.781 +Do I have parallel data to train? + +0:52:49.169 --> 0:52:50.885 +Here there's no data. + +0:52:50.885 --> 0:52:57.829 +It's like all trades, pencils and roads, but +the problem is people trading the roads and + +0:52:57.829 --> 0:52:59.857 +this needs to be experts. + +0:52:59.799 --> 0:53:06.614 +Understand at least the grammar in one language, +basically the grammar in both languages. + +0:53:06.614 --> 0:53:09.264 +It needs to be a real language to. + +0:53:10.090 --> 0:53:17.308 +Then we have the two corpus based machine +translation approaches, and then we use machine + +0:53:17.308 --> 0:53:22.682 +learning to learn how to translate from one +language to the other. + +0:53:22.882 --> 0:53:29.205 +We should find out ourselves what is the meaning +of individual words, which words translate + +0:53:29.205 --> 0:53:30.236 +to each other. + +0:53:30.236 --> 0:53:36.215 +The only information we give is the German +sentence, the English sentence, and then we + +0:53:36.215 --> 0:53:37.245 +look for many. + +0:53:37.697 --> 0:53:42.373 +So maybe you think there's a Bible for each +language. + +0:53:42.373 --> 0:53:44.971 +There shouldn't be a problem. + +0:53:45.605 --> 0:53:52.752 +But this is not the scale when we're talking +about. + +0:53:52.752 --> 0:54:05.122 +Small systems have maybe one hundred thousand +sentences when we're building large models. + +0:54:05.745 --> 0:54:19.909 +The statistical models do statistics about +how the word screw occur and how often the + +0:54:19.909 --> 0:54:21.886 +word screw. + +0:54:22.382 --> 0:54:29.523 +While we were focused on it was currently +most of the cases referred to as neural communication. + +0:54:30.050 --> 0:54:44.792 +So in this case the idea is that you have +a neural model which is a big neural network. + +0:54:45.345 --> 0:54:55.964 +And for these machine drums there quite challenging +tasks. + +0:54:55.964 --> 0:55:03.883 +For example, this transformal architecture. + +0:55:03.903 --> 0:55:07.399 +Cast by Google in two thousand eight. + +0:55:08.028 --> 0:55:19.287 +Here want to ask the screw-based machine translation +of that part. + +0:55:22.862 --> 0:55:33.201 +Would say it's mainly rule based systems because +purely rule based systems maybe exist with + +0:55:33.201 --> 0:55:36.348 +some very exotic languages. + +0:55:36.776 --> 0:55:43.947 +Of course, the idea of investigating if we +have this type of rulers that might be still + +0:55:43.947 --> 0:55:45.006 +interesting. + +0:55:45.105 --> 0:55:52.090 +Maybe you can try to let someone force the +rules in there. + +0:55:52.090 --> 0:55:57.655 +You might use rules to create artificial data. + +0:55:57.557 --> 0:56:03.577 +That it might be helpful to have some concepts +which develop by bilinguistic researches to + +0:56:03.577 --> 0:56:09.464 +somehow interview that that's still an open +question is sometimes helpful, and of course + +0:56:09.464 --> 0:56:13.235 +is also interesting from more the analyzed +perspectives. + +0:56:13.235 --> 0:56:13.499 +So. + +0:56:13.793 --> 0:56:20.755 +Do the new networks have these types of concepts +of gender or anything? + +0:56:20.755 --> 0:56:23.560 +And can we test that though? + +0:56:30.330 --> 0:56:34.255 +Yes, and then the other way of describing +how this can be done. + +0:56:34.574 --> 0:56:52.021 +And then originally mainly for a rule based +system that can be used for a lot of scenarios. + +0:56:52.352 --> 0:57:04.135 +In real ways, the first world has really direct +translation systems that work for related languages. + +0:57:04.135 --> 0:57:11.367 +You mainly look at each word and replace the +word by the one. + +0:57:11.631 --> 0:57:22.642 +Another idea is that you first do some type +of animus on the source side, so for example + +0:57:22.642 --> 0:57:28.952 +you can create what is referred to as a path +tree. + +0:57:30.150 --> 0:57:36.290 +Or you can instead, and that is what is called +the lingua face approach. + +0:57:36.290 --> 0:57:44.027 +You take the short sentence and parse it into +a semantic representation, which is hopefully + +0:57:44.027 --> 0:57:44.448 +the. + +0:57:44.384 --> 0:57:50.100 +Only of the meaning of what is said and then +you can generate it to any other language because + +0:57:50.100 --> 0:57:55.335 +it has a meaning and then you can need a part +generation which can generate all other. + +0:57:57.077 --> 0:58:09.248 +The idea is somewhat nice to have this type +of interlingua, general representation of all + +0:58:09.248 --> 0:58:17.092 +meanings, and they always translate into the +interlingua. + +0:58:17.177 --> 0:58:19.189 +A Little World and It's Been Somewhere. + +0:58:20.580 --> 0:58:26.684 +It shouldn't be a natural language because +it shouldn't have ambiguities so that's a big + +0:58:26.684 --> 0:58:32.995 +difference so the story and the tiger language +have ambiguities so the idea is they do some + +0:58:32.995 --> 0:58:39.648 +semantic representation or what does it mean +and so on and therefore it's very easy to generate. + +0:58:41.962 --> 0:58:45.176 +However, that is a challenge that this really +exists. + +0:58:45.176 --> 0:58:48.628 +You cannot define the language for anything +in the world. + +0:58:49.249 --> 0:58:56.867 +And that's why the Lingo-based approach typically +worked for small domains to do hotel reservation, + +0:58:56.867 --> 0:59:00.676 +but if you want to define the Lingo for anything. + +0:59:01.061 --> 0:59:07.961 +There have been approaches and semantics, +but it's yeah, it's not really possible CR. + +0:59:07.961 --> 0:59:15.905 +So approaches to this because I mean a seasonal +vector's face and bitch eyes and slaves everything + +0:59:15.905 --> 0:59:20.961 +that I mitonized that they all could end up +in the same space. + +0:59:21.821 --> 0:59:24.936 +That is not the question. + +0:59:24.936 --> 0:59:35.957 +If you talk about neural networks, it's direct +translation on the one you're putting in the + +0:59:35.957 --> 0:59:36.796 +input. + +0:59:36.957 --> 0:59:44.061 +And you can argue for both that we have been +making this representation language agnostic + +0:59:44.061 --> 0:59:45.324 +or independent. + +0:59:47.227 --> 0:59:52.912 +Until now we were able to make it less language +dependent but it's very hard to make it completely + +0:59:52.912 --> 0:59:54.175 +language independent. + +0:59:54.175 --> 0:59:59.286 +Maybe it's also not necessary and of course +if there's again the problem there's not all + +0:59:59.286 --> 1:00:04.798 +information and the source and the target there +is different types of information if you remove + +1:00:04.798 --> 1:00:05.602 +all language. + +1:00:05.585 --> 1:00:09.408 +Information might be that you have removed +too many information. + +1:00:10.290 --> 1:00:15.280 +Talk about this and there's a very interesting +research direction in which we are working + +1:00:15.280 --> 1:00:20.325 +on on the multilingual part because there is +especially the case if we have several source + +1:00:20.325 --> 1:00:25.205 +languages, several type of languages who try +to generate a representation in the middle + +1:00:25.205 --> 1:00:27.422 +which have the few language dependence. + +1:00:32.752 --> 1:00:46.173 +Yes, so for a direct base approach, so as +said the first one is dictionary based approach. + +1:00:46.806 --> 1:00:48.805 +Replace some words with other words. + +1:00:48.805 --> 1:00:51.345 +Then you have exactly the same same structure. + +1:00:51.771 --> 1:00:55.334 +Other problems are one to one correspondence. + +1:00:55.334 --> 1:01:01.686 +Some phrases are expressed with several words +in English, but one word in German. + +1:01:01.686 --> 1:01:03.777 +That's extremely the case. + +1:01:03.777 --> 1:01:07.805 +Just think about all our composites like the +Donau. + +1:01:08.608 --> 1:01:18.787 +Which is used very often as been referred +to as translation memory. + +1:01:18.787 --> 1:01:25.074 +It might seem very simple, but it's like. + +1:01:26.406 --> 1:01:33.570 +That means you might think of this not helpful +at all, but you know think about translating. + +1:01:33.513 --> 1:01:38.701 +The law text is more like the interactive +scenario for the human translator. + +1:01:38.701 --> 1:01:44.091 +In law text there is a lot of repetition and +a lot of phrases occur very often. + +1:01:44.424 --> 1:01:55.412 +The translator has just a background of translation +memory and retrieve all this translation. + +1:01:55.895 --> 1:02:07.147 +There is even another benefit in addition +to less work: That is also precise in the way + +1:02:07.147 --> 1:02:19.842 +know this creates a small mistake in the North +Carolina. + +1:02:20.300 --> 1:02:22.584 +By especially its like consistence,. + +1:02:23.243 --> 1:02:32.954 +If you once translate the sentence this way +you again translate it and especially for some + +1:02:32.954 --> 1:02:36.903 +situations like a company they have. + +1:02:37.217 --> 1:02:47.695 +With this one, of course, you get more consistent +translations. + +1:02:47.695 --> 1:02:56.700 +Each one is a style where phrases maybe are +retrieved. + +1:03:01.861 --> 1:03:15.502 +Then we have these transfer based approaches +where we have three steps: Analysts remain + +1:03:15.502 --> 1:03:25.975 +that you check one synthetic structure, so +for example for morphology the basic. + +1:03:26.286 --> 1:03:37.277 +Then you will do a parstry or dependency structure +that this is the adjective of the balm. + +1:03:37.917 --> 1:03:42.117 +Then you can do the transfer where you transfer +the structure to the other. + +1:03:42.382 --> 1:03:46.633 +There you have to do, for example, it's re-ordering +because the satisfaction is different. + +1:03:46.987 --> 1:03:50.088 +In German, the adjective is before the noun. + +1:03:50.088 --> 1:03:52.777 +In Spanish, it's the other way around. + +1:03:52.777 --> 1:03:59.256 +You have first found and then that it's nice +and these types of rehonoring can be done there. + +1:03:59.256 --> 1:04:04.633 +You might have to do other things like passive +voice to exit voice and so on. + +1:04:05.145 --> 1:04:14.074 +And in some type of lexical transverse it +should like to me: And then you are doing the + +1:04:14.074 --> 1:04:16.014 +generation. + +1:04:16.014 --> 1:04:25.551 +Of course, you would do the agreement if it +is accusative. + +1:04:25.551 --> 1:04:29.430 +What type of adjective? + +1:04:30.090 --> 1:04:32.048 +Is some kind of saving. + +1:04:32.048 --> 1:04:39.720 +Of course, here, because the analyze has only +to be done in the source language, the transfer + +1:04:39.720 --> 1:04:41.679 +has to do on the pairs. + +1:04:41.679 --> 1:04:48.289 +But if you not look German, English and French +through all directions, you only. + +1:04:53.273 --> 1:04:59.340 +Then there is an interlingua card which is +really about the pure meaning, so you have + +1:04:59.340 --> 1:05:00.751 +a semantic grammar. + +1:05:01.061 --> 1:05:07.930 +To represent everything and one thing, one +nice implication is more extreme than before. + +1:05:07.930 --> 1:05:15.032 +You don't have the transfer anymore, so if +you add one language to it and you have already. + +1:05:15.515 --> 1:05:26.188 +If you add the one parting and the one generation +phase, you can now translate from: So you need + +1:05:26.188 --> 1:05:40.172 +components which do the and components which +do the generation, and then you can translate: + +1:05:41.001 --> 1:05:45.994 +You can also do other things like paraphrasing. + +1:05:45.994 --> 1:05:52.236 +You can translate back to the words language +and hopefully. + +1:05:53.533 --> 1:06:05.013 +If you're sparkling trying to analyze it, +it was also down a lot for ungrammetical speech + +1:06:05.013 --> 1:06:11.518 +because the idea is you're in this representation. + +1:06:12.552 --> 1:06:18.679 +Of course, it's very much work and it's only +realistic for limited domains. + +1:06:20.000 --> 1:06:25.454 +Then we're, we're have the campus based approach. + +1:06:25.745 --> 1:06:32.486 +So we'll talk about a lot about peril layer +and what is really peril data is what you know + +1:06:32.486 --> 1:06:34.634 +from the Rosetta stone page. + +1:06:34.634 --> 1:06:41.227 +That is, you have a sewer sentence and you +have a target sentence and you know they need + +1:06:41.227 --> 1:06:42.856 +to watch translation. + +1:06:43.343 --> 1:06:46.651 +And that's important, so the alignment is +typically at a sentence level. + +1:06:46.987 --> 1:06:50.252 +So you know, for each sentence what is a translation? + +1:06:50.252 --> 1:06:55.756 +Not always perfect because maybe there's two +German sentences and one English, but at that + +1:06:55.756 --> 1:06:57.570 +level it's normally possible. + +1:06:57.570 --> 1:07:03.194 +At word level you can't do that because it's +a very complicated thing and sense level that's + +1:07:03.194 --> 1:07:04.464 +normally a relative. + +1:07:05.986 --> 1:07:12.693 +Some type of machine learning which tries +to learn dismapping between sentences on the + +1:07:12.693 --> 1:07:14.851 +English side and sentences. + +1:07:15.355 --> 1:07:22.088 +Of course this doesn't look like good mapping +too complex but you try to find something like + +1:07:22.088 --> 1:07:28.894 +that where it's a very nice mapping so there's +always the mixing things are met to each other + +1:07:28.894 --> 1:07:32.224 +and then if you have the English you can try. + +1:07:32.172 --> 1:07:36.900 +In another English sentence you can apply +the same mannering and hopefully adhere to + +1:07:36.900 --> 1:07:38.514 +the right sentence in terms. + +1:07:38.918 --> 1:07:41.438 +The big problem here. + +1:07:41.438 --> 1:07:44.646 +How can we find this model? + +1:07:44.646 --> 1:07:50.144 +How to map English centers into German centers? + +1:07:54.374 --> 1:08:08.492 +How we do that is that we are trying to maximize +the probability, so we have all the letterstone. + +1:08:09.109 --> 1:08:15.230 +Then we're having some type of model here +which takes the Suez language and translates + +1:08:15.230 --> 1:08:16.426 +it for a target. + +1:08:16.896 --> 1:08:34.008 +And then we are in our translation, and we +are adjusting our model in a way that the probability. + +1:08:34.554 --> 1:08:48.619 +How that is the idea behind it, how we are +pushed now, implement that is part of the bottle. + +1:08:51.131 --> 1:09:01.809 +And then if we want to do translation, what +we are doing is we are trying to find the translation. + +1:09:01.962 --> 1:09:06.297 +So we are scoring many possible translations. + +1:09:06.297 --> 1:09:12.046 +There is an infinite number of sentences that +we are trying. + +1:09:12.552 --> 1:09:18.191 +That may be a bit of a problem when we talk +about confidence because we are always trying + +1:09:18.191 --> 1:09:19.882 +to find the most probable. + +1:09:20.440 --> 1:09:28.241 +And then, of course, we are not really having +intrinsically the possibility to say, oh, I + +1:09:28.241 --> 1:09:31.015 +have no idea in this situation. + +1:09:31.015 --> 1:09:35.782 +But our general model is always about how +can we find? + +1:09:40.440 --> 1:09:41.816 +Think It's. + +1:09:42.963 --> 1:09:44.242 +Get Four More Slides. + +1:09:46.686 --> 1:09:52.025 +So just high level, so for a proper space +this one we won't cover again. + +1:09:52.352 --> 1:10:00.808 +Its example based machine translation was +at the beginning of SMT. + +1:10:00.808 --> 1:10:08.254 +The idea is that you take subparts and combine +them again. + +1:10:08.568 --> 1:10:11.569 +So this will not be really covered here. + +1:10:11.569 --> 1:10:15.228 +Then the statistical machine translation we +will. + +1:10:17.077 --> 1:10:18.773 +Yeah, we will cover next week. + +1:10:19.079 --> 1:10:27.594 +The idea is there that we automatically now, +if we have the sentence alignment, we automatically. + +1:10:27.527 --> 1:10:34.207 +In the sentences, and then we can learn statistical +models of how probable words are translated + +1:10:34.207 --> 1:10:39.356 +to each other, and then the surge is that we +create different hypotheses. + +1:10:39.356 --> 1:10:45.200 +This could be a translation of this part, +this could be a translation of that part. + +1:10:45.200 --> 1:10:47.496 +We give a score to each of them. + +1:10:47.727 --> 1:10:51.584 +The statistical machine manual is where a +lot of work is done. + +1:10:51.584 --> 1:10:54.155 +How can we score how good translation is? + +1:10:54.494 --> 1:11:04.764 +The words can recur this type of structure, +how is it reordered, and then based on that + +1:11:04.764 --> 1:11:08.965 +we search for the best translation. + +1:11:12.252 --> 1:11:19.127 +Then yeah, that one what we'll cover most +of the time is is a neural, a model where we + +1:11:19.127 --> 1:11:21.102 +can use neural networks. + +1:11:21.102 --> 1:11:27.187 +The nice thing is between everything together +before we get some compliment. + +1:11:27.187 --> 1:11:30.269 +Each of them is trained independently. + +1:11:30.210 --> 1:11:34.349 +Which of course has a disadvantage that they +might not best work together. + +1:11:34.694 --> 1:11:36.601 +Here everything is trained together. + +1:11:36.601 --> 1:11:39.230 +The continuous representation will look into +that. + +1:11:39.339 --> 1:11:41.846 +That's very helpful soft. + +1:11:41.846 --> 1:11:50.426 +We then neonetworks are able to learn somehow +the relation between words and that's very + +1:11:50.426 --> 1:11:57.753 +helpful because then we can more easily deal +with words which didn't occur. + +1:12:00.000 --> 1:12:05.240 +One thing just to correlate that to interlingua +based. + +1:12:05.345 --> 1:12:07.646 +So we have this as an actual language. + +1:12:07.627 --> 1:12:11.705 +And if you do an interlingual based approach +but don't take an artificial. + +1:12:11.731 --> 1:12:17.814 +With no ambiguities, but with a natural language +that's referred to as pivot based in tea and + +1:12:17.814 --> 1:12:20.208 +can be done with all the approaches. + +1:12:20.208 --> 1:12:25.902 +So the ideas instead of directly translating +from German to French, you first translate + +1:12:25.902 --> 1:12:29.073 +from German to English and then from English +to. + +1:12:29.409 --> 1:12:40.954 +French where the big advantage is that you +might have a lot more data for these two directions + +1:12:40.954 --> 1:12:43.384 +than you have here. + +1:12:44.864 --> 1:12:54.666 +With this thank you and deserve more questions +and a bit late I'm sorry and then I'll see + +1:12:54.666 --> 1:12:55.864 +you again. + diff --git a/demo_data/lectures/Lecture-01-18.04.2023/video.mp4 b/demo_data/lectures/Lecture-01-18.04.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..377cd10a6e20a54a4db19ed8351d2b72ea4d48a6 --- /dev/null +++ b/demo_data/lectures/Lecture-01-18.04.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f95bffd5a310af38b1ee51daef47a0af905687cbee799c161515f743cb30d0c +size 103388000 diff --git a/demo_data/lectures/Lecture-02-20.04.2023/English.vtt b/demo_data/lectures/Lecture-02-20.04.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..1f56b3e2002268e9c0a75f7503d9cf0a3fc860b1 --- /dev/null +++ b/demo_data/lectures/Lecture-02-20.04.2023/English.vtt @@ -0,0 +1,2984 @@ +WEBVTT + +0:00:01.561 --> 0:00:05.186 +Okay So Um. + +0:00:08.268 --> 0:00:17.655 +Welcome to today's presentation of the second +class and machine translation where we'll today + +0:00:17.655 --> 0:00:25.044 +do a bit of a specific topic and we'll talk +about linguistic backgrounds. + +0:00:26.226 --> 0:00:34.851 +Will cover their three different parts of +the lecture. + +0:00:35.615 --> 0:00:42.538 +We'll do first a very, very brief introduction +about linguistic background in a way that what + +0:00:42.538 --> 0:00:49.608 +is language, what are ways of describing language, +what are a bit serious behind it, very, very + +0:00:49.608 --> 0:00:50.123 +short. + +0:00:50.410 --> 0:00:57.669 +Don't know some of you have listened, think +to NLP in the last semester or so. + +0:00:58.598 --> 0:01:02.553 +So there we did a lot longer explanation. + +0:01:02.553 --> 0:01:08.862 +Here is just because we are not talking about +machine translation. + +0:01:09.109 --> 0:01:15.461 +So it's really focused on the parts which +are important when we talk about machine translation. + +0:01:15.755 --> 0:01:19.377 +Though for everybody who has listened to that +already, it's a bit of a repetition. + +0:01:19.377 --> 0:01:19.683 +Maybe. + +0:01:19.980 --> 0:01:23.415 +But it's really trying to look. + +0:01:23.415 --> 0:01:31.358 +These are properties of languages and how +can they influence translation. + +0:01:31.671 --> 0:01:38.928 +We'll use that in the second part to discuss +why is machine translation more from what we + +0:01:38.928 --> 0:01:40.621 +know about language. + +0:01:40.940 --> 0:01:47.044 +We will see that I mean there's two main things +is that the language might express ideas and + +0:01:47.044 --> 0:01:53.279 +information differently, and if they are expressed +different in different languages we have to + +0:01:53.279 --> 0:01:54.920 +do somehow the transfer. + +0:01:55.135 --> 0:02:02.771 +And it's not purely that we know there's words +used for it, but it's not that simple and very + +0:02:02.771 --> 0:02:03.664 +different. + +0:02:04.084 --> 0:02:10.088 +And the other problem we mentioned last time +about biases is that there's not always the + +0:02:10.088 --> 0:02:12.179 +same amount of information in. + +0:02:12.592 --> 0:02:18.206 +So it can be that there's some more information +in the one or you can't express that few information + +0:02:18.206 --> 0:02:19.039 +on the target. + +0:02:19.039 --> 0:02:24.264 +We had that also, for example, with the example +with the rice plant in Germany, we would just + +0:02:24.264 --> 0:02:24.820 +say rice. + +0:02:24.904 --> 0:02:33.178 +Or in English, while in other countries you +have to distinguish between rice plant or rice + +0:02:33.178 --> 0:02:33.724 +as a. + +0:02:34.194 --> 0:02:40.446 +And then it's not always possible to directly +infer this on the surface. + +0:02:41.781 --> 0:02:48.501 +And if we make it to the last point otherwise +we'll do that next Tuesday or we'll partly + +0:02:48.501 --> 0:02:55.447 +do it only here is like we'll describe briefly +the three main approaches on a rule based so + +0:02:55.447 --> 0:02:59.675 +linguistic motivated ways of doing machine +translation. + +0:02:59.779 --> 0:03:03.680 +We mentioned them last time like the direct +translation. + +0:03:03.680 --> 0:03:10.318 +The translation by transfer the lingua interlingua +bass will do that a bit more in detail today. + +0:03:10.590 --> 0:03:27.400 +But very briefly because this is not a focus +of this class and then next week because. + +0:03:29.569 --> 0:03:31.757 +Why do we think this is important? + +0:03:31.757 --> 0:03:37.259 +On the one hand, of course, we are dealing +with natural language, so therefore it might + +0:03:37.259 --> 0:03:43.074 +be good to spend a bit of time in understanding +what we are really dealing with because this + +0:03:43.074 --> 0:03:45.387 +is challenging these other problems. + +0:03:45.785 --> 0:03:50.890 +And on the other hand, this was the first +way of how we're doing machine translation. + +0:03:51.271 --> 0:04:01.520 +Therefore, it's interesting to understand +what was the idea behind that and also to later + +0:04:01.520 --> 0:04:08.922 +see what is done differently and to understand +when some models. + +0:04:13.453 --> 0:04:20.213 +When we're talking about linguistics, we can +of course do that on different levels and there's + +0:04:20.213 --> 0:04:21.352 +different ways. + +0:04:21.521 --> 0:04:26.841 +On the right side here you are seeing the +basic levels of linguistics. + +0:04:27.007 --> 0:04:31.431 +So we have at the bottom the phonetics and +phonology. + +0:04:31.431 --> 0:04:38.477 +Phones will not cover this year because we +are mainly focusing on text input where we + +0:04:38.477 --> 0:04:42.163 +are directly having directors and then work. + +0:04:42.642 --> 0:04:52.646 +Then what we touch today, at least mention +what it is, is a morphology which is the first + +0:04:52.646 --> 0:04:53.424 +level. + +0:04:53.833 --> 0:04:59.654 +Already mentioned it a bit on Tuesday that +of course there are some languages where this + +0:04:59.654 --> 0:05:05.343 +is very, very basic and there is not really +a lot of rules of how you can build words. + +0:05:05.343 --> 0:05:11.099 +But since I assume you all have some basic +knowledge of German there is like a lot more + +0:05:11.099 --> 0:05:12.537 +challenges than that. + +0:05:13.473 --> 0:05:20.030 +You know, maybe if you're a native speaker +that's quite easy and everything is clear, + +0:05:20.030 --> 0:05:26.969 +but if you have to learn it like the endings +of a word, we are famous for doing compositar + +0:05:26.969 --> 0:05:29.103 +and putting words together. + +0:05:29.103 --> 0:05:31.467 +So this is like the first lab. + +0:05:32.332 --> 0:05:40.268 +Then we have the syntax, which is both on +the word and on the sentence level, and that's + +0:05:40.268 --> 0:05:43.567 +about the structure of the sentence. + +0:05:43.567 --> 0:05:46.955 +What are the functions of some words? + +0:05:47.127 --> 0:05:51.757 +You might remember part of speech text from +From Your High School Time. + +0:05:51.757 --> 0:05:57.481 +There is like noun and adjective and and things +like that and this is something helpful. + +0:05:57.737 --> 0:06:03.933 +Just imagine in the beginning that it was +not only used for rule based but for statistical + +0:06:03.933 --> 0:06:10.538 +machine translation, for example, the reordering +between languages was quite a challenging task. + +0:06:10.770 --> 0:06:16.330 +Especially if you have long range reorderings +and their part of speech information is very + +0:06:16.330 --> 0:06:16.880 +helpful. + +0:06:16.880 --> 0:06:20.301 +You know, in German you have to move the word +the verb. + +0:06:20.260 --> 0:06:26.599 +To the second position, if you have Spanish +you have to change the noun and the adjective + +0:06:26.599 --> 0:06:30.120 +so information from part of speech could be +very. + +0:06:30.410 --> 0:06:38.621 +Then you have a syntax base structure where +you have a full syntax tree in the beginning + +0:06:38.621 --> 0:06:43.695 +and then it came into statistical machine translation. + +0:06:44.224 --> 0:06:50.930 +And it got more and more important for statistical +machine translation that you are really trying + +0:06:50.930 --> 0:06:53.461 +to model the whole syntax tree of a. + +0:06:53.413 --> 0:06:57.574 +Sentence in order to better match how to do +that in UM. + +0:06:57.574 --> 0:07:04.335 +In the target language, a bit yeah, the syntax +based statistical machine translation had a + +0:07:04.335 --> 0:07:05.896 +bitter of a problem. + +0:07:05.896 --> 0:07:08.422 +It got better and better and was. + +0:07:08.368 --> 0:07:13.349 +Just on the way of getting better in some +languages than traditional statistical models. + +0:07:13.349 --> 0:07:18.219 +But then the neural models came up and they +were just so much better in modelling that + +0:07:18.219 --> 0:07:19.115 +all implicitly. + +0:07:19.339 --> 0:07:23.847 +So that they are never were used in practice +so much. + +0:07:24.304 --> 0:07:34.262 +And then we'll talk about the semantics, so +what is the meaning of the words? + +0:07:34.262 --> 0:07:40.007 +Last time words can have different meanings. + +0:07:40.260 --> 0:07:46.033 +And yeah, how you represent meaning of cause +is very challenging. + +0:07:45.966 --> 0:07:53.043 +And normally that like formalizing this is +typically done in quite limited domains because + +0:07:53.043 --> 0:08:00.043 +like doing that for like all possible words +has not really been achieved yet in this very + +0:08:00.043 --> 0:08:00.898 +challenge. + +0:08:02.882 --> 0:08:09.436 +About pragmatics, so pragmatics is then what +is meaning in the context of the current situation. + +0:08:09.789 --> 0:08:16.202 +So one famous example is there, for example, +if you say the light is red. + +0:08:16.716 --> 0:08:21.795 +The traffic light is red so that typically +not you don't want to tell the other person + +0:08:21.795 --> 0:08:27.458 +if you're sitting in a car that it's surprising +oh the light is red but typically you're meaning + +0:08:27.458 --> 0:08:30.668 +okay you should stop and you shouldn't pass +the light. + +0:08:30.850 --> 0:08:40.994 +So the meaning of this sentence, the light, +is red in the context of sitting in the car. + +0:08:42.762 --> 0:08:51.080 +So let's start with the morphology so that +with the things we are starting there and one + +0:08:51.080 --> 0:08:53.977 +easy and first thing is there. + +0:08:53.977 --> 0:09:02.575 +Of course we have to split the sentence into +words or joint directors so that we have word. + +0:09:02.942 --> 0:09:09.017 +Because in most of our work we'll deal like +machine translation with some type of words. + +0:09:09.449 --> 0:09:15.970 +In neuromachine translation, people are working +also on director based and subwords, but a + +0:09:15.970 --> 0:09:20.772 +basic unique words of the sentence is a very +important first step. + +0:09:21.421 --> 0:09:32.379 +And for many languages that is quite simple +in German, it's not that hard to determine + +0:09:32.379 --> 0:09:33.639 +the word. + +0:09:34.234 --> 0:09:46.265 +In tokenization, the main challenge is if +we are doing corpus-based methods that we are + +0:09:46.265 --> 0:09:50.366 +also dealing as normal words. + +0:09:50.770 --> 0:10:06.115 +And there of course it's getting a bit more +challenging. + +0:10:13.173 --> 0:10:17.426 +So that is maybe the main thing where, for +example, in Germany, if you think of German + +0:10:17.426 --> 0:10:19.528 +tokenization, it's easy to get every word. + +0:10:19.779 --> 0:10:26.159 +You split it at a space, but then you would +have the dots at the end join to the last word, + +0:10:26.159 --> 0:10:30.666 +and of course that you don't want because it's +a different word. + +0:10:30.666 --> 0:10:37.046 +The last word would not be go, but go dot, +but what you can do is split up the dots always. + +0:10:37.677 --> 0:10:45.390 +Can you really do that always or it might +be sometimes better to keep the dot as a point? + +0:10:47.807 --> 0:10:51.001 +For example, email addresses or abbreviations +here. + +0:10:51.001 --> 0:10:56.284 +For example, doctor, maybe it doesn't make +sense to split up the dot because then you + +0:10:56.284 --> 0:11:01.382 +would assume all year starts a new sentence, +but it's just the DR dot from doctor. + +0:11:01.721 --> 0:11:08.797 +Or if you have numbers like he's a seventh +person like the zipter, then you don't want + +0:11:08.797 --> 0:11:09.610 +to split. + +0:11:09.669 --> 0:11:15.333 +So there are some things where it could be +a bit more difficult, but it's not really challenging. + +0:11:16.796 --> 0:11:23.318 +In other languages it's getting a lot more +challenging, especially in Asian languages + +0:11:23.318 --> 0:11:26.882 +where often there are no spaces between words. + +0:11:27.147 --> 0:11:32.775 +So you just have the sequence of characters. + +0:11:32.775 --> 0:11:38.403 +The quick brown fox jumps over the lazy dog. + +0:11:38.999 --> 0:11:44.569 +And then it still might be helpful to work +on something like words. + +0:11:44.569 --> 0:11:48.009 +Then you need to have a bit more complex. + +0:11:48.328 --> 0:11:55.782 +And here you see we are again having our typical +problem. + +0:11:55.782 --> 0:12:00.408 +That means that there is ambiguity. + +0:12:00.600 --> 0:12:02.104 +So you're seeing here. + +0:12:02.104 --> 0:12:08.056 +We have exactly the same sequence of characters +or here, but depending on how we split it, + +0:12:08.056 --> 0:12:12.437 +it means he is your servant or he is the one +who used your things. + +0:12:12.437 --> 0:12:15.380 +Or here we have round eyes and take the air. + +0:12:15.895 --> 0:12:22.953 +So then of course yeah this type of tokenization +gets more important because you could introduce + +0:12:22.953 --> 0:12:27.756 +already arrows and you can imagine if you're +doing it here wrong. + +0:12:27.756 --> 0:12:34.086 +If you once do a wrong decision it's quite +difficult to recover from a wrong decision. + +0:12:34.634 --> 0:12:47.088 +And so in these cases looking about how we're +doing tokenization is an important issue. + +0:12:47.127 --> 0:12:54.424 +And then it might be helpful to do things +like director based models where we treat each + +0:12:54.424 --> 0:12:56.228 +director as a symbol. + +0:12:56.228 --> 0:13:01.803 +For example, do this decision in the later +or never really do this? + +0:13:06.306 --> 0:13:12.033 +The other thing is that if we have words we +might, it might not be the optimal unit to + +0:13:12.033 --> 0:13:18.155 +work with because it can be that we should +look into the internal structure of words because + +0:13:18.155 --> 0:13:20.986 +if we have a morphological rich language,. + +0:13:21.141 --> 0:13:27.100 +That means we have a lot of different types +of words, and if you have a lot of many different + +0:13:27.100 --> 0:13:32.552 +types of words, it on the other hand means +of course each of these words we have seen + +0:13:32.552 --> 0:13:33.757 +very infrequently. + +0:13:33.793 --> 0:13:39.681 +So if you only have ten words and you have +a large corpus, each word occurs more often. + +0:13:39.681 --> 0:13:45.301 +If you have three million different words, +then each of them will occur less often. + +0:13:45.301 --> 0:13:51.055 +Hopefully you know, from machine learning, +it's helpful if you have seen each example + +0:13:51.055 --> 0:13:51.858 +very often. + +0:13:52.552 --> 0:13:54.524 +And so why does it help? + +0:13:54.524 --> 0:13:56.495 +Why does it help happen? + +0:13:56.495 --> 0:14:02.410 +Yeah, in some languages we have quite a complex +information inside a word. + +0:14:02.410 --> 0:14:09.271 +So here's a word from a finish talosanikiko +or something like that, and it means in my + +0:14:09.271 --> 0:14:10.769 +house to question. + +0:14:11.491 --> 0:14:15.690 +So you have all these information attached +to the word. + +0:14:16.036 --> 0:14:20.326 +And that of course in extreme case that's +why typically, for example, Finnish is the + +0:14:20.326 --> 0:14:20.831 +language. + +0:14:20.820 --> 0:14:26.725 +Where machine translation quality is less +good because generating all these different + +0:14:26.725 --> 0:14:33.110 +morphological variants is is a challenge and +the additional challenge is typically in finish + +0:14:33.110 --> 0:14:39.564 +not really low resource but for in low resource +languages you quite often have more difficult + +0:14:39.564 --> 0:14:40.388 +morphology. + +0:14:40.440 --> 0:14:43.949 +Mean English is an example of a relatively +easy one. + +0:14:46.066 --> 0:14:54.230 +And so in general we can say that words are +composed of more themes, and more themes are + +0:14:54.230 --> 0:15:03.069 +the smallest meaning carrying unit, so normally +it means: All morphine should have some type + +0:15:03.069 --> 0:15:04.218 +of meaning. + +0:15:04.218 --> 0:15:09.004 +For example, here does not really have a meaning. + +0:15:09.289 --> 0:15:12.005 +Bian has some type of meaning. + +0:15:12.005 --> 0:15:14.371 +It's changing the meaning. + +0:15:14.371 --> 0:15:21.468 +The NES has the meaning that it's making out +of an adjective, a noun, and happy. + +0:15:21.701 --> 0:15:31.215 +So each of these parts conveys some meaning, +but you cannot split them further up and have + +0:15:31.215 --> 0:15:32.156 +somehow. + +0:15:32.312 --> 0:15:36.589 +You see that of course a little bit more is +happening. + +0:15:36.589 --> 0:15:43.511 +Typically the Y is going into an E so there +can be some variation, but these are typical + +0:15:43.511 --> 0:15:46.544 +examples of what we have as morphines. + +0:16:02.963 --> 0:16:08.804 +That is, of course, a problem and that's the +question why how you do your splitting. + +0:16:08.804 --> 0:16:15.057 +But that problem we have anyway always because +even full words can have different meanings + +0:16:15.057 --> 0:16:17.806 +depending on the context they're using. + +0:16:18.038 --> 0:16:24.328 +So we always have to somewhat have a model +which can infer or represent the meaning of + +0:16:24.328 --> 0:16:25.557 +the word in the. + +0:16:25.825 --> 0:16:30.917 +But you are right that this problem might +get even more severe if you're splitting up. + +0:16:30.917 --> 0:16:36.126 +Therefore, it might not be the best to go +for the very extreme and represent each letter + +0:16:36.126 --> 0:16:41.920 +and have a model which is only on letters because, +of course, a letter can have a lot of different + +0:16:41.920 --> 0:16:44.202 +meanings depending on where it's used. + +0:16:44.524 --> 0:16:50.061 +And yeah, there is no right solution like +what is the right splitting. + +0:16:50.061 --> 0:16:56.613 +It depends on the language and the application +on the amount of data you're having. + +0:16:56.613 --> 0:17:01.058 +For example, typically it means the fewer +data you have. + +0:17:01.301 --> 0:17:12.351 +The more splitting you should do, if you have +more data, then you can be better distinguish. + +0:17:13.653 --> 0:17:19.065 +Then there are different types of morphines: +So we have typically one stemmed theme: It's + +0:17:19.065 --> 0:17:21.746 +like house or tish, so the main meaning. + +0:17:21.941 --> 0:17:29.131 +And then you can have functional or bound +morphemes which can be f which can be prefix, + +0:17:29.131 --> 0:17:34.115 +suffix, infix or circumfix so it can be before +can be after. + +0:17:34.114 --> 0:17:39.416 +It can be inside or it can be around it, something +like a coughed there. + +0:17:39.416 --> 0:17:45.736 +Typically you would say that it's not like +two more themes, G and T, because they both + +0:17:45.736 --> 0:17:50.603 +describe the function, but together G and T +are marking the cough. + +0:17:53.733 --> 0:18:01.209 +For what are people using them you can use +them for inflection to describe something like + +0:18:01.209 --> 0:18:03.286 +tense count person case. + +0:18:04.604 --> 0:18:09.238 +That is yeah, if you know German, this is +commonly used in German. + +0:18:10.991 --> 0:18:16.749 +But of course there is a lot more complicated +things: I think in in some languages it also. + +0:18:16.749 --> 0:18:21.431 +I mean, in Germany it only depends counting +person on the subject. + +0:18:21.431 --> 0:18:27.650 +For the word, for example, in other languages +it can also determine the first and on the + +0:18:27.650 --> 0:18:28.698 +second object. + +0:18:28.908 --> 0:18:35.776 +So that it like if you buy an apple or an +house, that not only the, the, the. + +0:18:35.776 --> 0:18:43.435 +Kauft depends on on me like in German, but +it can also depend on whether it's an apple + +0:18:43.435 --> 0:18:44.492 +or a house. + +0:18:44.724 --> 0:18:48.305 +And then of course you have an exploding number +of web fronts. + +0:18:49.409 --> 0:19:04.731 +Furthermore, it can be used to do derivations +so you can make other types of words from it. + +0:19:05.165 --> 0:19:06.254 +And then yeah. + +0:19:06.254 --> 0:19:12.645 +This is like creating new words by joining +them like rainbow waterproof but for example + +0:19:12.645 --> 0:19:19.254 +in German like Einköw's Wagen, Ice Cult and +so on where you can join where you can do that + +0:19:19.254 --> 0:19:22.014 +with nouns and German adjectives and. + +0:19:22.282 --> 0:19:29.077 +Then of course you might have additional challenges +like the Fugan where you have to add this one. + +0:19:32.452 --> 0:19:39.021 +Yeah, then there is a yeah of course additional +special things. + +0:19:39.639 --> 0:19:48.537 +You have to sometimes put extra stuff because +of phonology, so it's dig the plural, not plural. + +0:19:48.537 --> 0:19:56.508 +The third person singular, as in English, +is normally S, but by Goes, for example, is + +0:19:56.508 --> 0:19:57.249 +an E S. + +0:19:57.277 --> 0:20:04.321 +In German you can also have other things that +like Osmutta gets Mutter so you're changing + +0:20:04.321 --> 0:20:11.758 +the Umlaud in order to express the plural and +in other languages for example the vowel harmony + +0:20:11.758 --> 0:20:17.315 +where the vowels inside are changing depending +on which form you have. + +0:20:17.657 --> 0:20:23.793 +Which makes things more difficult than splitting +a word into its part doesn't really work anymore. + +0:20:23.793 --> 0:20:28.070 +So like for Muta and Muta, for example, that +is not really possible. + +0:20:28.348 --> 0:20:36.520 +The nice thing is, of course, more like a +general thing, but often irregular things are + +0:20:36.520 --> 0:20:39.492 +happening as words which occur. + +0:20:39.839 --> 0:20:52.177 +So that you can have enough examples, while +the regular things you can do by some type + +0:20:52.177 --> 0:20:53.595 +of rules. + +0:20:55.655 --> 0:20:57.326 +Yeah, This Can Be Done. + +0:20:57.557 --> 0:21:02.849 +So there are tasks on this: how to do automatic +inflection, how to analyze them. + +0:21:02.849 --> 0:21:04.548 +So you give it a word to. + +0:21:04.548 --> 0:21:10.427 +It's telling you what are the possible forms +of that, like how they are built, and so on. + +0:21:10.427 --> 0:21:15.654 +And for the at least Ah Iris shoes language, +there are a lot of tools for that. + +0:21:15.654 --> 0:21:18.463 +Of course, if you now want to do that for. + +0:21:18.558 --> 0:21:24.281 +Some language which is very low resourced +might be very difficult and there might be + +0:21:24.281 --> 0:21:25.492 +no tool for them. + +0:21:28.368 --> 0:21:37.652 +Good before we are going for the next part +about part of speech, are there any questions + +0:21:37.652 --> 0:21:38.382 +about? + +0:22:01.781 --> 0:22:03.187 +Yeah, we'll come to that a bit. + +0:22:03.483 --> 0:22:09.108 +So it's a very good question and difficult +and especially we'll see that later if you + +0:22:09.108 --> 0:22:14.994 +just put in words it would be very bad because +words are put into neural networks just as + +0:22:14.994 --> 0:22:15.844 +some digits. + +0:22:15.844 --> 0:22:21.534 +Each word is mapped into a jitter and you +put it in so it doesn't really know any more + +0:22:21.534 --> 0:22:22.908 +about the structure. + +0:22:23.543 --> 0:22:29.898 +What we will see therefore the most successful +approach which is mostly done is a subword + +0:22:29.898 --> 0:22:34.730 +unit where we split: But we will do this. + +0:22:34.730 --> 0:22:40.154 +Don't know if you have been in advanced. + +0:22:40.154 --> 0:22:44.256 +We'll cover this on a Tuesday. + +0:22:44.364 --> 0:22:52.316 +So there is an algorithm called bite pairing +coding, which is about splitting words into + +0:22:52.316 --> 0:22:52.942 +parts. + +0:22:53.293 --> 0:23:00.078 +So it's doing the splitting of words but not +morphologically motivated but more based on + +0:23:00.078 --> 0:23:00.916 +frequency. + +0:23:00.940 --> 0:23:11.312 +However, it performs very good and that's +why it's used and there is a bit of correlation. + +0:23:11.312 --> 0:23:15.529 +Sometimes they agree on count based. + +0:23:15.695 --> 0:23:20.709 +So we're splitting words and we're splitting +especially words which are infrequent and that's + +0:23:20.709 --> 0:23:23.962 +maybe a good motivation why that's good for +neural networks. + +0:23:23.962 --> 0:23:28.709 +That means if you have seen a word very often +you don't need to split it and it's easier + +0:23:28.709 --> 0:23:30.043 +to just process it fast. + +0:23:30.690 --> 0:23:39.218 +While if you have seen the words infrequently, +it is good to split it into parts so it can + +0:23:39.218 --> 0:23:39.593 +do. + +0:23:39.779 --> 0:23:47.729 +So there is some way of doing it, but linguists +would say this is not a morphological analyst. + +0:23:47.729 --> 0:23:53.837 +That is true, but we are spitting words into +parts if they are not seen. + +0:23:59.699 --> 0:24:06.324 +Yes, so another important thing about words +are the paddle speech text. + +0:24:06.324 --> 0:24:14.881 +These are the common ones: noun, verb, adjective, +verb, determine, pronoun, proposition, and + +0:24:14.881 --> 0:24:16.077 +conjunction. + +0:24:16.077 --> 0:24:26.880 +There are some more: They are not the same +in all language, but for example there is this + +0:24:26.880 --> 0:24:38.104 +universal grammar which tries to do this type +of part of speech text for many languages. + +0:24:38.258 --> 0:24:42.018 +And then, of course, it's helping you for +generalization. + +0:24:42.018 --> 0:24:48.373 +There are some language deals with verbs and +nouns, especially if you look at sentence structure. + +0:24:48.688 --> 0:24:55.332 +And so if you know the part of speech tag +you can easily generalize and do get these + +0:24:55.332 --> 0:24:58.459 +rules or apply these rules as you know. + +0:24:58.459 --> 0:25:02.680 +The verb in English is always at the second +position. + +0:25:03.043 --> 0:25:10.084 +So you know how to deal with verbs independently +of which words you are now really looking at. + +0:25:12.272 --> 0:25:18.551 +And that again can be done is ambiguous. + +0:25:18.598 --> 0:25:27.171 +So there are some words which can have several +pot of speech text. + +0:25:27.171 --> 0:25:38.686 +Example are the word can, for example, which +can be the can of beans or can do something. + +0:25:38.959 --> 0:25:46.021 +Often is also in English related work. + +0:25:46.021 --> 0:25:55.256 +Access can be to excess or to access to something. + +0:25:56.836 --> 0:26:02.877 +Most words have only one single part of speech +tag, but they are some where it's a bit more + +0:26:02.877 --> 0:26:03.731 +challenging. + +0:26:03.731 --> 0:26:09.640 +The nice thing is the ones which are in big +are often more words, which occur more often, + +0:26:09.640 --> 0:26:12.858 +while for really ware words it's not that often. + +0:26:13.473 --> 0:26:23.159 +If you look at these classes you can distinguish +open classes where new words can happen so + +0:26:23.159 --> 0:26:25.790 +we can invent new nouns. + +0:26:26.926 --> 0:26:31.461 +But then there are the close classes which +I think are determined or pronoun. + +0:26:31.461 --> 0:26:35.414 +For example, it's not that you can easily +develop your new pronoun. + +0:26:35.414 --> 0:26:38.901 +So there is a fixed list of pronouns and we +are using that. + +0:26:38.901 --> 0:26:44.075 +So it's not like that or tomorrow there is +something happening and then people are using + +0:26:44.075 --> 0:26:44.482 +a new. + +0:26:45.085 --> 0:26:52.426 +Pronoun or new conjectures, so it's like end, +because it's not that you normally invent a + +0:26:52.426 --> 0:26:52.834 +new. + +0:27:00.120 --> 0:27:03.391 +And additional to part of speech text. + +0:27:03.391 --> 0:27:09.012 +Then some of these part of speech texts have +different properties. + +0:27:09.389 --> 0:27:21.813 +So, for example, for nouns and adjectives +we can have a singular plural: In other languages, + +0:27:21.813 --> 0:27:29.351 +there is a duel so that a word is not only +like a single or in plural, but also like a + +0:27:29.351 --> 0:27:31.257 +duel if it's meaning. + +0:27:31.631 --> 0:27:36.246 +You have the gender and masculine feminine +neutre we know. + +0:27:36.246 --> 0:27:43.912 +In other language there is animated and inanimated +and you have the cases like in German you have + +0:27:43.912 --> 0:27:46.884 +no maternative guinetive acquisitive. + +0:27:47.467 --> 0:27:57.201 +So here and then in other languages you also +have Latin with the upper teeth. + +0:27:57.497 --> 0:28:03.729 +So there's like more, it's just like yeah, +and there you have no one to one correspondence, + +0:28:03.729 --> 0:28:09.961 +so it can be that there are some cases which +are only in the one language and do not happen + +0:28:09.961 --> 0:28:11.519 +in the other language. + +0:28:13.473 --> 0:28:20.373 +For whorps we have tenses of course like walk +is walking walked have walked head walked will + +0:28:20.373 --> 0:28:21.560 +walk and so on. + +0:28:21.560 --> 0:28:28.015 +Interestingly for example in Japanese this +can also happen for adjectives though there + +0:28:28.015 --> 0:28:32.987 +is a difference between something is white +or something was white. + +0:28:35.635 --> 0:28:41.496 +There is this continuous thing which should +not really have that commonly in German and + +0:28:41.496 --> 0:28:47.423 +I guess that's if you're German and learning +English that's something like she sings and + +0:28:47.423 --> 0:28:53.350 +she is singing and of course we can express +that but it's not commonly used and normally + +0:28:53.350 --> 0:28:55.281 +we're not doing this aspect. + +0:28:55.455 --> 0:28:57.240 +Also about tenses. + +0:28:57.240 --> 0:29:05.505 +If you use pasts in English you will also +use past tenses in German, so we have similar + +0:29:05.505 --> 0:29:09.263 +tenses, but the use might be different. + +0:29:14.214 --> 0:29:20.710 +There is uncertainty like the mood in there +indicative. + +0:29:20.710 --> 0:29:26.742 +If he were here, there's voices active and +passive. + +0:29:27.607 --> 0:29:34.024 +That you know, that is like both in German +and English there, but there is something in + +0:29:34.024 --> 0:29:35.628 +the Middle and Greek. + +0:29:35.628 --> 0:29:42.555 +I get myself taught, so there is other phenomens +than which might only happen in one language. + +0:29:42.762 --> 0:29:50.101 +This is, like yeah, the different synthetic +structures that you can can have in the language, + +0:29:50.101 --> 0:29:57.361 +and where there's the two things, so it might +be that some only are in some language, others + +0:29:57.361 --> 0:29:58.376 +don't exist. + +0:29:58.358 --> 0:30:05.219 +And on the other hand there is also matching, +so it might be that in some situations you + +0:30:05.219 --> 0:30:07.224 +use different structures. + +0:30:10.730 --> 0:30:13.759 +The next would be then about semantics. + +0:30:13.759 --> 0:30:16.712 +Do you have any questions before that? + +0:30:19.819 --> 0:30:31.326 +I'll just continue, but if something is unclear +beside the structure, we typically have more + +0:30:31.326 --> 0:30:39.863 +ambiguities, so it can be that words itself +have different meanings. + +0:30:40.200 --> 0:30:48.115 +And we are typically talking about polysemy +and homonyme, where polysemy means that a word + +0:30:48.115 --> 0:30:50.637 +can have different meanings. + +0:30:50.690 --> 0:30:58.464 +So if you have the English word interest, +it can be that you are interested in something. + +0:30:58.598 --> 0:31:07.051 +Or it can be like the interest rate financial, +but it is somehow related because if you are + +0:31:07.051 --> 0:31:11.002 +getting some interest rates there is some. + +0:31:11.531 --> 0:31:18.158 +Are, but there is a homophemer where they +really are not related. + +0:31:18.458 --> 0:31:24.086 +So you can and can doesn't really have anything +in common, so it's really very different. + +0:31:24.324 --> 0:31:29.527 +And of course that's not completely clear +so there is not a clear definition so for example + +0:31:29.527 --> 0:31:34.730 +for the bank it can be that you say it's related +but it can also be other can argue that so + +0:31:34.730 --> 0:31:39.876 +there are some clear things which is interest +there are some which is vague and then there + +0:31:39.876 --> 0:31:43.439 +are some where it's very clear again that there +are different. + +0:31:45.065 --> 0:31:49.994 +And in order to translate them, of course, +we might need the context to disambiguate. + +0:31:49.994 --> 0:31:54.981 +That's typically where we can disambiguate, +and that's not only for lexical semantics, + +0:31:54.981 --> 0:32:00.198 +that's generally very often that if you want +to disambiguate, context can be very helpful. + +0:32:00.198 --> 0:32:03.981 +So in which sentence and which general knowledge +who is speaking? + +0:32:04.944 --> 0:32:09.867 +You can do that externally by some disinvigration +task. + +0:32:09.867 --> 0:32:14.702 +Machine translation system will also do it +internally. + +0:32:16.156 --> 0:32:21.485 +And sometimes you're lucky and you don't need +to do it because you just have the same ambiguity + +0:32:21.485 --> 0:32:23.651 +in the source and the target language. + +0:32:23.651 --> 0:32:26.815 +And then it doesn't matter if you think about +the mouse. + +0:32:26.815 --> 0:32:31.812 +As I said, you don't really need to know if +it's a computer mouse or the living mouse you + +0:32:31.812 --> 0:32:36.031 +translate from German to English because it +has exactly the same ambiguity. + +0:32:40.400 --> 0:32:46.764 +There's also relations between words like +synonyms, antonyms, hipponomes, like the is + +0:32:46.764 --> 0:32:50.019 +a relation and the part of like Dora House. + +0:32:50.019 --> 0:32:55.569 +Big small is an antonym and synonym is like +which needs something similar. + +0:32:56.396 --> 0:33:03.252 +There are resources which try to express all +these linguistic information like word net + +0:33:03.252 --> 0:33:10.107 +or German net where you have a graph with words +and how they are related to each other. + +0:33:11.131 --> 0:33:12.602 +Which can be helpful. + +0:33:12.602 --> 0:33:18.690 +Typically these things were more used in tasks +where there is fewer data, so there's a lot + +0:33:18.690 --> 0:33:24.510 +of tasks in NLP where you have very limited +data because you really need to hand align + +0:33:24.510 --> 0:33:24.911 +that. + +0:33:25.125 --> 0:33:28.024 +Machine translation has a big advantage. + +0:33:28.024 --> 0:33:31.842 +There's naturally a lot of text translated +out there. + +0:33:32.212 --> 0:33:39.519 +Typically in machine translation we have compared +to other tasks significantly amount of data. + +0:33:39.519 --> 0:33:46.212 +People have looked into integrating wordnet +or things like that, but it is rarely used + +0:33:46.212 --> 0:33:49.366 +in like commercial systems or something. + +0:33:52.692 --> 0:33:55.626 +So this was based on the words. + +0:33:55.626 --> 0:34:03.877 +We have morphology, syntax, and semantics, +and then of course it makes sense to also look + +0:34:03.877 --> 0:34:06.169 +at the bigger structure. + +0:34:06.169 --> 0:34:08.920 +That means information about. + +0:34:08.948 --> 0:34:17.822 +Of course, we don't have a really morphology +there because morphology about the structure + +0:34:17.822 --> 0:34:26.104 +of words, but we have syntax on the sentence +level and the semantic representation. + +0:34:28.548 --> 0:34:35.637 +When we are thinking about the sentence structure, +then the sentence is, of course, first a sequence + +0:34:35.637 --> 0:34:37.742 +of words terminated by a dot. + +0:34:37.742 --> 0:34:42.515 +Jane bought the house and we can say something +about the structure. + +0:34:42.515 --> 0:34:47.077 +It's typically its subject work and then one +or several objects. + +0:34:47.367 --> 0:34:51.996 +And the number of objects, for example, is +then determined by the word. + +0:34:52.232 --> 0:34:54.317 +It's Called the Valency. + +0:34:54.354 --> 0:35:01.410 +So you have intransitive verbs which don't +get any object, it's just to sleep. + +0:35:02.622 --> 0:35:05.912 +For example, there is no object sleep beds. + +0:35:05.912 --> 0:35:14.857 +You cannot say that: And there are transitive +verbs where you have to put one or more objects, + +0:35:14.857 --> 0:35:16.221 +and you always. + +0:35:16.636 --> 0:35:19.248 +Sentence is not correct if you don't put the +object. + +0:35:19.599 --> 0:35:33.909 +So if you have to buy something you have to +say bought this or give someone something then. + +0:35:34.194 --> 0:35:40.683 +Here you see a bit that may be interesting +the relation between word order and morphology. + +0:35:40.683 --> 0:35:47.243 +Of course it's not that strong, but for example +in English you always have to first say who + +0:35:47.243 --> 0:35:49.453 +you gave it and what you gave. + +0:35:49.453 --> 0:35:53.304 +So the structure is very clear and cannot +be changed. + +0:35:54.154 --> 0:36:00.801 +German, for example, has a possibility of +determining what you gave and whom you gave + +0:36:00.801 --> 0:36:07.913 +it because there is a morphology and you can +do what you gave a different form than to whom + +0:36:07.913 --> 0:36:08.685 +you gave. + +0:36:11.691 --> 0:36:18.477 +And that is a general tendency that if you +have morphology then typically the word order + +0:36:18.477 --> 0:36:25.262 +is more free and possible, while in English +you cannot express these information through + +0:36:25.262 --> 0:36:26.482 +the morphology. + +0:36:26.706 --> 0:36:30.238 +You typically have to express them through +the word order. + +0:36:30.238 --> 0:36:32.872 +It's not as free, but it's more restricted. + +0:36:35.015 --> 0:36:40.060 +Yeah, the first part is typically the noun +phrase, the subject, and that can not only + +0:36:40.060 --> 0:36:43.521 +be a single noun, but of course it can be a +longer phrase. + +0:36:43.521 --> 0:36:48.860 +So if you have Jane the woman, it can be Jane, +it can be the woman, it can a woman, it can + +0:36:48.860 --> 0:36:52.791 +be the young woman or the young woman who lives +across the street. + +0:36:53.073 --> 0:36:56.890 +All of these are the subjects, so this can +be already very, very long. + +0:36:57.257 --> 0:36:58.921 +And they also put this. + +0:36:58.921 --> 0:37:05.092 +The verb is on the second position in a bit +more complicated way because if you have now + +0:37:05.092 --> 0:37:11.262 +the young woman who lives across the street +runs to somewhere or so then yeah runs is at + +0:37:11.262 --> 0:37:16.185 +the second position in this tree but the first +position is quite long. + +0:37:16.476 --> 0:37:19.277 +And so it's not just counting okay. + +0:37:19.277 --> 0:37:22.700 +The second word is always is always a word. + +0:37:26.306 --> 0:37:32.681 +Additional to these simple things, there's +more complex stuff. + +0:37:32.681 --> 0:37:43.104 +Jane bought the house from Jim without hesitation, +or Jane bought the house in the pushed neighborhood + +0:37:43.104 --> 0:37:44.925 +across the river. + +0:37:45.145 --> 0:37:51.694 +And these often lead to additional ambiguities +because it's not always completely clear to + +0:37:51.694 --> 0:37:53.565 +which this prepositional. + +0:37:54.054 --> 0:37:59.076 +So that we'll see and you have, of course, +subclasses and so on. + +0:38:01.061 --> 0:38:09.926 +And then there is a theory behind it which +was very important for rule based machine translation + +0:38:09.926 --> 0:38:14.314 +because that's exactly what you're doing there. + +0:38:14.314 --> 0:38:18.609 +You would take the sentence, do the syntactic. + +0:38:18.979 --> 0:38:28.432 +So that we can have this constituents which +like describe the basic parts of the language. + +0:38:28.468 --> 0:38:35.268 +And we can create the sentence structure as +a context free grammar, which you hopefully + +0:38:35.268 --> 0:38:42.223 +remember from basic computer science, which +is a pair of non terminals, terminal symbols, + +0:38:42.223 --> 0:38:44.001 +production rules, and. + +0:38:43.943 --> 0:38:50.218 +And the star symbol, and you can then describe +a sentence by this phrase structure grammar: + +0:38:51.751 --> 0:38:59.628 +So a simple example would be something like +that: you have a lexicon, Jane is a noun, Frays + +0:38:59.628 --> 0:39:02.367 +is a noun, Telescope is a noun. + +0:39:02.782 --> 0:39:10.318 +And then you have these production rules sentences: +a noun phrase in the web phrase. + +0:39:10.318 --> 0:39:18.918 +The noun phrase can either be a determinized +noun or it can be a noun phrase and a propositional + +0:39:18.918 --> 0:39:19.628 +phrase. + +0:39:19.919 --> 0:39:25.569 +Or a prepositional phrase and a prepositional +phrase is a preposition and a non phrase. + +0:39:26.426 --> 0:39:27.622 +We're looking at this. + +0:39:27.622 --> 0:39:30.482 +What is the valency of the word we're describing +here? + +0:39:33.513 --> 0:39:36.330 +How many objects would in this case the world +have? + +0:39:46.706 --> 0:39:48.810 +We're looking at the web phrase. + +0:39:48.810 --> 0:39:54.358 +The web phrase is a verb and a noun phrase, +so one object here, so this would be for a + +0:39:54.358 --> 0:39:55.378 +balance of one. + +0:39:55.378 --> 0:40:00.925 +If you have intransitive verbs, it would be +verb phrases, just a word, and if you have + +0:40:00.925 --> 0:40:03.667 +two, it would be noun phrase, noun phrase. + +0:40:08.088 --> 0:40:15.348 +And yeah, then the, the, the challenge or +what you have to do is like this: Given a natural + +0:40:15.348 --> 0:40:23.657 +language sentence, you want to parse it to +get this type of pastry from programming languages + +0:40:23.657 --> 0:40:30.198 +where you also need to parse the code in order +to get the representation. + +0:40:30.330 --> 0:40:39.356 +However, there is one challenge if you parse +natural language compared to computer language. + +0:40:43.823 --> 0:40:56.209 +So there are different ways of how you can +express things and there are different pastures + +0:40:56.209 --> 0:41:00.156 +belonging to the same input. + +0:41:00.740 --> 0:41:05.241 +So if you have Jane buys a horse, how's that +an easy example? + +0:41:05.241 --> 0:41:07.491 +So you do the lexicon look up. + +0:41:07.491 --> 0:41:13.806 +Jane can be a noun phrase, a bias is a verb, +a is a determiner, and a house is a noun. + +0:41:15.215 --> 0:41:18.098 +And then you can now use the grammar rules +of here. + +0:41:18.098 --> 0:41:19.594 +There is no rule for that. + +0:41:20.080 --> 0:41:23.564 +Here we have no rules, but here we have a +rule. + +0:41:23.564 --> 0:41:27.920 +A noun is a non-phrase, so we have mapped +that to the noun. + +0:41:28.268 --> 0:41:34.012 +Then we can map this to the web phrase. + +0:41:34.012 --> 0:41:47.510 +We have a verb noun phrase to web phrase and +then we can map this to a sentence representing: + +0:41:49.069 --> 0:41:53.042 +We can have that even more complex. + +0:41:53.042 --> 0:42:01.431 +The woman who won the lottery yesterday bought +the house across the street. + +0:42:01.431 --> 0:42:05.515 +The structure gets more complicated. + +0:42:05.685 --> 0:42:12.103 +You now see that the word phrase is at the +second position, but the noun phrase is quite. + +0:42:12.052 --> 0:42:18.655 +Quite big in here and the p p phrases, it's +sometimes difficult where to put them because + +0:42:18.655 --> 0:42:25.038 +they can be put to the noun phrase, but in +other sentences they can also be put to the + +0:42:25.038 --> 0:42:25.919 +web phrase. + +0:42:36.496 --> 0:42:38.250 +Yeah. + +0:42:43.883 --> 0:42:50.321 +Yes, so then either it can have two tags, +noun or noun phrase, or you can have the extra + +0:42:50.321 --> 0:42:50.755 +rule. + +0:42:50.755 --> 0:42:57.409 +The noun phrase can not only be a determiner +in the noun, but it can also be a noun phrase. + +0:42:57.717 --> 0:43:04.360 +Then of course either you introduce additional +rules when what is possible or the problem + +0:43:04.360 --> 0:43:11.446 +that if you do pastures which are not correct +and then you have to add some type of probability + +0:43:11.446 --> 0:43:13.587 +which type is more probable. + +0:43:16.876 --> 0:43:23.280 +But of course some things also can't really +model easily with this type of cheese. + +0:43:23.923 --> 0:43:32.095 +There, for example, the agreement is not straightforward +to do so that in subject and work you can check + +0:43:32.095 --> 0:43:38.866 +that the person, the agreement, the number +in person, the number agreement is correct, + +0:43:38.866 --> 0:43:41.279 +but if it's a singular object. + +0:43:41.561 --> 0:43:44.191 +A singular verb, it's also a singular. + +0:43:44.604 --> 0:43:49.242 +Non-subject, and if it's a plural subject, +it's a plural work. + +0:43:49.489 --> 0:43:56.519 +Things like that are yeah, the agreement in +determining action driven now, so they also + +0:43:56.519 --> 0:43:57.717 +have to agree. + +0:43:57.877 --> 0:44:05.549 +Things like that cannot be easily done with +this type of grammar or this subcategorization + +0:44:05.549 --> 0:44:13.221 +that you check whether the verb is transitive +or intransitive, and that Jane sleeps is OK, + +0:44:13.221 --> 0:44:16.340 +but Jane sleeps the house is not OK. + +0:44:16.436 --> 0:44:21.073 +And Jane Walterhouse is okay, but Jane Walterhouse +is not okay. + +0:44:23.183 --> 0:44:29.285 +Furthermore, this long range dependency might +be difficult and which word orders are allowed + +0:44:29.285 --> 0:44:31.056 +and which are not allowed. + +0:44:31.571 --> 0:44:40.011 +This is also not directly so you can say Maria +give de man das bourg, de man give Maria das + +0:44:40.011 --> 0:44:47.258 +bourg, das bourg give Maria, de man aber Maria, +de man give des bourg is some. + +0:44:47.227 --> 0:44:55.191 +One yeah, which one from this one is possible +and not is sometimes not possible to model, + +0:44:55.191 --> 0:44:56.164 +is simple. + +0:44:56.876 --> 0:45:05.842 +Therefore, people have done more complex stuff +like this unification grammar and tried to + +0:45:05.842 --> 0:45:09.328 +model both the categories of verb. + +0:45:09.529 --> 0:45:13.367 +The agreement has to be that it's person and +single. + +0:45:13.367 --> 0:45:20.028 +You're joining that so you're annotating this +thing with more information and then you have + +0:45:20.028 --> 0:45:25.097 +more complex synthetic structures in order +to model also these types. + +0:45:28.948 --> 0:45:33.137 +Yeah, why is this difficult? + +0:45:33.873 --> 0:45:39.783 +We have different ambiguities and that makes +it different, so words have different part + +0:45:39.783 --> 0:45:43.610 +of speech text and if you have time flies like +an error. + +0:45:43.583 --> 0:45:53.554 +It can mean that sometimes the animal L look +like an arrow and or it can mean that the time + +0:45:53.554 --> 0:45:59.948 +is flying very fast is going away very fast +like an error. + +0:46:00.220 --> 0:46:10.473 +And if you want to do a pastry, these two +meanings have a different part of speech text, + +0:46:10.473 --> 0:46:13.008 +so flies is the verb. + +0:46:13.373 --> 0:46:17.999 +And of course that is a different semantic, +and so that is very different. + +0:46:19.499 --> 0:46:23.361 +And otherwise a structural. + +0:46:23.243 --> 0:46:32.419 +Ambiguity so that like some part of the sentence +can have different rules, so the famous thing + +0:46:32.419 --> 0:46:34.350 +is this attachment. + +0:46:34.514 --> 0:46:39.724 +So the cops saw the Bulgara with a binoculars. + +0:46:39.724 --> 0:46:48.038 +Then with a binocular can be attached to saw +or it can be attached to the. + +0:46:48.448 --> 0:46:59.897 +And so in the first two it's more probable +that he saw the theft, and not that the theft + +0:46:59.897 --> 0:47:01.570 +has the one. + +0:47:01.982 --> 0:47:13.356 +And this, of course, makes things difficult +while parsing and doing structure implicitly + +0:47:13.356 --> 0:47:16.424 +defining the semantics. + +0:47:20.120 --> 0:47:29.736 +Therefore, we would then go directly to semantics, +but maybe some questions about spintax and + +0:47:29.736 --> 0:47:31.373 +how that works. + +0:47:33.113 --> 0:47:46.647 +Then we'll do a bit more about semantics, +so now we only describe the structure of the + +0:47:46.647 --> 0:47:48.203 +sentence. + +0:47:48.408 --> 0:47:55.584 +And for the meaning of the sentence we typically +have the compositionality of meaning. + +0:47:55.584 --> 0:48:03.091 +The meaning of the full sentence is determined +by the meaning of the individual words, and + +0:48:03.091 --> 0:48:06.308 +they together form the meaning of the. + +0:48:06.686 --> 0:48:17.936 +For words that is partly true but not always +mean for things like rainbow, jointly rain + +0:48:17.936 --> 0:48:19.086 +and bow. + +0:48:19.319 --> 0:48:26.020 +But this is not always a case, while for sentences +typically that is happening because you can't + +0:48:26.020 --> 0:48:30.579 +directly determine the full meaning, but you +split it into parts. + +0:48:30.590 --> 0:48:36.164 +Sometimes only in some parts like kick the +bucket the expression. + +0:48:36.164 --> 0:48:43.596 +Of course you cannot get the meaning of kick +the bucket by looking at the individual or + +0:48:43.596 --> 0:48:46.130 +in German abyss in its grass. + +0:48:47.207 --> 0:48:53.763 +You cannot get that he died by looking at +the individual words of Bis ins grass, but + +0:48:53.763 --> 0:48:54.611 +they have. + +0:48:55.195 --> 0:49:10.264 +And there are different ways of describing +that some people have tried that more commonly + +0:49:10.264 --> 0:49:13.781 +used for some tasks. + +0:49:14.654 --> 0:49:20.073 +Will come to so the first thing would be something +like first order logic. + +0:49:20.073 --> 0:49:27.297 +If you have Peter loves Jane then you have +this meaning and you're having the end of representation + +0:49:27.297 --> 0:49:33.005 +that you have a love property between Peter +and Jane and you try to construct. + +0:49:32.953 --> 0:49:40.606 +That you will see this a lot more complex +than directly than only doing syntax but also + +0:49:40.606 --> 0:49:43.650 +doing this type of representation. + +0:49:44.164 --> 0:49:47.761 +The other thing is to try to do frame semantics. + +0:49:47.867 --> 0:49:55.094 +That means that you try to represent the knowledge +about the world and you have these ah frames. + +0:49:55.094 --> 0:49:58.372 +For example, you might have a frame to buy. + +0:49:58.418 --> 0:50:05.030 +And the meaning is that you have a commercial +transaction. + +0:50:05.030 --> 0:50:08.840 +You have a person who is selling. + +0:50:08.969 --> 0:50:10.725 +You Have a Person Who's Buying. + +0:50:11.411 --> 0:50:16.123 +You have something that is priced, you might +have a price, and so on. + +0:50:17.237 --> 0:50:22.698 +And then what you are doing in semantic parsing +with frame semantics you first try to determine. + +0:50:22.902 --> 0:50:30.494 +Which frames are happening in the sentence, +so if it's something with Bowie buying you + +0:50:30.494 --> 0:50:33.025 +would try to first identify. + +0:50:33.025 --> 0:50:40.704 +Oh, here we have to try Brain B, which does +not always have to be indicated by the verb + +0:50:40.704 --> 0:50:42.449 +cell or other ways. + +0:50:42.582 --> 0:50:52.515 +And then you try to find out which elements +of these frame are in the sentence and try + +0:50:52.515 --> 0:50:54.228 +to align them. + +0:50:56.856 --> 0:51:01.121 +Yeah, you have, for example, to buy and sell. + +0:51:01.121 --> 0:51:07.239 +If you have a model that has frames, they +have the same elements. + +0:51:09.829 --> 0:51:15.018 +In addition over like sentence, then you have +also a phenomenon beyond sentence level. + +0:51:15.018 --> 0:51:20.088 +We're coming to this later because it's a +special challenge for machine translation. + +0:51:20.088 --> 0:51:22.295 +There is, for example, co reference. + +0:51:22.295 --> 0:51:27.186 +That means if you first mention it, it's like +the President of the United States. + +0:51:27.467 --> 0:51:30.107 +And later you would refer to him maybe as +he. + +0:51:30.510 --> 0:51:36.966 +And that is especially challenging in machine +translation because you're not always using + +0:51:36.966 --> 0:51:38.114 +the same thing. + +0:51:38.114 --> 0:51:44.355 +Of course, for the president, it's he and +air in German, but for other things it might + +0:51:44.355 --> 0:51:49.521 +be different depending on the gender in languages +that you refer to it. + +0:51:55.435 --> 0:52:03.866 +So much for the background and the next, we +want to look based on the knowledge we have + +0:52:03.866 --> 0:52:04.345 +now. + +0:52:04.345 --> 0:52:10.285 +Why is machine translation difficult before +we have any more? + +0:52:16.316 --> 0:52:22.471 +The first type of problem is what we refer +to as translation divers. + +0:52:22.471 --> 0:52:30.588 +That means that we have the same information +in source and target, but the problem is that + +0:52:30.588 --> 0:52:33.442 +they are expressed differently. + +0:52:33.713 --> 0:52:42.222 +So it is not the same way, and we have to +translate these things more easily by just + +0:52:42.222 --> 0:52:44.924 +having a bit more complex. + +0:52:45.325 --> 0:52:51.324 +So example is if it's only a structure in +English, the delicious. + +0:52:51.324 --> 0:52:59.141 +The adjective is before the noun, while in +Spanish you have to put it after the noun, + +0:52:59.141 --> 0:53:02.413 +and so you have to change the word. + +0:53:02.983 --> 0:53:10.281 +So there are different ways of divergence, +so there can be structural divergence, which + +0:53:10.281 --> 0:53:10.613 +is. + +0:53:10.550 --> 0:53:16.121 +The word orders so that the order is different, +so in German we have that especially in the + +0:53:16.121 --> 0:53:19.451 +in the sub clause, while in English in the +sub clause. + +0:53:19.451 --> 0:53:24.718 +The verb is also at the second position, in +German it's at the end, and so you have to + +0:53:24.718 --> 0:53:25.506 +move it all. + +0:53:25.465 --> 0:53:27.222 +Um All Over. + +0:53:27.487 --> 0:53:32.978 +It can be that that it's a complete different +grammatical role. + +0:53:33.253 --> 0:53:35.080 +So,. + +0:53:35.595 --> 0:53:37.458 +You Have You Like Her. + +0:53:38.238 --> 0:53:41.472 +And eh in in. + +0:53:41.261 --> 0:53:47.708 +English: In Spanish it's a la ti gusta which +means she so now she is no longer like object + +0:53:47.708 --> 0:53:54.509 +but she is subject here and you are now acquisitive +and then pleases or like yeah so you really + +0:53:54.509 --> 0:53:58.689 +use a different sentence structure and you +have to change. + +0:53:59.139 --> 0:54:03.624 +Can also be the head switch. + +0:54:03.624 --> 0:54:09.501 +In English you say the baby just ate. + +0:54:09.501 --> 0:54:16.771 +In Spanish literary you say the baby finishes. + +0:54:16.997 --> 0:54:20.803 +So the is no longer the word, but the finishing +is the word. + +0:54:21.241 --> 0:54:30.859 +So you have to learn so you cannot always +have the same structures in your input and + +0:54:30.859 --> 0:54:31.764 +output. + +0:54:36.856 --> 0:54:42.318 +Lexical things like to swim across or to cross +swimming. + +0:54:43.243 --> 0:54:57.397 +You have categorical like an adjective gets +into a noun, so you have a little bread to + +0:54:57.397 --> 0:55:00.162 +make a decision. + +0:55:00.480 --> 0:55:15.427 +That is the one challenge and the even bigger +challenge is referred to as translation. + +0:55:17.017 --> 0:55:19.301 +That can be their lexical mismatch. + +0:55:19.301 --> 0:55:21.395 +That's the fish we talked about. + +0:55:21.395 --> 0:55:27.169 +If it's like the, the fish you eat or the +fish which is living is the two different worlds + +0:55:27.169 --> 0:55:27.931 +in Spanish. + +0:55:28.108 --> 0:55:34.334 +And then that's partly sometimes even not +known, so even the human might not be able + +0:55:34.334 --> 0:55:34.627 +to. + +0:55:34.774 --> 0:55:40.242 +Infer that you maybe need to see the context +you maybe need to have the sentences around, + +0:55:40.242 --> 0:55:45.770 +so one problem is that at least traditional +machine translation works on a sentence level, + +0:55:45.770 --> 0:55:51.663 +so we take each sentence and translate it independent +of everything else, but that's, of course, + +0:55:51.663 --> 0:55:52.453 +not correct. + +0:55:52.532 --> 0:55:59.901 +Will look into some ways of looking at and +doing document-based machine translation, but. + +0:56:00.380 --> 0:56:06.793 +There's gender information might be a problem, +so in English it's player and you don't know + +0:56:06.793 --> 0:56:10.139 +if it's Spieler Spielerin or if it's not known. + +0:56:10.330 --> 0:56:15.770 +But in the English, if you now generate German, +you should know is the reader. + +0:56:15.770 --> 0:56:21.830 +Does he know the gender or does he not know +the gender and then generate the right one? + +0:56:22.082 --> 0:56:38.333 +So just imagine a commentator if he's talking +about the player and you can see if it's male + +0:56:38.333 --> 0:56:40.276 +or female. + +0:56:40.540 --> 0:56:47.801 +So in generally the problem is that if you +have less information and you need more information + +0:56:47.801 --> 0:56:51.928 +in your target, this translation doesn't really +work. + +0:56:55.175 --> 0:56:59.180 +Another problem is we just talked about the +the. + +0:56:59.119 --> 0:57:01.429 +The co reference. + +0:57:01.641 --> 0:57:08.818 +So if you refer to an object and that can +be across sentence boundaries then you have + +0:57:08.818 --> 0:57:14.492 +to use the right pronoun and you cannot just +translate the pronoun. + +0:57:14.492 --> 0:57:18.581 +If the baby does not thrive on raw milk boil +it. + +0:57:19.079 --> 0:57:28.279 +And if you are now using it and just take +the typical translation, it will be: And That + +0:57:28.279 --> 0:57:31.065 +Will Be Ah Wrong. + +0:57:31.291 --> 0:57:35.784 +No, that will be even right because it is +dust baby. + +0:57:35.784 --> 0:57:42.650 +Yes, but I mean, you have to determine that +and it might be wrong at some point. + +0:57:42.650 --> 0:57:48.753 +So getting this this um yeah, it will be wrong +yes, that is right yeah. + +0:57:48.908 --> 0:57:55.469 +Because in English both are baby and milk, +and baby are both referred to it, so if you + +0:57:55.469 --> 0:58:02.180 +do S it will be to the first one referred to, +so it's correct, but in Germany it will be + +0:58:02.180 --> 0:58:06.101 +S, and so if you translate it as S it will +be baby. + +0:58:06.546 --> 0:58:13.808 +But you have to do Z because milk is female, +although that is really very uncommon because + +0:58:13.808 --> 0:58:18.037 +maybe a model is an object and so it should +be more. + +0:58:18.358 --> 0:58:25.176 +Of course, I agree there might be a situation +which is a bit created and not a common thing, + +0:58:25.176 --> 0:58:29.062 +but you can see that these things are not that +easy. + +0:58:29.069 --> 0:58:31.779 +Another example is this: Dr. + +0:58:31.779 --> 0:58:37.855 +McLean often brings his dog champion to visit +with his patients. + +0:58:37.855 --> 0:58:41.594 +He loves to give big wets loppy kisses. + +0:58:42.122 --> 0:58:58.371 +And there, of course, it's also important +if he refers to the dog or to the doctor. + +0:58:59.779 --> 0:59:11.260 +Another example of challenging is that we +don't have a fixed language and that was referred + +0:59:11.260 --> 0:59:16.501 +to morphology and we can build new words. + +0:59:16.496 --> 0:59:23.787 +So we can in all languages build new words +by just concatinating part of it like braxits, + +0:59:23.787 --> 0:59:30.570 +some things like: And then, of course, also +words don't exist in languages, don't exist + +0:59:30.570 --> 0:59:31.578 +in isolations. + +0:59:32.012 --> 0:59:41.591 +In Germany you can now use the word download +somewhere and you can also use a morphological + +0:59:41.591 --> 0:59:43.570 +operation on that. + +0:59:43.570 --> 0:59:48.152 +I guess there is even not the correct word. + +0:59:48.508 --> 0:59:55.575 +But so you have to deal with these things, +and yeah, in social meters. + +0:59:55.996 --> 1:00:00.215 +This word is maybe most of you have forgotten +already. + +1:00:00.215 --> 1:00:02.517 +This was ten years ago or so. + +1:00:02.517 --> 1:00:08.885 +I don't know there was a volcano in Iceland +which stopped Europeans flying around. + +1:00:09.929 --> 1:00:14.706 +So there is always new words coming up and +you have to deal with. + +1:00:18.278 --> 1:00:24.041 +Yeah, one last thing, so some of these examples +we have seen are a bit artificial. + +1:00:24.041 --> 1:00:30.429 +So one example what is very common with machine +translation doesn't really work is this box + +1:00:30.429 --> 1:00:31.540 +was in the pen. + +1:00:32.192 --> 1:00:36.887 +And maybe you would be surprised, at least +when read it. + +1:00:36.887 --> 1:00:39.441 +How can a box be inside a pen? + +1:00:40.320 --> 1:00:44.175 +Does anybody have a solution for that while +the sentence is still correct? + +1:00:47.367 --> 1:00:51.692 +Maybe it's directly clear for you, maybe your +English was aside, yeah. + +1:00:54.654 --> 1:01:07.377 +Yes, like at a farm or for small children, +and that is also called a pen or a pen on a + +1:01:07.377 --> 1:01:08.254 +farm. + +1:01:08.368 --> 1:01:12.056 +And then this is, and so you can mean okay. + +1:01:12.056 --> 1:01:16.079 +To infer these two meanings is quite difficult. + +1:01:16.436 --> 1:01:23.620 +But at least when I saw it, I wasn't completely +convinced because it's maybe not the sentence + +1:01:23.620 --> 1:01:29.505 +you're using in your daily life, and some of +these constructions seem to be. + +1:01:29.509 --> 1:01:35.155 +They are very good in showing where the problem +is, but the question is, does it really imply + +1:01:35.155 --> 1:01:35.995 +in real life? + +1:01:35.996 --> 1:01:42.349 +And therefore here some examples also that +we had here with a lecture translator that + +1:01:42.349 --> 1:01:43.605 +really occurred. + +1:01:43.605 --> 1:01:49.663 +They maybe looked simple, but you will see +that some of them still are happening. + +1:01:50.050 --> 1:01:53.948 +And they are partly about spitting words, +and then they are happening. + +1:01:54.294 --> 1:01:56.816 +So Um. + +1:01:56.596 --> 1:02:03.087 +We had a text about the numeral system in +German, the Silen system, which got splitted + +1:02:03.087 --> 1:02:07.041 +into sub parts because otherwise we can't translate. + +1:02:07.367 --> 1:02:14.927 +And then he did only a proximate match and +was talking about the binary payment system + +1:02:14.927 --> 1:02:23.270 +because the payment system was a lot more common +in the training data than the Thailand system. + +1:02:23.823 --> 1:02:29.900 +And so there you see like rare words, which +don't occur that often. + +1:02:29.900 --> 1:02:38.211 +They are very challenging to deal with because +we are good and inferring that sometimes, but + +1:02:38.211 --> 1:02:41.250 +for others that's very difficult. + +1:02:44.344 --> 1:02:49.605 +Another challenge is that, of course, the +context is very difficult. + +1:02:50.010 --> 1:02:56.448 +This is also an example a bit older from also +the lecture translators we were translating + +1:02:56.448 --> 1:03:01.813 +in mass lecture, and he was always talking +about the omens of the numbers. + +1:03:02.322 --> 1:03:11.063 +Which doesn't make any sense at all, but the +German word fortsizing can of course mean the + +1:03:11.063 --> 1:03:12.408 +sign and the. + +1:03:12.732 --> 1:03:22.703 +And if you not have the right to main knowledge +in there and encode it, it might use the main + +1:03:22.703 --> 1:03:23.869 +knowledge. + +1:03:25.705 --> 1:03:31.205 +A more recent version of that is like here +from a paper where it's about translating. + +1:03:31.205 --> 1:03:36.833 +We had this pivot based translation where +you translate maybe to English and to another + +1:03:36.833 --> 1:03:39.583 +because you have not enough training data. + +1:03:40.880 --> 1:03:48.051 +And we did that from Dutch to German guess +if you don't understand Dutch, if you speak + +1:03:48.051 --> 1:03:48.710 +German. + +1:03:48.908 --> 1:03:56.939 +So we have this raven forebuilt, which means +to geben in English. + +1:03:56.939 --> 1:04:05.417 +It's correctly in setting an example: However, +if we're then translate to German, he didn't + +1:04:05.417 --> 1:04:11.524 +get the full context, and in German you normally +don't set an example, but you give an example, + +1:04:11.524 --> 1:04:16.740 +and so yes, going through another language +you introduce their additional errors. + +1:04:19.919 --> 1:04:27.568 +Good so much for this are there more questions +about why this is difficult. + +1:04:30.730 --> 1:04:35.606 +Then we'll start with this one. + +1:04:35.606 --> 1:04:44.596 +I have to leave a bit early today in a quarter +of an hour. + +1:04:44.904 --> 1:04:58.403 +If you look about linguistic approaches to +machine translation, they are typically described + +1:04:58.403 --> 1:05:03.599 +by: So we can do a direct translation, so you +take the Suez language. + +1:05:03.599 --> 1:05:09.452 +Do not apply a lot of the analysis we were +discussing today about syntax representation, + +1:05:09.452 --> 1:05:11.096 +semantic representation. + +1:05:11.551 --> 1:05:14.678 +But you directly translate to your target +text. + +1:05:14.678 --> 1:05:16.241 +That's here the direct. + +1:05:16.516 --> 1:05:19.285 +Then there is a transfer based approach. + +1:05:19.285 --> 1:05:23.811 +Then you transfer everything over and you +do the text translation. + +1:05:24.064 --> 1:05:28.354 +And you can do that at two levels, more at +the syntax level. + +1:05:28.354 --> 1:05:34.683 +That means you only do synthetic analysts +like you do a pasture or so, or at the semantic + +1:05:34.683 --> 1:05:37.848 +level where you do a semantic parsing frame. + +1:05:38.638 --> 1:05:51.489 +Then there is an interlingua based approach +where you don't do any transfer anymore, but + +1:05:51.489 --> 1:05:55.099 +you only do an analysis. + +1:05:57.437 --> 1:06:02.790 +So how does now the direct transfer, the direct +translation? + +1:06:03.043 --> 1:06:07.031 +Look like it's one of the earliest approaches. + +1:06:07.327 --> 1:06:18.485 +So you do maybe some morphological analysts, +but not a lot, and then you do this bilingual + +1:06:18.485 --> 1:06:20.202 +word mapping. + +1:06:20.540 --> 1:06:25.067 +You might do some here in generations. + +1:06:25.067 --> 1:06:32.148 +These two things are not really big, but you +are working on. + +1:06:32.672 --> 1:06:39.237 +And of course this might be a first easy solution +about all the challenges we have seen that + +1:06:39.237 --> 1:06:41.214 +the structure is different. + +1:06:41.214 --> 1:06:45.449 +That you have to reorder, look at the agreement, +then work. + +1:06:45.449 --> 1:06:47.638 +That's why the first approach. + +1:06:47.827 --> 1:06:54.618 +So if we have different word order, structural +shifts or idiomatic expressions that doesn't + +1:06:54.618 --> 1:06:55.208 +really. + +1:06:57.797 --> 1:07:05.034 +Then there are these rule based approaches +which were more commonly used. + +1:07:05.034 --> 1:07:15.249 +They might still be somewhere: Mean most commonly +they are now used by neural networks but wouldn't + +1:07:15.249 --> 1:07:19.254 +be sure there is no system out there but. + +1:07:19.719 --> 1:07:25.936 +And in this transfer based approach we have +these steps there nicely visualized in the. + +1:07:26.406 --> 1:07:32.397 +Triangle, so we have the analytic of the sur +sentence where we then get some type of abstract + +1:07:32.397 --> 1:07:33.416 +representation. + +1:07:33.693 --> 1:07:40.010 +Then we are doing the transfer of the representation +of the source sentence into the representation + +1:07:40.010 --> 1:07:40.263 +of. + +1:07:40.580 --> 1:07:46.754 +And then we have the generation where we take +this abstract representation and do then the + +1:07:46.754 --> 1:07:47.772 +surface forms. + +1:07:47.772 --> 1:07:54.217 +For example, it might be that there is no +morphological variants in the episode representation + +1:07:54.217 --> 1:07:56.524 +and we have to do this agreement. + +1:07:56.656 --> 1:08:00.077 +Which components do you they need? + +1:08:01.061 --> 1:08:08.854 +You need monolingual source and target lexicon +and the corresponding grammars in order to + +1:08:08.854 --> 1:08:12.318 +do both the analyst and the generation. + +1:08:12.412 --> 1:08:18.584 +Then you need the bilingual dictionary in +order to do the lexical translation and the + +1:08:18.584 --> 1:08:25.116 +bilingual transfer rules in order to transfer +the grammar, for example in German, into the + +1:08:25.116 --> 1:08:28.920 +grammar in English, and that enables you to +do that. + +1:08:29.269 --> 1:08:32.579 +So an example is is something like this here. + +1:08:32.579 --> 1:08:38.193 +So if you're doing a syntactic transfer it +means you're starting with John E. + +1:08:38.193 --> 1:08:38.408 +Z. + +1:08:38.408 --> 1:08:43.014 +Apple you do the analyst then you have this +type of graph here. + +1:08:43.014 --> 1:08:48.340 +Therefore you need your monolingual lexicon +and your monolingual grammar. + +1:08:48.748 --> 1:08:59.113 +Then you're doing the transfer where you're +transferring this representation into this + +1:08:59.113 --> 1:09:01.020 +representation. + +1:09:01.681 --> 1:09:05.965 +So how could this type of translation then +look like? + +1:09:07.607 --> 1:09:08.276 +Style. + +1:09:08.276 --> 1:09:14.389 +We have the example of a delicious soup and +una soup deliciosa. + +1:09:14.894 --> 1:09:22.173 +This is your source language tree and this +is your target language tree and then the rules + +1:09:22.173 --> 1:09:26.092 +that you need are these ones to do the transfer. + +1:09:26.092 --> 1:09:31.211 +So if you have a noun phrase that also goes +to the noun phrase. + +1:09:31.691 --> 1:09:44.609 +You see here that the switch is happening, +so the second position is here at the first + +1:09:44.609 --> 1:09:46.094 +position. + +1:09:46.146 --> 1:09:52.669 +Then you have the translation of determiner +of the words, so the dictionary entries. + +1:09:53.053 --> 1:10:07.752 +And with these types of rules you can then +do these mappings and do the transfer between + +1:10:07.752 --> 1:10:11.056 +the representation. + +1:10:25.705 --> 1:10:32.505 +Think it more depends on the amount of expertise +you have in representing them. + +1:10:32.505 --> 1:10:35.480 +The rules will get more difficult. + +1:10:36.136 --> 1:10:42.445 +For example, these rule based were, so I think +it more depends on how difficult the structure + +1:10:42.445 --> 1:10:42.713 +is. + +1:10:42.713 --> 1:10:48.619 +So for German generating German they were +quite long, quite successful because modeling + +1:10:48.619 --> 1:10:52.579 +all the German phenomena which are in there +was difficult. + +1:10:52.953 --> 1:10:56.786 +And that can be done there, and it wasn't +easy to learn that just from data. + +1:10:59.019 --> 1:11:07.716 +Think even if you think about Chinese and +English or so, if you have the trees there + +1:11:07.716 --> 1:11:10.172 +is quite some rule and. + +1:11:15.775 --> 1:11:23.370 +Another thing is you can also try to do something +like that on the semantic, which means this + +1:11:23.370 --> 1:11:24.905 +gets more complex. + +1:11:25.645 --> 1:11:31.047 +This gets maybe a bit easier because this +representation, the semantic representation + +1:11:31.047 --> 1:11:36.198 +between languages, are more similar and therefore +this gets more difficult again. + +1:11:36.496 --> 1:11:45.869 +So typically if you go higher in your triangle +this is more work while this is less work. + +1:11:49.729 --> 1:11:56.023 +So it can be then, for example, like in Gusta, +we have again that the the the order changes. + +1:11:56.023 --> 1:12:02.182 +So you see the transfer rule for like is that +the first argument is here and the second is + +1:12:02.182 --> 1:12:06.514 +there, while on the on the Gusta side here +the second argument. + +1:12:06.466 --> 1:12:11.232 +It is in the first position and the first +argument is in the second position. + +1:12:11.511 --> 1:12:14.061 +So that you do yeah, and also there you're +ordering,. + +1:12:14.354 --> 1:12:20.767 +From the principle it is more like you have +a different type of formalism of representing + +1:12:20.767 --> 1:12:27.038 +your sentence and therefore you need to do +more on one side and less on the other side. + +1:12:32.852 --> 1:12:42.365 +Then so in general transfer based approaches +are you have to first select how to represent + +1:12:42.365 --> 1:12:44.769 +a synthetic structure. + +1:12:45.165 --> 1:12:55.147 +There's like these variable abstraction levels +and then you have the three components: The + +1:12:55.147 --> 1:13:04.652 +disadvantage is that on the one hand you need +normally a lot of experts monolingual experts + +1:13:04.652 --> 1:13:08.371 +who analyze how to do the transfer. + +1:13:08.868 --> 1:13:18.860 +And if you're doing a new language, you have +to do analyst transfer in generation and the + +1:13:18.860 --> 1:13:19.970 +transfer. + +1:13:20.400 --> 1:13:27.074 +So if you need one language, add one language +in existing systems, of course you have to + +1:13:27.074 --> 1:13:29.624 +do transfer to all the languages. + +1:13:32.752 --> 1:13:39.297 +Therefore, the other idea which people were +interested in is the interlingua based machine + +1:13:39.297 --> 1:13:40.232 +translation. + +1:13:40.560 --> 1:13:47.321 +Where the idea is that we have this intermediate +language with this abstract language independent + +1:13:47.321 --> 1:13:53.530 +representation and so the important thing is +it's language independent so it's really the + +1:13:53.530 --> 1:13:59.188 +same for all language and it's a pure meaning +and there is no ambiguity in there. + +1:14:00.100 --> 1:14:05.833 +That allows this nice translation without +transfer, so you just do an analysis into your + +1:14:05.833 --> 1:14:11.695 +representation, and there afterwards you do +the generation into the other target language. + +1:14:13.293 --> 1:14:16.953 +And that of course makes especially multilingual. + +1:14:16.953 --> 1:14:19.150 +It's like somehow is a dream. + +1:14:19.150 --> 1:14:25.519 +If you want to add a language you just need +to add one analyst tool and one generation + +1:14:25.519 --> 1:14:25.959 +tool. + +1:14:29.249 --> 1:14:32.279 +Which is not the case in the other scenario. + +1:14:33.193 --> 1:14:40.547 +However, the big challenge is in this case +the interlingua based representation because + +1:14:40.547 --> 1:14:47.651 +you need to represent all different types of +knowledge in there in order to do that. + +1:14:47.807 --> 1:14:54.371 +And also like world knowledge, so something +like an apple is a fruit and property is a + +1:14:54.371 --> 1:14:57.993 +fruit, so they are eatable and stuff like that. + +1:14:58.578 --> 1:15:06.286 +So that is why this is typically always only +done for small amounts of data. + +1:15:06.326 --> 1:15:13.106 +So what people have done for special applications +like hotel reservation people have looked into + +1:15:13.106 --> 1:15:18.348 +that, but they have typically not done it for +any possibility of doing it. + +1:15:18.718 --> 1:15:31.640 +So the advantage is you need to represent +all the world knowledge in your interlingua. + +1:15:32.092 --> 1:15:40.198 +And that is not possible at the moment or +never was possible so far. + +1:15:40.198 --> 1:15:47.364 +Typically they were for small domains for +hotel reservation. + +1:15:51.431 --> 1:15:57.926 +But of course this idea of doing that and +that's why some people are interested in is + +1:15:57.926 --> 1:16:04.950 +like if you now do a neural system where you +learn the representation in your neural network + +1:16:04.950 --> 1:16:07.442 +is that some type of artificial. + +1:16:08.848 --> 1:16:09.620 +Interlingua. + +1:16:09.620 --> 1:16:15.025 +However, what we at least found out until +now is that there's often very language specific + +1:16:15.025 --> 1:16:15.975 +information in. + +1:16:16.196 --> 1:16:19.648 +And they might be important and essential. + +1:16:19.648 --> 1:16:26.552 +You don't have all the information in your +input, so you typically can't do resolving + +1:16:26.552 --> 1:16:32.412 +all ambiguities inside there because you might +not have all information. + +1:16:32.652 --> 1:16:37.870 +So in English you don't know if it's a living +fish or the fish which you're eating, and if + +1:16:37.870 --> 1:16:43.087 +you're translating to Germany you also don't +have to resolve this problem because you have + +1:16:43.087 --> 1:16:45.610 +the same ambiguity in your target language. + +1:16:45.610 --> 1:16:50.828 +So why would you put in our effort in finding +out if it's a dish or the other fish if it's + +1:16:50.828 --> 1:16:52.089 +not necessary at all? + +1:16:54.774 --> 1:16:59.509 +Yeah Yeah. + +1:17:05.585 --> 1:17:15.019 +The semantic transfer is not the same for +both languages, so you still represent the + +1:17:15.019 --> 1:17:17.127 +semantic language. + +1:17:17.377 --> 1:17:23.685 +So you have the like semantic representation +in the Gusta, but that's not the same as semantic + +1:17:23.685 --> 1:17:28.134 +representation for both languages, and that's +the main difference. + +1:17:35.515 --> 1:17:44.707 +Okay, then these are the most important things +for today: what is language and how our rule + +1:17:44.707 --> 1:17:46.205 +based systems. + +1:17:46.926 --> 1:17:59.337 +And if there is no more questions thank you +for joining, we have today a bit of a shorter + +1:17:59.337 --> 1:18:00.578 +lecture. + diff --git a/demo_data/lectures/Lecture-02-20.04.2023/video.mp4 b/demo_data/lectures/Lecture-02-20.04.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..f27e979b6e22a4e159b48064334c19d812d6107b --- /dev/null +++ b/demo_data/lectures/Lecture-02-20.04.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0ac15772e9e528ff3f7fb957401be410fcdf4a4ad54542e96916fe654443eb3 +size 111655016 diff --git a/demo_data/lectures/Lecture-03-25.04.2023/English.vtt b/demo_data/lectures/Lecture-03-25.04.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..2b281c6ecf19b9762d01057d29b4083df778ec64 --- /dev/null +++ b/demo_data/lectures/Lecture-03-25.04.2023/English.vtt @@ -0,0 +1,3102 @@ +WEBVTT + +0:00:02.822 --> 0:00:07.880 +We look into more linguistic approaches. + +0:00:07.880 --> 0:00:14.912 +We can do machine translation in a more traditional +way. + +0:00:14.912 --> 0:00:21.224 +It should be: Translation should be generated +this way. + +0:00:21.224 --> 0:00:27.933 +We can analyze versus a sewer sentence what +is the meaning or the syntax. + +0:00:27.933 --> 0:00:35.185 +Then we transfer this information to the target +side and then we then generate. + +0:00:36.556 --> 0:00:42.341 +And this was the strong and common used approach +for yeah several years. + +0:00:44.024 --> 0:00:50.839 +However, we saw already at the beginning there +some challenges with that: Language is very + +0:00:50.839 --> 0:00:57.232 +ambigue and it's often very difficult to really +get high coated rules. + +0:00:57.232 --> 0:01:05.336 +What are the different meanings and we have +to do that also with a living language so new + +0:01:05.336 --> 0:01:06.596 +things occur. + +0:01:07.007 --> 0:01:09.308 +And that's why people look into. + +0:01:09.308 --> 0:01:13.282 +Can we maybe do it differently and use machine +learning? + +0:01:13.333 --> 0:01:24.849 +So we are no longer giving rules of how to +do it, but we just give examples and the system. + +0:01:25.045 --> 0:01:34.836 +And one important thing then is these examples: +how can we learn how to translate one sentence? + +0:01:35.635 --> 0:01:42.516 +And therefore these yeah, the data is now +really a very important issue. + +0:01:42.582 --> 0:01:50.021 +And that is what we want to look into today. + +0:01:50.021 --> 0:01:58.783 +What type of data do we use for machine translation? + +0:01:59.019 --> 0:02:08.674 +So the idea in preprocessing is always: Can +we make the task somehow a bit easier so that + +0:02:08.674 --> 0:02:13.180 +the empty system will be in a way better? + +0:02:13.493 --> 0:02:28.309 +So one example could be if it has problems +dealing with numbers because they are occurring. + +0:02:28.648 --> 0:02:35.479 +Or think about so one problem which still +might be is there in some systems think about + +0:02:35.479 --> 0:02:36.333 +different. + +0:02:36.656 --> 0:02:44.897 +So a system might learn that of course if +there's a German over in English there should. + +0:02:45.365 --> 0:02:52.270 +However, if it's in pearl text, it will see +that in Germany there is often km, and in English + +0:02:52.270 --> 0:02:54.107 +typically various miles. + +0:02:54.594 --> 0:03:00.607 +Might just translate three hundred and fifty +five miles into three hundred and fiftY five + +0:03:00.607 --> 0:03:04.348 +kilometers, which of course is not right, and +so forth. + +0:03:04.348 --> 0:03:06.953 +It might make things to look into the. + +0:03:07.067 --> 0:03:13.072 +Therefore, first step when you build your +machine translation system is normally to look + +0:03:13.072 --> 0:03:19.077 +at the data, to check it, to see if there is +anything happening which you should address + +0:03:19.077 --> 0:03:19.887 +beforehand. + +0:03:20.360 --> 0:03:29.152 +And then the second part is how do you represent +no works machine learning normally? + +0:03:29.109 --> 0:03:35.404 +So the question is how do we get out from +the words into numbers and I've seen some of + +0:03:35.404 --> 0:03:35.766 +you? + +0:03:35.766 --> 0:03:42.568 +For example, in advance there we have introduced +to an algorithm which we also shortly repeat + +0:03:42.568 --> 0:03:43.075 +today. + +0:03:43.303 --> 0:03:53.842 +The subword unit approach which was first +introduced in machine translation and now used + +0:03:53.842 --> 0:04:05.271 +for an in order to represent: Now you've learned +about morphology, so you know that maybe in + +0:04:05.271 --> 0:04:09.270 +English it's not that important. + +0:04:09.429 --> 0:04:22.485 +In German you have all these different word +poems and to learn independent representation. + +0:04:24.024 --> 0:04:26.031 +And then, of course, they are more extreme. + +0:04:27.807 --> 0:04:34.387 +So how are we doing? + +0:04:34.975 --> 0:04:37.099 +Machine translation. + +0:04:37.099 --> 0:04:46.202 +So hopefully you remember we had these approaches +to machine translation, the rule based. + +0:04:46.202 --> 0:04:52.473 +We had a big block of corpus space machine +translation which. + +0:04:52.492 --> 0:05:00.443 +Will on Thursday have an overview on statistical +models and then afterwards concentrate on the. + +0:05:00.680 --> 0:05:08.828 +Both of them are corpus based machine translation +and therefore it's really essential, and while + +0:05:08.828 --> 0:05:16.640 +we are typically training a machine translation +system is what we refer to as parallel data. + +0:05:16.957 --> 0:05:22.395 +Talk a lot about pearl corpus or pearl data, +and what I mean there is something which you + +0:05:22.395 --> 0:05:28.257 +might know from was that a stone or something +like that, so it's typically you have one sentence + +0:05:28.257 --> 0:05:33.273 +in the one language, and then you have aligned +to it one sentence in the charcote. + +0:05:33.833 --> 0:05:38.261 +And this is how we train all our alignments. + +0:05:38.261 --> 0:05:43.181 +We'll see today that of course we might not +have. + +0:05:43.723 --> 0:05:51.279 +However, this is relatively easy to create, +at least for iquality data. + +0:05:51.279 --> 0:06:00.933 +We look into data trawling so that means how +we can automatically create this parallel data + +0:06:00.933 --> 0:06:02.927 +from the Internet. + +0:06:04.144 --> 0:06:13.850 +It's not so difficult to learn these alignments +if we have some type of dictionary, so which + +0:06:13.850 --> 0:06:16.981 +sentence is aligned to which. + +0:06:18.718 --> 0:06:25.069 +What it would, of course, be a lot more difficult +is really to word alignment, and that's also + +0:06:25.069 --> 0:06:27.476 +often no longer that good possible. + +0:06:27.476 --> 0:06:33.360 +We do that automatically in some yes for symbols, +but it's definitely more challenging. + +0:06:33.733 --> 0:06:40.691 +For sentence alignment, of course, it's still +not always perfect, so there might be that + +0:06:40.691 --> 0:06:46.085 +there is two German sentences and one English +sentence or the other. + +0:06:46.085 --> 0:06:53.511 +So there's not always perfect alignment, but +if you look at text, it's still bigly relatively. + +0:06:54.014 --> 0:07:03.862 +If we have that then we can build a machine +learning model which tries to map ignition + +0:07:03.862 --> 0:07:06.239 +sentences somewhere. + +0:07:06.626 --> 0:07:15.932 +So this is the idea of behind statistical +machine translation and machine translation. + +0:07:15.932 --> 0:07:27.098 +The difference is: Statistical machine translation +is typically a whole box of different models + +0:07:27.098 --> 0:07:30.205 +which try to evaluate the. + +0:07:30.510 --> 0:07:42.798 +In neural machine translation, it's all one +large neural network where we use the one-sur-sentence + +0:07:42.798 --> 0:07:43.667 +input. + +0:07:44.584 --> 0:07:50.971 +And then we can train it by having exactly +this mapping port or parallel data. + +0:07:54.214 --> 0:08:02.964 +So what we want today to look at today is +we want to first look at general text data. + +0:08:03.083 --> 0:08:06.250 +So what is text data? + +0:08:06.250 --> 0:08:09.850 +What text data is there? + +0:08:09.850 --> 0:08:18.202 +Why is it challenging so that we have large +vocabularies? + +0:08:18.378 --> 0:08:22.003 +It's so that you always have words which you +haven't seen. + +0:08:22.142 --> 0:08:29.053 +If you increase your corporate science normally +you will also increase your vocabulary so you + +0:08:29.053 --> 0:08:30.744 +always find new words. + +0:08:31.811 --> 0:08:39.738 +Then based on that we'll look into pre-processing. + +0:08:39.738 --> 0:08:45.333 +So how can we pre-process our data? + +0:08:45.333 --> 0:08:46.421 +Maybe. + +0:08:46.526 --> 0:08:54.788 +This is a lot about tokenization, for example, +which we heard is not so challenging in European + +0:08:54.788 --> 0:09:02.534 +languages but still important, but might be +really difficult in Asian languages where you + +0:09:02.534 --> 0:09:05.030 +don't have space separation. + +0:09:05.986 --> 0:09:12.161 +And this preprocessing typically tries to +deal with the extreme cases where you have + +0:09:12.161 --> 0:09:13.105 +seen things. + +0:09:13.353 --> 0:09:25.091 +If you have seen your words three one hundred +times, it doesn't really matter if you have + +0:09:25.091 --> 0:09:31.221 +seen them with them without punctuation or +so. + +0:09:31.651 --> 0:09:38.578 +And then we look into word representation, +so what is the best way to represent a word? + +0:09:38.578 --> 0:09:45.584 +And finally, we look into the other type of +data we really need for machine translation. + +0:09:45.725 --> 0:09:56.842 +So in first we can use for many tasks, and +later we can also use purely monolingual data + +0:09:56.842 --> 0:10:00.465 +to make machine translation. + +0:10:00.660 --> 0:10:03.187 +So then the traditional approach was that +it was easier. + +0:10:03.483 --> 0:10:08.697 +We have this type of language model which +we can train only on the target data to make + +0:10:08.697 --> 0:10:12.173 +the text more fluent in neural machine translation +model. + +0:10:12.173 --> 0:10:18.106 +It's partly a bit more complicated to integrate +this data but still it's very important especially + +0:10:18.106 --> 0:10:22.362 +if you think about lower issue languages where +you have very few data. + +0:10:23.603 --> 0:10:26.999 +It's harder to get parallel data than you +get monolingual data. + +0:10:27.347 --> 0:10:33.821 +Because monolingual data you just have out +there not huge amounts for some languages, + +0:10:33.821 --> 0:10:38.113 +but definitely the amount of data is always +significant. + +0:10:40.940 --> 0:10:50.454 +When we talk about data, it's also of course +important how we use it for machine learning. + +0:10:50.530 --> 0:11:05.867 +And that you hopefully learn in some prior +class, so typically we separate our data into + +0:11:05.867 --> 0:11:17.848 +three chunks: So this is really by far the +largest, and this grows with the data we get. + +0:11:17.848 --> 0:11:21.387 +Today we get here millions. + +0:11:22.222 --> 0:11:27.320 +Then we have our validation data and that +is to train some type of parameters. + +0:11:27.320 --> 0:11:33.129 +So not only you have some things to configure +and you don't know what is the right value, + +0:11:33.129 --> 0:11:39.067 +so what you can do is train a model and change +these a bit and try to find the best ones on + +0:11:39.067 --> 0:11:40.164 +your validation. + +0:11:40.700 --> 0:11:48.531 +For a statistical model, for example data +in what you want to use if you have several + +0:11:48.531 --> 0:11:54.664 +models: You know how to combine it, so how +much focus should you put on the different + +0:11:54.664 --> 0:11:55.186 +models? + +0:11:55.186 --> 0:11:59.301 +And if it's like twenty models, so it's only +twenty per meter. + +0:11:59.301 --> 0:12:02.828 +It's not that much, so that is still bigly +estimated. + +0:12:03.183 --> 0:12:18.964 +In your model there's often a question how +long should train the model before you have + +0:12:18.964 --> 0:12:21.322 +overfitting. + +0:12:22.902 --> 0:12:28.679 +And then you have your test data, which is +finally where you report on your test. + +0:12:29.009 --> 0:12:33.663 +And therefore it's also important that from +time to time you get new test data because + +0:12:33.663 --> 0:12:38.423 +if you're always through your experiments you +test on it and then you do new experiments + +0:12:38.423 --> 0:12:43.452 +and tests again at some point you have tested +so many on it that you do some type of training + +0:12:43.452 --> 0:12:48.373 +on your test data again because you just select +the things which is at the end best on your + +0:12:48.373 --> 0:12:48.962 +test data. + +0:12:49.009 --> 0:12:54.755 +It's important to get a new test data from +time to time, for example in important evaluation + +0:12:54.755 --> 0:12:58.340 +campaigns for machine translation and speech +translation. + +0:12:58.618 --> 0:13:07.459 +There is like every year there should do tests +that create it so we can see if the model really + +0:13:07.459 --> 0:13:09.761 +gets better on new data. + +0:13:10.951 --> 0:13:19.629 +And of course it is important that this is +a representative of the use case you are interested. + +0:13:19.879 --> 0:13:36.511 +So if you're building a system for translating +websites, this should be on websites. + +0:13:36.816 --> 0:13:39.356 +So normally a system is good on some tasks. + +0:13:40.780 --> 0:13:48.596 +I would solve everything and then your test +data should be out of everything because if + +0:13:48.596 --> 0:13:54.102 +you only have a very small subset you know +it's good on this. + +0:13:54.394 --> 0:14:02.714 +Therefore, the selection of your test data +is really important in order to ensure that + +0:14:02.714 --> 0:14:05.200 +the MP system in the end. + +0:14:05.525 --> 0:14:12.646 +Is the greatest system ever you have evaluated +on translating Bible. + +0:14:12.646 --> 0:14:21.830 +The use case is to translate some Twitter +data and you can imagine the performance might + +0:14:21.830 --> 0:14:22.965 +be really. + +0:14:23.803 --> 0:14:25.471 +And privately. + +0:14:25.471 --> 0:14:35.478 +Of course, in honor to have this and realistic +evaluation, it's important that there's no + +0:14:35.478 --> 0:14:39.370 +overlap between this data because. + +0:14:39.799 --> 0:14:51.615 +Because the danger might be is learning by +heart how to translate the sentences from your + +0:14:51.615 --> 0:14:53.584 +training data. + +0:14:54.194 --> 0:15:04.430 +That the test data is really different from +your training data. + +0:15:04.430 --> 0:15:16.811 +Therefore, it's important to: So what type +of data we have? + +0:15:16.811 --> 0:15:24.966 +There's a lot of different text data and the +nice thing is with digitalization. + +0:15:25.345 --> 0:15:31.785 +You might think there's a large amount with +books, but to be honest books and printed things + +0:15:31.785 --> 0:15:35.524 +that's by now a minor percentage of the data +we have. + +0:15:35.815 --> 0:15:39.947 +There's like so much data created every day +on the Internet. + +0:15:39.980 --> 0:15:46.223 +With social media and all the other types. + +0:15:46.223 --> 0:15:56.821 +This of course is a largest amount of data, +more of colloquial language. + +0:15:56.856 --> 0:16:02.609 +It might be more noisy and harder to process, +so there is a whole area on how to deal with + +0:16:02.609 --> 0:16:04.948 +more social media and outdoor stuff. + +0:16:07.347 --> 0:16:20.702 +What type of data is there if you think about +parallel data news type of data official sites? + +0:16:20.900 --> 0:16:26.629 +So the first Power Corpora were like things +like the European Parliament or like some news + +0:16:26.629 --> 0:16:27.069 +sites. + +0:16:27.227 --> 0:16:32.888 +Nowadays there's quite a large amount of data +crawled from the Internet, but of course if + +0:16:32.888 --> 0:16:38.613 +you crawl parallel data from the Internet, +a lot of the data is also like company websites + +0:16:38.613 --> 0:16:41.884 +or so which gets translated into several languages. + +0:16:45.365 --> 0:17:00.613 +Then, of course, there is different levels +of text and we have to look at what level we + +0:17:00.613 --> 0:17:05.118 +want to process our data. + +0:17:05.885 --> 0:17:16.140 +It one normally doesn't make sense to work +on full sentences because a lot of sentences + +0:17:16.140 --> 0:17:22.899 +have never been seen and you always create +new sentences. + +0:17:23.283 --> 0:17:37.421 +So typically what we take is our basic words, +something between words and letters, and that + +0:17:37.421 --> 0:17:40.033 +is an essential. + +0:17:40.400 --> 0:17:47.873 +So we need some of these atomic blocks or +basic blocks on which we can't make smaller. + +0:17:48.128 --> 0:17:55.987 +So if we're building a sentence, for example, +you can build it out of something and you can + +0:17:55.987 --> 0:17:57.268 +either decide. + +0:17:57.268 --> 0:18:01.967 +For example, you take words and you spit them +further. + +0:18:03.683 --> 0:18:10.178 +Then, of course, the nice thing is not too +small and therefore building larger things + +0:18:10.178 --> 0:18:11.386 +like sentences. + +0:18:11.831 --> 0:18:16.690 +So you only have to take your vocabulary and +put it somewhere together to get your full + +0:18:16.690 --> 0:18:17.132 +center. + +0:18:19.659 --> 0:18:27.670 +However, if it's too large, these blocks don't +occur often enough, and you have more blocks + +0:18:27.670 --> 0:18:28.715 +that occur. + +0:18:29.249 --> 0:18:34.400 +And that's why yeah we can work with blocks +for smaller like software blocks. + +0:18:34.714 --> 0:18:38.183 +Work with neural models. + +0:18:38.183 --> 0:18:50.533 +Then you can work on letters so you have a +system which tries to understand the sentence + +0:18:50.533 --> 0:18:53.031 +letter by letter. + +0:18:53.313 --> 0:18:57.608 +But that is a design decision which you have +to take at some point. + +0:18:57.608 --> 0:19:03.292 +On which level do you want to split your text +and that of the evasive blocks that you are + +0:19:03.292 --> 0:19:04.176 +working with? + +0:19:04.176 --> 0:19:06.955 +And that's something we'll look into today. + +0:19:06.955 --> 0:19:08.471 +What possibilities are? + +0:19:12.572 --> 0:19:14.189 +Any question. + +0:19:17.998 --> 0:19:24.456 +Then let's look a bit on what type of data +there is in how much data there is to person. + +0:19:24.824 --> 0:19:34.006 +Is that nowadays, at least for pure text, +it's no longer for some language. + +0:19:34.006 --> 0:19:38.959 +There is so much data we cannot process. + +0:19:39.479 --> 0:19:49.384 +That is only true for some languages, but +there is also interest in other languages and + +0:19:49.384 --> 0:19:50.622 +important. + +0:19:50.810 --> 0:20:01.483 +So if you want to build a system for Sweden +or for some dialect in other countries, then + +0:20:01.483 --> 0:20:02.802 +of course. + +0:20:03.103 --> 0:20:06.888 +Otherwise you have this huge amount of hair. + +0:20:06.888 --> 0:20:11.515 +We are often no longer taking about gigabytes +or more. + +0:20:11.891 --> 0:20:35.788 +The general information that is produced every +year is: And this is like all the information + +0:20:35.788 --> 0:20:40.661 +that are available in the, so there are really. + +0:20:41.001 --> 0:20:44.129 +We look at machine translation. + +0:20:44.129 --> 0:20:53.027 +We can see these numbers are really like more +than ten years old, but we see this increase + +0:20:53.027 --> 0:20:58.796 +in one billion works we had at that time for +English data. + +0:20:59.019 --> 0:21:01.955 +Then I wore like new shuffle on Google Maps +and stuff. + +0:21:02.382 --> 0:21:05.003 +For this one you could train your system on. + +0:21:05.805 --> 0:21:20.457 +And the interesting thing is this one billion +words is more than any human typically speaks. + +0:21:21.001 --> 0:21:25.892 +So these systems they see by now like a magnitude +of more data. + +0:21:25.892 --> 0:21:32.465 +We know I think are a magnitude higher of +more data than a human has ever seen in his + +0:21:32.465 --> 0:21:33.229 +lifetime. + +0:21:35.175 --> 0:21:41.808 +And that is maybe the interesting thing why +it still doesn't work on it because you see + +0:21:41.808 --> 0:21:42.637 +they seem. + +0:21:43.103 --> 0:21:48.745 +So we are seeing a really impressive result, +but in most cases it's not that they're really + +0:21:48.745 --> 0:21:49.911 +better than human. + +0:21:50.170 --> 0:21:56.852 +However, they really have seen more data than +any human ever has seen in this lifetime. + +0:21:57.197 --> 0:22:01.468 +They can just process so much data, so. + +0:22:01.501 --> 0:22:08.425 +The question is, can we make them more efficient +so that they can learn similarly good without + +0:22:08.425 --> 0:22:09.592 +that much data? + +0:22:09.592 --> 0:22:16.443 +And that is essential if we now go to Lawrence's +languages where we might never get that much + +0:22:16.443 --> 0:22:21.254 +data, and we should be also able to achieve +a reasonable perform. + +0:22:23.303 --> 0:22:32.399 +On the other hand, this of course links also +to one topic which we will cover later: If + +0:22:32.399 --> 0:22:37.965 +you think about this, it's really important +that your algorithms are also very efficient + +0:22:37.965 --> 0:22:41.280 +in order to process that much data both in +training. + +0:22:41.280 --> 0:22:46.408 +If you have more data, you want to process +more data so you can make use of that. + +0:22:46.466 --> 0:22:54.499 +On the other hand, if more and more data is +processed, more and more people will use machine + +0:22:54.499 --> 0:23:06.816 +translation to generate translations, and it +will be important to: And there is yeah, there + +0:23:06.816 --> 0:23:07.257 +is. + +0:23:07.607 --> 0:23:10.610 +More. + +0:23:10.170 --> 0:23:17.262 +More data generated every day, we hear just +some general numbers on how much data there + +0:23:17.262 --> 0:23:17.584 +is. + +0:23:17.584 --> 0:23:24.595 +It says that a lot of the data we produce +at least at the moment is text rich, so text + +0:23:24.595 --> 0:23:26.046 +that is produced. + +0:23:26.026 --> 0:23:29.748 +That is very important to either wise. + +0:23:29.748 --> 0:23:33.949 +We can use it as training data in some way. + +0:23:33.873 --> 0:23:40.836 +That we want to translate some of that because +it might not be published in all the languages, + +0:23:40.836 --> 0:23:46.039 +and step with the need for machine translation +is even more important. + +0:23:47.907 --> 0:23:51.547 +So what are the challenges with this? + +0:23:51.831 --> 0:24:01.360 +So first of all that seems to be very good +news, so there is more and more data, so we + +0:24:01.360 --> 0:24:10.780 +can just wait for three years and have more +data, and then our system will be better. + +0:24:11.011 --> 0:24:22.629 +If you see in competitions, the system performance +increases. + +0:24:24.004 --> 0:24:27.190 +See that here are three different systems. + +0:24:27.190 --> 0:24:34.008 +Blue score is metric to measure how good an +empty system is and we'll talk about evaluation + +0:24:34.008 --> 0:24:40.974 +and the next week so you'll have to evaluate +machine validation and also a practical session. + +0:24:41.581 --> 0:24:45.219 +And so. + +0:24:44.784 --> 0:24:50.960 +This shows you that this is like how much +data of the training data you have five percent. + +0:24:50.960 --> 0:24:56.117 +You're significantly worse than if you're +forty percent and eighty percent. + +0:24:56.117 --> 0:25:02.021 +You're getting better and you're seeing two +between this curve, which maybe not really + +0:25:02.021 --> 0:25:02.971 +flattens out. + +0:25:02.971 --> 0:25:03.311 +But. + +0:25:03.263 --> 0:25:07.525 +Of course, the gains you get are normally +smaller and smaller. + +0:25:07.525 --> 0:25:09.216 +The more data you have,. + +0:25:09.549 --> 0:25:21.432 +If your improvements are unnormally better, +if you add the same thing or even double your + +0:25:21.432 --> 0:25:25.657 +data late, of course more data. + +0:25:26.526 --> 0:25:34.955 +However, you see the clear tendency if you +need to improve your system. + +0:25:34.955 --> 0:25:38.935 +This is possible by just getting. + +0:25:39.039 --> 0:25:41.110 +But it's not all about data. + +0:25:41.110 --> 0:25:45.396 +It can also be the domain of the day that +there's building. + +0:25:45.865 --> 0:25:55.668 +So this was a test on machine translation +system on translating genome data. + +0:25:55.668 --> 0:26:02.669 +We have the like SAI said he's working on +translating. + +0:26:02.862 --> 0:26:06.868 +Here you see the performance began with GreenScore. + +0:26:06.868 --> 0:26:12.569 +You see one system which only was trained +on genome data and it only has. + +0:26:12.812 --> 0:26:17.742 +That's very, very few for machine translation. + +0:26:18.438 --> 0:26:23.927 +And to compare that to a system which was +generally trained on used translation data. + +0:26:24.104 --> 0:26:34.177 +With four point five million sentences so +roughly one hundred times as much data you + +0:26:34.177 --> 0:26:40.458 +still see that this system doesn't really work +well. + +0:26:40.820 --> 0:26:50.575 +So you see it's not only about data, it's +also that the data has to somewhat fit to the + +0:26:50.575 --> 0:26:51.462 +domain. + +0:26:51.831 --> 0:26:58.069 +The more general data you get that you have +covered up all domains. + +0:26:58.418 --> 0:27:07.906 +But that's very difficult and especially for +more specific domains. + +0:27:07.906 --> 0:27:16.696 +It can be really important to get data which +fits your domain. + +0:27:16.716 --> 0:27:18.520 +Maybe if you can do some very much broccoli +or something like that, maybe if you. + +0:27:18.598 --> 0:27:22.341 +To say okay, concentrate this as you like +for being at better. + +0:27:24.564 --> 0:27:28.201 +It's not that easy to prompt it. + +0:27:28.201 --> 0:27:35.807 +You can do the prompting in the more traditional +way of fine tuning. + +0:27:35.807 --> 0:27:44.514 +Then, of course, if you select UIV later combine +this one, you can get better. + +0:27:44.904 --> 0:27:52.675 +But it will always be that this type of similar +data is much more important than the general. + +0:27:52.912 --> 0:28:00.705 +So of course it can make the lower system +a lot better if you search for similar data + +0:28:00.705 --> 0:28:01.612 +and find. + +0:28:02.122 --> 0:28:08.190 +Will have a lecture on domain adaptation where +it's exactly the idea how you can make systems + +0:28:08.190 --> 0:28:13.935 +in these situations better so you can adapt +it to this data but then you still need this + +0:28:13.935 --> 0:28:14.839 +type of data. + +0:28:15.335 --> 0:28:21.590 +And in prompting it might work if you have +seen it in your data so it can make the system + +0:28:21.590 --> 0:28:25.134 +aware and tell it focus more in this type of +data. + +0:28:25.465 --> 0:28:30.684 +But if you haven't had enough of the really +specific good matching data, I think it will + +0:28:30.684 --> 0:28:31.681 +always not work. + +0:28:31.681 --> 0:28:37.077 +So you need to have this type of data and +therefore it's important not only to have general + +0:28:37.077 --> 0:28:42.120 +data but also data, at least in your overall +system, which really fits to the domain. + +0:28:45.966 --> 0:28:53.298 +And then the second thing, of course, is you +need to have data that has good quality. + +0:28:53.693 --> 0:29:00.170 +In the early stages it might be good to have +all the data but later it's especially important + +0:29:00.170 --> 0:29:06.577 +that you have somehow good quality and so that +you're learning what you really want to learn + +0:29:06.577 --> 0:29:09.057 +and not learning some great things. + +0:29:10.370 --> 0:29:21.551 +We talked about this with the kilometers and +miles, so if you just take in some type of + +0:29:21.551 --> 0:29:26.253 +data and don't look at the quality,. + +0:29:26.766 --> 0:29:30.875 +But of course, the question here is what is +good quality data? + +0:29:31.331 --> 0:29:35.054 +It is not yet that easy to define what is +a good quality data. + +0:29:36.096 --> 0:29:43.961 +That doesn't mean it has to what people generally +assume as high quality text or so, like written + +0:29:43.961 --> 0:29:47.814 +by a Nobel Prize winner or something like that. + +0:29:47.814 --> 0:29:54.074 +This is not what we mean by this quality, +but again the most important again. + +0:29:54.354 --> 0:30:09.181 +So if you have Twitter data, high quality +data doesn't mean you have now some novels. + +0:30:09.309 --> 0:30:12.875 +Test data, but it should also be represented +similarly. + +0:30:12.875 --> 0:30:18.480 +Don't have, for example, quality definitely +as it should be really translating yourself + +0:30:18.480 --> 0:30:18.862 +into. + +0:30:19.199 --> 0:30:25.556 +So especially if you corral data you would +often have that it's not a direct translation. + +0:30:25.805 --> 0:30:28.436 +So then, of course, this is not high quality +teaching. + +0:30:29.449 --> 0:30:39.974 +But in generally that's a very difficult thing +to, and it's very difficult to design what + +0:30:39.974 --> 0:30:41.378 +is reading. + +0:30:41.982 --> 0:30:48.333 +And of course a biometric is always the quality +of your data is good if your machine translation. + +0:30:48.648 --> 0:30:50.719 +So that is like the indirect. + +0:30:50.991 --> 0:30:52.447 +Well, what can we motive? + +0:30:52.447 --> 0:30:57.210 +Of course, it's difficult to always try a +lot of things and evaluate either of them, + +0:30:57.210 --> 0:30:59.396 +build a full MP system and then check. + +0:30:59.396 --> 0:31:00.852 +Oh, was this a good idea? + +0:31:00.852 --> 0:31:01.357 +I mean,. + +0:31:01.581 --> 0:31:19.055 +You have two tokenizers who like split sentences +and the words you really want to apply. + +0:31:19.179 --> 0:31:21.652 +Now you could maybe argue or your idea could +be. + +0:31:21.841 --> 0:31:30.186 +Just take it there very fast and then get +the result, but the problem is there is not + +0:31:30.186 --> 0:31:31.448 +always this. + +0:31:31.531 --> 0:31:36.269 +One thing that works very well for small data. + +0:31:36.269 --> 0:31:43.123 +It's not for sure that the same effect will +happen in large stages. + +0:31:43.223 --> 0:31:50.395 +This idea really improves on very low resource +data if only train on hundred words. + +0:31:51.271 --> 0:31:58.357 +But if you use it for a large data set, it +doesn't really matter and all your ideas not. + +0:31:58.598 --> 0:32:01.172 +So that is also a typical thing. + +0:32:01.172 --> 0:32:05.383 +This quality issue is more and more important +if you. + +0:32:06.026 --> 0:32:16.459 +By one motivation which generally you should +have, you want to represent your data in having + +0:32:16.459 --> 0:32:17.469 +as many. + +0:32:17.677 --> 0:32:21.805 +Why is this the case any idea? + +0:32:21.805 --> 0:32:33.389 +Why this could be a motivation that we try +to represent the data in a way that we have + +0:32:33.389 --> 0:32:34.587 +as many. + +0:32:38.338 --> 0:32:50.501 +We also want to learn about the fun text because +maybe sometimes some grows in the fun text. + +0:32:52.612 --> 0:32:54.020 +The context is here. + +0:32:54.020 --> 0:32:56.432 +It's more about the learning first. + +0:32:56.432 --> 0:33:00.990 +You can generally learn better if you've seen +something more often. + +0:33:00.990 --> 0:33:06.553 +So if you have seen an event only once, it's +really hard to learn about the event. + +0:33:07.107 --> 0:33:15.057 +If you have seen an event a hundred times +your bearing estimating which and maybe that + +0:33:15.057 --> 0:33:18.529 +is the context, then you can use the. + +0:33:18.778 --> 0:33:21.331 +So, for example, if you here have the word +towels. + +0:33:21.761 --> 0:33:28.440 +If you would just take the data normally you +would directly process the data. + +0:33:28.440 --> 0:33:32.893 +In the upper case you would the house with +the dog. + +0:33:32.893 --> 0:33:40.085 +That's a different word than the house this +way and then the house with the common. + +0:33:40.520 --> 0:33:48.365 +So you want to learn how this translates into +house, but you translate an upper case. + +0:33:48.365 --> 0:33:50.281 +How this translates. + +0:33:50.610 --> 0:33:59.445 +You were learning how to translate into house +and house, so you have to learn four different + +0:33:59.445 --> 0:34:00.205 +things. + +0:34:00.205 --> 0:34:06.000 +Instead, we really want to learn that house +gets into house. + +0:34:06.366 --> 0:34:18.796 +And then imagine if it would be even a beak, +it might be like here a house would be into. + +0:34:18.678 --> 0:34:22.089 +Good-bye Then. + +0:34:22.202 --> 0:34:29.512 +If it's an upper case then I always have to +translate it into a boiler while it's a lower + +0:34:29.512 --> 0:34:34.955 +case that is translated into house and that's +of course not right. + +0:34:34.955 --> 0:34:39.260 +We have to use the context to decide what +is better. + +0:34:39.679 --> 0:34:47.086 +If you have seen an event several times then +you are better able to learn your model and + +0:34:47.086 --> 0:34:51.414 +that doesn't matter what type of learning you +have. + +0:34:52.392 --> 0:34:58.981 +I shouldn't say all but for most of these +models it's always better to have like seen + +0:34:58.981 --> 0:35:00.897 +an event war more often. + +0:35:00.920 --> 0:35:11.483 +Therefore, if you preprocessive data, you +should ask the question how can represent data + +0:35:11.483 --> 0:35:14.212 +in order to have seen. + +0:35:14.514 --> 0:35:17.885 +Of course you should not remove that information. + +0:35:18.078 --> 0:35:25.519 +So you could now, of course, just lowercase +everything. + +0:35:25.519 --> 0:35:30.303 +Then you've seen things more often. + +0:35:30.710 --> 0:35:38.443 +And that might be an issue because in the +final application you want to have real text + +0:35:38.443 --> 0:35:38.887 +and. + +0:35:40.440 --> 0:35:44.003 +And finally, even it's more important than +it's consistent. + +0:35:44.965 --> 0:35:52.630 +So this is a problem where, for example, aren't +consistent. + +0:35:52.630 --> 0:35:58.762 +So I am, I'm together written in training +data. + +0:35:58.762 --> 0:36:04.512 +And if you're not in test data, have a high. + +0:36:04.824 --> 0:36:14.612 +Therefore, most important is to generate preprocessing +and represent your data that is most consistent + +0:36:14.612 --> 0:36:18.413 +because it's easier to map how similar. + +0:36:18.758 --> 0:36:26.588 +If your text is represented very, very differently +then your data will be badly be translated. + +0:36:26.666 --> 0:36:30.664 +So we once had the case. + +0:36:30.664 --> 0:36:40.420 +For example, there is some data who wrote +it, but in German. + +0:36:40.900 --> 0:36:44.187 +And if you read it as a human you see it. + +0:36:44.187 --> 0:36:49.507 +It's even hard to get the difference because +it looks very similar. + +0:36:50.130 --> 0:37:02.997 +If you use it for a machine translation system, +it would not be able to translate anything + +0:37:02.997 --> 0:37:08.229 +of it because it's a different word. + +0:37:09.990 --> 0:37:17.736 +And especially on the other hand you should +of course not rechange significant training + +0:37:17.736 --> 0:37:18.968 +data thereby. + +0:37:18.968 --> 0:37:27.155 +For example, removing case information because +if your task is to generate case information. + +0:37:31.191 --> 0:37:41.081 +One thing which is a bit point to look into +it in order to see the difficulty of your data + +0:37:41.081 --> 0:37:42.711 +is to compare. + +0:37:43.103 --> 0:37:45.583 +There are types. + +0:37:45.583 --> 0:37:57.983 +We mean the number of unique words in the +corpus, so your vocabulary and the tokens. + +0:37:58.298 --> 0:38:08.628 +And then you can look at the type token ratio +that means a number of types per token. + +0:38:15.815 --> 0:38:22.381 +Have less types than tokens because every +word appears at least in the corpus, but most + +0:38:22.381 --> 0:38:27.081 +of them will occur more often until this number +is bigger, so. + +0:38:27.667 --> 0:38:30.548 +And of course this changes if you have more +date. + +0:38:31.191 --> 0:38:38.103 +Here is an example from an English Wikipedia. + +0:38:38.103 --> 0:38:45.015 +That means each word in average occurs times. + +0:38:45.425 --> 0:38:47.058 +Of course there's a big difference. + +0:38:47.058 --> 0:38:51.323 +There will be some words which occur one hundred +times, but therefore most of the words occur + +0:38:51.323 --> 0:38:51.777 +only one. + +0:38:52.252 --> 0:38:55.165 +However, you see this ratio goes down. + +0:38:55.165 --> 0:39:01.812 +That's a good thing, so you have seen each +word more often and therefore your model gets + +0:39:01.812 --> 0:39:03.156 +typically better. + +0:39:03.156 --> 0:39:08.683 +However, the problem is we always have a lot +of words which we have seen. + +0:39:09.749 --> 0:39:15.111 +Even here there will be a bound of words which +you have only seen once. + +0:39:15.111 --> 0:39:20.472 +However, this can give you an indication about +the quality of the data. + +0:39:20.472 --> 0:39:27.323 +So you should always, of course, try to achieve +data where you have a very low type to talk + +0:39:27.323 --> 0:39:28.142 +and ratio. + +0:39:28.808 --> 0:39:39.108 +For example, if you compare, simplify and +not only Wikipedia, what would be your expectation? + +0:39:41.861 --> 0:39:49.842 +Yes, that's exactly, but however it's surprisingly +only a little bit lower, but you see that it's + +0:39:49.842 --> 0:39:57.579 +lower, so we are using less words to express +the same thing, and therefore the task to produce + +0:39:57.579 --> 0:39:59.941 +this text is also a gesture. + +0:40:01.221 --> 0:40:07.702 +However, as how many words are there, there +is no clear definition. + +0:40:07.787 --> 0:40:19.915 +So there will be always more words, especially +depending on your dataset, how many different + +0:40:19.915 --> 0:40:22.132 +words there are. + +0:40:22.482 --> 0:40:30.027 +So if you have million tweets where around +fifty million tokens and you have six hundred + +0:40:30.027 --> 0:40:30.875 +thousand. + +0:40:31.251 --> 0:40:40.299 +If you have times this money teen tweeds you +also have significantly more tokens but also. + +0:40:40.660 --> 0:40:58.590 +So especially in things like the social media, +of course, there's always different types of + +0:40:58.590 --> 0:40:59.954 +words. + +0:41:00.040 --> 0:41:04.028 +Another example from not social media is here. + +0:41:04.264 --> 0:41:18.360 +So yeah, there is a small liter sandwich like +phone conversations, two million tokens, and + +0:41:18.360 --> 0:41:22.697 +only twenty thousand words. + +0:41:23.883 --> 0:41:37.221 +If you think about Shakespeare, it has even +less token, significantly less than a million, + +0:41:37.221 --> 0:41:40.006 +but the number of. + +0:41:40.060 --> 0:41:48.781 +On the other hand, there is this Google Engron +corpus which has tokens and there is always + +0:41:48.781 --> 0:41:50.506 +new words coming. + +0:41:50.991 --> 0:41:52.841 +Is English. + +0:41:52.841 --> 0:42:08.103 +The nice thing about English is that the vocabulary +is relatively small, too small, but relatively + +0:42:08.103 --> 0:42:09.183 +small. + +0:42:09.409 --> 0:42:14.224 +So here you see the Ted Corpus here. + +0:42:15.555 --> 0:42:18.144 +All know Ted's lectures. + +0:42:18.144 --> 0:42:26.429 +They are transcribed, translated, not a source +for us, especially small crocus. + +0:42:26.846 --> 0:42:32.702 +You can do a lot of experiments with that +and you see that the corpus site is relatively + +0:42:32.702 --> 0:42:36.782 +similar so we have around four million tokens +in this corpus. + +0:42:36.957 --> 0:42:44.464 +However, if you look at the vocabulary, English +has half as many words in their different words + +0:42:44.464 --> 0:42:47.045 +as German and Dutch and Italian. + +0:42:47.527 --> 0:42:56.260 +So this is one influence from positional works +like which are more frequent in German, the + +0:42:56.260 --> 0:43:02.978 +more important since we have all these different +morphological forms. + +0:43:03.263 --> 0:43:08.170 +There all leads to new words and they need +to be somewhat expressed in there. + +0:43:11.531 --> 0:43:20.278 +So to deal with this, the question is how +can we normalize the text in order to make + +0:43:20.278 --> 0:43:22.028 +the text easier? + +0:43:22.028 --> 0:43:25.424 +Can we simplify the task easier? + +0:43:25.424 --> 0:43:29.231 +But we need to keep all information. + +0:43:29.409 --> 0:43:32.239 +So an example where not all information skipped. + +0:43:32.239 --> 0:43:35.012 +Of course you make the task easier if you +just. + +0:43:35.275 --> 0:43:41.141 +You don't have to deal with different cases. + +0:43:41.141 --> 0:43:42.836 +It's easier. + +0:43:42.836 --> 0:43:52.482 +However, information gets lost and you might +need to generate the target. + +0:43:52.832 --> 0:44:00.153 +So the question is always: How can we on the +one hand simplify the task but keep all the + +0:44:00.153 --> 0:44:01.223 +information? + +0:44:01.441 --> 0:44:06.639 +Say necessary because it depends on the task. + +0:44:06.639 --> 0:44:11.724 +For some tasks you might find to remove the. + +0:44:14.194 --> 0:44:23.463 +So the steps they were typically doing are +that you can the segment and words in a running + +0:44:23.463 --> 0:44:30.696 +text, so you can normalize word forms and segmentation +into sentences. + +0:44:30.696 --> 0:44:33.955 +Also, if you have not a single. + +0:44:33.933 --> 0:44:38.739 +If this is not a redundancy point to segments, +the text is also into segments. + +0:44:39.779 --> 0:44:52.609 +So what are we doing there for European language +segmentation into words? + +0:44:52.609 --> 0:44:57.290 +It's not that complicated. + +0:44:57.277 --> 0:45:06.001 +You have to somehow handle the joint words +and by handling joint words the most important. + +0:45:06.526 --> 0:45:11.331 +So in most systems it really doesn't matter +much. + +0:45:11.331 --> 0:45:16.712 +If you write, I'm together as one word or +as two words. + +0:45:17.197 --> 0:45:23.511 +The nice thing about iron is maybe this is +so often that it doesn't matter if you both + +0:45:23.511 --> 0:45:26.560 +and if they're both accrued often enough. + +0:45:26.560 --> 0:45:32.802 +But you'll have some of these cases where +they don't occur there often, so you should + +0:45:32.802 --> 0:45:35.487 +have more as consistent as possible. + +0:45:36.796 --> 0:45:41.662 +But of course things can get more complicated. + +0:45:41.662 --> 0:45:48.598 +If you have Finland capital, do you want to +split the ends or not? + +0:45:48.598 --> 0:45:53.256 +Isn't you split or do you even write it out? + +0:45:53.433 --> 0:46:00.468 +And what about like things with hyphens in +the middle and so on? + +0:46:00.540 --> 0:46:07.729 +So there is not everything is very easy, but +is generally possible to somewhat keep as. + +0:46:11.791 --> 0:46:25.725 +Sometimes the most challenging and traditional +systems were compounds, or how to deal with + +0:46:25.725 --> 0:46:28.481 +things like this. + +0:46:28.668 --> 0:46:32.154 +The nice thing is, as said, will come to the +later. + +0:46:32.154 --> 0:46:34.501 +Nowadays we typically use subword. + +0:46:35.255 --> 0:46:42.261 +Unit, so we don't have to deal with this in +the preprocessing directly, but in the subword + +0:46:42.261 --> 0:46:47.804 +splitting we're doing it, and then we can learn +how to best spit these. + +0:46:52.392 --> 0:46:56.974 +Things Get More Complicated. + +0:46:56.977 --> 0:46:59.934 +About non European languages. + +0:46:59.934 --> 0:47:08.707 +Because in non European languages, not all +of them, there is no space between the words. + +0:47:09.029 --> 0:47:18.752 +Nowadays you can also download word segmentation +models where you put in the full sentence and + +0:47:18.752 --> 0:47:22.744 +then it's getting splitted into parts. + +0:47:22.963 --> 0:47:31.814 +And then, of course, it's even that you have +different writing systems, sometimes in Japanese. + +0:47:31.814 --> 0:47:40.385 +For example, they have these katakana, hiragana +and kanji symbols in there, and you have to + +0:47:40.385 --> 0:47:42.435 +some idea with these. + +0:47:49.669 --> 0:47:54.560 +To the, the next thing is can reduce some +normalization. + +0:47:54.874 --> 0:48:00.376 +So the idea is that you map several words +onto the same. + +0:48:00.460 --> 0:48:07.877 +And that is test dependent, and the idea is +to define something like acronym classes so + +0:48:07.877 --> 0:48:15.546 +that words, which have the same meaning where +it's not in order to have the difference, to + +0:48:15.546 --> 0:48:19.423 +map onto the same thing in order to make the. + +0:48:19.679 --> 0:48:27.023 +The most important thing is there about tasing, +and then there is something like sometimes + +0:48:27.023 --> 0:48:27.508 +word. + +0:48:28.048 --> 0:48:37.063 +For casing you can do two things and then +depend on the task. + +0:48:37.063 --> 0:48:44.769 +You can lowercase everything, maybe some exceptions. + +0:48:45.045 --> 0:48:47.831 +For the target side, it should normally it's +normally not done. + +0:48:48.188 --> 0:48:51.020 +Why is it not done? + +0:48:51.020 --> 0:48:56.542 +Why should you only do it for suicide? + +0:48:56.542 --> 0:49:07.729 +Yes, so you have to generate correct text +instead of lower case and uppercase. + +0:49:08.848 --> 0:49:16.370 +Nowadays to be always do true casing on both +sides, also on the sewer side, that means you + +0:49:16.370 --> 0:49:17.610 +keep the case. + +0:49:17.610 --> 0:49:24.966 +The only thing where people try to work on +or sometimes do that is that at the beginning + +0:49:24.966 --> 0:49:25.628 +of the. + +0:49:25.825 --> 0:49:31.115 +For words like this, this is not that important +because you will have seen otherwise a lot + +0:49:31.115 --> 0:49:31.696 +of times. + +0:49:31.696 --> 0:49:36.928 +But if you know have rare words, which you +only have seen maybe three times, and you have + +0:49:36.928 --> 0:49:42.334 +only seen in the middle of the sentence, and +now it occurs at the beginning of the sentence, + +0:49:42.334 --> 0:49:45.763 +which is upper case, then you don't know how +to deal with. + +0:49:46.146 --> 0:49:50.983 +So then it might be good to do a true casing. + +0:49:50.983 --> 0:49:56.241 +That means you recase each word on the beginning. + +0:49:56.576 --> 0:49:59.830 +The only question, of course, is how do you +recase it? + +0:49:59.830 --> 0:50:01.961 +So what case would you always know? + +0:50:02.162 --> 0:50:18.918 +Word of the senders, or do you have a better +solution, especially not English, maybe German. + +0:50:18.918 --> 0:50:20.000 +It's. + +0:50:25.966 --> 0:50:36.648 +The fancy solution would be to count hope +and decide based on this, the unfancy running + +0:50:36.648 --> 0:50:43.147 +would: Think it's not really good because most +of the cane boards are lower paced. + +0:50:43.683 --> 0:50:53.657 +That is one idea to count and definitely better +because as a word more often occurs upper case. + +0:50:53.653 --> 0:50:57.934 +Otherwise you only have a lower case at the +beginning where you have again. + +0:50:58.338 --> 0:51:03.269 +Haven't gained anything, you can make it even +a bit better when counting. + +0:51:03.269 --> 0:51:09.134 +You're ignoring the first position so that +you don't count the word beginning and yeah, + +0:51:09.134 --> 0:51:12.999 +that's typically how it's done to do this type +of casing. + +0:51:13.273 --> 0:51:23.907 +And that's the easy thing you can't even use +like then bygram teachers who work pairs. + +0:51:23.907 --> 0:51:29.651 +There's very few words which occur more often. + +0:51:29.970 --> 0:51:33.163 +It's OK to have them boast because you can +otherwise learn it. + +0:51:36.376 --> 0:51:52.305 +Another thing about these classes is to use +word classes that were partly done, for example, + +0:51:52.305 --> 0:51:55.046 +and more often. + +0:51:55.375 --> 0:51:57.214 +Ten Thousand One Hundred Books. + +0:51:57.597 --> 0:52:07.397 +And then for an system that might not be important +you can do something at number books. + +0:52:07.847 --> 0:52:16.450 +However, you see here already that it's not +that easy because if you have one book you + +0:52:16.450 --> 0:52:19.318 +don't have to do with a pro. + +0:52:20.020 --> 0:52:21.669 +Always be careful. + +0:52:21.669 --> 0:52:28.094 +It's very fast to ignore some exceptions and +make more things worse than. + +0:52:28.488 --> 0:52:37.879 +So it's always difficult to decide when to +do this and when to better not do it and keep + +0:52:37.879 --> 0:52:38.724 +things. + +0:52:43.483 --> 0:52:56.202 +Then the next step is sentence segmentation, +so we are typically working on sentences. + +0:52:56.476 --> 0:53:11.633 +However, dots things are a bit more complicated, +so you can do a bit more. + +0:53:11.731 --> 0:53:20.111 +You can even have some type of classifier +with features by then generally. + +0:53:20.500 --> 0:53:30.731 +Is not too complicated, so you can have different +types of classifiers to do that, but in generally. + +0:53:30.650 --> 0:53:32.537 +I Didn't Know It. + +0:53:33.393 --> 0:53:35.583 +It's not a super complicated task. + +0:53:35.583 --> 0:53:39.461 +There are nowadays also a lot of libraries +which you can use. + +0:53:39.699 --> 0:53:45.714 +To do that normally if you're doing the normalization +beforehand that can be done there so you only + +0:53:45.714 --> 0:53:51.126 +split up the dot if it's like the sentence +boundary and otherwise you keep it to the word + +0:53:51.126 --> 0:53:54.194 +so you can do that a bit jointly with the segment. + +0:53:54.634 --> 0:54:06.017 +It's something to think about to care because +it's where arrows happen. + +0:54:06.017 --> 0:54:14.712 +However, on the one end you can still do it +very well. + +0:54:14.834 --> 0:54:19.740 +You will never get data which is perfectly +clean and where everything is great. + +0:54:20.340 --> 0:54:31.020 +There's just too much data and it will never +happen, so therefore it's important to be aware + +0:54:31.020 --> 0:54:35.269 +of that during the full development. + +0:54:37.237 --> 0:54:42.369 +And one last thing about the preprocessing, +we'll get into the representation. + +0:54:42.369 --> 0:54:47.046 +If you're working on that, you'll get a friend +with regular expression. + +0:54:47.046 --> 0:54:50.034 +That's not only how you do all this matching. + +0:54:50.430 --> 0:55:03.811 +And if you look into the scripts of how to +deal with pancreation marks and stuff like + +0:55:03.811 --> 0:55:04.900 +that,. + +0:55:11.011 --> 0:55:19.025 +So if we have now the data of our next step +to build, the system is to represent our words. + +0:55:19.639 --> 0:55:27.650 +Before we start with this, any more questions +about preprocessing. + +0:55:27.650 --> 0:55:32.672 +While we work on the pure text, I'm sure. + +0:55:33.453 --> 0:55:40.852 +The idea is again to make things more simple +because if you think about the production mark + +0:55:40.852 --> 0:55:48.252 +at the beginning of a sentence, it might be +that you haven't seen the word or, for example, + +0:55:48.252 --> 0:55:49.619 +think of titles. + +0:55:49.619 --> 0:55:56.153 +In newspaper articles there's: So you then +have seen the word now in the title before, + +0:55:56.153 --> 0:55:58.425 +and the text you have never seen. + +0:55:58.898 --> 0:56:03.147 +But there is always the decision. + +0:56:03.123 --> 0:56:09.097 +Do I gain more because I've seen things more +often or do I lose because now I remove information + +0:56:09.097 --> 0:56:11.252 +which helps me to the same degree? + +0:56:11.571 --> 0:56:21.771 +Because if we, for example, do that in German +and remove the case, this might be an important + +0:56:21.771 --> 0:56:22.531 +issue. + +0:56:22.842 --> 0:56:30.648 +So there is not the perfect solution, but +generally you can get some arrows to make things + +0:56:30.648 --> 0:56:32.277 +look more similar. + +0:56:35.295 --> 0:56:43.275 +What you can do about products like the state +of the area or the trends that are more or + +0:56:43.275 --> 0:56:43.813 +less. + +0:56:44.944 --> 0:56:50.193 +It starts even less because models get more +powerful, so it's not that important, but be + +0:56:50.193 --> 0:56:51.136 +careful partly. + +0:56:51.136 --> 0:56:56.326 +It's also the evaluation thing because these +things which are problematic are happening + +0:56:56.326 --> 0:56:57.092 +very rarely. + +0:56:57.092 --> 0:57:00.159 +If you take average performance, it doesn't +matter. + +0:57:00.340 --> 0:57:06.715 +However, in between it's doing the stupid +mistakes that don't count on average, but they + +0:57:06.715 --> 0:57:08.219 +are not really good. + +0:57:09.089 --> 0:57:15.118 +Done you do some type of tokenization? + +0:57:15.118 --> 0:57:19.911 +You can do true casing or not. + +0:57:19.911 --> 0:57:28.723 +Some people nowadays don't do it, but that's +still done. + +0:57:28.948 --> 0:57:34.441 +Then it depends on who is a bit on the type +of domain. + +0:57:34.441 --> 0:57:37.437 +Again we have so translation. + +0:57:37.717 --> 0:57:46.031 +So in the text sometimes there is mark in +the menu, later the shortcut. + +0:57:46.031 --> 0:57:49.957 +This letter is used for shortcut. + +0:57:49.957 --> 0:57:57.232 +You cannot mistake the word because it's no +longer a file but. + +0:57:58.018 --> 0:58:09.037 +Then you cannot deal with it, so then it might +make sense to remove this. + +0:58:12.032 --> 0:58:17.437 +Now the next step is how to match words into +numbers. + +0:58:17.437 --> 0:58:22.142 +Machine learning models deal with some digits. + +0:58:22.342 --> 0:58:27.091 +The first idea is to use words as our basic +components. + +0:58:27.247 --> 0:58:40.695 +And then you have a large vocabulary where +each word gets referenced to an indigenous. + +0:58:40.900 --> 0:58:49.059 +So your sentence go home is now and that is +your set. + +0:58:52.052 --> 0:59:00.811 +So the nice thing is you have very short sequences +so that you can deal with them. + +0:59:00.811 --> 0:59:01.867 +However,. + +0:59:01.982 --> 0:59:11.086 +So you have not really understood how words +are processed. + +0:59:11.086 --> 0:59:16.951 +Why is this or can that be a problem? + +0:59:17.497 --> 0:59:20.741 +And there is an easy solution to deal with +unknown words. + +0:59:20.741 --> 0:59:22.698 +You just have one token, which is. + +0:59:23.123 --> 0:59:25.906 +Worrying in maybe some railroads in your training +day, do you deal? + +0:59:26.206 --> 0:59:34.938 +That's working a bit for some province, but +in general it's not good because you know nothing + +0:59:34.938 --> 0:59:35.588 +about. + +0:59:35.895 --> 0:59:38.770 +Can at least deal with this and maybe map +it. + +0:59:38.770 --> 0:59:44.269 +So an easy solution in machine translation +is always if it's an unknown word or we just + +0:59:44.269 --> 0:59:49.642 +copy it to the target side because unknown +words are often named entities and in many + +0:59:49.642 --> 0:59:52.454 +languages the good solution is just to keep. + +0:59:53.013 --> 1:00:01.203 +So that is somehow a trick, trick, but yeah, +that's of course not a good thing. + +1:00:01.821 --> 1:00:08.959 +It's also a problem if you deal with full +words is that you have very few examples for + +1:00:08.959 --> 1:00:09.451 +some. + +1:00:09.949 --> 1:00:17.696 +And of course if you've seen a word once you +can, someone may be translated, but we will + +1:00:17.696 --> 1:00:24.050 +learn that in your networks you represent words +with continuous vectors. + +1:00:24.264 --> 1:00:26.591 +You have seen them two, three or four times. + +1:00:26.591 --> 1:00:31.246 +It is not really well learned, and you are +typically doing most Arabs and words with your + +1:00:31.246 --> 1:00:31.763 +crow rap. + +1:00:33.053 --> 1:00:40.543 +And yeah, you cannot deal with things which +are inside the world. + +1:00:40.543 --> 1:00:50.303 +So if you know that houses set one hundred +and twelve and you see no houses, you have + +1:00:50.303 --> 1:00:51.324 +no idea. + +1:00:51.931 --> 1:00:55.533 +Of course, not really convenient, so humans +are better. + +1:00:55.533 --> 1:00:58.042 +They can use the internal information. + +1:00:58.498 --> 1:01:04.080 +So if we have houses you'll know that it's +like the bluer form of house. + +1:01:05.285 --> 1:01:16.829 +And for the ones who weren't in advance, ay, +you have this night worth here and guess. + +1:01:16.716 --> 1:01:20.454 +Don't know the meaning of these words. + +1:01:20.454 --> 1:01:25.821 +However, all of you will know is the fear +of something. + +1:01:26.686 --> 1:01:39.437 +From the ending, the phobia phobia is always +the fear of something, but you don't know how. + +1:01:39.879 --> 1:01:46.618 +So we can split words into some parts that +is helpful to deal with. + +1:01:46.618 --> 1:01:49.888 +This, for example, is a fear of. + +1:01:50.450 --> 1:02:04.022 +It's not very important, it's not how to happen +very often, but yeah, it's also not important + +1:02:04.022 --> 1:02:10.374 +for understanding that you know everything. + +1:02:15.115 --> 1:02:18.791 +So what can we do instead? + +1:02:18.791 --> 1:02:29.685 +One thing which we could do instead is to +represent words by the other extreme. + +1:02:29.949 --> 1:02:42.900 +So you really do like if you have a person's +eye and a and age, then you need a space symbol. + +1:02:43.203 --> 1:02:55.875 +So you have now a representation for each +character that enables you to implicitly learn + +1:02:55.875 --> 1:03:01.143 +morphology because words which have. + +1:03:01.541 --> 1:03:05.517 +And you can then deal with unknown words. + +1:03:05.517 --> 1:03:10.344 +There's still not everything you can process, +but. + +1:03:11.851 --> 1:03:16.953 +So if you would go on charity level what might +still be a problem? + +1:03:18.598 --> 1:03:24.007 +So all characters which you haven't seen, +but that's nowadays a little bit more often + +1:03:24.007 --> 1:03:25.140 +with new emoties. + +1:03:25.140 --> 1:03:26.020 +You couldn't. + +1:03:26.020 --> 1:03:31.366 +It could also be that you have translated +from Germany and German, and then there is + +1:03:31.366 --> 1:03:35.077 +a Japanese character or Chinese that you cannot +translate. + +1:03:35.435 --> 1:03:43.938 +But most of the time all directions occur +have been seen so that someone works very good. + +1:03:44.464 --> 1:03:58.681 +This is first a nice thing, so you have a +very small vocabulary size, so one big part + +1:03:58.681 --> 1:04:01.987 +of the calculation. + +1:04:02.222 --> 1:04:11.960 +Neural networks is the calculation of the +vocabulary size, so if you are efficient there + +1:04:11.960 --> 1:04:13.382 +it's better. + +1:04:14.914 --> 1:04:26.998 +On the other hand, the problem is you have +no very long sequences, so if you think about + +1:04:26.998 --> 1:04:29.985 +this before you have. + +1:04:30.410 --> 1:04:43.535 +Your computation often depends on your input +size and not only linear but quadratic going + +1:04:43.535 --> 1:04:44.410 +more. + +1:04:44.504 --> 1:04:49.832 +And of course it might also be that you just +generally make things more complicated than + +1:04:49.832 --> 1:04:50.910 +they were before. + +1:04:50.951 --> 1:04:58.679 +We said before make things easy, but now if +we really have to analyze each director independently, + +1:04:58.679 --> 1:05:05.003 +we cannot directly learn that university is +the same, but we have to learn that. + +1:05:05.185 --> 1:05:12.179 +Is beginning and then there is an I and then +there is an E and then all this together means + +1:05:12.179 --> 1:05:17.273 +university but another combination of these +letters is a complete. + +1:05:17.677 --> 1:05:24.135 +So of course you make everything here a lot +more complicated than you have on word basis. + +1:05:24.744 --> 1:05:32.543 +Character based models work very well in conditions +with few data because you have seen the words + +1:05:32.543 --> 1:05:33.578 +very rarely. + +1:05:33.578 --> 1:05:38.751 +It's not good to learn but you have seen all +letters more often. + +1:05:38.751 --> 1:05:44.083 +So if you have scenarios with very few data +this is like one good. + +1:05:46.446 --> 1:05:59.668 +The other idea is to split now not doing the +extreme, so either taking forwards or taking + +1:05:59.668 --> 1:06:06.573 +only directives by doing something in between. + +1:06:07.327 --> 1:06:12.909 +And one of these ideas has been done for a +long time. + +1:06:12.909 --> 1:06:17.560 +It's called compound splitting, but we only. + +1:06:17.477 --> 1:06:18.424 +Bounce them. + +1:06:18.424 --> 1:06:24.831 +You see that Baum and Stumbo accrue very often, +then maybe more often than Bounce them. + +1:06:24.831 --> 1:06:28.180 +Then you split Baum and Stumb and you use +it. + +1:06:29.509 --> 1:06:44.165 +But it's even not so easy it will learn wrong +splits so we did that in all the systems and + +1:06:44.165 --> 1:06:47.708 +there is a word Asia. + +1:06:48.288 --> 1:06:56.137 +And the business, of course, is not a really +good way of dealing it because it is non-semantic. + +1:06:56.676 --> 1:07:05.869 +The good thing is we didn't really care that +much about it because the system wasn't learned + +1:07:05.869 --> 1:07:09.428 +if you have Asia and Tish together. + +1:07:09.729 --> 1:07:17.452 +So you can of course learn all that the compound +spirit doesn't really help you to get a deeper + +1:07:17.452 --> 1:07:18.658 +understanding. + +1:07:21.661 --> 1:07:23.364 +The Thing of Course. + +1:07:23.943 --> 1:07:30.475 +Yeah, there was one paper where this doesn't +work like they report, but it's called Burning + +1:07:30.475 --> 1:07:30.972 +Ducks. + +1:07:30.972 --> 1:07:37.503 +I think because it was like if you had German +NS Branter, you could split it in NS Branter, + +1:07:37.503 --> 1:07:43.254 +and sometimes you have to add an E to make +the compounds that was Enter Branter. + +1:07:43.583 --> 1:07:48.515 +So he translated Esperanto into burning dark. + +1:07:48.888 --> 1:07:56.127 +So of course you can introduce there some +type of additional arrows, but in generally + +1:07:56.127 --> 1:07:57.221 +it's a good. + +1:07:57.617 --> 1:08:03.306 +Of course there is a trade off between vocabulary +size so you want to have a lower vocabulary + +1:08:03.306 --> 1:08:08.812 +size so you've seen everything more often but +the length of the sequence should not be too + +1:08:08.812 --> 1:08:13.654 +long because if you split more often you get +less different types but you have. + +1:08:16.896 --> 1:08:25.281 +The motivation of the advantage of compared +to Character based models is that you can directly + +1:08:25.281 --> 1:08:33.489 +learn the representation for works that occur +very often while still being able to represent + +1:08:33.489 --> 1:08:35.783 +works that are rare into. + +1:08:36.176 --> 1:08:42.973 +And while first this was only done for compounds, +nowadays there's an algorithm which really + +1:08:42.973 --> 1:08:49.405 +tries to do it on everything and there are +different ways to be honest compound fitting + +1:08:49.405 --> 1:08:50.209 +and so on. + +1:08:50.209 --> 1:08:56.129 +But the most successful one which is commonly +used is based on data compression. + +1:08:56.476 --> 1:08:59.246 +And there the idea is okay. + +1:08:59.246 --> 1:09:06.765 +Can we find an encoding so that parts are +compressed in the most efficient? + +1:09:07.027 --> 1:09:22.917 +And the compression algorithm is called the +bipear encoding, and this is also then used + +1:09:22.917 --> 1:09:25.625 +for splitting. + +1:09:26.346 --> 1:09:39.164 +And the idea is we recursively represent the +most frequent pair of bites by a new bike. + +1:09:39.819 --> 1:09:51.926 +Language is now you splitch, burst all your +words into letters, and then you look at what + +1:09:51.926 --> 1:09:59.593 +is the most frequent bigrams of which two letters +occur. + +1:10:00.040 --> 1:10:04.896 +And then you replace your repeat until you +have a fixed vocabulary. + +1:10:04.985 --> 1:10:08.031 +So that's a nice thing. + +1:10:08.031 --> 1:10:16.663 +Now you can predefine your vocabulary as want +to represent my text. + +1:10:16.936 --> 1:10:28.486 +By hand, and then you can represent any text +with these symbols, and of course the shorter + +1:10:28.486 --> 1:10:30.517 +your text will. + +1:10:32.772 --> 1:10:36.543 +So the original idea was something like that. + +1:10:36.543 --> 1:10:39.411 +We have to sequence A, B, A, B, C. + +1:10:39.411 --> 1:10:45.149 +For example, a common biogram is A, B, so +you can face A, B, B, I, D. + +1:10:45.149 --> 1:10:46.788 +Then the text gets. + +1:10:48.108 --> 1:10:53.615 +Then you can make to and then you have eating +beet and so on, so this is then your text. + +1:10:54.514 --> 1:11:00.691 +Similarly, we can do it now for tanking. + +1:11:01.761 --> 1:11:05.436 +Let's assume you have these sentences. + +1:11:05.436 --> 1:11:11.185 +I go, he goes, she goes, so your vocabulary +is go, goes, he. + +1:11:11.851 --> 1:11:30.849 +And the first thing you're doing is split +your crocus into singles. + +1:11:30.810 --> 1:11:34.692 +So thereby you can split words again like +split senses into words. + +1:11:34.692 --> 1:11:38.980 +Because now you only have chiracters, you +don't know the word boundaries. + +1:11:38.980 --> 1:11:44.194 +You introduce the word boundaries by having +a special symbol at the end of each word, and + +1:11:44.194 --> 1:11:46.222 +then you know this symbol happens. + +1:11:46.222 --> 1:11:48.366 +I can split it and have it in a new. + +1:11:48.708 --> 1:11:55.245 +So you have the corpus I go, he goes, and +she goes, and then you have now here the sequences + +1:11:55.245 --> 1:11:56.229 +of Character. + +1:11:56.229 --> 1:12:02.625 +So then the Character based per presentation, +and now you calculate the bigram statistics. + +1:12:02.625 --> 1:12:08.458 +So I and the end of word occurs one time G +& O across three times, so there there. + +1:12:09.189 --> 1:12:18.732 +And these are all the others, and now you +look, which is the most common happening. + +1:12:19.119 --> 1:12:26.046 +So then you have known the rules. + +1:12:26.046 --> 1:12:39.235 +If and have them together you have these new +words: Now is no longer two symbols, but it's + +1:12:39.235 --> 1:12:41.738 +one single symbol because if you join that. + +1:12:42.402 --> 1:12:51.175 +And then you have here now the new number +of biceps, steel and wood, and and so on. + +1:12:52.092 --> 1:13:01.753 +In small examples now you have a lot of rules +which occur the same time. + +1:13:01.753 --> 1:13:09.561 +In reality that is happening sometimes but +not that often. + +1:13:10.370 --> 1:13:21.240 +You add the end of words to him, and so this +way you go on until you have your vocabulary. + +1:13:21.601 --> 1:13:38.242 +And your vocabulary is in these rules, so +people speak about the vocabulary of the rules. + +1:13:38.658 --> 1:13:43.637 +And these are the rules, and if you have not +a different sentence, something like they tell. + +1:13:44.184 --> 1:13:53.600 +Then your final output looks like something +like that. + +1:13:53.600 --> 1:13:59.250 +These two words represent by by. + +1:14:00.940 --> 1:14:06.398 +And that is your algorithm. + +1:14:06.398 --> 1:14:18.873 +Now you can represent any type of text with +a fixed vocabulary. + +1:14:20.400 --> 1:14:23.593 +So think that's defined in the beginning. + +1:14:23.593 --> 1:14:27.243 +Fill how many egos have won and that has spent. + +1:14:28.408 --> 1:14:35.253 +It's nearly correct that it writes a number +of characters. + +1:14:35.253 --> 1:14:38.734 +It can be that in additional. + +1:14:38.878 --> 1:14:49.162 +So on the one end all three of the right side +of the rules can occur, and then additionally + +1:14:49.162 --> 1:14:49.721 +all. + +1:14:49.809 --> 1:14:55.851 +In reality it can even happen that there is +less your vocabulary smaller because it might + +1:14:55.851 --> 1:15:01.960 +happen that like for example go never occurs +singular at the end but you always like merge + +1:15:01.960 --> 1:15:06.793 +all occurrences so there are not all right +sides really happen because. + +1:15:06.746 --> 1:15:11.269 +This rule is never only applied, but afterwards +another rule is also applied. + +1:15:11.531 --> 1:15:15.621 +So it's a summary approbounce of your vocabulary +than static. + +1:15:20.480 --> 1:15:29.014 +Then we come to the last part, which is about +parallel data, but we have some questions beforehand. + +1:15:36.436 --> 1:15:38.824 +So what is parallel data? + +1:15:38.824 --> 1:15:47.368 +So if we set machine translations really, +really important that we are dealing with parallel + +1:15:47.368 --> 1:15:52.054 +data, that means we have a lined input and +output. + +1:15:52.054 --> 1:15:54.626 +You have this type of data. + +1:15:55.015 --> 1:16:01.773 +However, in machine translation we have one +very big advantage that is somewhat naturally + +1:16:01.773 --> 1:16:07.255 +occurring, so you have a lot of parallel data +which you can summar gaps. + +1:16:07.255 --> 1:16:13.788 +In many P tests you need to manually annotate +your data and generate the aligned data. + +1:16:14.414 --> 1:16:22.540 +We have to manually create translations, and +of course that is very expensive, but it's + +1:16:22.540 --> 1:16:29.281 +really expensive to pay for like one million +sentences to be translated. + +1:16:29.889 --> 1:16:36.952 +The nice thing is that in there is data normally +available because other people have done machine + +1:16:36.952 --> 1:16:37.889 +translation. + +1:16:40.120 --> 1:16:44.672 +So there is this data and of course process +it. + +1:16:44.672 --> 1:16:51.406 +We'll have a full lecture on how to deal with +more complex situations. + +1:16:52.032 --> 1:16:56.645 +The idea is really you don't do really much +human work. + +1:16:56.645 --> 1:17:02.825 +You really just start the caller with some +initials, start pages and then. + +1:17:03.203 --> 1:17:07.953 +But a lot of iquality parallel data is really +targeted on some scenarios. + +1:17:07.953 --> 1:17:13.987 +So, for example, think of the European Parliament +as one website where you can easily extract + +1:17:13.987 --> 1:17:17.581 +these information from and there you have a +large data. + +1:17:17.937 --> 1:17:22.500 +Or like we have the TED data, which is also +you can get from the TED website. + +1:17:23.783 --> 1:17:33.555 +So in generally parallel corpus is a collection +of texts with translations into one of several. + +1:17:34.134 --> 1:17:42.269 +And this data is important because there is +no general empty normally, but you work secured. + +1:17:42.222 --> 1:17:46.732 +It works especially good if your training +and test conditions are similar. + +1:17:46.732 --> 1:17:50.460 +So if the topic is similar, the style of modality +is similar. + +1:17:50.460 --> 1:17:55.391 +So if you want to translate speech, it's often +better to train all to own speech. + +1:17:55.391 --> 1:17:58.818 +If you want to translate text, it's better +to translate. + +1:17:59.379 --> 1:18:08.457 +And there is a lot of these data available +nowadays for common languages. + +1:18:08.457 --> 1:18:12.014 +You normally can start with. + +1:18:12.252 --> 1:18:15.298 +It's really available. + +1:18:15.298 --> 1:18:27.350 +For example, Opus is a big website collecting +different types of parallel corpus where you + +1:18:27.350 --> 1:18:29.601 +can select them. + +1:18:29.529 --> 1:18:33.276 +You have this document alignment will come +to that layout. + +1:18:33.553 --> 1:18:39.248 +There is things like comparable data where +you have not full sentences but only some parts + +1:18:39.248 --> 1:18:40.062 +of parallel. + +1:18:40.220 --> 1:18:48.700 +But now first let's assume we have easy tasks +like European Parliament when we have the speech + +1:18:48.700 --> 1:18:55.485 +in German and the speech in English and you +need to generate parallel data. + +1:18:55.485 --> 1:18:59.949 +That means you have to align the sewer sentences. + +1:19:00.000 --> 1:19:01.573 +And doing this right. + +1:19:05.905 --> 1:19:08.435 +How can we do that? + +1:19:08.435 --> 1:19:19.315 +And that is what people refer to sentence +alignment, so we have parallel documents in + +1:19:19.315 --> 1:19:20.707 +languages. + +1:19:22.602 --> 1:19:32.076 +This is so you cannot normally do that word +by word because there is no direct correlation + +1:19:32.076 --> 1:19:34.158 +between, but it is. + +1:19:34.074 --> 1:19:39.837 +Relatively possible to do it on sentence level, +it will not be perfect, so you sometimes have + +1:19:39.837 --> 1:19:42.535 +two sentences in English and one in German. + +1:19:42.535 --> 1:19:47.992 +German like to have these long sentences with +sub clauses and so on, so there you can do + +1:19:47.992 --> 1:19:51.733 +it, but with long sentences it might not be +really possible. + +1:19:55.015 --> 1:19:59.454 +And for some we saw that sentence Marcus Andre +there, so it's more complicated. + +1:19:59.819 --> 1:20:10.090 +So how can we formalize this sentence alignment +problem? + +1:20:10.090 --> 1:20:16.756 +So we have a set of sewer sentences. + +1:20:17.377 --> 1:20:22.167 +And machine translation relatively often. + +1:20:22.167 --> 1:20:32.317 +Sometimes source sentences nowadays are and, +but traditionally it was and because people + +1:20:32.317 --> 1:20:34.027 +started using. + +1:20:34.594 --> 1:20:45.625 +And then the idea is to find this alignment +where we have alignment. + +1:20:46.306 --> 1:20:50.421 +And of course you want these sequences to +be shown as possible. + +1:20:50.421 --> 1:20:56.400 +Of course an easy solution is here all my +screen sentences and here all my target sentences. + +1:20:56.756 --> 1:21:07.558 +So want to have short sequences there, typically +one sentence or maximum two or three sentences, + +1:21:07.558 --> 1:21:09.340 +so that really. + +1:21:13.913 --> 1:21:21.479 +Then there is different ways of restriction +to this type of alignment, so first of all + +1:21:21.479 --> 1:21:29.131 +it should be a monotone alignment, so that +means that each segment on the source should + +1:21:29.131 --> 1:21:31.218 +start after each other. + +1:21:31.431 --> 1:21:36.428 +So we assume that in document there's really +a monotone and it's going the same way in source. + +1:21:36.957 --> 1:21:41.965 +Course for a very free translation that might +not be valid anymore. + +1:21:41.965 --> 1:21:49.331 +But this algorithm, the first one in the church +and gay algorithm, is more than really translations + +1:21:49.331 --> 1:21:51.025 +which are very direct. + +1:21:51.025 --> 1:21:54.708 +So each segment should be like coming after +each. + +1:21:55.115 --> 1:22:04.117 +Then we want to translate the full sequence, +and of course each segment should start before + +1:22:04.117 --> 1:22:04.802 +it is. + +1:22:05.525 --> 1:22:22.654 +And then you want to have something like that, +but you have to alignments or alignments. + +1:22:25.525 --> 1:22:41.851 +The alignment types are: You then, of course, +sometimes insertions and Venetians where there + +1:22:41.851 --> 1:22:43.858 +is some information added. + +1:22:44.224 --> 1:22:50.412 +Hand be, for example, explanation, so it can +be that some term is known in the one language + +1:22:50.412 --> 1:22:51.018 +but not. + +1:22:51.111 --> 1:22:53.724 +Think of things like Deutschland ticket. + +1:22:53.724 --> 1:22:58.187 +In Germany everybody will by now know what +the Deutschland ticket is. + +1:22:58.187 --> 1:23:03.797 +But if you translate it to English it might +be important to explain it and other things + +1:23:03.797 --> 1:23:04.116 +are. + +1:23:04.116 --> 1:23:09.853 +So sometimes you have to explain things and +then you have more sentences with insertions. + +1:23:10.410 --> 1:23:15.956 +Then you have two to one and one to two alignment, +and that is, for example, in Germany you have + +1:23:15.956 --> 1:23:19.616 +a lot of sub-classes and bipes that are expressed +by two cents. + +1:23:20.580 --> 1:23:37.725 +Of course, it might be more complex, but typically +to make it simple and only allow for this type + +1:23:37.725 --> 1:23:40.174 +of alignment. + +1:23:41.301 --> 1:23:56.588 +Then it is about finding the alignment and +that is, we try to score where we just take + +1:23:56.588 --> 1:23:59.575 +a general score. + +1:24:00.000 --> 1:24:04.011 +That is true like gala algorithms and the +matching of one segment. + +1:24:04.011 --> 1:24:09.279 +If you have one segment now so this is one +of the global things so the global alignment + +1:24:09.279 --> 1:24:13.828 +is as good as the product of all single steps +and then you have two scores. + +1:24:13.828 --> 1:24:18.558 +First of all you say one to one alignments +are much better than all the hours. + +1:24:19.059 --> 1:24:26.884 +And then you have a lexical similarity, which +is, for example, based on an initial dictionary + +1:24:26.884 --> 1:24:30.713 +which counts how many dictionary entries are. + +1:24:31.091 --> 1:24:35.407 +So this is a very simple algorithm. + +1:24:35.407 --> 1:24:41.881 +Typically violates like your first step and +you want. + +1:24:43.303 --> 1:24:54.454 +And that is like with this one you can get +an initial one you can have better parallel + +1:24:54.454 --> 1:24:55.223 +data. + +1:24:55.675 --> 1:25:02.369 +No, it is an optimization problem and you +are now based on the scores you can calculate + +1:25:02.369 --> 1:25:07.541 +for each possible alignment and score and then +select the best one. + +1:25:07.541 --> 1:25:14.386 +Of course, you won't try all possibilities +out but you can do a good search and then find + +1:25:14.386 --> 1:25:15.451 +the best one. + +1:25:15.815 --> 1:25:18.726 +Can typically be automatically. + +1:25:18.726 --> 1:25:25.456 +Of course, you should do some checks like +aligning sentences as possible. + +1:25:26.766 --> 1:25:32.043 +A bill like typically for training data is +done this way. + +1:25:32.043 --> 1:25:35.045 +Maybe if you have test data you. + +1:25:40.000 --> 1:25:47.323 +Sorry, I'm a bit late because originally wanted +to do a quiz at the end. + +1:25:47.323 --> 1:25:49.129 +Can we go a quiz? + +1:25:49.429 --> 1:25:51.833 +We'll do it somewhere else. + +1:25:51.833 --> 1:25:56.813 +We had a bachelor project about making quiz +for lectures. + +1:25:56.813 --> 1:25:59.217 +And I still want to try it. + +1:25:59.217 --> 1:26:04.197 +So let's see I hope in some other lecture +we can do that. + +1:26:04.197 --> 1:26:09.435 +Then we can at the island of the lecture do +some quiz about. + +1:26:09.609 --> 1:26:13.081 +All We Can Do Is Is the Practical Thing Let's +See. + +1:26:13.533 --> 1:26:24.719 +And: Today, so what you should remember is +what is parallel data and how we can. + +1:26:25.045 --> 1:26:29.553 +Create parallel data like how to generally +process data. + +1:26:29.553 --> 1:26:36.435 +What you think about data is really important +if you build systems and different ways. + +1:26:36.696 --> 1:26:46.857 +The three main options like forwards is directly +on director level or using subword things. + +1:26:47.687 --> 1:26:49.634 +Is there any question? + +1:26:52.192 --> 1:26:57.768 +Yes, this is the alignment thing in Cadillac +band in Tyne walking with people. + +1:27:00.000 --> 1:27:05.761 +It's not directly using than every time walking, +but the idea is similar and you can use all + +1:27:05.761 --> 1:27:11.771 +this type of similar algorithms, which is the +main thing which is the question of the difficulty + +1:27:11.771 --> 1:27:14.807 +is to define me at your your loss function +here. + +1:27:14.807 --> 1:27:16.418 +What is a good alignment? + +1:27:16.736 --> 1:27:24.115 +But as you do not have a time walk on, you +have a monotone alignment in there, and you + +1:27:24.115 --> 1:27:26.150 +cannot have rehonoring. + +1:27:30.770 --> 1:27:40.121 +There then thanks a lot and on first day we +will then start with or discuss. + diff --git a/demo_data/lectures/Lecture-03-25.04.2023/video.mp4 b/demo_data/lectures/Lecture-03-25.04.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..78055935800d9ad9b8f5b709f0408fe57ebe4c56 --- /dev/null +++ b/demo_data/lectures/Lecture-03-25.04.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b241226dacb56a88fcbccaecb2639c3b5765fbea6f60e4758715c6941fbc512 +size 117644511 diff --git a/demo_data/lectures/Lecture-04-27.04.2023/English.vtt b/demo_data/lectures/Lecture-04-27.04.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..aa2e1d2f4f9d76d22655031c5aa0762e1bd822e4 --- /dev/null +++ b/demo_data/lectures/Lecture-04-27.04.2023/English.vtt @@ -0,0 +1,2919 @@ +WEBVTT + +0:00:03.663 --> 0:00:07.970 +Okay, then I should switch back to English, +sorry,. + +0:00:08.528 --> 0:00:18.970 +So welcome to today's lecture in the cross +machine translation and today we're planning + +0:00:18.970 --> 0:00:20.038 +to talk. + +0:00:20.880 --> 0:00:31.845 +Which will be without our summary of power +translation was done from around till. + +0:00:32.872 --> 0:00:38.471 +Fourteen, so this was an approach which was +quite long. + +0:00:38.471 --> 0:00:47.070 +It was the first approach where at the end +the quality was really so good that it was + +0:00:47.070 --> 0:00:49.969 +used as a commercial system. + +0:00:49.990 --> 0:00:56.482 +Or something like that, so the first systems +there was using the statistical machine translation. + +0:00:57.937 --> 0:01:02.706 +So when I came into the field this was the +main part of the lecture, so there would be + +0:01:02.706 --> 0:01:07.912 +not be one lecture, but in more detail than +half of the full course would be about statistical + +0:01:07.912 --> 0:01:09.063 +machine translation. + +0:01:09.369 --> 0:01:23.381 +So what we try to do today is like get the +most important things, which think our part + +0:01:23.381 --> 0:01:27.408 +is still very important. + +0:01:27.267 --> 0:01:31.196 +Four State of the Art Box. + +0:01:31.952 --> 0:01:45.240 +Then we'll have the presentation about how +to evaluate the other part of the machine translation. + +0:01:45.505 --> 0:01:58.396 +The other important thing is the language +modeling part will explain later how they combine. + +0:01:59.539 --> 0:02:04.563 +Shortly mentioned this one already. + +0:02:04.824 --> 0:02:06.025 +On Tuesday. + +0:02:06.246 --> 0:02:21.849 +So in a lot of these explanations, how we +model translation process, it might be surprising: + +0:02:22.082 --> 0:02:27.905 +Later some people say it's for four eight words +traditionally came because the first models + +0:02:27.905 --> 0:02:32.715 +which you'll discuss here also when they are +referred to as the IVM models. + +0:02:32.832 --> 0:02:40.043 +They were trained on French to English translation +directions and that's why they started using + +0:02:40.043 --> 0:02:44.399 +F and E and then this was done for the next +twenty years. + +0:02:44.664 --> 0:02:52.316 +So while we are trying to wait, the source +words is: We have a big eye, typically the + +0:02:52.316 --> 0:03:02.701 +lengths of the sewer sentence in small eye, +the position, and similarly in the target and + +0:03:02.701 --> 0:03:05.240 +the lengths of small. + +0:03:05.485 --> 0:03:13.248 +Things will get a bit complicated in this +way because it is not always clear what is + +0:03:13.248 --> 0:03:13.704 +the. + +0:03:14.014 --> 0:03:21.962 +See that there is this noisy channel model +which switches the direction in your model, + +0:03:21.962 --> 0:03:25.616 +but in the application it's the target. + +0:03:26.006 --> 0:03:37.077 +So that is why if you especially read these +papers, it might sometimes be a bit disturbing. + +0:03:37.437 --> 0:03:40.209 +Try to keep it here always. + +0:03:40.209 --> 0:03:48.427 +The source is, and even if we use a model +where it's inverse, we'll keep this way. + +0:03:48.468 --> 0:03:55.138 +Don't get disturbed by that, and I think it's +possible to understand all that without this + +0:03:55.138 --> 0:03:55.944 +confusion. + +0:03:55.944 --> 0:04:01.734 +But in some of the papers you might get confused +because they switched to the. + +0:04:04.944 --> 0:04:17.138 +In general, in statistics and machine translation, +the goal is how we do translation. + +0:04:17.377 --> 0:04:25.562 +But first we are seeing all our possible target +sentences as possible translations. + +0:04:26.726 --> 0:04:37.495 +And we are assigning some probability to the +combination, so we are modeling. + +0:04:39.359 --> 0:04:49.746 +And then we are doing a search over all possible +things or at least theoretically, and we are + +0:04:49.746 --> 0:04:56.486 +trying to find the translation with the highest +probability. + +0:04:56.936 --> 0:05:05.116 +And this general idea is also true for neuromachine +translation. + +0:05:05.116 --> 0:05:07.633 +They differ in how. + +0:05:08.088 --> 0:05:10.801 +So these were then of course the two big challenges. + +0:05:11.171 --> 0:05:17.414 +On the one hand, how can we estimate this +probability? + +0:05:17.414 --> 0:05:21.615 +How is the translation of the other? + +0:05:22.262 --> 0:05:32.412 +The other challenge is the search, so we cannot, +of course, say we want to find the most probable + +0:05:32.412 --> 0:05:33.759 +translation. + +0:05:33.759 --> 0:05:42.045 +We cannot go over all possible English sentences +and calculate the probability. + +0:05:43.103 --> 0:05:45.004 +So,. + +0:05:45.165 --> 0:05:53.423 +What we have to do there is some are doing +intelligent search and look for the ones and + +0:05:53.423 --> 0:05:54.268 +compare. + +0:05:54.734 --> 0:05:57.384 +That will be done. + +0:05:57.384 --> 0:06:07.006 +This process of finding them is called the +decoding process because. + +0:06:07.247 --> 0:06:09.015 +They will be covered well later. + +0:06:09.015 --> 0:06:11.104 +Today we will concentrate on the mile. + +0:06:11.451 --> 0:06:23.566 +The model is trained using data, so in the +first step we're having data, we're somehow + +0:06:23.566 --> 0:06:30.529 +having a definition of what the model looks +like. + +0:06:34.034 --> 0:06:42.913 +And in statistical machine translation the +common model is behind. + +0:06:42.913 --> 0:06:46.358 +That is what is referred. + +0:06:46.786 --> 0:06:55.475 +And this is motivated by the initial idea +from Shannon. + +0:06:55.475 --> 0:07:02.457 +We have this that you can think of decoding. + +0:07:02.722 --> 0:07:10.472 +So think of it as we have this text in maybe +German. + +0:07:10.472 --> 0:07:21.147 +Originally it was an English text, but somebody +used some nice decoding. + +0:07:21.021 --> 0:07:28.579 +Task is to decipher it again, this crazy cyborg +expressing things in German, and to decipher + +0:07:28.579 --> 0:07:31.993 +the meaning again and doing that between. + +0:07:32.452 --> 0:07:35.735 +And that is the idea about this noisy channel +when it. + +0:07:36.236 --> 0:07:47.209 +It goes through some type of channel which +adds noise to the source and then you receive + +0:07:47.209 --> 0:07:48.811 +the message. + +0:07:49.429 --> 0:08:00.190 +And then the idea is, can we now construct +the original message out of these messages + +0:08:00.190 --> 0:08:05.070 +by modeling some of the channels here? + +0:08:06.726 --> 0:08:15.797 +There you know to see a bit the surface of +the source message with English. + +0:08:15.797 --> 0:08:22.361 +It went through some channel and received +the message. + +0:08:22.682 --> 0:08:31.381 +If you're not looking at machine translation, +your source language is English. + +0:08:31.671 --> 0:08:44.388 +Here you see now a bit of this where the confusion +starts while English as a target language is + +0:08:44.388 --> 0:08:47.700 +also the source message. + +0:08:47.927 --> 0:08:48.674 +You can see. + +0:08:48.674 --> 0:08:51.488 +There is also a mathematics of how we model +the. + +0:08:52.592 --> 0:08:56.888 +It's a noisy channel model from a mathematic +point of view. + +0:08:56.997 --> 0:09:00.245 +So this is again our general formula. + +0:09:00.245 --> 0:09:08.623 +We are looking for the most probable translation +and that is the translation that has the highest + +0:09:08.623 --> 0:09:09.735 +probability. + +0:09:09.809 --> 0:09:19.467 +We are not interested in the probability itself, +but we are interesting in this target sentence + +0:09:19.467 --> 0:09:22.082 +E where this probability. + +0:09:23.483 --> 0:09:33.479 +And: Therefore, we can use them twice definition +of conditional probability and using the base + +0:09:33.479 --> 0:09:42.712 +rules, so this probability equals the probability +of f giving any kind of probability of e divided + +0:09:42.712 --> 0:09:44.858 +by the probability of. + +0:09:45.525 --> 0:09:48.218 +Now see mathematically this confusion. + +0:09:48.218 --> 0:09:54.983 +Originally we are interested in the probability +of the target sentence given the search sentence. + +0:09:55.295 --> 0:10:00.742 +And if we are modeling things now, we are +looking here at the inverse direction, so the + +0:10:00.742 --> 0:10:06.499 +probability of F given E to the probability +of the source sentence given the target sentence + +0:10:06.499 --> 0:10:10.832 +is the probability of the target sentence divided +by the probability. + +0:10:13.033 --> 0:10:15.353 +Why are we doing this? + +0:10:15.353 --> 0:10:24.333 +Maybe I mean, of course, once it's motivated +by our model, that we were saying this type + +0:10:24.333 --> 0:10:27.058 +of how we are modeling it. + +0:10:27.058 --> 0:10:30.791 +The other interesting thing is that. + +0:10:31.231 --> 0:10:40.019 +So we are looking at this probability up there, +which we had before we formulate that we can + +0:10:40.019 --> 0:10:40.775 +remove. + +0:10:41.181 --> 0:10:46.164 +If we are searching for the highest translation, +this is fixed. + +0:10:46.164 --> 0:10:47.800 +This doesn't change. + +0:10:47.800 --> 0:10:52.550 +We have an input, the source sentence, and +we cannot change. + +0:10:52.812 --> 0:11:02.780 +Is always the same, so we can ignore it in +the ACMAX because the lower one is exactly + +0:11:02.780 --> 0:11:03.939 +the same. + +0:11:04.344 --> 0:11:06.683 +And then we have p o f. + +0:11:06.606 --> 0:11:13.177 +E times P of E and that is so we are modeling +the translation process on the one hand with + +0:11:13.177 --> 0:11:19.748 +the translation model which models how probable +is the sentence F given E and on the other + +0:11:19.748 --> 0:11:25.958 +hand with the language model which models only +how probable is this English sentence. + +0:11:26.586 --> 0:11:39.366 +That somebody wrote this language or translation +point of view, this is about fluency. + +0:11:40.200 --> 0:11:44.416 +You should have in German, for example, agreement. + +0:11:44.416 --> 0:11:50.863 +If the agreement is not right, that's properly +not said by anybody in German. + +0:11:50.863 --> 0:11:58.220 +Nobody would say that's Schönest's house because +it's not according to the German rules. + +0:11:58.598 --> 0:12:02.302 +So this can be modeled by the language model. + +0:12:02.542 --> 0:12:09.855 +And you have the translation model which models +housings get translated between the. + +0:12:10.910 --> 0:12:18.775 +And here you see again our confusion again, +and now here put the translation model: Wage + +0:12:18.775 --> 0:12:24.360 +is a big income counterintuitive because the +probability of a sewer sentence giving the + +0:12:24.360 --> 0:12:24.868 +target. + +0:12:26.306 --> 0:12:35.094 +Have to do that for the bass farmer, but in +the following slides I'll talk again about. + +0:12:35.535 --> 0:12:45.414 +Because yeah, that's more intuitive that you +model the translation of the target sentence + +0:12:45.414 --> 0:12:48.377 +given the source sentence. + +0:12:50.930 --> 0:12:55.668 +And this is what we want to talk about today. + +0:12:55.668 --> 0:13:01.023 +We later talk about language models how to +do that. + +0:13:00.940 --> 0:13:04.493 +And maybe also how to combine them. + +0:13:04.493 --> 0:13:13.080 +But the focus on today would be how can we +model this probability to how to generate a + +0:13:13.080 --> 0:13:16.535 +translation from source to target? + +0:13:19.960 --> 0:13:24.263 +How can we do that and the easiest thing? + +0:13:24.263 --> 0:13:33.588 +Maybe if you think about statistics, you count +how many examples you have, how many target + +0:13:33.588 --> 0:13:39.121 +sentences go occur, and that gives you an estimation. + +0:13:40.160 --> 0:13:51.632 +However, like in another model that is not +possible because most sentences you will never + +0:13:51.632 --> 0:13:52.780 +see, so. + +0:13:53.333 --> 0:14:06.924 +So what we have to do is break up the translation +process into smaller models and model each + +0:14:06.924 --> 0:14:09.555 +of the decisions. + +0:14:09.970 --> 0:14:26.300 +So this simple solution with how you throw +a dice is like you have a and that gives you + +0:14:26.300 --> 0:14:29.454 +the probability. + +0:14:29.449 --> 0:14:40.439 +But here's the principle because each event +is so rare that most of them never have helped. + +0:14:43.063 --> 0:14:48.164 +Although it might be that in all your training +data you have never seen this title of set. + +0:14:49.589 --> 0:14:52.388 +How can we do that? + +0:14:52.388 --> 0:15:04.845 +We look in statistical machine translation +into two different models, a generative model + +0:15:04.845 --> 0:15:05.825 +where. + +0:15:06.166 --> 0:15:11.736 +So the idea was to really model model like +each individual translation between words. + +0:15:12.052 --> 0:15:22.598 +So you break down the translation of a full +sentence into the translation of each individual's + +0:15:22.598 --> 0:15:23.264 +word. + +0:15:23.264 --> 0:15:31.922 +So you say if you have the black cat, if you +translate it, the full sentence. + +0:15:32.932 --> 0:15:38.797 +Of course, this has some challenges, any ideas +where this type of model could be very challenging. + +0:15:40.240 --> 0:15:47.396 +Vocabularies and videos: Yes, we're going +to be able to play in the very color. + +0:15:47.867 --> 0:15:51.592 +Yes, but you could at least use a bit of the +context around it. + +0:15:51.592 --> 0:15:55.491 +It will not only depend on the word, but it's +already challenging. + +0:15:55.491 --> 0:15:59.157 +You make things very hard, so that's definitely +one challenge. + +0:16:00.500 --> 0:16:07.085 +One other, what did you talk about that we +just don't want to say? + +0:16:08.348 --> 0:16:11.483 +Yes, they are challenging. + +0:16:11.483 --> 0:16:21.817 +You have to do something like words, but the +problem is that you might introduce errors. + +0:16:21.841 --> 0:16:23.298 +Later and makes things very comfortable. + +0:16:25.265 --> 0:16:28.153 +Wrong splitting is the worst things that are +very complicated. + +0:16:32.032 --> 0:16:35.580 +Saints, for example, and also maybe Japanese +medicine. + +0:16:35.735 --> 0:16:41.203 +In German, yes, especially like these are +all right. + +0:16:41.203 --> 0:16:46.981 +The first thing is maybe the one which is +most obvious. + +0:16:46.981 --> 0:16:49.972 +It is raining cats and dogs. + +0:16:51.631 --> 0:17:01.837 +To German, the cat doesn't translate this +whole chunk into something because there is + +0:17:01.837 --> 0:17:03.261 +not really. + +0:17:03.403 --> 0:17:08.610 +Mean, of course, in generally there is this +type of alignment, so there is a correspondence + +0:17:08.610 --> 0:17:11.439 +between words in English and the words in German. + +0:17:11.439 --> 0:17:16.363 +However, that's not true for all sentences, +so in some sentences you cannot really say + +0:17:16.363 --> 0:17:18.174 +this word translates into that. + +0:17:18.498 --> 0:17:21.583 +But you can only let more locate this whole +phrase. + +0:17:21.583 --> 0:17:23.482 +This model into something else. + +0:17:23.563 --> 0:17:30.970 +If you think about the don't in English, the +do is not really clearly where should that + +0:17:30.970 --> 0:17:31.895 +be allied. + +0:17:32.712 --> 0:17:39.079 +Then for a long time the most successful approach +was this phrase based translation model where + +0:17:39.079 --> 0:17:45.511 +the idea is your block is not a single word +but a longer phrase if you try to build translations + +0:17:45.511 --> 0:17:46.572 +based on these. + +0:17:48.768 --> 0:17:54.105 +But let's start with a word based and what +you need. + +0:17:54.105 --> 0:18:03.470 +There is two main knowledge sources, so on +the one hand we have a lexicon where we translate + +0:18:03.470 --> 0:18:05.786 +possible translations. + +0:18:06.166 --> 0:18:16.084 +The main difference between the lexicon and +statistical machine translation and lexicon + +0:18:16.084 --> 0:18:17.550 +as you know. + +0:18:17.837 --> 0:18:23.590 +Traditional lexicon: You know how word is +translated and mainly it's giving you two or + +0:18:23.590 --> 0:18:26.367 +three examples with any example sentence. + +0:18:26.367 --> 0:18:30.136 +So in this context it gets translated like +that henceon. + +0:18:30.570 --> 0:18:38.822 +In order to model that and work with probabilities +what we need in a machine translation is these: + +0:18:39.099 --> 0:18:47.962 +So if we have the German word bargain, it sends +me out with a probability of zero point five. + +0:18:47.962 --> 0:18:51.545 +Maybe it's translated into a vehicle. + +0:18:52.792 --> 0:18:58.876 +And of course this is not easy to be created +by a shoveman. + +0:18:58.876 --> 0:19:07.960 +If ask you and give probabilities for how +probable this vehicle is, there might: So how + +0:19:07.960 --> 0:19:12.848 +we are doing is again that the lexicon is automatically +will be created from a corpus. + +0:19:13.333 --> 0:19:18.754 +And we're just counting here, so we count +how often does it work, how often does it co + +0:19:18.754 --> 0:19:24.425 +occur with vehicle, and then we're taking the +ratio and saying in the house of time on the + +0:19:24.425 --> 0:19:26.481 +English side there was vehicles. + +0:19:26.481 --> 0:19:31.840 +There was a probability of vehicles given +back, and there's something like zero point + +0:19:31.840 --> 0:19:32.214 +five. + +0:19:33.793 --> 0:19:46.669 +That we need another concept, and that is +this concept of alignment, and now you can + +0:19:46.669 --> 0:19:47.578 +have. + +0:19:47.667 --> 0:19:53.113 +Since this is quite complicated, the alignment +in general can be complex. + +0:19:53.113 --> 0:19:55.689 +It can be that it's not only like. + +0:19:55.895 --> 0:20:04.283 +It can be that two words of a surrender target +sign and it's also imbiguous. + +0:20:04.283 --> 0:20:13.761 +It can be that you say all these two words +only are aligned together and our words are + +0:20:13.761 --> 0:20:15.504 +aligned or not. + +0:20:15.875 --> 0:20:21.581 +Is should the do be aligned to the knot in +German? + +0:20:21.581 --> 0:20:29.301 +It's only there because in German it's not, +so it should be aligned. + +0:20:30.510 --> 0:20:39.736 +However, typically it's formalized and it's +formalized by a function from the target language. + +0:20:40.180 --> 0:20:44.051 +And that is to make these models get easier +and clearer. + +0:20:44.304 --> 0:20:49.860 +That means what means does it mean that you +have a fence that means that each. + +0:20:49.809 --> 0:20:58.700 +A sewer's word gives target word and the alliance +to only one source word because the function + +0:20:58.700 --> 0:21:00.384 +is also directly. + +0:21:00.384 --> 0:21:05.999 +However, a source word can be hit or like +by signal target. + +0:21:06.286 --> 0:21:11.332 +So you are allowing for one to many alignments, +but not for many to one alignment. + +0:21:11.831 --> 0:21:17.848 +That is a bit of a challenge because you assume +a lightning should be symmetrical. + +0:21:17.848 --> 0:21:24.372 +So if you look at a parallel sentence, it +should not matter if you look at it from German + +0:21:24.372 --> 0:21:26.764 +to English or English to German. + +0:21:26.764 --> 0:21:34.352 +So however, it makes these models: Yea possible +and we'll like to see yea for the phrase bass + +0:21:34.352 --> 0:21:36.545 +until we need these alignments. + +0:21:36.836 --> 0:21:41.423 +So this alignment was the most important of +the world based models. + +0:21:41.423 --> 0:21:47.763 +For the next twenty years you need the world +based models to generate this type of alignment, + +0:21:47.763 --> 0:21:50.798 +which is then the first step for the phrase. + +0:21:51.931 --> 0:21:59.642 +Approach, and there you can then combine them +again like both directions into one we'll see. + +0:22:00.280 --> 0:22:06.850 +This alignment is very important and allows +us to do this type of separation. + +0:22:08.308 --> 0:22:15.786 +And yet the most commonly used word based +models are these models referred to as IBM + +0:22:15.786 --> 0:22:25.422 +models, and there is a sequence of them with +great names: And they were like yeah very commonly + +0:22:25.422 --> 0:22:26.050 +used. + +0:22:26.246 --> 0:22:31.719 +We'll mainly focus on the simple one here +and look how this works and then not do all + +0:22:31.719 --> 0:22:34.138 +the details about the further models. + +0:22:34.138 --> 0:22:38.084 +The interesting thing is also that all of +them are important. + +0:22:38.084 --> 0:22:43.366 +So if you want to train this alignment what +you normally do is train an IVM model. + +0:22:43.743 --> 0:22:50.940 +Then you take that as your initialization +to then train the IBM model too and so on. + +0:22:50.940 --> 0:22:53.734 +The motivation for that is yeah. + +0:22:53.734 --> 0:23:00.462 +The first model gives you: Is so simple that +you can even find a global optimum, so it gives + +0:23:00.462 --> 0:23:06.403 +you a good starting point for the next one +where the optimization in finding the right + +0:23:06.403 --> 0:23:12.344 +model is more difficult and therefore like +the defore technique was to make your model + +0:23:12.344 --> 0:23:13.641 +step by step more. + +0:23:15.195 --> 0:23:27.333 +In these models we are breaking down the probability +into smaller steps and then we can define: + +0:23:27.367 --> 0:23:38.981 +You see it's not a bit different, so it's not +the curability and one specific alignment given. + +0:23:39.299 --> 0:23:42.729 +We'll let us learn how we can then go from +one alignment to the full set. + +0:23:43.203 --> 0:23:52.889 +The probability of target sentences and one +alignment between the source and target sentences + +0:23:52.889 --> 0:23:56.599 +alignment is this type of function. + +0:23:57.057 --> 0:24:14.347 +That every word is aligned in order to ensure +that every word is aligned. + +0:24:15.835 --> 0:24:28.148 +So first of all you do some epsilon, the epsilon +is just a normalization factor that everything + +0:24:28.148 --> 0:24:31.739 +is somehow to inferability. + +0:24:31.631 --> 0:24:37.539 +Of source sentences plus one to the power +of the length of the targets. + +0:24:37.937 --> 0:24:50.987 +And this is somehow the probability of this +alignment. + +0:24:51.131 --> 0:24:53.224 +So is this alignment probable or not? + +0:24:53.224 --> 0:24:55.373 +Of course you can have some intuition. + +0:24:55.373 --> 0:24:58.403 +So if there's a lot of crossing, it may be +not a good. + +0:24:58.403 --> 0:25:03.196 +If all of the words align to the same one +might be not a good alignment, but generally + +0:25:03.196 --> 0:25:06.501 +it's difficult to really describe what is a +good alignment. + +0:25:07.067 --> 0:25:11.482 +Say for the first model that's the most simple +thing. + +0:25:11.482 --> 0:25:18.760 +What can be the most simple thing if you think +about giving a probability to some event? + +0:25:21.401 --> 0:25:25.973 +Yes exactly, so just take the uniform distribution. + +0:25:25.973 --> 0:25:33.534 +If we don't really know the best thing of +modeling is all equally probable, of course + +0:25:33.534 --> 0:25:38.105 +that is not true, but it's giving you a good +study. + +0:25:38.618 --> 0:25:44.519 +And so this one is just a number of all possible +alignments for this sentence. + +0:25:44.644 --> 0:25:53.096 +So how many alignments are possible, so the +first target word can be allied to all sources + +0:25:53.096 --> 0:25:53.746 +worth. + +0:25:54.234 --> 0:26:09.743 +The second one can also be aligned to all +source work, and the third one also to source. + +0:26:10.850 --> 0:26:13.678 +This is the number of alignments. + +0:26:13.678 --> 0:26:19.002 +The second part is to model the probability +of the translation. + +0:26:19.439 --> 0:26:31.596 +And there it's not nice to have this function, +so now we are making the product over all target. + +0:26:31.911 --> 0:26:40.068 +And we are making a very strong independent +assumption because in these models we normally + +0:26:40.068 --> 0:26:45.715 +assume the translation probability of one word +is independent. + +0:26:46.126 --> 0:26:49.800 +So how you translate and visit it is independent +of all the other parts. + +0:26:50.290 --> 0:26:52.907 +That is very strong and very bad. + +0:26:52.907 --> 0:26:55.294 +Yeah, you should do it better. + +0:26:55.294 --> 0:27:00.452 +We know that it's wrong because how you translate +this depends on. + +0:27:00.452 --> 0:27:05.302 +However, it's a first easy solution and again +a good starting. + +0:27:05.966 --> 0:27:14.237 +So what you do is that you take a product +of all words and take a translation probability + +0:27:14.237 --> 0:27:15.707 +on this target. + +0:27:16.076 --> 0:27:23.901 +And because we know that there is always one +source word allied to that, so it. + +0:27:24.344 --> 0:27:37.409 +If the probability of visits in the zoo doesn't +really work, the good here I'm again. + +0:27:38.098 --> 0:27:51.943 +So most only we have it here, so the probability +is an absolute divided pipe to the power. + +0:27:53.913 --> 0:27:58.401 +And then there is somewhere in the last one. + +0:27:58.401 --> 0:28:04.484 +There is an arrow and switch, so it is the +other way around. + +0:28:04.985 --> 0:28:07.511 +Then you have your translation model. + +0:28:07.511 --> 0:28:12.498 +Hopefully let's assume you have your water +train so that's only a signing. + +0:28:12.953 --> 0:28:25.466 +And then this sentence has the probability +of generating I visit a friend given that you + +0:28:25.466 --> 0:28:31.371 +have the source sentence if Bezukhov I'm. + +0:28:32.012 --> 0:28:34.498 +Time stand to the power of minus five. + +0:28:35.155 --> 0:28:36.098 +So this is your model. + +0:28:36.098 --> 0:28:37.738 +This is how you're applying your model. + +0:28:39.479 --> 0:28:44.220 +As you said, it's the most simple bottle you +assume that all word translations are. + +0:28:44.204 --> 0:28:46.540 +Independent of each other. + +0:28:46.540 --> 0:28:54.069 +You assume that all alignments are equally +important, and then the only thing you need + +0:28:54.069 --> 0:29:00.126 +for this type of model is to have this lexicon +in order to calculate. + +0:29:00.940 --> 0:29:04.560 +And that is, of course, now the training process. + +0:29:04.560 --> 0:29:08.180 +The question is how do we get this type of +lexic? + +0:29:09.609 --> 0:29:15.461 +But before we look into the training, do you +have any questions about the model itself? + +0:29:21.101 --> 0:29:26.816 +The problem in training is that we have incomplete +data. + +0:29:26.816 --> 0:29:32.432 +So if you want to count, I mean said you want +to count. + +0:29:33.073 --> 0:29:39.348 +However, if you don't have the alignment, +on the other hand, if you would have a lexicon + +0:29:39.348 --> 0:29:44.495 +you could maybe generate the alignment, which +is the most probable word. + +0:29:45.225 --> 0:29:55.667 +And this is the very common problem that you +have this type of incomplete data where you + +0:29:55.667 --> 0:29:59.656 +have not one type of information. + +0:30:00.120 --> 0:30:08.767 +And you can model this by considering the +alignment as your hidden variable and then + +0:30:08.767 --> 0:30:17.619 +you can use the expectation maximization algorithm +in order to generate the alignment. + +0:30:17.577 --> 0:30:26.801 +So the nice thing is that you only need your +parallel data, which is aligned on sentence + +0:30:26.801 --> 0:30:29.392 +level, but you normally. + +0:30:29.389 --> 0:30:33.720 +Is just a lot of work we saw last time. + +0:30:33.720 --> 0:30:39.567 +Typically what you have is this type of corpus +where. + +0:30:41.561 --> 0:30:50.364 +And yeah, the ERM algorithm sounds very fancy. + +0:30:50.364 --> 0:30:58.605 +However, again look at a little high level. + +0:30:58.838 --> 0:31:05.841 +So you're initializing a model by uniform +distribution. + +0:31:05.841 --> 0:31:14.719 +You're just saying if have lexicon, if all +words are equally possible. + +0:31:15.215 --> 0:31:23.872 +And then you apply your model to the data, +and that is your expectation step. + +0:31:23.872 --> 0:31:30.421 +So given this initial lexicon, we are now +calculating the. + +0:31:30.951 --> 0:31:36.043 +So we can now take all our parallel sentences, +and of course ought to check what is the most + +0:31:36.043 --> 0:31:36.591 +probable. + +0:31:38.338 --> 0:31:49.851 +And then, of course, at the beginning maybe +houses most often in line. + +0:31:50.350 --> 0:31:58.105 +Once we have done this expectation step, we +can next do the maximization step and based + +0:31:58.105 --> 0:32:06.036 +on this guest alignment, which we have, we +can now learn better translation probabilities + +0:32:06.036 --> 0:32:09.297 +by just counting how often do words. + +0:32:09.829 --> 0:32:22.289 +And then it's rated these steps: We can make +this whole process even more stable, only taking + +0:32:22.289 --> 0:32:26.366 +the most probable alignment. + +0:32:26.346 --> 0:32:36.839 +Second step, but in contrast we calculate +for all possible alignments the alignment probability + +0:32:36.839 --> 0:32:40.009 +and weigh the correcurrence. + +0:32:40.000 --> 0:32:41.593 +Then Things Are Most. + +0:32:42.942 --> 0:32:49.249 +Why could that be very challenging if we do +it in general and really calculate all probabilities + +0:32:49.249 --> 0:32:49.834 +for all? + +0:32:53.673 --> 0:32:55.905 +How many alignments are there for a Simpson? + +0:32:58.498 --> 0:33:03.344 +Yes there, we just saw that in the formula +if you remember. + +0:33:03.984 --> 0:33:12.336 +This was the formula so it's exponential in +the lengths of the target sentence. + +0:33:12.336 --> 0:33:15.259 +It would calculate all the. + +0:33:15.415 --> 0:33:18.500 +Be very inefficient and really possible. + +0:33:18.500 --> 0:33:25.424 +The nice thing is we can again use some type +of dynamic programming, so then we can do this + +0:33:25.424 --> 0:33:27.983 +without really calculating audit. + +0:33:28.948 --> 0:33:40.791 +We have the next pipe slides or so with the +most equations in the whole lecture, so don't + +0:33:40.791 --> 0:33:41.713 +worry. + +0:33:42.902 --> 0:34:01.427 +So we said we have first explanation where +it is about calculating the alignment. + +0:34:02.022 --> 0:34:20.253 +And we can do this with our initial definition +of because this formula. + +0:34:20.160 --> 0:34:25.392 +So we can define this as and and divided by +and. + +0:34:25.905 --> 0:34:30.562 +This is just the normal definition of a conditional +probability. + +0:34:31.231 --> 0:34:37.937 +And what we then need to assume a meter calculate +is P of E given. + +0:34:37.937 --> 0:34:41.441 +P of E given is still again quiet. + +0:34:41.982 --> 0:34:56.554 +Simple: The probability of the sewer sentence +given the target sentence is quite intuitive. + +0:34:57.637 --> 0:35:15.047 +So let's just calculate how to calculate the +probability of a event. + +0:35:15.215 --> 0:35:21.258 +So in here we can then put in our original +form in our soils. + +0:35:21.201 --> 0:35:28.023 +There are some of the possible alignments +of the first word, and so until the sum of + +0:35:28.023 --> 0:35:30.030 +all possible alignments. + +0:35:29.990 --> 0:35:41.590 +And then we have the probability here of the +alignment type, this product of translation. + +0:35:42.562 --> 0:35:58.857 +Now this one is independent of the alignment, +so we can put it to the front here. + +0:35:58.959 --> 0:36:03.537 +And now this is where dynamic programming +works in. + +0:36:03.537 --> 0:36:08.556 +We can change that and make thereby things +a lot easier. + +0:36:08.668 --> 0:36:21.783 +Can reform it like this just as a product +over all target positions, and then it's the + +0:36:21.783 --> 0:36:26.456 +sum over all source positions. + +0:36:27.127 --> 0:36:36.454 +Maybe at least the intuition why this is equal +is a lot easier if you look into it as graphic. + +0:36:36.816 --> 0:36:39.041 +So what we have here is the table. + +0:36:39.041 --> 0:36:42.345 +We have the target position and the Swiss +position. + +0:36:42.862 --> 0:37:03.643 +And we have to sum up all possible passes +through that: The nice thing is that each of + +0:37:03.643 --> 0:37:07.127 +these passes these probabilities are independent +of each. + +0:37:07.607 --> 0:37:19.678 +In order to get the sum of all passes through +this table you can use dynamic programming + +0:37:19.678 --> 0:37:27.002 +and then say oh this probability is exactly +the same. + +0:37:26.886 --> 0:37:34.618 +Times the sun of this column finds the sum +of this column, and times the sun of this colun. + +0:37:35.255 --> 0:37:41.823 +That is the same as if you go through all +possible passes here and multiply always the + +0:37:41.823 --> 0:37:42.577 +elements. + +0:37:43.923 --> 0:37:54.227 +And that is a simplification because now we +only have quadratic numbers and we don't have + +0:37:54.227 --> 0:37:55.029 +to go. + +0:37:55.355 --> 0:38:12.315 +Similar to guess you may be seen the same +type of algorithm for what is it? + +0:38:14.314 --> 0:38:19.926 +Yeah, well yeah, so that is the saying. + +0:38:19.926 --> 0:38:31.431 +But yeah, I think graphically this is seeable +if you don't know exactly the mass. + +0:38:32.472 --> 0:38:49.786 +Now put these both together, so if you really +want to take a piece of and put these two formulas + +0:38:49.786 --> 0:38:51.750 +together,. + +0:38:51.611 --> 0:38:56.661 +Eliminated and Then You Get Your Final Formula. + +0:38:56.716 --> 0:39:01.148 +And that somehow really makes now really intuitively +again sense. + +0:39:01.401 --> 0:39:08.301 +So the probability of an alignment is the +product of all target sentences, and then it's + +0:39:08.301 --> 0:39:15.124 +the probability of to translate a word into +the word that is aligned to divided by some + +0:39:15.124 --> 0:39:17.915 +of the other words in the sentence. + +0:39:18.678 --> 0:39:31.773 +If you look at this again, it makes real descent. + +0:39:31.891 --> 0:39:43.872 +So you're looking at how probable it is to +translate compared to all the other words. + +0:39:43.872 --> 0:39:45.404 +So you're. + +0:39:45.865 --> 0:39:48.543 +So and that gives you the alignment probability. + +0:39:48.768 --> 0:39:54.949 +Somehow it's not only that it's mathematically +correct if you look at it this way, it's somehow + +0:39:54.949 --> 0:39:55.785 +intuitively. + +0:39:55.785 --> 0:39:58.682 +So if you would say how good is it to align? + +0:39:58.638 --> 0:40:04.562 +We had to zoo him to visit, or yet it should +depend on how good this is the translation + +0:40:04.562 --> 0:40:10.620 +probability compared to how good are the other +words in the sentence, and how probable is + +0:40:10.620 --> 0:40:12.639 +it that I align them to them. + +0:40:15.655 --> 0:40:26.131 +Then you have the expectations that the next +thing is now the maximization step, so we have + +0:40:26.131 --> 0:40:30.344 +now the probability of an alignment. + +0:40:31.451 --> 0:40:37.099 +Intuitively, that means how often are words +aligned to each other giving this alignment + +0:40:37.099 --> 0:40:39.281 +or more in a perverse definition? + +0:40:39.281 --> 0:40:43.581 +What is the expectation value that they are +aligned to each other? + +0:40:43.581 --> 0:40:49.613 +So if there's a lot of alignments with hyperability +that they're aligned to each other, then. + +0:40:50.050 --> 0:41:07.501 +So the count of E and given F given our caravan +data is a sum of all possible alignments. + +0:41:07.968 --> 0:41:14.262 +That is, this count, and you don't do just +count with absolute numbers, but you count + +0:41:14.262 --> 0:41:14.847 +always. + +0:41:15.815 --> 0:41:26.519 +And to make that translation probability is +that you have to normalize it, of course, through: + +0:41:27.487 --> 0:41:30.584 +And that's then the whole model. + +0:41:31.111 --> 0:41:39.512 +It looks now maybe a bit mathematically complex. + +0:41:39.512 --> 0:41:47.398 +The whole training process is described here. + +0:41:47.627 --> 0:41:53.809 +So you really, really just have to collect +these counts and later normalize that. + +0:41:54.134 --> 0:42:03.812 +So repeating that until convergence we have +said the ear migration is always done again. + +0:42:04.204 --> 0:42:15.152 +Equally, then you go over all sentence pairs +and all of words and calculate the translation. + +0:42:15.355 --> 0:42:17.983 +And then you go once again over. + +0:42:17.983 --> 0:42:22.522 +It counted this count, count given, and totally +e-given. + +0:42:22.702 --> 0:42:35.316 +Initially how probable is the E translated +to something else, and you normalize your translation + +0:42:35.316 --> 0:42:37.267 +probabilities. + +0:42:38.538 --> 0:42:45.761 +So this is an old training process for this +type of. + +0:42:46.166 --> 0:43:00.575 +How that then works is shown here a bit, so +we have a very simple corpus. + +0:43:01.221 --> 0:43:12.522 +And as we said, you initialize your translation +with yes or possible translations, so dusk + +0:43:12.522 --> 0:43:16.620 +can be aligned to the bookhouse. + +0:43:16.997 --> 0:43:25.867 +And the other ones are missing because only +a curse with and book, and then the others + +0:43:25.867 --> 0:43:26.988 +will soon. + +0:43:27.127 --> 0:43:34.316 +In the initial way your vocabulary is for +works, so the initial probabilities are all: + +0:43:34.794 --> 0:43:50.947 +And then if you iterate you see that the things +which occur often and then get alignments get + +0:43:50.947 --> 0:43:53.525 +more and more. + +0:43:55.615 --> 0:44:01.506 +In reality, of course, you won't get like +zero alignments, but you would normally get + +0:44:01.506 --> 0:44:02.671 +there sometimes. + +0:44:03.203 --> 0:44:05.534 +But as the probability increases. + +0:44:05.785 --> 0:44:17.181 +The training process is also guaranteed that +the probability of your training data is always + +0:44:17.181 --> 0:44:20.122 +increased in iteration. + +0:44:21.421 --> 0:44:27.958 +You see that the model tries to model your +training data and give you at least good models. + +0:44:30.130 --> 0:44:37.765 +Okay, are there any more questions to the +training of these type of word-based models? + +0:44:38.838 --> 0:44:54.790 +Initially there is like forwards in the source +site, so it's just one force to do equal distribution. + +0:44:55.215 --> 0:45:01.888 +So each target word, the probability of the +target word, is at four target words, so the + +0:45:01.888 --> 0:45:03.538 +uniform distribution. + +0:45:07.807 --> 0:45:14.430 +However, there is problems with this initial +order and we have this already mentioned at + +0:45:14.430 --> 0:45:15.547 +the beginning. + +0:45:15.547 --> 0:45:21.872 +There is for example things that yeah you +want to allow for reordering but there are + +0:45:21.872 --> 0:45:27.081 +definitely some alignments which should be +more probable than others. + +0:45:27.347 --> 0:45:42.333 +So a friend visit should have a lower probability +than visit a friend. + +0:45:42.302 --> 0:45:50.233 +It's not always monitoring, there is some +reordering happening, but if you just mix it + +0:45:50.233 --> 0:45:51.782 +crazy, it's not. + +0:45:52.252 --> 0:46:11.014 +You have slings like one too many alignments +and they are not really models. + +0:46:11.491 --> 0:46:17.066 +But it shouldn't be that you align one word +to all the others, and that is, you don't want + +0:46:17.066 --> 0:46:18.659 +this type of probability. + +0:46:19.199 --> 0:46:27.879 +You don't want to align to null, so there's +nothing about that and how to deal with other + +0:46:27.879 --> 0:46:30.386 +words on the source side. + +0:46:32.272 --> 0:46:45.074 +And therefore this was only like the initial +model in there. + +0:46:45.325 --> 0:46:47.639 +Models, which we saw. + +0:46:47.639 --> 0:46:57.001 +They only model the translation probability, +so how probable is it to translate one word + +0:46:57.001 --> 0:46:58.263 +to another? + +0:46:58.678 --> 0:47:05.915 +What you could then add is the absolute position. + +0:47:05.915 --> 0:47:16.481 +Yeah, the second word should more probable +align to the second position. + +0:47:17.557 --> 0:47:22.767 +We add a fertility model that means one word +is mostly translated into one word. + +0:47:23.523 --> 0:47:29.257 +For example, we saw it there that should be +translated into two words, but most words should + +0:47:29.257 --> 0:47:32.463 +be one to one, and it's even modeled for each +word. + +0:47:32.463 --> 0:47:37.889 +So for each source word, how probable is it +that it is translated to one, two, three or + +0:47:37.889 --> 0:47:38.259 +more? + +0:47:40.620 --> 0:47:50.291 +Then either one of four acts relative positions, +so it's asks: Maybe instead of modeling, how + +0:47:50.291 --> 0:47:55.433 +probable is it that you translate from position +five to position twenty five? + +0:47:55.433 --> 0:48:01.367 +It's not a very good way, but in a relative +position instead of what you try to model it. + +0:48:01.321 --> 0:48:06.472 +How probable is that you are jumping Swiss +steps forward or Swiss steps back? + +0:48:07.287 --> 0:48:15.285 +However, this makes sense more complex because +what is a jump forward and a jump backward + +0:48:15.285 --> 0:48:16.885 +is not that easy. + +0:48:18.318 --> 0:48:30.423 +You want to have a model that describes reality, +so every sentence that is not possible should + +0:48:30.423 --> 0:48:37.304 +have the probability zero because that cannot +happen. + +0:48:37.837 --> 0:48:48.037 +However, with this type of IBM model four +this has a positive probability, so it makes + +0:48:48.037 --> 0:48:54.251 +a sentence more complex and you can easily +check it. + +0:48:57.457 --> 0:49:09.547 +So these models were the first models which +tried to directly model and where they are + +0:49:09.547 --> 0:49:14.132 +the first to do the translation. + +0:49:14.414 --> 0:49:19.605 +So in all of these models, the probability +of a word translating into another word is + +0:49:19.605 --> 0:49:25.339 +always independent of all the other translations, +and that is a challenge because we know that + +0:49:25.339 --> 0:49:26.486 +this is not right. + +0:49:26.967 --> 0:49:32.342 +And therefore we will come now to then the +phrase-based translation models. + +0:49:35.215 --> 0:49:42.057 +However, this word alignment is the very important +concept which was used in phrase based. + +0:49:42.162 --> 0:49:50.559 +Even when people use phrase based, they first +would always train a word based model not to + +0:49:50.559 --> 0:49:56.188 +get the really model but only to get this type +of alignment. + +0:49:57.497 --> 0:50:01.343 +What was the main idea of a phrase based machine +translation? + +0:50:03.223 --> 0:50:08.898 +It's not only that things got mathematically +a lot more simple here because you don't try + +0:50:08.898 --> 0:50:13.628 +to express the whole translation process, but +it's a discriminative model. + +0:50:13.628 --> 0:50:19.871 +So what you only try to model is this translation +probability or is this translation more probable + +0:50:19.871 --> 0:50:20.943 +than some other. + +0:50:24.664 --> 0:50:28.542 +The main idea is that the basic units are +are the phrases. + +0:50:28.542 --> 0:50:31.500 +That's why it's called phrase phrase phrase. + +0:50:31.500 --> 0:50:35.444 +You have to be aware that these are not linguistic +phrases. + +0:50:35.444 --> 0:50:39.124 +I guess you have some intuition about what +is a phrase. + +0:50:39.399 --> 0:50:45.547 +You would express as a phrase. + +0:50:45.547 --> 0:50:58.836 +However, you wouldn't say that is a very good +phrase because it's. + +0:50:59.339 --> 0:51:06.529 +However, in this machine learning-based motivated +thing, phrases are just indicative. + +0:51:07.127 --> 0:51:08.832 +So it can be any split. + +0:51:08.832 --> 0:51:12.455 +We don't consider linguistically motivated +or not. + +0:51:12.455 --> 0:51:15.226 +It can be any sequence of consecutive. + +0:51:15.335 --> 0:51:16.842 +That's the Only Important Thing. + +0:51:16.977 --> 0:51:25.955 +The phrase is always a thing of consecutive +words, and the motivation behind that is getting + +0:51:25.955 --> 0:51:27.403 +computational. + +0:51:27.387 --> 0:51:35.912 +People have looked into how you can also discontinuous +phrases, which might be very helpful if you + +0:51:35.912 --> 0:51:38.237 +think about German harbor. + +0:51:38.237 --> 0:51:40.046 +Has this one phrase? + +0:51:40.000 --> 0:51:47.068 +There's two phrases, although there's many +things in between, but in order to make things + +0:51:47.068 --> 0:51:52.330 +still possible and runner will, it's always +like consecutive work. + +0:51:53.313 --> 0:52:05.450 +The nice thing is that on the one hand you +don't need this word to word correspondence + +0:52:05.450 --> 0:52:06.706 +anymore. + +0:52:06.906 --> 0:52:17.088 +You now need to invent some type of alignment +that in this case doesn't really make sense. + +0:52:17.417 --> 0:52:21.710 +So you can just learn okay, you have this +phrase and this phrase and their translation. + +0:52:22.862 --> 0:52:25.989 +Secondly, we can add a bit of context into +that. + +0:52:26.946 --> 0:52:43.782 +You're saying, for example, of Ultimate Customs +and of My Shift. + +0:52:44.404 --> 0:52:51.443 +And this was difficult to model and work based +models because they always model the translation. + +0:52:52.232 --> 0:52:57.877 +Here you can have phrases where you have more +context and just jointly translate the phrases, + +0:52:57.877 --> 0:53:03.703 +and if you then have seen all by the question +as a phrase you can directly use that to generate. + +0:53:08.468 --> 0:53:19.781 +Okay, before we go into how to do that, then +we start, so the start is when we start with + +0:53:19.781 --> 0:53:21.667 +the alignment. + +0:53:22.022 --> 0:53:35.846 +So that is what we get from the work based +model and we are assuming to get the. + +0:53:36.356 --> 0:53:40.786 +So that is your starting point. + +0:53:40.786 --> 0:53:47.846 +You have a certain sentence and one most probable. + +0:53:48.989 --> 0:54:11.419 +The challenge you now have is that these alignments +are: On the one hand, a source word like hit + +0:54:11.419 --> 0:54:19.977 +several times with one source word can be aligned +to several: So in this case you see that for + +0:54:19.977 --> 0:54:29.594 +example Bisher is aligned to three words, so +this can be the alignment from English to German, + +0:54:29.594 --> 0:54:32.833 +but it cannot be the alignment. + +0:54:33.273 --> 0:54:41.024 +In order to address for this inconsistency +and being able to do that, what you typically + +0:54:41.024 --> 0:54:49.221 +then do is: If you have this inconsistency +and you get different things in both directions,. + +0:54:54.774 --> 0:55:01.418 +In machine translation to do that you just +do it in both directions and somehow combine + +0:55:01.418 --> 0:55:08.363 +them because both will do arrows and the hope +is yeah if you know both things you minimize. + +0:55:08.648 --> 0:55:20.060 +So you would also do it in the other direction +and get a different type of lineup, for example + +0:55:20.060 --> 0:55:22.822 +that you now have saw. + +0:55:23.323 --> 0:55:37.135 +So in this way you are having two alignments +and the question is now how do get one alignment + +0:55:37.135 --> 0:55:38.605 +and what? + +0:55:38.638 --> 0:55:45.828 +There were a lot of different types of heuristics. + +0:55:45.828 --> 0:55:55.556 +They normally start with intersection because +you should trust them. + +0:55:55.996 --> 0:55:59.661 +And your maximum will could take this, the +union thought,. + +0:55:59.980 --> 0:56:04.679 +If one of the systems says they are not aligned +then maybe you should not align them. + +0:56:05.986 --> 0:56:12.240 +The only question they are different is what +should I do about things where they don't agree? + +0:56:12.240 --> 0:56:18.096 +So where only one of them enlines and then +you have heuristics depending on other words + +0:56:18.096 --> 0:56:22.288 +around it, you can decide should I align them +or should I not. + +0:56:24.804 --> 0:56:34.728 +So that is your first step and then the second +step in your model. + +0:56:34.728 --> 0:56:41.689 +So now you have one alignment for the process. + +0:56:42.042 --> 0:56:47.918 +And the idea is that we will now extract all +phrase pairs to combinations of source and + +0:56:47.918 --> 0:56:51.858 +target phrases where they are consistent within +alignment. + +0:56:52.152 --> 0:56:57.980 +The idea is a consistence with an alignment +that should be a good example and that we can + +0:56:57.980 --> 0:56:58.563 +extract. + +0:56:59.459 --> 0:57:14.533 +And there are three conditions where we say +an alignment has to be consistent. + +0:57:14.533 --> 0:57:17.968 +The first one is. + +0:57:18.318 --> 0:57:24.774 +So if you add bisher, then it's in your phrase. + +0:57:24.774 --> 0:57:32.306 +All the three words up till and now should +be in there. + +0:57:32.492 --> 0:57:42.328 +So Bisheret Till would not be a valid phrase +pair in this case, but for example Bisheret + +0:57:42.328 --> 0:57:43.433 +Till now. + +0:57:45.525 --> 0:58:04.090 +Does anybody now have already an idea about +the second rule that should be there? + +0:58:05.325 --> 0:58:10.529 +Yes, that is exactly the other thing. + +0:58:10.529 --> 0:58:22.642 +If a target verse is in the phrase pair, there +are also: Then there is one very obvious one. + +0:58:22.642 --> 0:58:28.401 +If you strike a phrase pair, at least one +word in the phrase. + +0:58:29.069 --> 0:58:32.686 +And this is a knife with working. + +0:58:32.686 --> 0:58:40.026 +However, in reality a captain will select +some part of the sentence. + +0:58:40.380 --> 0:58:47.416 +You can take any possible combination of sewers +and target words for this part, and that of + +0:58:47.416 --> 0:58:54.222 +course is not very helpful because you just +have no idea, and therefore it says at least + +0:58:54.222 --> 0:58:58.735 +one sewer should be aligned to one target word +to prevent. + +0:58:59.399 --> 0:59:09.615 +But still, it means that if you have normally +analyzed words, the more analyzed words you + +0:59:09.615 --> 0:59:10.183 +can. + +0:59:10.630 --> 0:59:13.088 +That's not true for the very extreme case. + +0:59:13.088 --> 0:59:17.603 +If no word is a line you can extract nothing +because you can never fulfill it. + +0:59:17.603 --> 0:59:23.376 +However, if only for example one word is aligned +then you can align a lot of different possibilities + +0:59:23.376 --> 0:59:28.977 +because you can start with this word and then +add source words or target words or any combination + +0:59:28.977 --> 0:59:29.606 +of source. + +0:59:30.410 --> 0:59:37.585 +So there was typically a problem that if you +have too few works in light you can really + +0:59:37.585 --> 0:59:38.319 +extract. + +0:59:38.558 --> 0:59:45.787 +If you think about this already here you can +extract very, very many phrase pairs from: + +0:59:45.845 --> 0:59:55.476 +So what you can extract is, for example, what +we saw up and so on. + +0:59:55.476 --> 1:00:00.363 +So all of them will be extracted. + +1:00:00.400 --> 1:00:08.379 +In order to limit this you typically have +a length limit so you can only extract phrases + +1:00:08.379 --> 1:00:08.738 +up. + +1:00:09.049 --> 1:00:18.328 +But still there these phrases where you have +all these phrases extracted. + +1:00:18.328 --> 1:00:22.968 +You have to think about how to deal. + +1:00:26.366 --> 1:00:34.966 +Now we have the phrases, so the other question +is what is a good phrase pair and not so good. + +1:00:35.255 --> 1:00:39.933 +You might be that you sometimes extract one +which is explaining this sentence but is not + +1:00:39.933 --> 1:00:44.769 +really a good one because there is something +ever in there or something special so it might + +1:00:44.769 --> 1:00:47.239 +not be a good phase pair in another situation. + +1:00:49.629 --> 1:00:59.752 +And therefore the easiest thing is again just +count, and if a phrase pair occurs very often + +1:00:59.752 --> 1:01:03.273 +seems to be a good phrase pair. + +1:01:03.743 --> 1:01:05.185 +So if we have this one. + +1:01:05.665 --> 1:01:09.179 +And if you have the exam up till now,. + +1:01:09.469 --> 1:01:20.759 +Then you look how often does up till now to +this hair occur? + +1:01:20.759 --> 1:01:28.533 +How often does up until now to this hair? + +1:01:30.090 --> 1:01:36.426 +So this is one way of yeah describing the +quality of the phrase book. + +1:01:37.257 --> 1:01:47.456 +So one difference is now, and that is the +advantage of these primitive models. + +1:01:47.867 --> 1:01:55.442 +But instead we are trying to have a lot of +features describing how good a phrase parent + +1:01:55.442 --> 1:01:55.786 +is. + +1:01:55.786 --> 1:02:04.211 +One of these features is this one describing: +But in this model we'll later see how to combine + +1:02:04.211 --> 1:02:04.515 +it. + +1:02:04.515 --> 1:02:10.987 +The nice thing is we can invent any other +type of features and add that and normally + +1:02:10.987 --> 1:02:14.870 +if you have two or three metrics to describe +then. + +1:02:15.435 --> 1:02:18.393 +And therefore the spray spray sprays. + +1:02:18.393 --> 1:02:23.220 +They were not only like evaluated by one type +but by several. + +1:02:23.763 --> 1:02:36.580 +So this could, for example, have a problem +because your target phrase here occurs only + +1:02:36.580 --> 1:02:37.464 +once. + +1:02:38.398 --> 1:02:46.026 +It will of course only occur with one other +source trait, and that probability will be + +1:02:46.026 --> 1:02:53.040 +one which might not be a very good estimation +because you've only seen it once. + +1:02:53.533 --> 1:02:58.856 +Therefore, we use additional ones to better +deal with that, and the first thing is we're + +1:02:58.856 --> 1:02:59.634 +doing again. + +1:02:59.634 --> 1:03:01.129 +Yeah, we know it by now. + +1:03:01.129 --> 1:03:06.692 +If you look at it in the one direction, it's +helpful to us to look into the other direction. + +1:03:06.692 --> 1:03:11.297 +So you take also the inverse probability, +so you not only take in peer of E. + +1:03:11.297 --> 1:03:11.477 +G. + +1:03:11.477 --> 1:03:11.656 +M. + +1:03:11.656 --> 1:03:12.972 +F., but also peer of. + +1:03:13.693 --> 1:03:19.933 +And then in addition you say maybe for the +especially prolonged phrases they occur rarely, + +1:03:19.933 --> 1:03:25.898 +and then you have very high probabilities, +and that might not be always the right one. + +1:03:25.898 --> 1:03:32.138 +So maybe it's good to also look at the word +based probabilities to represent how good they + +1:03:32.138 --> 1:03:32.480 +are. + +1:03:32.692 --> 1:03:44.202 +So in addition you take the work based probabilities +of this phrase pair as an additional model. + +1:03:44.704 --> 1:03:52.828 +So then you would have in total four different +values describing how good the phrase is. + +1:03:52.828 --> 1:04:00.952 +It would be the relatively frequencies in +both directions and the lexical probabilities. + +1:04:01.361 --> 1:04:08.515 +So four values in describing how probable +a phrase translation is. + +1:04:11.871 --> 1:04:20.419 +Then the next challenge is how can we combine +these different types of probabilities into + +1:04:20.419 --> 1:04:23.458 +a global score saying how good? + +1:04:24.424 --> 1:04:36.259 +Model, but before we are doing that give any +questions to this phrase extraction and phrase + +1:04:36.259 --> 1:04:37.546 +creation. + +1:04:40.260 --> 1:04:44.961 +And the motivation for that this was our initial +moral. + +1:04:44.961 --> 1:04:52.937 +If you remember from the beginning of a lecture +we had the probability of like PFO three times + +1:04:52.937 --> 1:04:53.357 +PFO. + +1:04:55.155 --> 1:04:57.051 +Now the problem is here. + +1:04:57.051 --> 1:04:59.100 +That is, of course, right. + +1:04:59.100 --> 1:05:06.231 +However, we have done a lot of simplification +that the translation probability is independent + +1:05:06.231 --> 1:05:08.204 +of the other translation. + +1:05:08.628 --> 1:05:14.609 +So therefore our estimations of pH give me +and pH might not be right, and therefore the + +1:05:14.609 --> 1:05:16.784 +combination might not be right. + +1:05:17.317 --> 1:05:22.499 +So it can be that, for example, at the edge +you have a fluid but not accurate translation. + +1:05:22.782 --> 1:05:25.909 +And Then There's Could Be an Easy Way Around +It. + +1:05:26.126 --> 1:05:32.019 +If our effluent but not accurate, it might +be that we put too much effort on the language + +1:05:32.019 --> 1:05:36.341 +model and we are putting too few effort on +the translation model. + +1:05:36.936 --> 1:05:43.016 +There we can wait a minute so we can do this +a bit stronger. + +1:05:43.016 --> 1:05:46.305 +This one is more important than. + +1:05:48.528 --> 1:05:53.511 +And based on that we can extend this idea +to the lacteria mole. + +1:05:53.893 --> 1:06:02.164 +The log linear model now says all the translation +probabilities is just we have. + +1:06:02.082 --> 1:06:09.230 +Describing how good this translation process +is, these are the speeches H which depend on + +1:06:09.230 --> 1:06:09.468 +E. + +1:06:09.468 --> 1:06:09.706 +F. + +1:06:09.706 --> 1:06:13.280 +Only one of them, but generally depend on +E. + +1:06:13.280 --> 1:06:13.518 +E. + +1:06:13.518 --> 1:06:13.757 +E. + +1:06:13.757 --> 1:06:13.995 +N. + +1:06:13.995 --> 1:06:14.233 +F. + +1:06:14.474 --> 1:06:22.393 +Each of these pictures has a weight saying +yeah how good does it model it so that if you're + +1:06:22.393 --> 1:06:29.968 +asking a lot of people about some opinion it +might also be waiting some opinion more so + +1:06:29.968 --> 1:06:34.100 +I put more effort on that and he may not be +so. + +1:06:34.314 --> 1:06:39.239 +If you're saying that it's maybe a good indication, +yeah, would trust that much. + +1:06:39.559 --> 1:06:41.380 +And exactly you can do that for you too. + +1:06:41.380 --> 1:06:42.446 +You can't add no below. + +1:06:43.423 --> 1:07:01.965 +It's like depending on how many you want to +have and each of the features gives you value. + +1:07:02.102 --> 1:07:12.655 +The nice thing is that we can normally ignore +because we are not interested in the probability + +1:07:12.655 --> 1:07:13.544 +itself. + +1:07:13.733 --> 1:07:18.640 +And again, if that's not normalized, that's +fine. + +1:07:18.640 --> 1:07:23.841 +So if this value is the highest, that's the +highest. + +1:07:26.987 --> 1:07:29.302 +Can we do that? + +1:07:29.302 --> 1:07:34.510 +Let's start with two simple things. + +1:07:34.510 --> 1:07:39.864 +Then you have one translation model. + +1:07:40.000 --> 1:07:43.102 +Which gives you the peer of eagerness. + +1:07:43.383 --> 1:07:49.203 +It can be typically as a feature it would +take the liberalism of this ability, so mine + +1:07:49.203 --> 1:07:51.478 +is nine hundred and fourty seven. + +1:07:51.451 --> 1:07:57.846 +And the language model which says you how +clue in the English side is how you can calculate + +1:07:57.846 --> 1:07:59.028 +the probability. + +1:07:58.979 --> 1:08:03.129 +In some future lectures we'll give you all +superbology. + +1:08:03.129 --> 1:08:10.465 +You can feature again the luck of the purbology, +then you have minus seven and then give different + +1:08:10.465 --> 1:08:11.725 +weights to them. + +1:08:12.292 --> 1:08:19.243 +And that means that your probability is one +divided by said to the power of this. + +1:08:20.840 --> 1:08:38.853 +You're not really interested in the probability, +so you just calculate on the score to the exponendum. + +1:08:40.000 --> 1:08:41.668 +Maximal Maximal I Think. + +1:08:42.122 --> 1:08:57.445 +You can, for example, try different translations, +calculate all their scores and take in the + +1:08:57.445 --> 1:09:00.905 +end the translation. + +1:09:03.423 --> 1:09:04.661 +Why to do that. + +1:09:05.986 --> 1:09:10.698 +We've done that now for two, but of course +you cannot only do it with two. + +1:09:10.698 --> 1:09:16.352 +You can do it now with any fixed number, so +of course you have to decide in the beginning + +1:09:16.352 --> 1:09:21.944 +I want to have ten features or something like +that, but you can take all these features. + +1:09:22.002 --> 1:09:29.378 +And yeah, based on them, they calculate your +model probability or the model score. + +1:09:31.031 --> 1:09:40.849 +A big advantage over the initial. + +1:09:40.580 --> 1:09:45.506 +A model because now we can add a lot of features +and there was diamond machine translation, + +1:09:45.506 --> 1:09:47.380 +a statistical machine translation. + +1:09:47.647 --> 1:09:57.063 +So how can develop new features, new ways +of evaluating them so that can hopefully better + +1:09:57.063 --> 1:10:00.725 +describe what is good translation? + +1:10:01.001 --> 1:10:16.916 +If you have a new great feature you can calculate +these features and then how much better do + +1:10:16.916 --> 1:10:18.969 +they model? + +1:10:21.741 --> 1:10:27.903 +There is one challenge which haven't touched +upon yet. + +1:10:27.903 --> 1:10:33.505 +So could you easily build your model if you +have. + +1:10:38.999 --> 1:10:43.016 +Assumed here something which just gazed, but +which might not be that easy. + +1:10:49.990 --> 1:10:56.333 +The weight for the translation model is and +the weight for the language model is. + +1:10:56.716 --> 1:11:08.030 +That's a bit arbitrary, so why should you +use this one and guess normally you won't be + +1:11:08.030 --> 1:11:11.801 +able to select that by hand? + +1:11:11.992 --> 1:11:19.123 +So typically we didn't have like or features +in there, but features is very common. + +1:11:19.779 --> 1:11:21.711 +So how do you select them? + +1:11:21.711 --> 1:11:24.645 +There was a second part of the training. + +1:11:24.645 --> 1:11:27.507 +These models were trained in two steps. + +1:11:27.507 --> 1:11:32.302 +On the one hand, we had the training of the +individual components. + +1:11:32.302 --> 1:11:38.169 +We saw that now how to build the phrase based +system, how to extract the phrases. + +1:11:38.738 --> 1:11:46.223 +But then if you have these different components +you need a second training to learn the optimal. + +1:11:46.926 --> 1:11:51.158 +And typically this is referred to as the tuning +of the system. + +1:11:51.431 --> 1:12:07.030 +So now if you have different types of models +describing what a good translation is you need + +1:12:07.030 --> 1:12:10.760 +to find good weights. + +1:12:12.312 --> 1:12:14.315 +So how can you do it? + +1:12:14.315 --> 1:12:20.871 +The easiest thing is, of course, you can just +try different things out. + +1:12:21.121 --> 1:12:27.496 +You can then always select the best hyper +scissors. + +1:12:27.496 --> 1:12:38.089 +You can evaluate it with some metrics saying: +You can score all your outputs, always select + +1:12:38.089 --> 1:12:42.543 +the best one and then get this translation. + +1:12:42.983 --> 1:12:45.930 +And you can do that for a lot of different +possible combinations. + +1:12:47.067 --> 1:12:59.179 +However, the challenge is the complexity, +so if you have only parameters and each of + +1:12:59.179 --> 1:13:04.166 +them has values you try for, then. + +1:13:04.804 --> 1:13:16.895 +We won't be able to try all of these possible +combinations, so what we have to do is some + +1:13:16.895 --> 1:13:19.313 +more intelligent. + +1:13:20.540 --> 1:13:34.027 +And what has been done there in machine translation +is referred to as a minimum error rate training. + +1:13:34.534 --> 1:13:41.743 +Whole surge is a very intuitive one, so have +all these different parameters, so how do. + +1:13:42.522 --> 1:13:44.358 +And the idea is okay. + +1:13:44.358 --> 1:13:52.121 +I start with an initial guess and then I optimize +one single parameter that's always easier. + +1:13:52.121 --> 1:13:54.041 +That's some or linear. + +1:13:54.041 --> 1:13:58.882 +So you're searching the best value for the +one parameter. + +1:13:59.759 --> 1:14:04.130 +Often visualized with a San Francisco map. + +1:14:04.130 --> 1:14:13.786 +Just imagine if you want to go to the highest +spot in San Francisco, you're standing somewhere + +1:14:13.786 --> 1:14:14.395 +here. + +1:14:14.574 --> 1:14:21.220 +You are switching your dimensions so you are +going in this direction again finding. + +1:14:21.661 --> 1:14:33.804 +Now you're on a different street and this +one is not a different one so you go in here + +1:14:33.804 --> 1:14:36.736 +so you can interact. + +1:14:36.977 --> 1:14:56.368 +The one thing of course is find a local optimum, +especially if you start in two different positions. + +1:14:56.536 --> 1:15:10.030 +So yeah, there is a heuristic in there, so +typically it's done again if you land in different + +1:15:10.030 --> 1:15:16.059 +positions with different starting points. + +1:15:16.516 --> 1:15:29.585 +What is different or what is like the addition +of arrow rate training compared to the standard? + +1:15:29.729 --> 1:15:37.806 +So the question is, like we said, you can +now evaluate different values for one parameter. + +1:15:38.918 --> 1:15:42.857 +And the question is: Which values should you +try out for one parameters? + +1:15:42.857 --> 1:15:47.281 +Should you just do zero point one, zero point +two, zero point three, or anything? + +1:15:49.029 --> 1:16:03.880 +If you change only one parameter then you +can define the score of translation as a linear + +1:16:03.880 --> 1:16:05.530 +function. + +1:16:05.945 --> 1:16:17.258 +That this is the one that possesses, and yet +if you change the parameter, the score of this. + +1:16:17.397 --> 1:16:26.506 +It may depend so your score is there because +the rest you don't change your feature value. + +1:16:26.826 --> 1:16:30.100 +And the feature value is there for the steepness +of their purse. + +1:16:30.750 --> 1:16:38.887 +And now look at different possible translations. + +1:16:38.887 --> 1:16:46.692 +Therefore, how they go up here is differently. + +1:16:47.247 --> 1:16:59.289 +So in this case if you look at the minimum +score so there should be as minimum. + +1:17:00.300 --> 1:17:10.642 +So it's enough to check once a year and check +once here because if you check here and here. + +1:17:11.111 --> 1:17:24.941 +And that is the idea in minimum air rate training +when you select different hypotheses. + +1:17:29.309 --> 1:17:34.378 +So in yeah, the minimum air raid training +is a power search. + +1:17:34.378 --> 1:17:37.453 +Then we do an intelligent step size. + +1:17:37.453 --> 1:17:39.364 +We do random restarts. + +1:17:39.364 --> 1:17:46.428 +Then things are still too slow because it +might say we would have to decode a lot of + +1:17:46.428 --> 1:17:47.009 +times. + +1:17:46.987 --> 1:17:54.460 +So what we can do to make things even faster +is we are decoding once with the current parameters, + +1:17:54.460 --> 1:18:01.248 +but then we are not generating only the most +probable translation, but we are generating + +1:18:01.248 --> 1:18:05.061 +the most probable ten hundred translations +or so. + +1:18:06.006 --> 1:18:18.338 +And then we are optimizing our weights by +only looking at this one hundred translation + +1:18:18.338 --> 1:18:23.725 +and finding the optimal values there. + +1:18:24.564 --> 1:18:39.284 +Of course, it might be a problem that at some +point you have now good ways to find good translations + +1:18:39.284 --> 1:18:42.928 +inside your ambest list. + +1:18:43.143 --> 1:18:52.357 +You have to iterate that sometime, but the +important thing is you don't have to decode + +1:18:52.357 --> 1:18:56.382 +every time you need weights, but you. + +1:18:57.397 --> 1:19:11.325 +There is mainly a speed up process in order +to make things more, make things even faster. + +1:19:15.515 --> 1:19:20.160 +Good Then We'll Finish With. + +1:19:20.440 --> 1:19:25.289 +Looking at how do you really calculate the +scores and everything? + +1:19:25.289 --> 1:19:32.121 +Because what we did look into was a translation +of a full sentence doesn't really consist of + +1:19:32.121 --> 1:19:37.190 +only one single phrase, but of course you have +to combine different. + +1:19:37.637 --> 1:19:40.855 +So how does that now really look and how do +we have to do? + +1:19:41.361 --> 1:19:48.252 +Just think again of the translation we have +done before. + +1:19:48.252 --> 1:19:59.708 +The sentence must be: What is the probability +of translating this one into what we saw after + +1:19:59.708 --> 1:20:00.301 +now? + +1:20:00.301 --> 1:20:03.501 +We're doing this by using. + +1:20:03.883 --> 1:20:07.157 +So we're having the phrase pair. + +1:20:07.157 --> 1:20:12.911 +Vasvia is the phrase pair up to now and gazine +harm into. + +1:20:13.233 --> 1:20:18.970 +In addition, that is important because translation +is not monotone. + +1:20:18.970 --> 1:20:26.311 +We are not putting phrase pairs in the same +order as we are doing it on the source and + +1:20:26.311 --> 1:20:31.796 +on the target, but in order to generate the +correct translation. + +1:20:31.771 --> 1:20:34.030 +So we have to shuffle the phrase pears. + +1:20:34.294 --> 1:20:39.747 +And the blue wand is in front on the search +side but not on the back of the tag. + +1:20:40.200 --> 1:20:49.709 +This reordering makes a statistic of the machine +translation really complicated because if you + +1:20:49.709 --> 1:20:53.313 +would just monotonely do this then. + +1:20:53.593 --> 1:21:05.288 +The problem is if you would analyze all possible +combinations of reshuffling them, then again. + +1:21:05.565 --> 1:21:11.508 +So you again have to use some type of heuristics +which shuffle you allow and which you don't + +1:21:11.508 --> 1:21:11.955 +allow. + +1:21:12.472 --> 1:21:27.889 +That was relatively challenging since, for +example, if you think of Germany you would + +1:21:27.889 --> 1:21:32.371 +have to allow very long. + +1:21:33.033 --> 1:21:52.218 +But if we have now this, how do we calculate +the translation score so the translation score? + +1:21:52.432 --> 1:21:55.792 +That's why we sum up the scores at the end. + +1:21:56.036 --> 1:22:08.524 +So you said our first feature is the probability +of the full sentence. + +1:22:08.588 --> 1:22:13.932 +So we say, the translation of each phrase +pair is independent of each other, and then + +1:22:13.932 --> 1:22:19.959 +we can hear the probability of the full sentences, +fear of what we give, but fear of times, fear + +1:22:19.959 --> 1:22:24.246 +of sobbing because they have time to feel up +till now is impossible. + +1:22:24.664 --> 1:22:29.379 +Now we can use the loss of logarithmal calculation. + +1:22:29.609 --> 1:22:36.563 +That's logarithm of the first perability. + +1:22:36.563 --> 1:22:48.153 +We'll get our first score, which says the +translation model is minus. + +1:22:49.970 --> 1:22:56.586 +And that we're not doing only once, but we're +exactly doing it with all our translation model. + +1:22:56.957 --> 1:23:03.705 +So we said we also have the relative frequency +and the inverse directions of the. + +1:23:03.843 --> 1:23:06.226 +So in the end you'll have four scores. + +1:23:06.226 --> 1:23:09.097 +Here how you combine them is exactly the same. + +1:23:09.097 --> 1:23:12.824 +The only thing is how you look them up for +each phrase pair. + +1:23:12.824 --> 1:23:18.139 +We have said in the beginning we are storing +four scores describing how good they are. + +1:23:19.119 --> 1:23:25.415 +And these are then of force points describing +how probable the sense. + +1:23:27.427 --> 1:23:31.579 +Then we can have more sports. + +1:23:31.579 --> 1:23:37.806 +For example, we can have a distortion model. + +1:23:37.806 --> 1:23:41.820 +How much reordering is done? + +1:23:41.841 --> 1:23:47.322 +There were different types of ones who won't +go into detail, but just imagine you have no + +1:23:47.322 --> 1:23:47.748 +score. + +1:23:48.548 --> 1:23:56.651 +Then you have a language model which is the +sequence of what we saw until now. + +1:23:56.651 --> 1:24:06.580 +How we generate this language model for ability +will cover: And there weren't even more probabilities. + +1:24:06.580 --> 1:24:11.841 +So one, for example, was a phrase count scarf, +which just counts how many. + +1:24:12.072 --> 1:24:19.555 +In order to learn is it better to have more +short phrases or should bias on having fewer + +1:24:19.555 --> 1:24:20.564 +and longer. + +1:24:20.940 --> 1:24:28.885 +Easily add this but just counting so the value +will be here and like putting in a count like + +1:24:28.885 --> 1:24:32.217 +typically how good is it to translate. + +1:24:32.932 --> 1:24:44.887 +For language model, the probability normally +gets shorter the longer the sequences in order + +1:24:44.887 --> 1:24:46.836 +to counteract. + +1:24:47.827 --> 1:24:59.717 +And then you get your final score by multi-climbing +each of the scores we had before. + +1:24:59.619 --> 1:25:07.339 +Optimization and that gives you a final score +maybe of twenty three point seven eight five + +1:25:07.339 --> 1:25:13.278 +and then you can do that with several possible +translation tests and. + +1:25:14.114 --> 1:25:23.949 +One may be important point here is so the +score not only depends on the target side but + +1:25:23.949 --> 1:25:32.444 +it also depends on which phrases you have used +so you could have generated. + +1:25:32.772 --> 1:25:38.076 +So you would have the same translation, but +you would have a different split into phrase. + +1:25:38.979 --> 1:25:45.636 +And this was normally ignored so you would +just look at all of them and then select the + +1:25:45.636 --> 1:25:52.672 +one which has the highest probability and ignore +that this translation could be generated by + +1:25:52.672 --> 1:25:54.790 +several splits into phrase. + +1:25:57.497 --> 1:26:06.097 +So to summarize what we look into today and +what you should hopefully remember is: Statistical + +1:26:06.097 --> 1:26:11.440 +models in how to generate machine translation +output that were the word based statistical + +1:26:11.440 --> 1:26:11.915 +models. + +1:26:11.915 --> 1:26:16.962 +There was IBM models at the beginning and +then we have the phrase based entity where + +1:26:16.962 --> 1:26:22.601 +it's about building the translation by putting +together these blocks of phrases and combining. + +1:26:23.283 --> 1:26:34.771 +If you have a water which has several features +you can't do that with millions but with features. + +1:26:34.834 --> 1:26:42.007 +Then you can combine them with your local +model, which allows you to have your variable + +1:26:42.007 --> 1:26:45.186 +number of features and easily combine. + +1:26:45.365 --> 1:26:47.920 +Yeah, how much can you trust each of these +more? + +1:26:51.091 --> 1:26:54.584 +Do you have any further questions for this +topic? + +1:26:58.378 --> 1:27:08.715 +And there will be on Tuesday a lecture by +Tuan about evaluation, and then next Thursday + +1:27:08.715 --> 1:27:12.710 +there will be the practical part. + +1:27:12.993 --> 1:27:21.461 +So please bring the practical pot here, but +you can do something yourself if you are not + +1:27:21.461 --> 1:27:22.317 +able to. + +1:27:23.503 --> 1:27:26.848 +So then please tell us and we'll have to see +how we find the difference in this. + diff --git a/demo_data/lectures/Lecture-04-27.04.2023/video.mp4 b/demo_data/lectures/Lecture-04-27.04.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..6208ee8db0a3eea21fd4e8b94f28d27772f73e1a --- /dev/null +++ b/demo_data/lectures/Lecture-04-27.04.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8786f0bc34cf397879e95757fe367887c5f5d01d0f388aa98f768203cccc5269 +size 116390723 diff --git a/demo_data/lectures/Lecture-05-02.05.2023/English.vtt b/demo_data/lectures/Lecture-05-02.05.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..c247be22d754261cb8f8a5794bd70441933eb904 --- /dev/null +++ b/demo_data/lectures/Lecture-05-02.05.2023/English.vtt @@ -0,0 +1,1124 @@ +WEBVTT + +0:00:56.957 --> 0:01:10.166 +In today you are going to talk about evaluation +like how you can tell how well your translation. + +0:01:11.251 --> 0:01:23.175 +Today we're going to talk about first some +introduction about the difficulties and also + +0:01:23.175 --> 0:01:27.783 +the dimensions of the evaluation. + +0:01:28.248 --> 0:01:32.315 +And the second one is on automatic evaluation. + +0:01:32.315 --> 0:01:33.960 +The second one is. + +0:01:33.893 --> 0:01:40.952 +Would be less human effort costly, but it +probably is not really as perfect. + +0:01:42.702 --> 0:02:01.262 +So on machine translation evaluation, so the +goal is to measure the quality of translation. + +0:02:03.003 --> 0:02:06.949 +We need machine translation evaluation. + +0:02:06.949 --> 0:02:14.152 +The first thing is for application scenarios +and whether it is reliable. + +0:02:14.674 --> 0:02:22.911 +Second thing is to guide our research because +given symmetrics we will be able to find out + +0:02:22.911 --> 0:02:30.875 +which improvement direction is valuable for +our machine translation system and the last + +0:02:30.875 --> 0:02:34.224 +thing is for our system development. + +0:02:36.116 --> 0:02:42.926 +So now we will come to some difficulties on +evaluation. + +0:02:42.926 --> 0:02:50.952 +The first thing is ambiguity because usually +for one sentence it. + +0:02:51.431 --> 0:03:04.031 +Here you can see that, for example, we have +the correct reference. + +0:03:05.325 --> 0:03:19.124 +The second difficulty is that small changes +can be very important. + +0:03:20.060 --> 0:03:22.531 +The first difficulty is subjective. + +0:03:23.123 --> 0:03:39.266 +So it depends on each person's opinion whether +translation is correct. + +0:03:41.041 --> 0:03:49.393 +The last is that evaluation sometimes is application +dependent. + +0:03:49.393 --> 0:03:54.745 +We're not sure how good it's getting up. + +0:03:57.437 --> 0:04:04.502 +The first dimension is human versus automatic +evaluation, which I definitely talked about + +0:04:04.502 --> 0:04:06.151 +in the introduction. + +0:04:06.151 --> 0:04:13.373 +The second thing is on granulity, so evaluation +could be on sentence level, document level, + +0:04:13.373 --> 0:04:14.472 +or task base. + +0:04:15.375 --> 0:04:28.622 +The last thing is whether the translation +is correct in order to capture the meaning. + +0:04:30.630 --> 0:04:33.769 +So on the first dimensions, human verses are +automatic. + +0:04:34.334 --> 0:04:45.069 +So human evaluation education is the goal +standard because in the end we give our machine + +0:04:45.069 --> 0:04:48.647 +translation system to people. + +0:04:49.329 --> 0:04:55.040 +And is also expensive and time consuming for +people to manually evaluate some systems. + +0:04:57.057 --> 0:05:05.575 +For automatic evaluation, it is of course +tupper and faster, and it would use human reference. + +0:05:08.168 --> 0:05:16.971 +The next dimension is on granulity. + +0:05:16.971 --> 0:05:25.529 +The first level is sentence based. + +0:05:25.885 --> 0:05:33.003 +But this is difficult because if you translate +a single sentence, it will be difficult to + +0:05:33.003 --> 0:05:35.454 +tell whether this translation. + +0:05:37.537 --> 0:05:40.633 +The second level is document based. + +0:05:40.633 --> 0:05:46.051 +This should be the most commonly used in automatic +evaluation. + +0:05:46.286 --> 0:06:00.750 +This should be like the final bowl of our +machine translation. + +0:06:01.061 --> 0:06:02.315 +And slow in general. + +0:06:02.315 --> 0:06:07.753 +We are not sure whether the arrows come from +the machine translation system itself or some + +0:06:07.753 --> 0:06:08.828 +other components. + +0:06:11.431 --> 0:06:21.300 +The next dimension is on adigocy because it's +fluency, so adigocy is meaning translated correctly. + +0:06:22.642 --> 0:06:25.384 +Can see the example here. + +0:06:25.384 --> 0:06:32.237 +In hypothesis different is everything now, +so basically it just. + +0:06:32.852 --> 0:06:36.520 +But then you can see it's not fluent. + +0:06:36.520 --> 0:06:38.933 +It sounds kind of weird. + +0:06:38.933 --> 0:06:41.442 +Nothing is different now. + +0:06:41.442 --> 0:06:43.179 +It sounds fluent. + +0:06:46.006 --> 0:06:50.650 +Next we come to error analysis. + +0:06:50.650 --> 0:07:02.407 +When we value the system and give a score +we want to have interpretable results. + +0:07:03.083 --> 0:07:07.930 +So usually there would be some tetsus first +in order to detect these errors. + +0:07:08.448 --> 0:07:21.077 +And usually they would be like quite specific +to some specific type of arrow, for example + +0:07:21.077 --> 0:07:23.743 +wrong translation. + +0:07:24.344 --> 0:07:32.127 +All morphological agreements in whether the +world form is correct. + +0:07:32.127 --> 0:07:35.031 +If you have the article. + +0:07:37.577 --> 0:07:45.904 +So now we come to human evaluation, which +is the final goal of machine translation. + +0:07:47.287 --> 0:07:50.287 +So why do we perform human evaluation? + +0:07:51.011 --> 0:08:00.115 +The first thing is that automatic machine +translation magic is not sufficient. + +0:08:00.480 --> 0:08:06.725 +Existing automated metrics and are sometimes +biased. + +0:08:06.725 --> 0:08:16.033 +For example, the blue spar, but the blue scar +will usually try to look at the. + +0:08:16.496 --> 0:08:24.018 +So it doesn't take into account some deeper +meaning like cares about word-to-word matching + +0:08:24.018 --> 0:08:26.829 +instead of rephrasing or synonym. + +0:08:27.587 --> 0:08:34.881 +And bias, as in that metrics like that would +usually depend a lot on the goal standard reference + +0:08:34.881 --> 0:08:41.948 +given from some human, and that person could +have some specific type or language preferences, + +0:08:41.948 --> 0:08:43.979 +and then the metric would. + +0:08:47.147 --> 0:08:55.422 +The next thing is that automatic metrics don't +provide sufficient insights for error analysis. + +0:08:57.317 --> 0:09:04.096 +Different types of errors would have different +implications depending on the underlying task. + +0:09:04.644 --> 0:09:09.895 +So, for example, if you use machine translation +for information with you both,. + +0:09:10.470 --> 0:09:20.202 +Then if it makes some error omitting some +words in translation then it would be very + +0:09:20.202 --> 0:09:20.775 +bad. + +0:09:21.321 --> 0:09:30.305 +Another example is if you use machine translation +in chat pop then fluency would be very important + +0:09:30.305 --> 0:09:50.253 +because: And we also need human measure in +order to develop and assess automatic translation + +0:09:50.253 --> 0:09:52.324 +evaluation. + +0:09:55.455 --> 0:10:01.872 +Okay, so now we will come to the quality measures +of human evaluation. + +0:10:02.402 --> 0:10:05.165 +The first thing is inter allotator agreement. + +0:10:05.825 --> 0:10:25.985 +This is agreement between different annotators. + +0:10:26.126 --> 0:10:31.496 +So as you can see here, this would measure +the reliability of the other features. + +0:10:32.252 --> 0:10:49.440 +And here we have an example of where the pace +car here is. + +0:10:49.849 --> 0:10:57.700 +And this is in contrast to intra-annuator +agreement, so this is agreement within an annotator. + +0:10:58.118 --> 0:11:03.950 +So instead of measuring reliability, here +it measures consistency of a single animator. + +0:11:04.884 --> 0:11:07.027 +And yep. + +0:11:07.027 --> 0:11:22.260 +We also have an example here of the which +is so which is quite. + +0:11:23.263 --> 0:11:42.120 +So now we will come to the main types of human +assessment: The first thing is direct assessment. + +0:11:42.842 --> 0:11:53.826 +The second thing is human ranking of the translation +at sentence level. + +0:11:56.176 --> 0:12:11.087 +So direct assessment given the source and +translation, and possibly the reference translation. + +0:12:12.612 --> 0:12:18.023 +The goal here is to give the scores to evaluate +performance,adequacy and fluency. + +0:12:18.598 --> 0:12:23.619 +The problem here is that we need normalization +across different judges, different human. + +0:12:24.604 --> 0:12:27.043 +And here we have an example. + +0:12:27.043 --> 0:12:33.517 +She was treated at the site by an emergency +doctor and taken to hospital by. + +0:12:34.334 --> 0:12:48.444 +The hypothesis here is that she was treated +on site and emergency medical rescue workers + +0:12:48.444 --> 0:12:52.090 +brought to a hospital. + +0:12:52.472 --> 0:12:56.267 +Lesson five is best in one sport. + +0:13:00.060 --> 0:13:04.716 +I don't think it's hard because I think there +should be broad threat to a hospital right. + +0:13:05.905 --> 0:13:09.553 +Yes, that is like a crucial error. + +0:13:09.553 --> 0:13:19.558 +Yeah, I think I would agree because this sentence +somehow gives us the idea of what the meaning + +0:13:19.558 --> 0:13:21.642 +of the sentence is. + +0:13:21.642 --> 0:13:24.768 +But then it lost towards her. + +0:13:27.027 --> 0:13:29.298 +The next time of human evaluation is ranking. + +0:13:30.810 --> 0:13:38.893 +Which is a great different system according +to performance like which one is better. + +0:13:40.981 --> 0:13:43.914 +So here now we have a second hypothesis. + +0:13:43.914 --> 0:13:49.280 +She was hospitalized on the spot and taken +to hospital by ambulance crews. + +0:13:50.630 --> 0:14:01.608 +As you can see here, the second hypothesis +seems to be more fluent, more smooth. + +0:14:01.608 --> 0:14:09.096 +The meaning capture seems to be: So yeah, +it's difficult to compare different errors + +0:14:09.096 --> 0:14:11.143 +in whether which error is more severe. + +0:14:13.373 --> 0:14:16.068 +The next type of human evaluation is post +editing. + +0:14:17.817 --> 0:14:29.483 +So we want to measure how much time and effort +human needs to spend in order to turn it into + +0:14:29.483 --> 0:14:32.117 +correct translation. + +0:14:32.993 --> 0:14:47.905 +So this area can be measured by time or key +shop. + +0:14:49.649 --> 0:14:52.889 +And the last one is task based evaluation. + +0:14:52.889 --> 0:14:56.806 +Here we would want to evaluate the complete +system. + +0:14:56.806 --> 0:15:03.436 +But if you are using the lecture translator +and you see my lecture in German, the final + +0:15:03.436 --> 0:15:05.772 +evaluation here would be like. + +0:15:05.772 --> 0:15:08.183 +In the end, can you understand? + +0:15:09.769 --> 0:15:15.301 +Their friendship here that we get the overall +performance, which is our final goal. + +0:15:16.816 --> 0:15:25.850 +But the disadvantage here that it could be +complex and again if the spur is low it might + +0:15:25.850 --> 0:15:31.432 +be other problems than the machine translation +itself. + +0:15:33.613 --> 0:15:42.941 +So guess that was about the human evaluation +part any question so far. + +0:15:42.941 --> 0:15:44.255 +Yes, and. + +0:16:00.000 --> 0:16:15.655 +Then we will come to our magic matrix here +to access the quality of the machine translation + +0:16:15.655 --> 0:16:26.179 +system by comparing: So the premise here is +that the more similar translation is to reference, + +0:16:26.179 --> 0:16:31.437 +the better and we want some algorithms that +can approximate. + +0:16:34.114 --> 0:16:47.735 +So the most famous measure could be the blow +spark and the bilingual evaluation. + +0:16:50.930 --> 0:16:56.358 +So if we are given the goal that the more +similar translation is to the reference, the + +0:16:56.358 --> 0:17:01.785 +better I think the most naive way would be +count the number of people sentenced to the + +0:17:01.785 --> 0:17:02.472 +reference. + +0:17:02.472 --> 0:17:08.211 +But as you can see, this would be very difficult +because sentence being exactly the same to + +0:17:08.211 --> 0:17:10.332 +the reference would be very rare. + +0:17:11.831 --> 0:17:24.222 +You can see the example here in the reference +and machine translation output. + +0:17:24.764 --> 0:17:31.930 +So the idea here is that instead of comparing +the two whole sentences up, we consider the. + +0:17:35.255 --> 0:17:43.333 +Now we can look at an example, so for the +blow score we consider one to three four grams. + +0:17:44.844 --> 0:17:52.611 +The one ramp of a lap we would have back to +the future, not at premieres thirty years ago, + +0:17:52.611 --> 0:17:59.524 +so it should be like one, two, three, four, +five, six, seven, eight, so like it. + +0:17:59.459 --> 0:18:01.476 +One ram is overlap to the reverence. + +0:18:01.921 --> 0:18:03.366 +So you should be over. + +0:18:06.666 --> 0:18:08.994 +Is kind of the same. + +0:18:08.994 --> 0:18:18.529 +Instead of considering only the word back +for three, one is to be back to the future. + +0:18:19.439 --> 0:18:31.360 +So that is basically the idea of the blue +score, and in the end we calculate the geometric. + +0:18:32.812 --> 0:18:39.745 +So as you can see here, when we look at the +A brand overlap you can only look at the machine + +0:18:39.745 --> 0:18:40.715 +translation. + +0:18:41.041 --> 0:18:55.181 +We only care about how many words in the machine +translation output appear. + +0:18:55.455 --> 0:19:02.370 +So this metric is kind of like a precision +based and not really recall based. + +0:19:04.224 --> 0:19:08.112 +So this would lead to a problem like the example +here. + +0:19:08.112 --> 0:19:14.828 +The reference is back to the future of Premier +30 years ago and the machine translation output + +0:19:14.828 --> 0:19:16.807 +is only back to the future. + +0:19:17.557 --> 0:19:28.722 +The one grab overlap will be formed because +you can see back to the future is overlap entirely + +0:19:28.722 --> 0:19:30.367 +in reference. + +0:19:31.231 --> 0:19:38.314 +Is not right because one is the perfect score, +but this is obviously not a good translation. + +0:19:40.120 --> 0:19:47.160 +So in order to tackle this they use something +called pre gravity velocity. + +0:19:47.988 --> 0:19:59.910 +So it should be a factor that is multiplied +to the geometric nymph. + +0:19:59.910 --> 0:20:04.820 +This form is the length of. + +0:20:05.525 --> 0:20:19.901 +So the penalty over or overseas to the power +of the length of this river over. + +0:20:21.321 --> 0:20:32.298 +Which is lower than, and if we apply this +to the example, the blowscorn is going to be + +0:20:32.298 --> 0:20:36.462 +which is not a good translation. + +0:20:38.999 --> 0:20:42.152 +Yep so any question of this place. + +0:20:44.064 --> 0:21:00.947 +Yes exactly that should be a problem as well, +and it will be mentioned later on. + +0:21:00.947 --> 0:21:01.990 +But. + +0:21:03.203 --> 0:21:08.239 +Is very sensitive to zero score like that, +so that is why we usually don't use the blue + +0:21:08.239 --> 0:21:13.103 +score sentence level because sentence can be +short and then there can be no overlap. + +0:21:13.103 --> 0:21:16.709 +That is why we usually use it on documents +as you can imagine. + +0:21:16.709 --> 0:21:20.657 +Documents are very long and very little chance +to have zero overlap. + +0:21:23.363 --> 0:21:28.531 +Yeah okay, so the next thing on the blow's +floor is slipping. + +0:21:29.809 --> 0:21:42.925 +So you can see here we have two references, +the new movie and the new film, and we have + +0:21:42.925 --> 0:21:47.396 +a machine translation output. + +0:21:47.807 --> 0:21:54.735 +Because the here is also in the reference, +so yeah two or two books is one, which is: + +0:21:56.236 --> 0:22:02.085 +So but then this is not what we want because +this is just repeating something that appears. + +0:22:02.702 --> 0:22:06.058 +So that's why we use clipping. + +0:22:06.058 --> 0:22:15.368 +Clipping here is that we consider the mask +counts in any reference, so as you can see + +0:22:15.368 --> 0:22:17.425 +here in reference. + +0:22:18.098 --> 0:22:28.833 +So here when we do clipping we will just use +the maximum opponents in the references. + +0:22:29.809 --> 0:22:38.717 +Yeah, just to avoid avoid overlapping repetitive +words in the translation. + +0:22:41.641 --> 0:23:00.599 +It could happen that there is no overlap between +the machine translation output and reference. + +0:23:00.500 --> 0:23:01.917 +Then Everything Is Going To Go To Zero. + +0:23:02.402 --> 0:23:07.876 +So that's why for blow score we usually use +Japanese level score where we arrogate the + +0:23:07.876 --> 0:23:08.631 +statistics. + +0:23:12.092 --> 0:23:18.589 +Some summary about the brewer as you can see +it mash exact words. + +0:23:18.589 --> 0:23:31.751 +It can take several references: It measured +a depotency by the word precision and if measured + +0:23:31.751 --> 0:23:36.656 +the fluency by the gram precision. + +0:23:37.437 --> 0:23:47.254 +And as mentioned, it doesn't consider how +much meaning that is captured in the machine + +0:23:47.254 --> 0:23:48.721 +translation. + +0:23:49.589 --> 0:23:53.538 +So here they use reality penalty to prevent +short sentences. + +0:23:54.654 --> 0:24:04.395 +Will get the spot over the last test set to +avoid the zero issues. + +0:24:04.395 --> 0:24:07.012 +As we mentioned,. + +0:24:09.829 --> 0:24:22.387 +Yes, that's mentioned with multiple reference +translation simultaneously, and it's a precision + +0:24:22.387 --> 0:24:24.238 +based matrix. + +0:24:24.238 --> 0:24:27.939 +So we are not sure if this. + +0:24:29.689 --> 0:24:37.423 +The second thing is that blows calls common +safe for recall by routine penalty, and we + +0:24:37.423 --> 0:24:38.667 +are not sure. + +0:24:39.659 --> 0:24:50.902 +Matches, so can still improve the similarity +measure and improve the correlation score to + +0:24:50.902 --> 0:24:51.776 +human. + +0:24:52.832 --> 0:25:01.673 +The next is that all work will have the same +importance. + +0:25:01.673 --> 0:25:07.101 +What if a scheme for wedding work? + +0:25:11.571 --> 0:25:26.862 +And the last witness is that blows for high +grade order engrams that can confluency dramatically. + +0:25:27.547 --> 0:25:32.101 +So the pressure is that can be accounted for +fluency, and grammatically there's some other. + +0:25:35.956 --> 0:25:47.257 +We have some further issues and not created +equally so we can use stemming or knowledge + +0:25:47.257 --> 0:25:48.156 +space. + +0:25:50.730 --> 0:26:00.576 +The next way we incorporate information is +within the metrics. + +0:26:01.101 --> 0:26:07.101 +And can be used like a stop list to like somehow +ignore the non-important words. + +0:26:08.688 --> 0:26:12.687 +Text normalization spelling conjugation lower +case and mix case. + +0:26:12.687 --> 0:26:18.592 +The next thing is that for some language like +Chinese there can be different world segmentation + +0:26:18.592 --> 0:26:23.944 +so exact word matching might no longer be a +good idea so maybe it's ready to cover the + +0:26:23.944 --> 0:26:27.388 +score as the character level instead of the +word level. + +0:26:29.209 --> 0:26:33.794 +And the last thing is speech translation. + +0:26:33.794 --> 0:26:38.707 +Usually input from speech translation would. + +0:26:38.979 --> 0:26:51.399 +And there should be some way to segment into +sentences so that we can calculate the score + +0:26:51.399 --> 0:26:52.090 +and. + +0:26:52.953 --> 0:27:01.326 +And the way to soften is to use some tools +like enware segmentation to align the output + +0:27:01.326 --> 0:27:01.896 +with. + +0:27:06.306 --> 0:27:10.274 +Yes, so guess that was all about the blow +score any question. + +0:27:14.274 --> 0:27:28.292 +Again on automatic metrics we'll talk about +probably good metrics, strange automatic metrics, + +0:27:28.292 --> 0:27:32.021 +use cases on evaluation. + +0:27:34.374 --> 0:27:44.763 +How to measure the performance of the matrix, +so a good matrix would be a. + +0:27:49.949 --> 0:28:04.905 +We would want the matrix to be interpretable +if this is the ranking from a human that somehow + +0:28:04.905 --> 0:28:08.247 +can rank the system. + +0:28:12.132 --> 0:28:15.819 +We would also want the evaluation metric to +be sensitive. + +0:28:15.819 --> 0:28:21.732 +Like small differences in the machine translation +can be distinguished, we would not need to + +0:28:21.732 --> 0:28:22.686 +be consistent. + +0:28:22.686 --> 0:28:28.472 +Like if the same machine translation system +is used on a similar text, it should reproduce + +0:28:28.472 --> 0:28:29.553 +a similar score. + +0:28:31.972 --> 0:28:40.050 +Next, we would want the machine translation +system to be reliable. + +0:28:40.050 --> 0:28:42.583 +Machine translation. + +0:28:43.223 --> 0:28:52.143 +We want the matrix to be easy to run in general +and can be applied to multiple different machine. + +0:28:55.035 --> 0:29:11.148 +The difficulty of evaluating the metric itself +is kind of similar to when you evaluate the + +0:29:11.148 --> 0:29:13.450 +translation. + +0:29:18.638 --> 0:29:23.813 +And here is some components of the automatic +machine translation matrix. + +0:29:23.813 --> 0:29:28.420 +So for the matching matrix the component would +be the precision. + +0:29:28.420 --> 0:29:30.689 +Recall our Levinstein distance. + +0:29:30.689 --> 0:29:35.225 +So for the blow sparks you have seen it cares +mostly about the. + +0:29:36.396 --> 0:29:45.613 +And on the features it would be about how +to measure the matches or character based. + +0:29:48.588 --> 0:30:01.304 +Now we will talk about more matrix because +the blue score is the most common. + +0:30:02.082 --> 0:30:10.863 +So it compared the reference and hypothesis +using edit operations. + +0:30:10.863 --> 0:30:14.925 +They count how many insertion. + +0:30:23.143 --> 0:30:31.968 +We already talked about it beyond what matching +would care about character based mathematization + +0:30:31.968 --> 0:30:34.425 +or linguistic information. + +0:30:36.636 --> 0:30:41.502 +The next metric is the meteor metric. + +0:30:41.502 --> 0:30:50.978 +This is strong called metric for evaluation +of translation with explicit. + +0:30:51.331 --> 0:31:03.236 +So merely their new idea is that they reintroduce +repose and combine with precision as small + +0:31:03.236 --> 0:31:04.772 +components. + +0:31:05.986 --> 0:31:16.700 +The language translation output with each +reference individually and takes part of the + +0:31:16.700 --> 0:31:18.301 +best parent. + +0:31:20.940 --> 0:31:27.330 +The next thing is that matching takes into +counterfection variation by stepping, so it's + +0:31:27.330 --> 0:31:28.119 +no longer. + +0:31:30.230 --> 0:31:40.165 +When they address fluency, they're a direct +penalty instead of ink arms so they would care + +0:31:40.165 --> 0:31:40.929 +about. + +0:31:45.925 --> 0:31:56.287 +The next thing is on two noble metrics, so +for this metric we want to extract some features. + +0:31:56.936 --> 0:32:04.450 +So for example here the nice house is on the +right and the building is on the right side + +0:32:04.450 --> 0:32:12.216 +so we will have to extract some pictures like +for example here the reference and hypothesis + +0:32:12.216 --> 0:32:14.158 +have hypers in common. + +0:32:14.714 --> 0:32:19.163 +They have one insertion, two deletions, and +they have the same verb. + +0:32:21.141 --> 0:32:31.530 +So the idea is to use machine translation +techniques to combine features and this machine + +0:32:31.530 --> 0:32:37.532 +translation model will be trained on human +ranking. + +0:32:39.819 --> 0:32:44.788 +Any common framework for this is comet. + +0:32:44.684 --> 0:32:48.094 +Which is a narrow model that is used with +X for. + +0:32:48.094 --> 0:32:54.149 +The feature would be created using some prejutant +model like X, L, M, U, R, A, BO, DA. + +0:32:54.149 --> 0:33:00.622 +Here the input would be the source, the reference +and the hypothesis and then they would try + +0:33:00.622 --> 0:33:02.431 +to produce an assessment. + +0:33:03.583 --> 0:33:05.428 +Yeah, it's strange to predict human sport. + +0:33:06.346 --> 0:33:19.131 +And they also have some additional versions, +as we train this model in order to tell whether + +0:33:19.131 --> 0:33:20.918 +translation. + +0:33:21.221 --> 0:33:29.724 +So instead of checking the source and the +hypothesis as input, they could take only the + +0:33:29.724 --> 0:33:38.034 +source and the hypotheses as input and try +to predict the quality of the translation. + +0:33:42.562 --> 0:33:49.836 +So assumptions before machine translation +systems are often used in larger systems. + +0:33:50.430 --> 0:33:57.713 +So the question is how to evaluate the performance +of the machine translation system in this larger + +0:33:57.713 --> 0:34:04.997 +scenario, and an example would be speech translation +system when you try to translate English audio + +0:34:04.997 --> 0:34:05.798 +to German. + +0:34:06.506 --> 0:34:13.605 +Then it would usually have two opponents, +ASR and MT, where ASR is like speech recognition + +0:34:13.605 --> 0:34:20.626 +that can describe English audio to English +text, and then we have the machine translation + +0:34:20.626 --> 0:34:24.682 +system that translates English text to German +text. + +0:34:26.967 --> 0:34:33.339 +So in order to have these overall performances +in this bigger scenario, they are so willing + +0:34:33.339 --> 0:34:34.447 +to evaluate it. + +0:34:34.447 --> 0:34:41.236 +So the first one is to evaluate the individual +components like how good is the speech recognizer, + +0:34:41.236 --> 0:34:46.916 +how good is the analyzed and generalization +engines, how good is the synthesizer. + +0:34:47.727 --> 0:34:56.905 +The second way is to evaluate translation +quality from speech input to text output. + +0:34:56.905 --> 0:35:00.729 +How good is the final translation? + +0:35:02.102 --> 0:35:10.042 +The next thing is to measure the to evaluate +the architecture effectiveness like: How is + +0:35:10.042 --> 0:35:12.325 +the level effects in general? + +0:35:12.325 --> 0:35:19.252 +The next one is task based evaluation or use +a study like we just simply ask the user what + +0:35:19.252 --> 0:35:24.960 +is their experience like whether the system +works well and how well it is. + +0:35:27.267 --> 0:35:32.646 +So here we have an example of the ITF shale +test result. + +0:35:33.153 --> 0:35:38.911 +So the first block would be the human evaluation +like I think they are asked to give a spawl + +0:35:38.911 --> 0:35:44.917 +from one to five again where a fight is best +and one is worst and the lower one is the blowscore + +0:35:44.917 --> 0:35:50.490 +and they find out that the human evaluation +is far actually correlated with the blowsfall + +0:35:50.490 --> 0:35:51.233 +quite well. + +0:35:53.193 --> 0:36:02.743 +Here you can also see that the systems from +our university are actually on top many sub-tasts. + +0:36:05.605 --> 0:36:07.429 +So Yeah. + +0:36:08.868 --> 0:36:14.401 +For this lecture is that machine translation +evaluation is difficult. + +0:36:14.401 --> 0:36:21.671 +We talk about human versus automatic evaluation +that human would be costly, but then is the + +0:36:21.671 --> 0:36:27.046 +goal standard automatic evaluation would be +a fast and cheaper way. + +0:36:27.547 --> 0:36:36.441 +We talk about granulity on sentence level, +document level or task level evaluation machine + +0:36:36.441 --> 0:36:38.395 +translation system. + +0:36:39.679 --> 0:36:51.977 +And we talked about human evaluation versus +automatic metrics in details. + +0:36:54.034 --> 0:36:59.840 +So we introduced a lot of metric metrics. + +0:36:59.840 --> 0:37:10.348 +How do they compare from the quadrating of +human assessment so it's better? + +0:37:12.052 --> 0:37:16.294 +I don't have the exact score and reference +in my head. + +0:37:16.294 --> 0:37:22.928 +I would assume that mediators should have +a better correlation because here they also + +0:37:22.928 --> 0:37:30.025 +consider other aspects like the recall whether +the information in the reference is captured + +0:37:30.025 --> 0:37:31.568 +in the translation. + +0:37:32.872 --> 0:37:41.875 +Like synonyms, so I would assume that mid +air is better, but again don't have the reference + +0:37:41.875 --> 0:37:43.441 +in my hair, so. + +0:37:43.903 --> 0:37:49.771 +But guess the reason people are still using +BlueScore is that in most literature, a machine + +0:37:49.771 --> 0:38:00.823 +translation system, they report: So now you +create a new machine translation system. + +0:38:00.823 --> 0:38:07.990 +It might be better to also report the blow. + +0:38:08.228 --> 0:38:11.472 +Exactly just slice good, just spread white, +and then we're going to go ahead. + +0:38:12.332 --> 0:38:14.745 +And don't know what you're doing. + +0:38:17.457 --> 0:38:18.907 +I Want to Talk Quickly About. + +0:38:19.059 --> 0:38:32.902 +So it is like a language model, so it's kind +of the same uses as. + +0:38:33.053 --> 0:38:39.343 +So the idea is that we have this layer in +order to embed the sauce and the reference + +0:38:39.343 --> 0:38:39.713 +and. + +0:38:40.000 --> 0:38:54.199 +Into some feature vectors that we can later +on use to predict the human sport in the. + +0:38:58.618 --> 0:39:00.051 +It If There's Nothing Else. + diff --git a/demo_data/lectures/Lecture-05-02.05.2023/video.mp4 b/demo_data/lectures/Lecture-05-02.05.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e510c1064eadae86a257998be310627c6c458f1f --- /dev/null +++ b/demo_data/lectures/Lecture-05-02.05.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5014f3570b8db38818ab44ed117dc6d67206c5163b6b87b45df4a2aa426b8222 +size 314238982 diff --git a/demo_data/lectures/Lecture-06-09.05.2023/English.vtt b/demo_data/lectures/Lecture-06-09.05.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..54df8fa3fd239551a0302b2e7467bf6bb13d5f93 --- /dev/null +++ b/demo_data/lectures/Lecture-06-09.05.2023/English.vtt @@ -0,0 +1,2970 @@ +WEBVTT + +0:00:01.721 --> 0:00:08.584 +Hey, then welcome to today's lecture on language +modeling. + +0:00:09.409 --> 0:00:21.608 +We had not a different view on machine translation, +which was the evaluation path it's important + +0:00:21.608 --> 0:00:24.249 +to evaluate and see. + +0:00:24.664 --> 0:00:33.186 +We want to continue with building the MT system +and this will be the last part before we are + +0:00:33.186 --> 0:00:36.668 +going into a neural step on Thursday. + +0:00:37.017 --> 0:00:45.478 +So we had the the broader view on statistical +machine translation and the. + +0:00:45.385 --> 0:00:52.977 +Thursday: A week ago we talked about the statistical +machine translation and mainly the translation + +0:00:52.977 --> 0:00:59.355 +model, so how we model how probable is it that +one word is translated into another. + +0:01:00.800 --> 0:01:15.583 +However, there is another component when doing +generation tasks in general and machine translation. + +0:01:16.016 --> 0:01:23.797 +There are several characteristics which you +only need to model on the target side in the + +0:01:23.797 --> 0:01:31.754 +traditional approach where we talked about +the generation from more semantic or synthectic + +0:01:31.754 --> 0:01:34.902 +representation into the real world. + +0:01:35.555 --> 0:01:51.013 +And the challenge is that there's some constructs +which are only there in the target language. + +0:01:52.132 --> 0:01:57.908 +You cannot really get that translation, but +it's more something that needs to model on + +0:01:57.908 --> 0:01:58.704 +the target. + +0:01:59.359 --> 0:02:05.742 +And this is done typically by a language model +and this concept of language model. + +0:02:06.326 --> 0:02:11.057 +Guess you can assume nowadays very important. + +0:02:11.057 --> 0:02:20.416 +You've read a lot about large language models +recently and they are all somehow trained or + +0:02:20.416 --> 0:02:22.164 +the idea behind. + +0:02:25.986 --> 0:02:41.802 +What we'll look today at if get the next night +and look what a language model is and today's + +0:02:41.802 --> 0:02:42.992 +focus. + +0:02:43.363 --> 0:02:49.188 +This was the common approach to the language +model for twenty or thirty years, so a lot + +0:02:49.188 --> 0:02:52.101 +of time it was really the state of the art. + +0:02:52.101 --> 0:02:58.124 +And people have used that in many applications +in machine translation and automatic speech + +0:02:58.124 --> 0:02:58.985 +recognition. + +0:02:59.879 --> 0:03:11.607 +Again you are measuring the performance, but +this is purely the performance of the language + +0:03:11.607 --> 0:03:12.499 +model. + +0:03:13.033 --> 0:03:23.137 +And then we will see that the traditional +language will have a major drawback in how + +0:03:23.137 --> 0:03:24.683 +we can deal. + +0:03:24.944 --> 0:03:32.422 +So if you model language you will see that +in most of the sentences and you have not really + +0:03:32.422 --> 0:03:39.981 +seen and you're still able to assess if this +is good language or if this is native language. + +0:03:40.620 --> 0:03:45.092 +And this is challenging if you do just like +parameter estimation. + +0:03:45.605 --> 0:03:59.277 +We are using two different techniques to do: +interpolation, and these are essentially in + +0:03:59.277 --> 0:04:01.735 +order to build. + +0:04:01.881 --> 0:04:11.941 +It also motivates why things might be easier +if we are going into neural morals as we will. + +0:04:12.312 --> 0:04:18.203 +And at the end we'll talk a bit about some +additional type of language models which are + +0:04:18.203 --> 0:04:18.605 +also. + +0:04:20.440 --> 0:04:29.459 +So where our language was used, or how are +they used in the machine translations? + +0:04:30.010 --> 0:04:38.513 +So the idea of a language model is that we +are modeling what is the fluency of language. + +0:04:38.898 --> 0:04:49.381 +So if you have, for example, sentence will, +then you can estimate that there are some words: + +0:04:49.669 --> 0:05:08.929 +For example, the next word is valid, but will +card's words not? + +0:05:09.069 --> 0:05:13.673 +And we can do that. + +0:05:13.673 --> 0:05:22.192 +We have seen that the noise channel. + +0:05:22.322 --> 0:05:33.991 +That we have seen someone two weeks ago, and +today we will look into how can we model P + +0:05:33.991 --> 0:05:36.909 +of Y or how possible. + +0:05:37.177 --> 0:05:44.192 +Now this is completely independent of the +translation process. + +0:05:44.192 --> 0:05:49.761 +How fluent is a sentence and how you can express? + +0:05:51.591 --> 0:06:01.699 +And this language model task has one really +big advantage and assume that is even the big + +0:06:01.699 --> 0:06:02.935 +advantage. + +0:06:03.663 --> 0:06:16.345 +The big advantage is the data we need to train +that so normally we are doing supervised learning. + +0:06:16.876 --> 0:06:20.206 +So machine translation will talk about. + +0:06:20.206 --> 0:06:24.867 +That means we have the source center and target +center. + +0:06:25.005 --> 0:06:27.620 +They need to be aligned. + +0:06:27.620 --> 0:06:31.386 +We look into how we can model them. + +0:06:31.386 --> 0:06:39.270 +Generally, the problem with this is that: +Machine translation: You still have the advantage + +0:06:39.270 --> 0:06:45.697 +that there's quite huge amounts of this data +for many languages, not all but many, but other + +0:06:45.697 --> 0:06:47.701 +classes even more difficult. + +0:06:47.701 --> 0:06:50.879 +There's very few data where you have summary. + +0:06:51.871 --> 0:07:02.185 +So the big advantage of language model is +we're only modeling the centers, so we only + +0:07:02.185 --> 0:07:04.103 +need pure text. + +0:07:04.584 --> 0:07:11.286 +And pure text, especially since we have the +Internet face melting large amounts of text. + +0:07:11.331 --> 0:07:17.886 +Of course, it's still, it's still maybe only +for some domains, some type. + +0:07:18.198 --> 0:07:23.466 +Want to have data for speech about machine +translation. + +0:07:23.466 --> 0:07:27.040 +Maybe there's only limited data that. + +0:07:27.027 --> 0:07:40.030 +There's always and also you go to some more +exotic languages and then you will have less + +0:07:40.030 --> 0:07:40.906 +data. + +0:07:41.181 --> 0:07:46.803 +And in language once we can now look, how +can we make use of these data? + +0:07:47.187 --> 0:07:54.326 +And: Nowadays this is often also framed as +self supervised learning because on the one + +0:07:54.326 --> 0:08:00.900 +hand here we'll see it's a time of classification +cast or supervised learning but we create some + +0:08:00.900 --> 0:08:02.730 +other data science itself. + +0:08:02.742 --> 0:08:13.922 +So it's not that we have this pair of data +text and labels, but we have only the text. + +0:08:15.515 --> 0:08:21.367 +So the question is how can we use this modeling +data and how can we train our language? + +0:08:22.302 --> 0:08:35.086 +The main goal is to produce fluent English, +so we want to somehow model that something + +0:08:35.086 --> 0:08:38.024 +is a sentence of a. + +0:08:38.298 --> 0:08:44.897 +So there is no clear separation about semantics +and syntax, but in this case it is not about + +0:08:44.897 --> 0:08:46.317 +a clear separation. + +0:08:46.746 --> 0:08:50.751 +So we will monitor them somehow in there. + +0:08:50.751 --> 0:08:56.091 +There will be some notion of semantics, some +notion of. + +0:08:56.076 --> 0:09:08.748 +Because you say you want to water how fluid +or probable is that the native speaker is producing + +0:09:08.748 --> 0:09:12.444 +that because of the one in. + +0:09:12.512 --> 0:09:17.711 +We are rarely talking like things that are +semantically wrong, and therefore there is + +0:09:17.711 --> 0:09:18.679 +also some type. + +0:09:19.399 --> 0:09:24.048 +So, for example, the house is small. + +0:09:24.048 --> 0:09:30.455 +It should be a higher stability than the house +is. + +0:09:31.251 --> 0:09:38.112 +Because home and house are both meaning German, +they are used differently. + +0:09:38.112 --> 0:09:43.234 +For example, it should be more probable that +the plane. + +0:09:44.444 --> 0:09:51.408 +So this is both synthetically correct, but +cementically not. + +0:09:51.408 --> 0:09:58.372 +But still you will see much more often the +probability that. + +0:10:03.883 --> 0:10:14.315 +So more formally, it's about like the language +should be some type of function, and it gives + +0:10:14.315 --> 0:10:18.690 +us the probability that this sentence. + +0:10:19.519 --> 0:10:27.312 +Indicating that this is good English or more +generally English, of course you can do that. + +0:10:28.448 --> 0:10:37.609 +And earlier times people have even done try +to do that deterministic that was especially + +0:10:37.609 --> 0:10:40.903 +used for more dialogue systems. + +0:10:40.840 --> 0:10:50.660 +You have a very strict syntax so you can only +use like turn off the, turn off the radio. + +0:10:50.690 --> 0:10:56.928 +Something else, but you have a very strict +deterministic finance state grammar like which + +0:10:56.928 --> 0:10:58.107 +type of phrases. + +0:10:58.218 --> 0:11:04.791 +The problem of course if we're dealing with +language is that language is variable, we're + +0:11:04.791 --> 0:11:10.183 +not always talking correct sentences, and so +this type of deterministic. + +0:11:10.650 --> 0:11:22.121 +That's why for already many, many years people +look into statistical language models and try + +0:11:22.121 --> 0:11:24.587 +to model something. + +0:11:24.924 --> 0:11:35.096 +So something like what is the probability +of the sequences of to, and that is what. + +0:11:35.495 --> 0:11:43.076 +The advantage of doing it statistically is +that we can train large text databases so we + +0:11:43.076 --> 0:11:44.454 +can train them. + +0:11:44.454 --> 0:11:52.380 +We don't have to define it and most of these +cases we don't want to have the hard decision. + +0:11:52.380 --> 0:11:55.481 +This is a sentence of the language. + +0:11:55.815 --> 0:11:57.914 +Why we want to have some type of probability? + +0:11:57.914 --> 0:11:59.785 +How probable is this part of the center? + +0:12:00.560 --> 0:12:04.175 +Because yeah, even for a few minutes, it's +not always clear. + +0:12:04.175 --> 0:12:06.782 +Is this a sentence that you can use or not? + +0:12:06.782 --> 0:12:12.174 +I mean, I just in this presentation gave several +sentences, which are not correct English. + +0:12:12.174 --> 0:12:17.744 +So it might still happen that people speak +sentences or write sentences that I'm not correct, + +0:12:17.744 --> 0:12:19.758 +and you want to deal with all of. + +0:12:20.020 --> 0:12:25.064 +So that is then, of course, a big advantage +if you use your more statistical models. + +0:12:25.705 --> 0:12:35.810 +The disadvantage is that you need a subtitle +of large text databases which might exist from + +0:12:35.810 --> 0:12:37.567 +many languages. + +0:12:37.857 --> 0:12:46.511 +Nowadays you see that there is of course issues +that you need large computational resources + +0:12:46.511 --> 0:12:47.827 +to deal with. + +0:12:47.827 --> 0:12:56.198 +You need to collect all these crawlers on +the internet which can create enormous amounts + +0:12:56.198 --> 0:12:57.891 +of training data. + +0:12:58.999 --> 0:13:08.224 +So if we want to build this then the question +is of course how can we estimate the probability? + +0:13:08.448 --> 0:13:10.986 +So how probable is the sentence good morning? + +0:13:11.871 --> 0:13:15.450 +And you all know basic statistics. + +0:13:15.450 --> 0:13:21.483 +So if you see this you have a large database +of sentences. + +0:13:21.901 --> 0:13:28.003 +Made this a real example, so this was from +the TED talks. + +0:13:28.003 --> 0:13:37.050 +I guess most of you have heard about them, +and if you account for all many sentences, + +0:13:37.050 --> 0:13:38.523 +good morning. + +0:13:38.718 --> 0:13:49.513 +It happens so the probability of good morning +is sweet point times to the power minus. + +0:13:50.030 --> 0:13:53.755 +Okay, so this is a very easy thing. + +0:13:53.755 --> 0:13:58.101 +We can directly model the language model. + +0:13:58.959 --> 0:14:03.489 +Does anybody see a problem why this might +not be the final solution? + +0:14:06.326 --> 0:14:14.962 +Think we would need a folder of more sentences +to make anything useful of this. + +0:14:15.315 --> 0:14:29.340 +Because the probability of the talk starting +with good morning, good morning is much higher + +0:14:29.340 --> 0:14:32.084 +than ten minutes. + +0:14:33.553 --> 0:14:41.700 +In all the probability presented in this face, +not how we usually think about it. + +0:14:42.942 --> 0:14:55.038 +The probability is even OK, but you're going +into the right direction about the large data. + +0:14:55.038 --> 0:14:59.771 +Yes, you can't form a new sentence. + +0:15:00.160 --> 0:15:04.763 +It's about a large data, so you said it's +hard to get enough data. + +0:15:04.763 --> 0:15:05.931 +It's impossible. + +0:15:05.931 --> 0:15:11.839 +I would say we are always saying sentences +which have never been said and we are able + +0:15:11.839 --> 0:15:12.801 +to deal with. + +0:15:13.133 --> 0:15:25.485 +The problem with the sparsity of the data +will have a lot of perfect English sentences. + +0:15:26.226 --> 0:15:31.338 +And this is, of course, not what we want to +deal with. + +0:15:31.338 --> 0:15:39.332 +If we want to model that, we need to have +a model which can really estimate how good. + +0:15:39.599 --> 0:15:47.970 +And if we are just like counting this way, +most of it will get a zero probability, which + +0:15:47.970 --> 0:15:48.722 +is not. + +0:15:49.029 --> 0:15:56.572 +So we need to make things a bit different. + +0:15:56.572 --> 0:16:06.221 +For the models we had already some idea of +doing that. + +0:16:06.486 --> 0:16:08.058 +And that we can do here again. + +0:16:08.528 --> 0:16:12.866 +So we can especially use the gel gel. + +0:16:12.772 --> 0:16:19.651 +The chain rule and the definition of conditional +probability solve the conditional probability. + +0:16:19.599 --> 0:16:26.369 +Of an event B given in an event A is the probability +of A and B divided to the probability of A. + +0:16:26.369 --> 0:16:32.720 +Yes, I recently had a exam on a manic speech +recognition and Mister Rival said this is not + +0:16:32.720 --> 0:16:39.629 +called a chain of wood because I use this terminology +and he said it's just applying base another. + +0:16:40.500 --> 0:16:56.684 +But this is definitely the definition of the +condition of probability. + +0:16:57.137 --> 0:17:08.630 +The probability is defined as P of A and P +of supposed to be divided by the one. + +0:17:08.888 --> 0:17:16.392 +And that can be easily rewritten into and +times given. + +0:17:16.816 --> 0:17:35.279 +And the nice thing is, we can easily extend +it, of course, into more variables so we can + +0:17:35.279 --> 0:17:38.383 +have: And so on. + +0:17:38.383 --> 0:17:49.823 +So more generally you can do that for now +any length of sequence. + +0:17:50.650 --> 0:18:04.802 +So if we are now going back to words, we can +model that as the probability of the sequence + +0:18:04.802 --> 0:18:08.223 +is given its history. + +0:18:08.908 --> 0:18:23.717 +Maybe it's more clear if we're looking at +real works, so if we have pee-off, it's water + +0:18:23.717 --> 0:18:26.914 +is so transparent. + +0:18:26.906 --> 0:18:39.136 +So this way we are able to model the ability +of the whole sentence given the sequence by + +0:18:39.136 --> 0:18:42.159 +looking at each word. + +0:18:42.762 --> 0:18:49.206 +And of course the big advantage is that each +word occurs less often than the full sect. + +0:18:49.206 --> 0:18:54.991 +So hopefully we see that still, of course, +the problem the word doesn't occur. + +0:18:54.991 --> 0:19:01.435 +Then this doesn't work, but let's recover +most of the lectures today about dealing with + +0:19:01.435 --> 0:19:01.874 +this. + +0:19:02.382 --> 0:19:08.727 +So by first of all, we generally is at least +easier as the thing we have before. + +0:19:13.133 --> 0:19:23.531 +That we really make sense easier, no, because +those jumps get utterly long and we have central. + +0:19:23.943 --> 0:19:29.628 +Yes exactly, so when we look at the last probability +here, we still have to have seen the full. + +0:19:30.170 --> 0:19:38.146 +So if we want a molecule of transparent, if +water is so we have to see the food sequence. + +0:19:38.578 --> 0:19:48.061 +So in first step we didn't really have to +have seen the full sentence. + +0:19:48.969 --> 0:19:52.090 +However, a little bit of a step nearer. + +0:19:52.512 --> 0:19:59.673 +So this is still a problem and we will never +have seen it for all the time. + +0:20:00.020 --> 0:20:08.223 +So you can look at this if you have a vocabulary +of words. + +0:20:08.223 --> 0:20:17.956 +Now, for example, if the average sentence +is, you would leave to the. + +0:20:18.298 --> 0:20:22.394 +And we are quite sure we have never seen that +much date. + +0:20:22.902 --> 0:20:26.246 +So this is, we cannot really compute this +probability. + +0:20:26.786 --> 0:20:37.794 +However, there's a trick how we can do that +and that's the idea between most of the language. + +0:20:38.458 --> 0:20:44.446 +So instead of saying how often does this work +happen to exactly this history, we are trying + +0:20:44.446 --> 0:20:50.433 +to do some kind of clustering and cluster a +lot of different histories into the same class, + +0:20:50.433 --> 0:20:55.900 +and then we are modeling the probability of +the word given this class of histories. + +0:20:56.776 --> 0:21:06.245 +And then, of course, the big design decision +is how to be modeled like how to cluster history. + +0:21:06.666 --> 0:21:17.330 +So how do we put all these histories together +so that we have seen each of one off enough + +0:21:17.330 --> 0:21:18.396 +so that. + +0:21:20.320 --> 0:21:25.623 +So there is quite different types of things +people can do. + +0:21:25.623 --> 0:21:33.533 +You can add some speech texts, you can do +semantic words, you can model the similarity, + +0:21:33.533 --> 0:21:46.113 +you can model grammatical content, and things +like: However, like quite often in these statistical + +0:21:46.113 --> 0:21:53.091 +models, if you have a very simple solution. + +0:21:53.433 --> 0:21:58.455 +And this is what most statistical models do. + +0:21:58.455 --> 0:22:09.616 +They are based on the so called mark of assumption, +and that means we are assuming all this history + +0:22:09.616 --> 0:22:12.183 +is not that important. + +0:22:12.792 --> 0:22:25.895 +So we are modeling the probability of zirkins +is so transparent that or we have maybe two + +0:22:25.895 --> 0:22:29.534 +words by having a fixed. + +0:22:29.729 --> 0:22:38.761 +So the class of all our history from word +to word minus one is just the last two words. + +0:22:39.679 --> 0:22:45.229 +And by doing this classification, which of +course does need any additional knowledge. + +0:22:45.545 --> 0:22:51.176 +It's very easy to calculate we have no limited +our our histories. + +0:22:51.291 --> 0:23:00.906 +So instead of an arbitrary long one here, +we have here only like. + +0:23:00.906 --> 0:23:10.375 +For example, if we have two grams, a lot of +them will not occur. + +0:23:10.930 --> 0:23:20.079 +So it's a very simple trick to make all these +classes into a few classes and motivated by, + +0:23:20.079 --> 0:23:24.905 +of course, the language the nearest things +are. + +0:23:24.944 --> 0:23:33.043 +Like a lot of sequences, they mainly depend +on the previous one, and things which are far + +0:23:33.043 --> 0:23:33.583 +away. + +0:23:38.118 --> 0:23:47.361 +In our product here everything is just modeled +not by the whole history but by the last and + +0:23:47.361 --> 0:23:48.969 +minus one word. + +0:23:50.470 --> 0:23:54.322 +So and this is typically expressed by people. + +0:23:54.322 --> 0:24:01.776 +They're therefore also talking by an N gram +language model because we are always looking + +0:24:01.776 --> 0:24:06.550 +at these chimes of N words and modeling the +probability. + +0:24:07.527 --> 0:24:10.485 +So again start with the most simple case. + +0:24:10.485 --> 0:24:15.485 +Even extreme is the unigram case, so we're +ignoring the whole history. + +0:24:15.835 --> 0:24:24.825 +The probability of a sequence of words is +just the probability of each of the words in + +0:24:24.825 --> 0:24:25.548 +there. + +0:24:26.046 --> 0:24:32.129 +And therefore we are removing the whole context. + +0:24:32.129 --> 0:24:40.944 +The most probable sequence would be something +like one of them is the. + +0:24:42.162 --> 0:24:44.694 +Most probable wordsuit by itself. + +0:24:44.694 --> 0:24:49.684 +It might not make sense, but it, of course, +can give you a bit of. + +0:24:49.629 --> 0:24:52.682 +Intuition like which types of words should +be more frequent. + +0:24:53.393 --> 0:25:00.012 +And if you what you can do is train such a +button and you can just automatically generate. + +0:25:00.140 --> 0:25:09.496 +And this sequence is generated by sampling, +so we will later come in the lecture too. + +0:25:09.496 --> 0:25:16.024 +The sampling is that you randomly pick a word +but based on. + +0:25:16.096 --> 0:25:22.711 +So if the probability of one word is zero +point two then you'll put it on and if another + +0:25:22.711 --> 0:25:23.157 +word. + +0:25:23.483 --> 0:25:36.996 +And if you see that you'll see here now, for +example, it seems that these are two occurring + +0:25:36.996 --> 0:25:38.024 +posts. + +0:25:38.138 --> 0:25:53.467 +But you see there's not really any continuing +type of structure because each word is modeled + +0:25:53.467 --> 0:25:55.940 +independently. + +0:25:57.597 --> 0:26:03.037 +This you can do better even though going to +a biograph, so then we're having a bit of context. + +0:26:03.037 --> 0:26:08.650 +Of course, it's still very small, so the probability +of your word of the actual word only depends + +0:26:08.650 --> 0:26:12.429 +on the previous word and all the context before +there is ignored. + +0:26:13.133 --> 0:26:18.951 +This of course will come to that wrong, but +it models a regular language significantly + +0:26:18.951 --> 0:26:19.486 +better. + +0:26:19.779 --> 0:26:28.094 +Seeing some things here still doesn't really +make a lot of sense, but you're seeing some + +0:26:28.094 --> 0:26:29.682 +typical phrases. + +0:26:29.949 --> 0:26:39.619 +In this hope doesn't make sense, but in this +issue is also frequent. + +0:26:39.619 --> 0:26:51.335 +Issue is also: Very nice is this year new +car parking lot after, so if you have the word + +0:26:51.335 --> 0:26:53.634 +new then the word. + +0:26:53.893 --> 0:27:01.428 +Is also quite common, but new car they wouldn't +put parking. + +0:27:01.428 --> 0:27:06.369 +Often the continuation is packing lots. + +0:27:06.967 --> 0:27:12.417 +And now it's very interesting because here +we see the two cementic meanings of lot: You + +0:27:12.417 --> 0:27:25.889 +have a parking lot, but in general if you just +think about the history, the most common use + +0:27:25.889 --> 0:27:27.353 +is a lot. + +0:27:27.527 --> 0:27:33.392 +So you see that he's really not using the +context before, but he's only using the current + +0:27:33.392 --> 0:27:33.979 +context. + +0:27:38.338 --> 0:27:41.371 +So in general we can of course do that longer. + +0:27:41.371 --> 0:27:43.888 +We can do unigrams, bigrams, trigrams. + +0:27:45.845 --> 0:27:52.061 +People typically went up to four or five grams, +and then it's getting difficult because. + +0:27:52.792 --> 0:27:56.671 +There are so many five grams that it's getting +complicated. + +0:27:56.671 --> 0:28:02.425 +Storing all of them and storing these models +get so big that it's no longer working, and + +0:28:02.425 --> 0:28:08.050 +of course at some point the calculation of +the probabilities again gets too difficult, + +0:28:08.050 --> 0:28:09.213 +and each of them. + +0:28:09.429 --> 0:28:14.777 +If you have a small corpus, of course you +will use a smaller ingram length. + +0:28:14.777 --> 0:28:16.466 +You will take a larger. + +0:28:18.638 --> 0:28:24.976 +What is important to keep in mind is that, +of course, this is wrong. + +0:28:25.285 --> 0:28:36.608 +So we have long range dependencies, and if +we really want to model everything in language + +0:28:36.608 --> 0:28:37.363 +then. + +0:28:37.337 --> 0:28:46.965 +So here is like one of these extreme cases, +the computer, which has just put into the machine + +0:28:46.965 --> 0:28:49.423 +room in the slow crash. + +0:28:49.423 --> 0:28:55.978 +Like somehow, there is a dependency between +computer and crash. + +0:28:57.978 --> 0:29:10.646 +However, in most situations these are typically +rare and normally most important things happen + +0:29:10.646 --> 0:29:13.446 +in the near context. + +0:29:15.495 --> 0:29:28.408 +But of course it's important to keep that +in mind that you can't model the thing so you + +0:29:28.408 --> 0:29:29.876 +can't do. + +0:29:33.433 --> 0:29:50.200 +The next question is again how can we train +so we have to estimate these probabilities. + +0:29:51.071 --> 0:30:00.131 +And the question is how we do that, and again +the most simple thing. + +0:30:00.440 --> 0:30:03.168 +The thing is exactly what's maximum legal +destination. + +0:30:03.168 --> 0:30:12.641 +What gives you the right answer is: So how +probable is that the word is following minus + +0:30:12.641 --> 0:30:13.370 +one? + +0:30:13.370 --> 0:30:20.946 +You just count how often does this sequence +happen? + +0:30:21.301 --> 0:30:28.165 +So guess this is what most of you would have +intuitively done, and this also works best. + +0:30:28.568 --> 0:30:39.012 +So it's not a complicated train, so you once +have to go over your corpus, you have to count + +0:30:39.012 --> 0:30:48.662 +our diagrams and unigrams, and then you can +directly train the basic language model. + +0:30:49.189 --> 0:30:50.651 +Who is it difficult? + +0:30:50.651 --> 0:30:58.855 +There are two difficulties: The basic language +well doesn't work that well because of zero + +0:30:58.855 --> 0:31:03.154 +counts and how we address that and the second. + +0:31:03.163 --> 0:31:13.716 +Because we saw that especially if you go for +larger you have to store all these engrams + +0:31:13.716 --> 0:31:15.275 +efficiently. + +0:31:17.697 --> 0:31:21.220 +So how we can do that? + +0:31:21.220 --> 0:31:24.590 +Here's some examples. + +0:31:24.590 --> 0:31:33.626 +For example, if you have the sequence your +training curve. + +0:31:33.713 --> 0:31:41.372 +You see that the word happens, ascends the +star and the sequence happens two times. + +0:31:42.182 --> 0:31:45.651 +We have three times. + +0:31:45.651 --> 0:31:58.043 +The same starts as the probability is to thirds +and the other probability. + +0:31:58.858 --> 0:32:09.204 +Here we have what is following so you have +twice and once do so again two thirds and one. + +0:32:09.809 --> 0:32:20.627 +And this is all that you need to know here +about it, so you can do this calculation. + +0:32:23.723 --> 0:32:35.506 +So the question then, of course, is what do +we really learn in these types of models? + +0:32:35.506 --> 0:32:45.549 +Here are examples from the Europycopterus: +The green, the red, and the blue, and here + +0:32:45.549 --> 0:32:48.594 +you have the probabilities which is the next. + +0:32:48.989 --> 0:33:01.897 +That there is a lot more than just like the +syntax because the initial phrase is all the + +0:33:01.897 --> 0:33:02.767 +same. + +0:33:03.163 --> 0:33:10.132 +For example, you see the green paper in the +green group. + +0:33:10.132 --> 0:33:16.979 +It's more European palaman, the red cross, +which is by. + +0:33:17.197 --> 0:33:21.777 +What you also see that it's like sometimes +Indian, sometimes it's more difficult. + +0:33:22.302 --> 0:33:28.345 +So, for example, following the rats, in one +hundred cases it was a red cross. + +0:33:28.668 --> 0:33:48.472 +So it seems to be easier to guess the next +word. + +0:33:48.528 --> 0:33:55.152 +So there is different types of information +coded in that you also know that I guess sometimes + +0:33:55.152 --> 0:33:58.675 +you directly know all the speakers will continue. + +0:33:58.675 --> 0:34:04.946 +It's not a lot of new information in the next +word, but in other cases like blue there's + +0:34:04.946 --> 0:34:06.496 +a lot of information. + +0:34:11.291 --> 0:34:14.849 +Another example is this Berkeley restaurant +sentences. + +0:34:14.849 --> 0:34:21.059 +It's collected at Berkeley and you have sentences +like can you tell me about any good spaghetti + +0:34:21.059 --> 0:34:21.835 +restaurant. + +0:34:21.835 --> 0:34:27.463 +Big price title is what I'm looking for so +it's more like a dialogue system and people + +0:34:27.463 --> 0:34:31.215 +have collected this data and of course you +can also look. + +0:34:31.551 --> 0:34:46.878 +Into this and get the counts, so you count +the vibrants in the top, so the color is the. + +0:34:49.409 --> 0:34:52.912 +This is a bigram which is the first word of +West. + +0:34:52.912 --> 0:34:54.524 +This one fuzzy is one. + +0:34:56.576 --> 0:35:12.160 +One because want to hyperability, but want +a lot less, and there where you see it, for + +0:35:12.160 --> 0:35:17.004 +example: So here you see after I want. + +0:35:17.004 --> 0:35:23.064 +It's very often for I eat, but an island which +is not just. + +0:35:27.347 --> 0:35:39.267 +The absolute counts of how often each road +occurs, and then you can see here the probabilities + +0:35:39.267 --> 0:35:40.145 +again. + +0:35:42.422 --> 0:35:54.519 +Then do that if you want to do iwan Dutch +food you get the sequence you have to multiply + +0:35:54.519 --> 0:35:55.471 +olive. + +0:35:55.635 --> 0:36:00.281 +And then you of course get a bit of interesting +experience on that. + +0:36:00.281 --> 0:36:04.726 +For example: Information is there. + +0:36:04.726 --> 0:36:15.876 +So, for example, if you compare I want Dutch +or I want Chinese, it seems that. + +0:36:16.176 --> 0:36:22.910 +That the sentence often starts with eye. + +0:36:22.910 --> 0:36:31.615 +You have it after two is possible, but after +one it. + +0:36:31.731 --> 0:36:39.724 +And you cannot say want, but you have to say +want to spend, so there's grammical information. + +0:36:40.000 --> 0:36:51.032 +To main information and source: Here before +we're going into measuring quality, is there + +0:36:51.032 --> 0:36:58.297 +any questions about language model and the +idea of modeling? + +0:37:02.702 --> 0:37:13.501 +Hope that doesn't mean everybody sleeping, +and so when we're doing the training these + +0:37:13.501 --> 0:37:15.761 +language models,. + +0:37:16.356 --> 0:37:26.429 +You need to model what is the engrum length +should we use a trigram or a forkrum. + +0:37:27.007 --> 0:37:34.040 +So in order to decide how can you now decide +which of the two models are better? + +0:37:34.914 --> 0:37:40.702 +And if you would have to do that, how would +you decide taking language model or taking + +0:37:40.702 --> 0:37:41.367 +language? + +0:37:43.263 --> 0:37:53.484 +I take some test text and see which model +assigns a higher probability to me. + +0:37:54.354 --> 0:38:03.978 +It's very good, so that's even the second +thing, so the first thing maybe would have + +0:38:03.978 --> 0:38:04.657 +been. + +0:38:05.925 --> 0:38:12.300 +The problem is the and then you take the language +language language and machine translation. + +0:38:13.193 --> 0:38:18.773 +Problems: First of all you have to build a +whole system which is very time consuming and + +0:38:18.773 --> 0:38:21.407 +it might not only depend on the language. + +0:38:21.407 --> 0:38:24.730 +On the other hand, that's of course what the +end is. + +0:38:24.730 --> 0:38:30.373 +The end want and the pressure will model each +component individually or do you want to do + +0:38:30.373 --> 0:38:31.313 +an end to end. + +0:38:31.771 --> 0:38:35.463 +What can also happen is you'll see your metric +model. + +0:38:35.463 --> 0:38:41.412 +This is a very good language model, but it +somewhat doesn't really work well with your + +0:38:41.412 --> 0:38:42.711 +translation model. + +0:38:43.803 --> 0:38:49.523 +But of course it's very good to also have +this type of intrinsic evaluation where the + +0:38:49.523 --> 0:38:52.116 +assumption should be as a pointed out. + +0:38:52.116 --> 0:38:57.503 +If we have Good English it shouldn't be a +high probability and it's bad English. + +0:38:58.318 --> 0:39:07.594 +And this is measured by the take a held out +data set, so some data which you don't train + +0:39:07.594 --> 0:39:12.596 +on then calculate the probability of this data. + +0:39:12.912 --> 0:39:26.374 +Then you're just looking at the language model +and you take the language model. + +0:39:27.727 --> 0:39:33.595 +You're not directly using the probability, +but you're taking the perplexity. + +0:39:33.595 --> 0:39:40.454 +The perplexity is due to the power of the +cross entropy, and you see in the cross entropy + +0:39:40.454 --> 0:39:46.322 +you're doing something like an average probability +of always coming to this. + +0:39:46.846 --> 0:39:54.721 +Not so how exactly is that define perplexity +is typically what people refer to all across. + +0:39:54.894 --> 0:40:02.328 +The cross edge is negative and average, and +then you have the lock of the probability of + +0:40:02.328 --> 0:40:03.246 +the whole. + +0:40:04.584 --> 0:40:10.609 +We are modeling this probability as the product +of each of the words. + +0:40:10.609 --> 0:40:18.613 +That's how the end gram was defined and now +you hopefully can remember the rules of logarism + +0:40:18.613 --> 0:40:23.089 +so you can get the probability within the logarism. + +0:40:23.063 --> 0:40:31.036 +The sum here so the cross entry is minus one +by two by n, and the sum of all your words + +0:40:31.036 --> 0:40:35.566 +and the lowerism of the probability of each +word. + +0:40:36.176 --> 0:40:39.418 +And then the perplexity is just like two to +the power. + +0:40:41.201 --> 0:40:44.706 +Why can this be interpreted as a branching +factor? + +0:40:44.706 --> 0:40:50.479 +So it gives you a bit like the average thing, +like how many possibilities you have. + +0:40:51.071 --> 0:41:02.249 +You have a digit task and you have no idea, +but the probability of the next digit is like + +0:41:02.249 --> 0:41:03.367 +one ten. + +0:41:03.783 --> 0:41:09.354 +And if you then take a later perplexity, it +will be exactly ten. + +0:41:09.849 --> 0:41:24.191 +And that is like this perplexity gives you +a million interpretations, so how much randomness + +0:41:24.191 --> 0:41:27.121 +is still in there? + +0:41:27.307 --> 0:41:32.433 +Of course, now it's good to have a lower perplexity. + +0:41:32.433 --> 0:41:36.012 +We have less ambiguity in there and. + +0:41:35.976 --> 0:41:48.127 +If you have a hundred words and you only have +to uniformly compare it to ten different, so + +0:41:48.127 --> 0:41:49.462 +you have. + +0:41:49.609 --> 0:41:53.255 +Yes, think so it should be. + +0:41:53.255 --> 0:42:03.673 +You had here logarism and then to the power +and that should then be eliminated. + +0:42:03.743 --> 0:42:22.155 +So which logarism you use is not that important +because it's a constant factor to reformulate. + +0:42:23.403 --> 0:42:28.462 +Yes and Yeah So the Best. + +0:42:31.931 --> 0:42:50.263 +The best model is always like you want to +have a high probability. + +0:42:51.811 --> 0:43:04.549 +Time you see here, so here the probabilities +would like to commend the rapporteur on his + +0:43:04.549 --> 0:43:05.408 +work. + +0:43:05.285 --> 0:43:14.116 +You have then locked two probabilities and +then the average, so this is not the perplexity + +0:43:14.116 --> 0:43:18.095 +but the cross entropy as mentioned here. + +0:43:18.318 --> 0:43:26.651 +And then due to the power of that we'll give +you the perplexity of the center. + +0:43:29.329 --> 0:43:40.967 +And these metrics of perplexity are essential +in modeling that and we'll also see nowadays. + +0:43:41.121 --> 0:43:47.898 +You also measure like equality often in perplexity +or cross entropy, which gives you how good + +0:43:47.898 --> 0:43:50.062 +is it in estimating the same. + +0:43:50.010 --> 0:43:53.647 +The better the model is, the more information +you have about this. + +0:43:55.795 --> 0:44:03.106 +Talked about isomic ability or quit sentences, +but don't most have to any much because. + +0:44:03.463 --> 0:44:12.512 +You are doing that in this way implicitly +because of the correct word. + +0:44:12.512 --> 0:44:19.266 +If you are modeling this one, the sun over +all next. + +0:44:20.020 --> 0:44:29.409 +Therefore, you have that implicitly in there +because in each position you're modeling the + +0:44:29.409 --> 0:44:32.957 +probability of this witch behind. + +0:44:35.515 --> 0:44:43.811 +You have a very large number of negative examples +because all the possible extensions which are + +0:44:43.811 --> 0:44:49.515 +not there are incorrect, which of course might +also be a problem. + +0:44:52.312 --> 0:45:00.256 +And the biggest challenge of these types of +models is how to model unseen events. + +0:45:00.840 --> 0:45:04.973 +So that can be unknown words or it can be +unknown vibrants. + +0:45:05.245 --> 0:45:10.096 +So that's important also like you've seen +all the words. + +0:45:10.096 --> 0:45:17.756 +But if you have a bigram language model, if +you haven't seen the bigram, you'll still get + +0:45:17.756 --> 0:45:23.628 +a zero probability because we know that the +bigram's divided by the. + +0:45:24.644 --> 0:45:35.299 +If you have unknown words, the problem gets +even bigger because one word typically causes + +0:45:35.299 --> 0:45:37.075 +a lot of zero. + +0:45:37.217 --> 0:45:41.038 +So if you, for example, if your vocabulary +is go to and care it,. + +0:45:41.341 --> 0:45:43.467 +And you have not a sentence. + +0:45:43.467 --> 0:45:47.941 +I want to pay a T, so you have one word, which +is here 'an'. + +0:45:47.887 --> 0:45:54.354 +It is unknow then you have the proper. + +0:45:54.354 --> 0:46:02.147 +It is I get a sentence star and sentence star. + +0:46:02.582 --> 0:46:09.850 +To model this probability you always have +to take the account from these sequences divided + +0:46:09.850 --> 0:46:19.145 +by: Since when does it occur, all of these +angrams can also occur because of the word + +0:46:19.145 --> 0:46:19.961 +middle. + +0:46:20.260 --> 0:46:27.800 +So all of these probabilities are directly +zero. + +0:46:27.800 --> 0:46:33.647 +You see that just by having a single. + +0:46:34.254 --> 0:46:47.968 +Tells you it might not always be better to +have larger grams because if you have a gram + +0:46:47.968 --> 0:46:50.306 +language more. + +0:46:50.730 --> 0:46:57.870 +So sometimes it's better to have a smaller +angram counter because the chances that you're + +0:46:57.870 --> 0:47:00.170 +seeing the angram is higher. + +0:47:00.170 --> 0:47:07.310 +On the other hand, you want to have a larger +account because the larger the count is, the + +0:47:07.310 --> 0:47:09.849 +longer the context is modeling. + +0:47:10.670 --> 0:47:17.565 +So how can we address this type of problem? + +0:47:17.565 --> 0:47:28.064 +We address this type of problem by somehow +adjusting our accounts. + +0:47:29.749 --> 0:47:40.482 +We have often, but most of your entries in +the table are zero, and if one of these engrams + +0:47:40.482 --> 0:47:45.082 +occurs you'll have a zero probability. + +0:47:46.806 --> 0:48:06.999 +So therefore we need to find some of our ways +in order to estimate this type of event because: + +0:48:07.427 --> 0:48:11.619 +So there are different ways of how to model +it and how to adjust it. + +0:48:11.619 --> 0:48:15.326 +The one I hear is to do smoocing and that's +the first thing. + +0:48:15.326 --> 0:48:20.734 +So in smoocing you're saying okay, we take +a bit of the probability we have to our scene + +0:48:20.734 --> 0:48:23.893 +events and distribute this thing we're taking +away. + +0:48:23.893 --> 0:48:26.567 +We're distributing to all the other events. + +0:48:26.946 --> 0:48:33.927 +The nice thing is in this case oh now each +event has a non zero probability and that is + +0:48:33.927 --> 0:48:39.718 +of course very helpful because we don't have +zero probabilities anymore. + +0:48:40.180 --> 0:48:48.422 +It smoothed out, but at least you have some +kind of probability everywhere, so you take + +0:48:48.422 --> 0:48:50.764 +some of the probability. + +0:48:53.053 --> 0:49:05.465 +You can also do that more here when you have +the endgram, for example, and this is your + +0:49:05.465 --> 0:49:08.709 +original distribution. + +0:49:08.648 --> 0:49:15.463 +Then you are taking some mass away from here +and distributing this mass to all the other + +0:49:15.463 --> 0:49:17.453 +words that you have seen. + +0:49:18.638 --> 0:49:26.797 +And thereby you are now making sure that it's +yeah, that it's now possible to model that. + +0:49:28.828 --> 0:49:36.163 +The other idea we're coming into more detail +on how we can do this type of smoking, but + +0:49:36.163 --> 0:49:41.164 +one other idea you can do is to do some type +of clustering. + +0:49:41.501 --> 0:49:48.486 +And that means if we are can't model go Kit's, +for example because we haven't seen that. + +0:49:49.349 --> 0:49:56.128 +Then we're just looking at the full thing +and we're just going to live directly how probable. + +0:49:56.156 --> 0:49:58.162 +Go two ways or so. + +0:49:58.162 --> 0:50:09.040 +Then we are modeling just only the word interpolation +where you're interpolating all the probabilities + +0:50:09.040 --> 0:50:10.836 +and thereby can. + +0:50:11.111 --> 0:50:16.355 +These are the two things which are helpful +in order to better calculate all these types. + +0:50:19.499 --> 0:50:28.404 +Let's start with what counts news so the idea +is okay. + +0:50:28.404 --> 0:50:38.119 +We have not seen an event and then the probability +is zero. + +0:50:38.618 --> 0:50:50.902 +It's not that high, but you should always +be aware that there might be new things happening + +0:50:50.902 --> 0:50:55.308 +and somehow be able to estimate. + +0:50:56.276 --> 0:50:59.914 +So the idea is okay. + +0:50:59.914 --> 0:51:09.442 +We can also assign a positive probability +to a higher. + +0:51:10.590 --> 0:51:23.233 +We are changing so currently we worked on +imperial accounts so how often we have seen + +0:51:23.233 --> 0:51:25.292 +the accounts. + +0:51:25.745 --> 0:51:37.174 +And now we are going on to expect account +how often this would occur in an unseen. + +0:51:37.517 --> 0:51:39.282 +So we are directly trying to model that. + +0:51:39.859 --> 0:51:45.836 +Of course, the empirical accounts are a good +starting point, so if you've seen the world + +0:51:45.836 --> 0:51:51.880 +very often in your training data, it's a good +estimation of how often you would see it in + +0:51:51.880 --> 0:51:52.685 +the future. + +0:51:52.685 --> 0:51:58.125 +However, it might make sense to think about +it only because you haven't seen it. + +0:51:58.578 --> 0:52:10.742 +So does anybody have a very simple idea how +you start with smoothing it? + +0:52:10.742 --> 0:52:15.241 +What count would you give? + +0:52:21.281 --> 0:52:32.279 +Now you have the probability to calculation +how often have you seen the biogram with zero + +0:52:32.279 --> 0:52:33.135 +count. + +0:52:33.193 --> 0:52:39.209 +So what count would you give in order to still +do this calculation? + +0:52:39.209 --> 0:52:41.509 +We have to smooth, so we. + +0:52:44.884 --> 0:52:52.151 +We could clump together all the rare words, +for example everywhere we have only seen ones. + +0:52:52.652 --> 0:52:56.904 +And then just we can do the massive moment +of those and don't. + +0:52:56.936 --> 0:53:00.085 +So remove the real ones. + +0:53:00.085 --> 0:53:06.130 +Yes, and then every unseen word is one of +them. + +0:53:06.130 --> 0:53:13.939 +Yeah, but it's not only about unseen words, +it's even unseen. + +0:53:14.874 --> 0:53:20.180 +You can even start easier and that's what +people do at the first thing. + +0:53:20.180 --> 0:53:22.243 +That's at one smooth thing. + +0:53:22.243 --> 0:53:28.580 +You'll see it's not working good but the variation +works fine and we're just as here. + +0:53:28.580 --> 0:53:30.644 +We've seen everything once. + +0:53:31.771 --> 0:53:39.896 +That's similar to this because you're clustering +the one and the zero together and you just + +0:53:39.896 --> 0:53:45.814 +say you've seen everything once or have seen +them twice and so on. + +0:53:46.386 --> 0:53:53.249 +And if you've done that wow, there's no probability +because each event has happened once. + +0:53:55.795 --> 0:54:02.395 +If you otherwise have seen the bigram five +times, you would not now do five times but + +0:54:02.395 --> 0:54:03.239 +six times. + +0:54:03.363 --> 0:54:09.117 +So the nice thing is to have seen everything. + +0:54:09.117 --> 0:54:19.124 +Once the probability of the engrap is now +out, you have seen it divided by the. + +0:54:20.780 --> 0:54:23.763 +How long ago there's one big big problem with +it? + +0:54:24.064 --> 0:54:38.509 +Just imagine that you have a vocabulary of +words, and you have a corpus of thirty million + +0:54:38.509 --> 0:54:39.954 +bigrams. + +0:54:39.954 --> 0:54:42.843 +So if you have a. + +0:54:43.543 --> 0:54:46.580 +Simple Things So You've Seen Them Thirty Million +Times. + +0:54:47.247 --> 0:54:49.818 +That is your count, your distributing. + +0:54:49.818 --> 0:54:55.225 +According to your gain, the problem is yet +how many possible bigrams do you have? + +0:54:55.225 --> 0:55:00.895 +You have seven point five billion possible +bigrams, and each of them you are counting + +0:55:00.895 --> 0:55:04.785 +now as give up your ability, like you give +account of one. + +0:55:04.785 --> 0:55:07.092 +So each of them is saying a curse. + +0:55:07.627 --> 0:55:16.697 +Then this number of possible vigrams is many +times larger than the number you really see. + +0:55:17.537 --> 0:55:21.151 +You're mainly doing equal distribution. + +0:55:21.151 --> 0:55:26.753 +Everything gets the same because this is much +more important. + +0:55:26.753 --> 0:55:31.541 +Most of your probability mass is used for +smoothing. + +0:55:32.412 --> 0:55:37.493 +Because most of the probability miles have +to be distributed that you at least give every + +0:55:37.493 --> 0:55:42.687 +biogram at least a count of one, and the other +counts are only the thirty million, so seven + +0:55:42.687 --> 0:55:48.219 +point five billion counts go to like a distribute +around all the engrons, and only thirty million + +0:55:48.219 --> 0:55:50.026 +are according to your frequent. + +0:55:50.210 --> 0:56:02.406 +So you put a lot too much mass on your smoothing +and you're doing some kind of extreme smoothing. + +0:56:02.742 --> 0:56:08.986 +So that of course is a bit bad then and will +give you not the best performance. + +0:56:10.130 --> 0:56:16.160 +However, there's a nice thing and that means +to do probability calculations. + +0:56:16.160 --> 0:56:21.800 +We are doing it based on counts, but to do +this division we don't need. + +0:56:22.302 --> 0:56:32.112 +So we can also do that with floating point +values and there is still a valid type of calculation. + +0:56:32.392 --> 0:56:39.380 +So we can have less probability mass to unseen +events. + +0:56:39.380 --> 0:56:45.352 +We don't have to give one because if we count. + +0:56:45.785 --> 0:56:50.976 +But to do our calculation we can also give +zero point zero to something like that, so + +0:56:50.976 --> 0:56:56.167 +very small value, and thereby we have less +value on the smooth thing, and we are more + +0:56:56.167 --> 0:56:58.038 +focusing on the actual corpus. + +0:56:58.758 --> 0:57:03.045 +And that is what people refer to as Alpha +Smoozing. + +0:57:03.223 --> 0:57:12.032 +You see that we are now adding not one to +it but only alpha, and then we are giving less + +0:57:12.032 --> 0:57:19.258 +probability to the unseen event and more probability +to the really seen. + +0:57:20.780 --> 0:57:24.713 +Questions: Of course, how do you find see +also? + +0:57:24.713 --> 0:57:29.711 +I'm here to either use some help out data +and optimize them. + +0:57:30.951 --> 0:57:35.153 +So what what does it now really mean? + +0:57:35.153 --> 0:57:40.130 +This gives you a bit of an idea behind that. + +0:57:40.700 --> 0:57:57.751 +So here you have the grams which occur one +time, for example all grams which occur one. + +0:57:57.978 --> 0:58:10.890 +So, for example, that means that if you have +engrams which occur one time, then. + +0:58:11.371 --> 0:58:22.896 +If you look at all the engrams which occur +two times, then they occur. + +0:58:22.896 --> 0:58:31.013 +If you look at the engrams that occur zero, +then. + +0:58:32.832 --> 0:58:46.511 +So if you are now doing the smoothing you +can look what is the probability estimating + +0:58:46.511 --> 0:58:47.466 +them. + +0:58:47.847 --> 0:59:00.963 +You see that for all the endbreaks you heavily +underestimate how often they occur in the test + +0:59:00.963 --> 0:59:01.801 +card. + +0:59:02.002 --> 0:59:10.067 +So what you want is very good to estimate +this distribution, so for each Enron estimate + +0:59:10.067 --> 0:59:12.083 +quite well how often. + +0:59:12.632 --> 0:59:16.029 +You're quite bad at that for all of them. + +0:59:16.029 --> 0:59:22.500 +You're apparently underestimating only for +the top ones which you haven't seen. + +0:59:22.500 --> 0:59:24.845 +You'll heavily overestimate. + +0:59:25.645 --> 0:59:30.887 +If you're doing alpha smoothing and optimize +that to fit on the zero count because that's + +0:59:30.887 --> 0:59:36.361 +not completely fair because this alpha is now +optimizes the test counter, you see that you're + +0:59:36.361 --> 0:59:37.526 +doing a lot better. + +0:59:37.526 --> 0:59:42.360 +It's not perfect, but you're a lot better +in estimating how often they will occur. + +0:59:45.545 --> 0:59:49.316 +So this is one idea of doing it. + +0:59:49.316 --> 0:59:57.771 +Of course there's other ways and this is like +a large research direction. + +0:59:58.318 --> 1:00:03.287 +So there is this needed estimation. + +1:00:03.287 --> 1:00:11.569 +What you are doing is filling your trading +data into parts. + +1:00:11.972 --> 1:00:19.547 +Looking at how many engrams occur exactly +are types, which engrams occur are times in + +1:00:19.547 --> 1:00:20.868 +your training. + +1:00:21.281 --> 1:00:27.716 +And then you look for these ones. + +1:00:27.716 --> 1:00:36.611 +How often do they occur in your training data? + +1:00:36.611 --> 1:00:37.746 +It's. + +1:00:38.118 --> 1:00:45.214 +And then you say oh this engram, the expector +counts how often will see. + +1:00:45.214 --> 1:00:56.020 +It is divided by: Some type of clustering +you're putting all the engrams which occur + +1:00:56.020 --> 1:01:04.341 +are at times in your data together and in order +to estimate how often. + +1:01:05.185 --> 1:01:12.489 +And if you do half your data related to your +final estimation by just using those statistics,. + +1:01:14.014 --> 1:01:25.210 +So this is called added estimation, and thereby +you are not able to estimate better how often + +1:01:25.210 --> 1:01:25.924 +does. + +1:01:28.368 --> 1:01:34.559 +And again we can do the same look and compare +it to the expected counts. + +1:01:34.559 --> 1:01:37.782 +Again we have exactly the same table. + +1:01:38.398 --> 1:01:47.611 +So then we're having to hear how many engrams +that does exist. + +1:01:47.611 --> 1:01:55.361 +So, for example, there's like engrams which +you can. + +1:01:55.835 --> 1:02:08.583 +Then you look into your other half and how +often do these N grams occur in your 2nd part + +1:02:08.583 --> 1:02:11.734 +of the training data? + +1:02:12.012 --> 1:02:22.558 +For example, an unseen N gram I expect to +occur, an engram which occurs one time. + +1:02:22.558 --> 1:02:25.774 +I expect that it occurs. + +1:02:27.527 --> 1:02:42.564 +Yeah, the number of zero counts are if take +my one grams and then just calculate how many + +1:02:42.564 --> 1:02:45.572 +possible bigrams. + +1:02:45.525 --> 1:02:50.729 +Yes, so in this case we are now not assuming +about having a more larger cattle because then, + +1:02:50.729 --> 1:02:52.127 +of course, it's getting. + +1:02:52.272 --> 1:02:54.730 +So you're doing that given the current gram. + +1:02:54.730 --> 1:03:06.057 +The cavalry is better to: So yeah, there's +another problem in how to deal with them. + +1:03:06.057 --> 1:03:11.150 +This is more about how to smuse the engram +counts to also deal. + +1:03:14.394 --> 1:03:18.329 +Certainly as I Think The. + +1:03:18.198 --> 1:03:25.197 +Yes, the last idea of doing is so called good +cheering, and and the I hear here is in it + +1:03:25.197 --> 1:03:32.747 +similar, so there is a typical mathematic approve, +but you can show that a very good estimation + +1:03:32.747 --> 1:03:34.713 +for the expected counts. + +1:03:34.654 --> 1:03:42.339 +Is that you take the number of engrams which +occur one time more divided by the number of + +1:03:42.339 --> 1:03:46.011 +engram which occur R times and R plus one. + +1:03:46.666 --> 1:03:49.263 +So this is then the estimation of. + +1:03:49.549 --> 1:04:05.911 +So if you are looking now at an engram which +occurs times then you are looking at how many + +1:04:05.911 --> 1:04:08.608 +engrams occur. + +1:04:09.009 --> 1:04:18.938 +It's very simple, so in this one you only +have to count all the bigrams, how many different + +1:04:18.938 --> 1:04:23.471 +bigrams out there, and that is very good. + +1:04:23.903 --> 1:04:33.137 +So if you are saying now about end drums which +occur or times,. + +1:04:33.473 --> 1:04:46.626 +It might be that there are some occurring +times, but no times, and then. + +1:04:46.866 --> 1:04:54.721 +So what you normally do is you are doing for +small R, and for large R you do some curve + +1:04:54.721 --> 1:04:55.524 +fitting. + +1:04:56.016 --> 1:05:07.377 +In general this type of smoothing is important +for engrams which occur rarely. + +1:05:07.377 --> 1:05:15.719 +If an engram occurs so this is more important +for events. + +1:05:17.717 --> 1:05:25.652 +So here again you see you have the counts +and then based on that you get the adjusted + +1:05:25.652 --> 1:05:26.390 +counts. + +1:05:26.390 --> 1:05:34.786 +This is here and if you compare it's a test +count you see that it really works quite well. + +1:05:35.035 --> 1:05:41.093 +But for the low numbers it's a very good modeling +of how much how good this works. + +1:05:45.005 --> 1:05:50.018 +Then, of course, the question is how good +does it work in language modeling? + +1:05:50.018 --> 1:05:51.516 +We also want tomorrow. + +1:05:52.372 --> 1:05:54.996 +We can measure that perplexity. + +1:05:54.996 --> 1:05:59.261 +We learned that before and then we have everyone's. + +1:05:59.579 --> 1:06:07.326 +You saw that a lot of too much probability +mass is put to the events which have your probability. + +1:06:07.667 --> 1:06:11.098 +Then you have an alpha smoothing. + +1:06:11.098 --> 1:06:16.042 +Here's a start because it's not completely +fair. + +1:06:16.042 --> 1:06:20.281 +The alpha was maximized on the test data. + +1:06:20.480 --> 1:06:25.904 +But you see that like the leaded estimation +of the touring gives you a similar performance. + +1:06:26.226 --> 1:06:29.141 +So they seem to really work quite well. + +1:06:32.232 --> 1:06:41.552 +So this is about all assigning probability +mass to aimed grams, which we have not seen + +1:06:41.552 --> 1:06:50.657 +in order to also estimate their probability +before we're going to the interpolation. + +1:06:55.635 --> 1:07:00.207 +Good, so now we have. + +1:07:00.080 --> 1:07:11.818 +Done this estimation, and the problem is we +have this general. + +1:07:11.651 --> 1:07:19.470 +We want to have a longer context because we +can model longer than language better because + +1:07:19.470 --> 1:07:21.468 +long range dependency. + +1:07:21.701 --> 1:07:26.745 +On the other hand, we have limited data so +we want to have stored angrums because they + +1:07:26.745 --> 1:07:28.426 +reach angrums at first more. + +1:07:29.029 --> 1:07:43.664 +And about the smooth thing in the discounting +we did before, it always treats all angrams. + +1:07:44.024 --> 1:07:46.006 +So we didn't really look at the end drums. + +1:07:46.006 --> 1:07:48.174 +They were all classed into how often they +are. + +1:07:49.169 --> 1:08:00.006 +However, sometimes this might not be very +helpful, so for example look at the engram + +1:08:00.006 --> 1:08:06.253 +Scottish beer drinkers and Scottish beer eaters. + +1:08:06.686 --> 1:08:12.037 +Because we have not seen the trigram, so you +will estimate the trigram probability by the + +1:08:12.037 --> 1:08:14.593 +probability you assign to the zero county. + +1:08:15.455 --> 1:08:26.700 +However, if you look at the background probability +that you might have seen and might be helpful,. + +1:08:26.866 --> 1:08:34.538 +So be a drinker is more probable to see than +Scottish be a drinker, and be a drinker should + +1:08:34.538 --> 1:08:36.039 +be more probable. + +1:08:36.896 --> 1:08:39.919 +So this type of information is somehow ignored. + +1:08:39.919 --> 1:08:45.271 +So if we have the Trigram language model, +we are only looking at trigrams divided by + +1:08:45.271 --> 1:08:46.089 +the Vigrams. + +1:08:46.089 --> 1:08:49.678 +But if we have not seen the Vigrams, we are +not looking. + +1:08:49.678 --> 1:08:53.456 +Oh, maybe we will have seen the Vigram and +we can back off. + +1:08:54.114 --> 1:09:01.978 +And that is what people do in interpolation +and back off. + +1:09:01.978 --> 1:09:09.164 +The idea is if we don't have seen the large +engrams. + +1:09:09.429 --> 1:09:16.169 +So don't have to go to a shorter sequence +and try to see if we came on in this probability. + +1:09:16.776 --> 1:09:20.730 +And this is the idea of interpolation. + +1:09:20.730 --> 1:09:25.291 +There's like two different ways of doing it. + +1:09:25.291 --> 1:09:26.507 +One is the. + +1:09:26.646 --> 1:09:29.465 +The easiest thing is like okay. + +1:09:29.465 --> 1:09:32.812 +If we have bigrams, we have trigrams. + +1:09:32.812 --> 1:09:35.103 +If we have programs, why? + +1:09:35.355 --> 1:09:46.544 +Mean, of course, we have the larger ones, +the larger context, but the short amounts are + +1:09:46.544 --> 1:09:49.596 +maybe better estimated. + +1:09:50.090 --> 1:10:00.487 +Time just by taking the probability of just +the word class of probability of and. + +1:10:01.261 --> 1:10:07.052 +And of course we need to know because otherwise +we don't have a probability distribution, but + +1:10:07.052 --> 1:10:09.332 +we can somehow optimize the weights. + +1:10:09.332 --> 1:10:15.930 +For example, the health out data set: And +thereby we have now a probability distribution + +1:10:15.930 --> 1:10:17.777 +which takes both into account. + +1:10:18.118 --> 1:10:23.705 +The thing about the Scottish be a drink business. + +1:10:23.705 --> 1:10:33.763 +The dry rum probability will be the same for +the post office because they both occur zero + +1:10:33.763 --> 1:10:34.546 +times. + +1:10:36.116 --> 1:10:45.332 +But the two grand verability will hopefully +be different because we might have seen beer + +1:10:45.332 --> 1:10:47.611 +eaters and therefore. + +1:10:48.668 --> 1:10:57.296 +The idea that sometimes it's better to have +different models and combine them instead. + +1:10:58.678 --> 1:10:59.976 +Another idea in style. + +1:11:00.000 --> 1:11:08.506 +Of this overall interpolation is you can also +do this type of recursive interpolation. + +1:11:08.969 --> 1:11:23.804 +The probability of the word given its history +is in the current language model probability. + +1:11:24.664 --> 1:11:30.686 +Thus one minus the weights of this two some +after one, and here it's an interpolated probability + +1:11:30.686 --> 1:11:36.832 +from the n minus one breath, and then of course +it goes recursively on until you are at a junigram + +1:11:36.832 --> 1:11:37.639 +probability. + +1:11:38.558 --> 1:11:49.513 +What you can also do, you can not only do +the same weights for all our words, but you + +1:11:49.513 --> 1:12:06.020 +can for example: For example, for engrams, +which you have seen very often, you put more + +1:12:06.020 --> 1:12:10.580 +weight on the trigrams. + +1:12:13.673 --> 1:12:29.892 +The other thing you can do is the back off +and the difference in back off is we are not + +1:12:29.892 --> 1:12:32.656 +interpolating. + +1:12:32.892 --> 1:12:41.954 +If we have seen the trigram probability so +if the trigram hound is bigger then we take + +1:12:41.954 --> 1:12:48.412 +the trigram probability and if we have seen +this one then we. + +1:12:48.868 --> 1:12:54.092 +So that is the difference. + +1:12:54.092 --> 1:13:06.279 +We are always taking all the angle probabilities +and back off. + +1:13:07.147 --> 1:13:09.941 +Why do we need to do this just a minute? + +1:13:09.941 --> 1:13:13.621 +So why have we here just take the probability +of the. + +1:13:15.595 --> 1:13:18.711 +Yes, because otherwise the probabilities from +some people. + +1:13:19.059 --> 1:13:28.213 +In order to make them still sound one, we +have to take away a bit of a probability mass + +1:13:28.213 --> 1:13:29.773 +for the scene. + +1:13:29.709 --> 1:13:38.919 +The difference is we are no longer distributing +it equally as before to the unseen, but we + +1:13:38.919 --> 1:13:40.741 +are distributing. + +1:13:44.864 --> 1:13:56.220 +For example, this can be done with gutturing, +so the expected counts in goodturing we saw. + +1:13:57.697 --> 1:13:59.804 +The adjusted counts. + +1:13:59.804 --> 1:14:04.719 +They are always lower than the ones we see +here. + +1:14:04.719 --> 1:14:14.972 +These counts are always: See that so you can +now take this different and distribute this + +1:14:14.972 --> 1:14:18.852 +weights to the lower based input. + +1:14:23.323 --> 1:14:29.896 +Is how we can distribute things. + +1:14:29.896 --> 1:14:43.442 +Then there is one last thing people are doing, +especially how much. + +1:14:43.563 --> 1:14:55.464 +And there's one thing which is called well +written by Mozilla. + +1:14:55.315 --> 1:15:01.335 +In the background, like in the background, +it might make sense to look at the words and + +1:15:01.335 --> 1:15:04.893 +see how probable it is that you need to background. + +1:15:05.425 --> 1:15:11.232 +So look at these words five and one cent. + +1:15:11.232 --> 1:15:15.934 +Those occur exactly times in the. + +1:15:16.316 --> 1:15:27.804 +They would be treated exactly the same because +both occur at the same time, and it would be + +1:15:27.804 --> 1:15:29.053 +the same. + +1:15:29.809 --> 1:15:48.401 +However, it shouldn't really model the same. + +1:15:48.568 --> 1:15:57.447 +If you compare that for constant there are +four hundred different continuations of this + +1:15:57.447 --> 1:16:01.282 +work, so there is nearly always this. + +1:16:02.902 --> 1:16:11.203 +So if you're now seeing a new bigram or a +biogram with Isaac Constant or Spite starting + +1:16:11.203 --> 1:16:13.467 +and then another word,. + +1:16:15.215 --> 1:16:25.606 +In constant, it's very frequent that you see +new angrups because there are many different + +1:16:25.606 --> 1:16:27.222 +combinations. + +1:16:27.587 --> 1:16:35.421 +Therefore, it might look not only to look +at the counts, the end grams, but also how + +1:16:35.421 --> 1:16:37.449 +many extensions does. + +1:16:38.218 --> 1:16:43.222 +And this is done by witt velk smoothing. + +1:16:43.222 --> 1:16:51.032 +The idea is we count how many possible extensions +in this case. + +1:16:51.371 --> 1:17:01.966 +So we had for spive, we had possible extensions, +and for constant we had a lot more. + +1:17:02.382 --> 1:17:09.394 +And then how much we put into our backup model, +how much weight we put into the backup is, + +1:17:09.394 --> 1:17:13.170 +depending on this number of possible extensions. + +1:17:14.374 --> 1:17:15.557 +Style. + +1:17:15.557 --> 1:17:29.583 +We have it here, so this is the weight you +put on your lower end gram probability. + +1:17:29.583 --> 1:17:46.596 +For example: And if you compare these two +numbers, so for Spike you do how many extensions + +1:17:46.596 --> 1:17:55.333 +does Spike have divided by: While for constant +you have zero point three, you know,. + +1:17:55.815 --> 1:18:05.780 +So you're putting a lot more weight to like +it's not as bad to fall off to the back of + +1:18:05.780 --> 1:18:06.581 +model. + +1:18:06.581 --> 1:18:10.705 +So for the spy it's really unusual. + +1:18:10.730 --> 1:18:13.369 +For Constant there's a lot of probability +medicine. + +1:18:13.369 --> 1:18:15.906 +The chances that you're doing that is quite +high. + +1:18:20.000 --> 1:18:26.209 +Similarly, but just from the other way around, +it's now looking at this probability distribution. + +1:18:26.546 --> 1:18:37.103 +So now when we back off the probability distribution +for the lower angrums, we calculated exactly + +1:18:37.103 --> 1:18:40.227 +the same as the probability. + +1:18:40.320 --> 1:18:48.254 +However, they are used in a different way, +so the lower order end drums are only used + +1:18:48.254 --> 1:18:49.361 +if we have. + +1:18:50.410 --> 1:18:54.264 +So it's like you're modeling something different. + +1:18:54.264 --> 1:19:01.278 +You're not modeling how probable this engram +if we haven't seen the larger engram and that + +1:19:01.278 --> 1:19:04.361 +is tried by the diversity of histories. + +1:19:04.944 --> 1:19:14.714 +For example, if you look at York, that's a +quite frequent work. + +1:19:14.714 --> 1:19:18.530 +It occurs as many times. + +1:19:19.559 --> 1:19:27.985 +However, four hundred seventy three times +it was followed the way before it was mute. + +1:19:29.449 --> 1:19:40.237 +So if you now think the unigram model is only +used, the probability of York as a unigram + +1:19:40.237 --> 1:19:49.947 +model should be very, very low because: So +you should have a lower probability for your + +1:19:49.947 --> 1:19:56.292 +than, for example, for foods, although you +have seen both of them at the same time, and + +1:19:56.292 --> 1:20:02.853 +this is done by Knesser and Nye Smoothing where +you are not counting the words itself, but + +1:20:02.853 --> 1:20:05.377 +you count the number of mysteries. + +1:20:05.845 --> 1:20:15.233 +So how many other way around was it followed +by how many different words were before? + +1:20:15.233 --> 1:20:28.232 +Then instead of the normal way you count the +words: So you don't need to know all the formulas + +1:20:28.232 --> 1:20:28.864 +here. + +1:20:28.864 --> 1:20:33.498 +The more important thing is this intuition. + +1:20:34.874 --> 1:20:44.646 +More than it means already that I haven't +seen the larger end grammar, and therefore + +1:20:44.646 --> 1:20:49.704 +it might be better to model it differently. + +1:20:49.929 --> 1:20:56.976 +So if there's a new engram with something +in New York that's very unprofitable compared + +1:20:56.976 --> 1:20:57.297 +to. + +1:21:00.180 --> 1:21:06.130 +And yeah, this modified Kneffer Nice music +is what people took into use. + +1:21:06.130 --> 1:21:08.249 +That's the fall approach. + +1:21:08.728 --> 1:21:20.481 +Has an absolute discounting for small and +grams, and then bells smoothing, and for it + +1:21:20.481 --> 1:21:27.724 +uses the discounting of histories which we +just had. + +1:21:28.028 --> 1:21:32.207 +And there's even two versions of it, like +the backup and the interpolator. + +1:21:32.472 --> 1:21:34.264 +So that may be interesting. + +1:21:34.264 --> 1:21:40.216 +These are here even works well for interpolation, +although your assumption is even no longer + +1:21:40.216 --> 1:21:45.592 +true because you're using the lower engrams +even if you've seen the higher engrams. + +1:21:45.592 --> 1:21:49.113 +But since you're then focusing on the higher +engrams,. + +1:21:49.929 --> 1:21:53.522 +So if you see that some beats on the perfectities,. + +1:21:54.754 --> 1:22:00.262 +So you see normally what interpolated movement +class of nineties gives you some of the best + +1:22:00.262 --> 1:22:00.980 +performing. + +1:22:02.022 --> 1:22:08.032 +You see the larger your end drum than it is +with interpolation. + +1:22:08.032 --> 1:22:15.168 +You also get significant better so you can +not only look at the last words. + +1:22:18.638 --> 1:22:32.725 +Good so much for these types of things, and +we will finish with some special things about + +1:22:32.725 --> 1:22:34.290 +language. + +1:22:38.678 --> 1:22:44.225 +One thing we talked about the unknown words, +so there is different ways of doing it because + +1:22:44.225 --> 1:22:49.409 +in all the estimations we were still assuming +mostly that we have a fixed vocabulary. + +1:22:50.270 --> 1:23:06.372 +So you can often, for example, create an unknown +choken and use that while statistical language. + +1:23:06.766 --> 1:23:16.292 +It was mainly useful language processing since +newer models are coming, but maybe it's surprising. + +1:23:18.578 --> 1:23:30.573 +What is also nice is that if you're going +to really hard launch and ramps, it's more + +1:23:30.573 --> 1:23:33.114 +about efficiency. + +1:23:33.093 --> 1:23:37.378 +And then you have to remember lock it in your +model. + +1:23:37.378 --> 1:23:41.422 +In a lot of situations it's not really important. + +1:23:41.661 --> 1:23:46.964 +It's more about ranking so which one is better +and if they don't sum up to one that's not + +1:23:46.964 --> 1:23:47.907 +that important. + +1:23:47.907 --> 1:23:53.563 +Of course then you cannot calculate any perplexity +anymore because if this is not a probability + +1:23:53.563 --> 1:23:58.807 +mass then the thing we had about the negative +example doesn't fit anymore and that's not + +1:23:58.807 --> 1:23:59.338 +working. + +1:23:59.619 --> 1:24:02.202 +However, anification is also very helpful. + +1:24:02.582 --> 1:24:13.750 +And that is why there is this stupid bag-off +presented remove all this complicated things + +1:24:13.750 --> 1:24:14.618 +which. + +1:24:15.055 --> 1:24:28.055 +And it just does once we directly take the +absolute account, and otherwise we're doing. + +1:24:28.548 --> 1:24:41.867 +Is no longer any discounting anymore, so it's +very, very simple and however they show you + +1:24:41.867 --> 1:24:47.935 +have to calculate a lot less statistics. + +1:24:50.750 --> 1:24:57.525 +In addition you can have other type of language +models. + +1:24:57.525 --> 1:25:08.412 +We had word based language models and they +normally go up to four or five for six brands. + +1:25:08.412 --> 1:25:10.831 +They are too large. + +1:25:11.531 --> 1:25:20.570 +So what people have then looked also into +is what is referred to as part of speech language + +1:25:20.570 --> 1:25:21.258 +model. + +1:25:21.258 --> 1:25:29.806 +So instead of looking at the word sequence +you're modeling directly the part of speech + +1:25:29.806 --> 1:25:30.788 +sequence. + +1:25:31.171 --> 1:25:34.987 +Then of course now you're only being modeling +syntax. + +1:25:34.987 --> 1:25:41.134 +There's no cemented information anymore in +the paddle speech test but now you might go + +1:25:41.134 --> 1:25:47.423 +to a larger context link so you can do seven +H or nine grams and then you can write some + +1:25:47.423 --> 1:25:50.320 +of the long range dependencies in order. + +1:25:52.772 --> 1:25:59.833 +And there's other things people have done +like cash language models, so the idea in cash + +1:25:59.833 --> 1:26:07.052 +language model is that yes words that you have +recently seen are more frequently to do are + +1:26:07.052 --> 1:26:11.891 +more probable to reoccurr if you want to model +the dynamics. + +1:26:12.152 --> 1:26:20.734 +If you're just talking here, we talked about +language models in my presentation. + +1:26:20.734 --> 1:26:23.489 +There will be a lot more. + +1:26:23.883 --> 1:26:37.213 +Can do that by having a dynamic and a static +component, and then you have a dynamic component + +1:26:37.213 --> 1:26:41.042 +which looks at the bigram. + +1:26:41.261 --> 1:26:49.802 +And thereby, for example, if you once generate +language model of probability, it's increased + +1:26:49.802 --> 1:26:52.924 +and you're modeling that problem. + +1:26:56.816 --> 1:27:03.114 +Said the dynamic component is trained on the +text translated so far. + +1:27:04.564 --> 1:27:12.488 +To train them what you just have done, there's +no human feet there. + +1:27:12.712 --> 1:27:25.466 +The speech model all the time and then it +will repeat its errors and that is, of course,. + +1:27:25.966 --> 1:27:31.506 +A similar idea is people have looked into +trigger language model whereas one word occurs + +1:27:31.506 --> 1:27:34.931 +then you increase the probability of some other +words. + +1:27:34.931 --> 1:27:40.596 +So if you're talking about money that will +increase the probability of bank saving account + +1:27:40.596 --> 1:27:41.343 +dollar and. + +1:27:41.801 --> 1:27:47.352 +Because then you have to somehow model this +dependency, but it's somehow also an idea of + +1:27:47.352 --> 1:27:52.840 +modeling long range dependency, because if +one word occurs very often in your document, + +1:27:52.840 --> 1:27:58.203 +you like somehow like learning which other +words to occur because they are more often + +1:27:58.203 --> 1:27:59.201 +than by chance. + +1:28:02.822 --> 1:28:10.822 +Yes, then the last thing is, of course, especially +for languages which are, which are morphologically + +1:28:10.822 --> 1:28:11.292 +rich. + +1:28:11.292 --> 1:28:18.115 +You can do something similar to BPE so you +can now do more themes or so, and then more + +1:28:18.115 --> 1:28:22.821 +the morphine sequence because the morphines +are more often. + +1:28:23.023 --> 1:28:26.877 +However, the program is opposed that your +sequence length also gets longer. + +1:28:27.127 --> 1:28:33.185 +And so if they have a four gram language model, +it's not counting the last three words but + +1:28:33.185 --> 1:28:35.782 +only the last three more films, which. + +1:28:36.196 --> 1:28:39.833 +So of course then it's a bit challenging and +know how to deal with. + +1:28:40.680 --> 1:28:51.350 +What about language is finished by the idea +of a position at the end of the world? + +1:28:51.350 --> 1:28:58.807 +Yeah, but there you can typically do something +like that. + +1:28:59.159 --> 1:29:02.157 +It is not the one perfect solution. + +1:29:02.157 --> 1:29:05.989 +You have to do a bit of testing what is best. + +1:29:06.246 --> 1:29:13.417 +One way of dealing with a large vocabulary +that you haven't seen is to split these words + +1:29:13.417 --> 1:29:20.508 +into parts and themes that either like more +linguistic motivated in more themes or more + +1:29:20.508 --> 1:29:25.826 +statistically motivated like we have in the +bike pair and coding. + +1:29:28.188 --> 1:29:33.216 +The representation of your text is different. + +1:29:33.216 --> 1:29:41.197 +How you are later doing all the counting and +the statistics is the same. + +1:29:41.197 --> 1:29:44.914 +What you assume is your sequence. + +1:29:45.805 --> 1:29:49.998 +That's the same thing for the other things +we had here. + +1:29:49.998 --> 1:29:55.390 +Here you don't have words, but everything +you're doing is done exactly. + +1:29:57.857 --> 1:29:59.457 +Some practical issues. + +1:29:59.457 --> 1:30:05.646 +Typically you're doing things on the lock +and you're adding because mild decline in very + +1:30:05.646 --> 1:30:09.819 +small values gives you sometimes problems with +calculation. + +1:30:10.230 --> 1:30:16.687 +Good thing is you don't have to care with +this mostly so there is very good two kids + +1:30:16.687 --> 1:30:23.448 +like Azarayan or Kendalan which when you can +just give your data and they will train the + +1:30:23.448 --> 1:30:30.286 +language more then do all the complicated maths +behind that and you are able to run them. + +1:30:31.911 --> 1:30:39.894 +So what you should keep from today is what +is a language model and how we can do maximum + +1:30:39.894 --> 1:30:44.199 +training on that and different language models. + +1:30:44.199 --> 1:30:49.939 +Similar ideas we use for a lot of different +statistical models. + +1:30:50.350 --> 1:30:52.267 +Where You Always Have the Problem. + +1:30:53.233 --> 1:31:01.608 +Different way of looking at it and doing it +will do it on Thursday when we will go to language. + diff --git a/demo_data/lectures/Lecture-06-09.05.2023/video.mp4 b/demo_data/lectures/Lecture-06-09.05.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..2401628c3c7985a0dcf150f995024cdb79a80a0e --- /dev/null +++ b/demo_data/lectures/Lecture-06-09.05.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59fe56576cf62256b2c62b8fdcf6e502ce1931907278fc420d397cd360774f72 +size 129548573 diff --git a/demo_data/lectures/Lecture-07-11.05.2023/English.vtt b/demo_data/lectures/Lecture-07-11.05.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..cf6cfc1cf2a51840a888d32c814a3ae0ee406525 --- /dev/null +++ b/demo_data/lectures/Lecture-07-11.05.2023/English.vtt @@ -0,0 +1,2593 @@ +WEBVTT + +0:00:01.301 --> 0:00:05.707 +Okay So Welcome to Today's Lecture. + +0:00:06.066 --> 0:00:12.592 +I'm sorry for the inconvenience. + +0:00:12.592 --> 0:00:19.910 +Sometimes they are project meetings. + +0:00:19.910 --> 0:00:25.843 +There will be one other time. + +0:00:26.806 --> 0:00:40.863 +So what we want to talk today about is want +to start with neural approaches to machine + +0:00:40.863 --> 0:00:42.964 +translation. + +0:00:43.123 --> 0:00:51.285 +I guess you have heard about other types of +neural models for other types of neural language + +0:00:51.285 --> 0:00:52.339 +processing. + +0:00:52.339 --> 0:00:59.887 +This was some of the first steps in introducing +neal networks to machine translation. + +0:01:00.600 --> 0:01:06.203 +They are similar to what you know they see +in as large language models. + +0:01:06.666 --> 0:01:11.764 +And today look into what are these neuro-language +models? + +0:01:11.764 --> 0:01:13.874 +What is the difference? + +0:01:13.874 --> 0:01:15.983 +What is the motivation? + +0:01:16.316 --> 0:01:21.445 +And first will use them in statistics and +machine translation. + +0:01:21.445 --> 0:01:28.935 +So if you remember how fully like two or three +weeks ago we had this likely model where you + +0:01:28.935 --> 0:01:31.052 +can integrate easily any. + +0:01:31.351 --> 0:01:40.967 +We just have another model which evaluates +how good a system is or how good a fluent language + +0:01:40.967 --> 0:01:41.376 +is. + +0:01:41.376 --> 0:01:53.749 +The main advantage compared to the statistical +models we saw on Tuesday is: Next week we will + +0:01:53.749 --> 0:02:06.496 +then go for a neural machine translation where +we replace the whole model. + +0:02:11.211 --> 0:02:21.078 +Just as a remember from Tuesday, we've seen +the main challenge in language world was that + +0:02:21.078 --> 0:02:25.134 +most of the engrams we haven't seen. + +0:02:26.946 --> 0:02:33.967 +So this was therefore difficult to estimate +any probability because you've seen that normally + +0:02:33.967 --> 0:02:39.494 +if you have not seen the endgram you will assign +the probability of zero. + +0:02:39.980 --> 0:02:49.420 +However, this is not really very good because +we don't want to give zero probabilities to + +0:02:49.420 --> 0:02:54.979 +sentences, which still might be a very good +English. + +0:02:55.415 --> 0:03:02.167 +And then we learned a lot of techniques and +that is the main challenging statistical machine + +0:03:02.167 --> 0:03:04.490 +translate statistical language. + +0:03:04.490 --> 0:03:10.661 +What's how we can give a good estimate of +probability to events that we haven't seen + +0:03:10.661 --> 0:03:12.258 +smoothing techniques? + +0:03:12.258 --> 0:03:15.307 +We've seen this interpolation and begoff. + +0:03:15.435 --> 0:03:21.637 +And they invent or develop very specific techniques. + +0:03:21.637 --> 0:03:26.903 +To deal with that, however, it might not be. + +0:03:28.568 --> 0:03:43.190 +And therefore maybe we can do things different, +so if we have not seen an gram before in statistical + +0:03:43.190 --> 0:03:44.348 +models. + +0:03:45.225 --> 0:03:51.361 +Before and we can only get information from +exactly the same words. + +0:03:51.411 --> 0:04:06.782 +We don't have some on like approximate matching +like that, maybe in a sentence that cures similarly. + +0:04:06.782 --> 0:04:10.282 +So if you have seen a. + +0:04:11.191 --> 0:04:17.748 +And so you would like to have more something +like that where endgrams are represented, more + +0:04:17.748 --> 0:04:21.953 +in a general space, and we can generalize similar +numbers. + +0:04:22.262 --> 0:04:29.874 +So if you learn something about walk then +maybe we can use this knowledge and also apply. + +0:04:30.290 --> 0:04:42.596 +The same as we have done before, but we can +really better model how similar they are and + +0:04:42.596 --> 0:04:45.223 +transfer to other. + +0:04:47.047 --> 0:04:54.236 +And we maybe want to do that in a more hierarchical +approach that we know okay. + +0:04:54.236 --> 0:05:02.773 +Some words are similar but like go and walk +is somehow similar and I and P and G and therefore + +0:05:02.773 --> 0:05:06.996 +like maybe if we then merge them in an engram. + +0:05:07.387 --> 0:05:15.861 +If we learn something about our walk, then +it should tell us also something about Hugo. + +0:05:15.861 --> 0:05:17.113 +He walks or. + +0:05:17.197 --> 0:05:27.327 +You see that there is some relations which +we need to integrate for you. + +0:05:27.327 --> 0:05:35.514 +We need to add the s, but maybe walks should +also be here. + +0:05:37.137 --> 0:05:45.149 +And luckily there is one really convincing +method in doing that: And that is by using + +0:05:45.149 --> 0:05:47.231 +a neural mechanism. + +0:05:47.387 --> 0:05:58.497 +That's what we will introduce today so we +can use this type of neural networks to try + +0:05:58.497 --> 0:06:04.053 +to learn this similarity and to learn how. + +0:06:04.324 --> 0:06:14.355 +And that is one of the main advantages that +we have by switching from the standard statistical + +0:06:14.355 --> 0:06:15.200 +models. + +0:06:15.115 --> 0:06:22.830 +To learn similarities between words and generalized, +and learn what is called hidden representations + +0:06:22.830 --> 0:06:29.705 +or representations of words, where we can measure +similarity in some dimensions of words. + +0:06:30.290 --> 0:06:42.384 +So we can measure in which way words are similar. + +0:06:42.822 --> 0:06:48.902 +We had it before and we've seen that words +were just easier. + +0:06:48.902 --> 0:06:51.991 +The only thing we did is like. + +0:06:52.192 --> 0:07:02.272 +But this energies don't have any meaning, +so it wasn't that word is more similar to words. + +0:07:02.582 --> 0:07:12.112 +So we couldn't learn anything about words +in the statistical model and that's a big challenge. + +0:07:12.192 --> 0:07:23.063 +About words even like in morphology, so going +goes is somehow more similar because the person + +0:07:23.063 --> 0:07:24.219 +singular. + +0:07:24.264 --> 0:07:34.924 +The basic models we have to now have no idea +about that and goes as similar to go than it + +0:07:34.924 --> 0:07:37.175 +might be to sleep. + +0:07:39.919 --> 0:07:44.073 +So what we want to do today. + +0:07:44.073 --> 0:07:53.096 +In order to go to this we will have a short +introduction into. + +0:07:53.954 --> 0:08:05.984 +It very short just to see how we use them +here, but that's a good thing, so most of you + +0:08:05.984 --> 0:08:08.445 +think it will be. + +0:08:08.928 --> 0:08:14.078 +And then we will first look into a feet forward +neural network language models. + +0:08:14.454 --> 0:08:23.706 +And there we will still have this approximation. + +0:08:23.706 --> 0:08:33.902 +We have before we are looking only at a fixed +window. + +0:08:34.154 --> 0:08:35.030 +The case. + +0:08:35.030 --> 0:08:38.270 +However, we have the umbellent here. + +0:08:38.270 --> 0:08:43.350 +That's why they're already better in order +to generalize. + +0:08:44.024 --> 0:08:53.169 +And then at the end we'll look at language +models where we then have the additional advantage. + +0:08:53.093 --> 0:09:04.317 +Case that we need to have a fixed history, +but in theory we can model arbitrary long dependencies. + +0:09:04.304 --> 0:09:12.687 +And we talked about on Tuesday where it is +not clear what type of information it is to. + +0:09:16.396 --> 0:09:24.981 +So in general molecular networks I normally +learn to prove that they perform some tasks. + +0:09:25.325 --> 0:09:33.472 +We have the structure and we are learning +them from samples so that is similar to what + +0:09:33.472 --> 0:09:34.971 +we have before. + +0:09:34.971 --> 0:09:42.275 +So now we have the same task here, a language +model giving input or forwards. + +0:09:42.642 --> 0:09:48.959 +And is somewhat originally motivated by human +brain. + +0:09:48.959 --> 0:10:00.639 +However, when you now need to know about artificial +neural networks, it's hard to get similarity. + +0:10:00.540 --> 0:10:02.889 +There seemed to be not that point. + +0:10:03.123 --> 0:10:11.014 +So what they are mainly doing is summoning +multiplication and then one non-linear activation. + +0:10:12.692 --> 0:10:16.085 +So the basic units are these type of. + +0:10:17.937 --> 0:10:29.891 +Perceptron basic blocks which we have and +this does processing so we have a fixed number + +0:10:29.891 --> 0:10:36.070 +of input features and that will be important. + +0:10:36.096 --> 0:10:39.689 +So we have here numbers to xn as input. + +0:10:40.060 --> 0:10:53.221 +And this makes partly of course language processing +difficult. + +0:10:54.114 --> 0:10:57.609 +So we have to model this time on and then +go stand home and model. + +0:10:58.198 --> 0:11:02.099 +Then we are having weights, which are the +parameters and the number of weights exactly + +0:11:02.099 --> 0:11:03.668 +the same as the number of weights. + +0:11:04.164 --> 0:11:06.322 +Of input features. + +0:11:06.322 --> 0:11:15.068 +Sometimes he has his fires in there, and then +it's not really an input from. + +0:11:15.195 --> 0:11:19.205 +And what you then do is multiply. + +0:11:19.205 --> 0:11:26.164 +Each input resists weight and then you sum +it up and then. + +0:11:26.606 --> 0:11:34.357 +What is then additionally later important +is that we have an activation function and + +0:11:34.357 --> 0:11:42.473 +it's important that this activation function +is non linear, so we come to just a linear. + +0:11:43.243 --> 0:11:54.088 +And later it will be important that this is +differentiable because otherwise all the training. + +0:11:54.714 --> 0:12:01.907 +This model by itself is not very powerful. + +0:12:01.907 --> 0:12:10.437 +It was originally shown that this is not powerful. + +0:12:10.710 --> 0:12:19.463 +However, there is a very easy extension, the +multi layer perceptual, and then things get + +0:12:19.463 --> 0:12:20.939 +very powerful. + +0:12:21.081 --> 0:12:27.719 +The thing is you just connect a lot of these +in this layer of structures and we have our + +0:12:27.719 --> 0:12:35.029 +input layer where we have the inputs and our +hidden layer at least one where there is everywhere. + +0:12:35.395 --> 0:12:39.817 +And then we can combine them all to do that. + +0:12:40.260 --> 0:12:48.320 +The input layer is of course somewhat given +by a problem of dimension. + +0:12:48.320 --> 0:13:00.013 +The outward layer is also given by your dimension, +but the hidden layer is of course a hyperparameter. + +0:13:01.621 --> 0:13:08.802 +So let's start with the first question, now +more language related, and that is how we represent. + +0:13:09.149 --> 0:13:23.460 +So we've seen here we have the but the question +is now how can we put in a word into this? + +0:13:26.866 --> 0:13:34.117 +Noise: The first thing we're able to be better +is by the fact that like you are said,. + +0:13:34.314 --> 0:13:43.028 +That is not that easy because the continuous +vector will come to that. + +0:13:43.028 --> 0:13:50.392 +So from the neo-network we can directly put +in the bedding. + +0:13:50.630 --> 0:13:57.277 +But if we need to input a word into the needle +network, it has to be something which is easily + +0:13:57.277 --> 0:13:57.907 +defined. + +0:13:59.079 --> 0:14:12.492 +The one hood encoding, and then we have one +out of encoding, so one value is one, and all + +0:14:12.492 --> 0:14:15.324 +the others is the. + +0:14:16.316 --> 0:14:25.936 +That means we are always dealing with fixed +vocabulary because what said is we cannot. + +0:14:26.246 --> 0:14:38.017 +So you cannot easily extend your vocabulary +because if you mean you would extend your vocabulary. + +0:14:39.980 --> 0:14:41.502 +That's also motivating. + +0:14:41.502 --> 0:14:43.722 +We're talked about biperriagoding. + +0:14:43.722 --> 0:14:45.434 +That's a nice thing there. + +0:14:45.434 --> 0:14:47.210 +We have a fixed vocabulary. + +0:14:48.048 --> 0:14:55.804 +The big advantage of this one encoding is +that we don't implicitly sum our implement + +0:14:55.804 --> 0:15:04.291 +similarity between words, but really re-learning +because if you first think about this, this + +0:15:04.291 --> 0:15:06.938 +is a very, very inefficient. + +0:15:07.227 --> 0:15:15.889 +So you need like to represent end words, you +need a dimension of an end dimensional vector. + +0:15:16.236 --> 0:15:24.846 +Imagine you could do binary encoding so you +could represent words as binary vectors. + +0:15:24.846 --> 0:15:26.467 +Then you would. + +0:15:26.806 --> 0:15:31.177 +Will be significantly more efficient. + +0:15:31.177 --> 0:15:36.813 +However, then you have some implicit similarity. + +0:15:36.813 --> 0:15:39.113 +Some numbers share. + +0:15:39.559 --> 0:15:46.958 +Would somehow be bad because you would force +someone to do this by hand or clear how to + +0:15:46.958 --> 0:15:47.631 +define. + +0:15:48.108 --> 0:15:55.135 +So therefore currently this is the most successful +approach to just do this one watch. + +0:15:55.095 --> 0:15:59.563 +Representations, so we take a fixed vocabulary. + +0:15:59.563 --> 0:16:06.171 +We map each word to the inise, and then we +represent a word like this. + +0:16:06.171 --> 0:16:13.246 +So if home will be one, the representation +will be one zero zero zero, and. + +0:16:14.514 --> 0:16:30.639 +But this dimension here is a vocabulary size +and that is quite high, so we are always trying + +0:16:30.639 --> 0:16:33.586 +to be efficient. + +0:16:33.853 --> 0:16:43.792 +We are doing then some type of efficiency +because typically we are having this next layer. + +0:16:44.104 --> 0:16:51.967 +It can be still maybe two hundred or five +hundred or one thousand neurons, but this is + +0:16:51.967 --> 0:16:53.323 +significantly. + +0:16:53.713 --> 0:17:03.792 +You can learn that directly and there we then +have similarity between words. + +0:17:03.792 --> 0:17:07.458 +Then it is that some words. + +0:17:07.807 --> 0:17:14.772 +But the nice thing is that this is then learned +that we are not need to hand define that. + +0:17:17.117 --> 0:17:32.742 +We'll come later to the explicit architecture +of the neural language one, and there we can + +0:17:32.742 --> 0:17:35.146 +see how it's. + +0:17:38.418 --> 0:17:44.857 +So we're seeing that the other one or our +representation always has the same similarity. + +0:17:45.105 --> 0:17:59.142 +Then we're having this continuous factor which +is a lot smaller dimension and that's important + +0:17:59.142 --> 0:18:00.768 +for later. + +0:18:01.121 --> 0:18:06.989 +What we are doing then is learning these representations +so that they are best for language. + +0:18:07.487 --> 0:18:14.968 +So the representations are implicitly training +the language for the cards. + +0:18:14.968 --> 0:18:19.058 +This is the best way for doing language. + +0:18:19.479 --> 0:18:32.564 +And the nice thing that was found out later +is these representations are really good. + +0:18:33.153 --> 0:18:39.253 +And that is why they are now even called word +embeddings by themselves and used for other + +0:18:39.253 --> 0:18:39.727 +tasks. + +0:18:40.360 --> 0:18:49.821 +And they are somewhat describing very different +things so they can describe and semantic similarities. + +0:18:49.789 --> 0:18:58.650 +Are looking at the very example of today mass +vector space by adding words and doing some + +0:18:58.650 --> 0:19:00.618 +interesting things. + +0:19:00.940 --> 0:19:11.178 +So they got really like the first big improvement +when switching to neurostaff. + +0:19:11.491 --> 0:19:20.456 +Are like part of the model, but with more +complex representation, but they are the basic + +0:19:20.456 --> 0:19:21.261 +models. + +0:19:23.683 --> 0:19:36.979 +In the output layer we are also having one +output layer structure and a connection function. + +0:19:36.997 --> 0:19:46.525 +That is, for language learning we want to +predict what is the most common word. + +0:19:47.247 --> 0:19:56.453 +And that can be done very well with this so +called soft back layer, where again the dimension. + +0:19:56.376 --> 0:20:02.825 +Vocabulary size, so this is a vocabulary size, +and again the case neural represents the case + +0:20:02.825 --> 0:20:03.310 +class. + +0:20:03.310 --> 0:20:09.759 +So in our case we have again one round representation, +someone saying this is a core report. + +0:20:10.090 --> 0:20:17.255 +Our probability distribution is a probability +distribution over all works, so the case entry + +0:20:17.255 --> 0:20:21.338 +tells us how probable is that the next word +is this. + +0:20:22.682 --> 0:20:33.885 +So we need to have some probability distribution +at our output in order to achieve that this + +0:20:33.885 --> 0:20:37.017 +activation function goes. + +0:20:37.197 --> 0:20:46.944 +And we can achieve that with a soft max activation +we take the input to the form of the value, + +0:20:46.944 --> 0:20:47.970 +and then. + +0:20:48.288 --> 0:20:58.021 +So by having this type of activation function +we are really getting this type of probability. + +0:20:59.019 --> 0:21:15.200 +At the beginning was also very challenging +because again we have this inefficient representation. + +0:21:15.235 --> 0:21:29.799 +You can imagine that something over is maybe +a bit inefficient with cheap users, but definitely. + +0:21:36.316 --> 0:21:44.072 +And then for training the models that will +be fine, so we have to use architecture now. + +0:21:44.264 --> 0:21:48.491 +We need to minimize the arrow. + +0:21:48.491 --> 0:21:53.264 +Are we doing it taking the output? + +0:21:53.264 --> 0:21:58.174 +We are comparing it to our targets. + +0:21:58.298 --> 0:22:03.830 +So one important thing is by training them. + +0:22:03.830 --> 0:22:07.603 +How can we measure the error? + +0:22:07.603 --> 0:22:12.758 +So what is if we are training the ideas? + +0:22:13.033 --> 0:22:15.163 +And how well we are measuring. + +0:22:15.163 --> 0:22:19.768 +It is in natural language processing, typically +the cross entropy. + +0:22:19.960 --> 0:22:35.575 +And that means we are comparing the target +with the output. + +0:22:35.335 --> 0:22:44.430 +It gets optimized and you're seeing that this, +of course, makes it again very nice and easy + +0:22:44.430 --> 0:22:49.868 +because our target is again a one-hour representation. + +0:22:50.110 --> 0:23:00.116 +So all of these are always zero, and what +we are then doing is we are taking the one. + +0:23:00.100 --> 0:23:04.615 +And we only need to multiply the one with +the logarithm here, and that is all the feedback + +0:23:04.615 --> 0:23:05.955 +signal we are taking here. + +0:23:06.946 --> 0:23:13.885 +Of course, this is not always influenced by +all the others. + +0:23:13.885 --> 0:23:17.933 +Why is this influenced by all the. + +0:23:24.304 --> 0:23:34.382 +Have the activation function, which is the +current activation divided by some of the others. + +0:23:34.354 --> 0:23:45.924 +Otherwise it could easily just increase this +volume and ignore the others, but if you increase + +0:23:45.924 --> 0:23:49.090 +one value all the others. + +0:23:51.351 --> 0:23:59.912 +Then we can do with neometrics one very nice +and easy type of training that is done in all + +0:23:59.912 --> 0:24:07.721 +the neometrics where we are now calculating +our error and especially the gradient. + +0:24:07.707 --> 0:24:11.640 +So in which direction does the error show? + +0:24:11.640 --> 0:24:18.682 +And then if we want to go to a smaller arrow +that's what we want to achieve. + +0:24:18.682 --> 0:24:26.638 +We are taking the inverse direction of the +gradient and thereby trying to minimize our + +0:24:26.638 --> 0:24:27.278 +error. + +0:24:27.287 --> 0:24:31.041 +And we have to do that, of course, for all +the weights. + +0:24:31.041 --> 0:24:36.672 +And to calculate the error of all the weights, +we won't do the defectvagation here. + +0:24:36.672 --> 0:24:41.432 +But but what you can do is you can propagate +the arrow which measured. + +0:24:41.432 --> 0:24:46.393 +At the end you can propagate it back its basic +mass and basic derivation. + +0:24:46.706 --> 0:24:58.854 +For each way in your model measure how much +you contribute to the error and then change + +0:24:58.854 --> 0:25:01.339 +it in a way that. + +0:25:04.524 --> 0:25:11.625 +So to summarize what for at least machine +translation on your machine translation should + +0:25:11.625 --> 0:25:19.044 +remember, you know, to understand on this problem +is that this is how a multilayer first the + +0:25:19.044 --> 0:25:20.640 +problem looks like. + +0:25:20.580 --> 0:25:28.251 +There are fully two layers and no connections. + +0:25:28.108 --> 0:25:29.759 +Across layers. + +0:25:29.829 --> 0:25:35.153 +And what they're doing is always just a waited +sum here and then in activation production. + +0:25:35.415 --> 0:25:38.792 +And in order to train you have this forward +and backward pass. + +0:25:39.039 --> 0:25:41.384 +So We Put in Here. + +0:25:41.281 --> 0:25:41.895 +Inputs. + +0:25:41.895 --> 0:25:45.347 +We have some random values at the beginning. + +0:25:45.347 --> 0:25:47.418 +Then calculate the output. + +0:25:47.418 --> 0:25:54.246 +We are measuring how our error is propagating +the arrow back and then changing our model + +0:25:54.246 --> 0:25:57.928 +in a way that we hopefully get a smaller arrow. + +0:25:57.928 --> 0:25:59.616 +And then that is how. + +0:26:01.962 --> 0:26:12.893 +So before we're coming into our neural networks +language models, how can we use this type of + +0:26:12.893 --> 0:26:17.595 +neural network to do language modeling? + +0:26:23.103 --> 0:26:33.157 +So how can we use them in natural language +processing, especially machine translation? + +0:26:33.157 --> 0:26:41.799 +The first idea of using them was to estimate: +So we have seen that the output can be monitored + +0:26:41.799 --> 0:26:42.599 +here as well. + +0:26:43.603 --> 0:26:50.311 +A probability distribution and if we have +a full vocabulary we could mainly hear estimating + +0:26:50.311 --> 0:26:56.727 +how probable each next word is and then use +that in our language model fashion as we've + +0:26:56.727 --> 0:26:58.112 +done it last time. + +0:26:58.112 --> 0:27:03.215 +We got the probability of a full sentence +as a product of individual. + +0:27:04.544 --> 0:27:12.820 +And: That was done in the ninety seven years +and it's very easy to integrate it into this + +0:27:12.820 --> 0:27:14.545 +lot of the year model. + +0:27:14.545 --> 0:27:19.570 +So we have said that this is how the locker +here model looks like. + +0:27:19.570 --> 0:27:25.119 +So we are searching the best translation which +minimizes each waste time. + +0:27:25.125 --> 0:27:26.362 +The Future About You. + +0:27:26.646 --> 0:27:31.647 +We have that with minimum error rate training +if you can remember where we search for the + +0:27:31.647 --> 0:27:32.147 +optimal. + +0:27:32.512 --> 0:27:40.422 +The language model and many others, and we +can just add here a neuromodel, have a knock + +0:27:40.422 --> 0:27:41.591 +of features. + +0:27:41.861 --> 0:27:45.761 +So that is quite easy as said. + +0:27:45.761 --> 0:27:53.183 +That was how statistical machine translation +was improved. + +0:27:53.183 --> 0:27:57.082 +You just add one more feature. + +0:27:58.798 --> 0:28:07.631 +So how can we model the language modeling +with a network? + +0:28:07.631 --> 0:28:16.008 +So what we have to do is model the probability +of the. + +0:28:16.656 --> 0:28:25.047 +The problem in general in the head is that +mostly we haven't seen long sequences. + +0:28:25.085 --> 0:28:35.650 +Mostly we have to beg off to very short sequences +and we are working on this discrete space where + +0:28:35.650 --> 0:28:36.944 +similarity. + +0:28:37.337 --> 0:28:50.163 +So the idea is if we have now a real network, +we can make words into continuous representation. + +0:28:51.091 --> 0:29:00.480 +And the structure then looks like this, so +this is a basic still feed forward neural network. + +0:29:01.361 --> 0:29:10.645 +We are doing this at perximation again, so +we are not putting in all previous words, but + +0:29:10.645 --> 0:29:11.375 +it is. + +0:29:11.691 --> 0:29:25.856 +This is done because we said that in the real +network we can have only a fixed type of input. + +0:29:25.945 --> 0:29:31.886 +You can only do a fixed step and then we'll +be doing that exactly in minus one. + +0:29:33.593 --> 0:29:39.536 +So here you are, for example, three words +and three different words. + +0:29:39.536 --> 0:29:50.704 +One and all the others are: And then we're +having the first layer of the neural network, + +0:29:50.704 --> 0:29:56.230 +which like you learns is word embedding. + +0:29:57.437 --> 0:30:04.976 +There is one thing which is maybe special +compared to the standard neural member. + +0:30:05.345 --> 0:30:11.918 +So the representation of this word we want +to learn first of all position independence. + +0:30:11.918 --> 0:30:19.013 +So we just want to learn what is the general +meaning of the word independent of its neighbors. + +0:30:19.299 --> 0:30:26.239 +And therefore the representation you get here +should be the same as if in the second position. + +0:30:27.247 --> 0:30:36.865 +The nice thing you can achieve is that this +weights which you're using here you're reusing + +0:30:36.865 --> 0:30:41.727 +here and reusing here so we are forcing them. + +0:30:42.322 --> 0:30:48.360 +You then learn your word embedding, which +is contextual, independent, so it's the same + +0:30:48.360 --> 0:30:49.678 +for each position. + +0:30:49.909 --> 0:31:03.482 +So that's the idea that you want to learn +the representation first of and you don't want + +0:31:03.482 --> 0:31:07.599 +to really use the context. + +0:31:08.348 --> 0:31:13.797 +That of course might have a different meaning +depending on where it stands, but we'll learn + +0:31:13.797 --> 0:31:14.153 +that. + +0:31:14.514 --> 0:31:20.386 +So first we are learning here representational +words, which is just the representation. + +0:31:20.760 --> 0:31:32.498 +Normally we said in neurons all input neurons +here are connected to all here, but we're reducing + +0:31:32.498 --> 0:31:37.338 +the complexity by saying these neurons. + +0:31:37.857 --> 0:31:47.912 +Then we have a lot denser representation that +is our three word embedded in here, and now + +0:31:47.912 --> 0:31:57.408 +we are learning this interaction between words, +a direction between words not based. + +0:31:57.677 --> 0:32:08.051 +So we have at least one connected layer here, +which takes a three embedding input and then + +0:32:08.051 --> 0:32:14.208 +learns a new embedding which now represents +the full. + +0:32:15.535 --> 0:32:16.551 +Layers. + +0:32:16.551 --> 0:32:27.854 +It is the output layer which now and then +again the probability distribution of all the. + +0:32:28.168 --> 0:32:48.612 +So here is your target prediction. + +0:32:48.688 --> 0:32:56.361 +The nice thing is that you learn everything +together, so you don't have to teach them what + +0:32:56.361 --> 0:32:58.722 +a good word representation. + +0:32:59.079 --> 0:33:08.306 +Training the whole number together, so it +learns what a good representation for a word + +0:33:08.306 --> 0:33:13.079 +you get in order to perform your final task. + +0:33:15.956 --> 0:33:19.190 +Yeah, that is the main idea. + +0:33:20.660 --> 0:33:32.731 +This is now a days often referred to as one +way of self supervise learning. + +0:33:33.053 --> 0:33:37.120 +The output is the next word and the input +is the previous word. + +0:33:37.377 --> 0:33:46.783 +But it's not really that we created labels, +but we artificially created a task out of unlabeled. + +0:33:46.806 --> 0:33:59.434 +We just had pure text, and then we created +the telescopes by predicting the next word, + +0:33:59.434 --> 0:34:18.797 +which is: Say we have like two sentences like +go home and the second one is go to prepare. + +0:34:18.858 --> 0:34:30.135 +And then we have to predict the next series +and my questions in the labels for the album. + +0:34:31.411 --> 0:34:42.752 +We model this as one vector with like probability +for possible weights starting again. + +0:34:44.044 --> 0:34:57.792 +Multiple examples, so then you would twice +train one to predict KRT, one to predict home, + +0:34:57.792 --> 0:35:02.374 +and then of course the easel. + +0:35:04.564 --> 0:35:13.568 +Is a very good point, so you are not aggregating +examples beforehand, but you are taking each. + +0:35:19.259 --> 0:35:37.204 +So when you do it simultaneously learn the +projection layer and the endgram for abilities + +0:35:37.204 --> 0:35:39.198 +and then. + +0:35:39.499 --> 0:35:47.684 +And later analyze it that these representations +are very powerful. + +0:35:47.684 --> 0:35:56.358 +The task is just a very important task to +model what is the next word. + +0:35:56.816 --> 0:35:59.842 +Is motivated by nowadays. + +0:35:59.842 --> 0:36:10.666 +In order to get the meaning of the word you +have to look at its companies where the context. + +0:36:10.790 --> 0:36:16.048 +If you read texts in days of word which you +have never seen, you often can still estimate + +0:36:16.048 --> 0:36:21.130 +the meaning of this word because you do not +know how it is used, and this is typically + +0:36:21.130 --> 0:36:22.240 +used as a city or. + +0:36:22.602 --> 0:36:25.865 +Just imagine you read a text about some city. + +0:36:25.865 --> 0:36:32.037 +Even if you've never seen the city before, +you often know from the context of how it's + +0:36:32.037 --> 0:36:32.463 +used. + +0:36:34.094 --> 0:36:42.483 +So what is now the big advantage of using +neural neckworks? + +0:36:42.483 --> 0:36:51.851 +So just imagine we have to estimate that I +bought my first iPhone. + +0:36:52.052 --> 0:36:56.608 +So you have to monitor the probability of +ad hitting them. + +0:36:56.608 --> 0:37:00.237 +Now imagine iPhone, which you have never seen. + +0:37:00.600 --> 0:37:11.588 +So all the techniques we had last time at +the end, if you haven't seen iPhone you will + +0:37:11.588 --> 0:37:14.240 +always fall back to. + +0:37:15.055 --> 0:37:26.230 +You have no idea how to deal that you won't +have seen the diagram, the trigram, and all + +0:37:26.230 --> 0:37:27.754 +the others. + +0:37:28.588 --> 0:37:43.441 +If you're having this type of model, what +does it do if you have my first and then something? + +0:37:43.483 --> 0:37:50.270 +Maybe this representation is really messed +up because it's mainly on a cavalry word. + +0:37:50.730 --> 0:37:57.793 +However, you have still these two information +that two words before was first and therefore. + +0:37:58.098 --> 0:38:06.954 +So you have a lot of information in order +to estimate how good it is. + +0:38:06.954 --> 0:38:13.279 +There could be more information if you know +that. + +0:38:13.593 --> 0:38:25.168 +So all this type of modeling we can do that +we couldn't do beforehand because we always + +0:38:25.168 --> 0:38:25.957 +have. + +0:38:27.027 --> 0:38:40.466 +Good point, so typically you would have one +token for a vocabulary so that you could, for + +0:38:40.466 --> 0:38:45.857 +example: All you're doing by parent coding +when you have a fixed thing. + +0:38:46.226 --> 0:38:49.437 +Oh yeah, you have to do something like that +that that that's true. + +0:38:50.050 --> 0:38:55.420 +So yeah, auto vocabulary are by thanking where +you don't have other words written. + +0:38:55.735 --> 0:39:06.295 +But then, of course, you might be getting +very long previous things, and your sequence + +0:39:06.295 --> 0:39:11.272 +length gets very long for unknown words. + +0:39:17.357 --> 0:39:20.067 +Any more questions to the basic stable. + +0:39:23.783 --> 0:39:36.719 +For this model, what we then want to continue +is looking a bit into how complex or how we + +0:39:36.719 --> 0:39:39.162 +can make things. + +0:39:40.580 --> 0:39:49.477 +Because at the beginning there was definitely +a major challenge, it's still not that easy, + +0:39:49.477 --> 0:39:58.275 +and I mean our likeers followed the talk about +their environmental fingerprint and so on. + +0:39:58.478 --> 0:40:05.700 +So this calculation is not really heavy, and +if you build systems yourselves you have to + +0:40:05.700 --> 0:40:06.187 +wait. + +0:40:06.466 --> 0:40:14.683 +So it's good to know a bit about how complex +things are in order to do a good or efficient + +0:40:14.683 --> 0:40:15.405 +affair. + +0:40:15.915 --> 0:40:24.211 +So one thing where most of the calculation +really happens is if you're doing it in a bad + +0:40:24.211 --> 0:40:24.677 +way. + +0:40:25.185 --> 0:40:33.523 +So in generally all these layers we are talking +about networks and zones fancy. + +0:40:33.523 --> 0:40:46.363 +In the end it is: So what you have to do in +order to calculate here, for example, these + +0:40:46.363 --> 0:40:52.333 +activations: So make it simple a bit. + +0:40:52.333 --> 0:41:06.636 +Let's see where outputs and you just do metric +multiplication between your weight matrix and + +0:41:06.636 --> 0:41:08.482 +your input. + +0:41:08.969 --> 0:41:20.992 +So that is why computers are so powerful for +neural networks because they are very good + +0:41:20.992 --> 0:41:22.358 +in doing. + +0:41:22.782 --> 0:41:28.013 +However, for some type for the embedding layer +this is really very inefficient. + +0:41:28.208 --> 0:41:39.652 +So because remember we're having this one +art encoding in this input, it's always like + +0:41:39.652 --> 0:41:42.940 +one and everything else. + +0:41:42.940 --> 0:41:47.018 +It's zero if we're doing this. + +0:41:47.387 --> 0:41:55.552 +So therefore you can do at least the forward +pass a lot more efficient if you don't really + +0:41:55.552 --> 0:42:01.833 +do this calculation, but you can select the +one color where there is. + +0:42:01.833 --> 0:42:07.216 +Therefore, you also see this is called your +word embedding. + +0:42:08.348 --> 0:42:19.542 +So the weight matrix of the embedding layer +is just that in each color you have the embedding + +0:42:19.542 --> 0:42:20.018 +of. + +0:42:20.580 --> 0:42:30.983 +So this is like how your initial weights look +like and how you can interpret or understand. + +0:42:32.692 --> 0:42:39.509 +And this is already relatively important because +remember this is a huge dimensional thing. + +0:42:39.509 --> 0:42:46.104 +So typically here we have the number of words +is ten thousand or so, so this is the word + +0:42:46.104 --> 0:42:51.365 +embeddings metrics, typically the most expensive +to calculate metrics. + +0:42:51.451 --> 0:42:59.741 +Because it's the largest one there, we have +ten thousand entries, while for the hours we + +0:42:59.741 --> 0:43:00.393 +maybe. + +0:43:00.660 --> 0:43:03.408 +So therefore the addition to a little bit +more to make this. + +0:43:06.206 --> 0:43:10.538 +Then you can go where else the calculations +are very difficult. + +0:43:10.830 --> 0:43:20.389 +So here we then have our network, so we have +the word embeddings. + +0:43:20.389 --> 0:43:29.514 +We have one hidden there, and then you can +look how difficult. + +0:43:30.270 --> 0:43:38.746 +Could save a lot of calculation by not really +calculating the selection because that is always. + +0:43:40.600 --> 0:43:46.096 +The number of calculations you have to do +here is so. + +0:43:46.096 --> 0:43:51.693 +The length of this layer is minus one type +projection. + +0:43:52.993 --> 0:43:56.321 +That is a hint size. + +0:43:56.321 --> 0:44:10.268 +So the first step of calculation for this +metrics modification is how much calculation. + +0:44:10.730 --> 0:44:18.806 +Then you have to do some activation function +and then you have to do again the calculation. + +0:44:19.339 --> 0:44:27.994 +Here we need the vocabulary size because we +need to calculate the probability for each + +0:44:27.994 --> 0:44:29.088 +next word. + +0:44:29.889 --> 0:44:43.155 +And if you look at these numbers, so if you +have a projector size of and a vocabulary size + +0:44:43.155 --> 0:44:53.876 +of, you see: And that is why there has been +especially at the beginning some ideas how + +0:44:53.876 --> 0:44:55.589 +we can reduce. + +0:44:55.956 --> 0:45:01.942 +And if we really need to calculate all of +our capabilities, or if we can calculate only + +0:45:01.942 --> 0:45:02.350 +some. + +0:45:02.582 --> 0:45:10.871 +And there again the one important thing to +think about is for what will use my language + +0:45:10.871 --> 0:45:11.342 +mom. + +0:45:11.342 --> 0:45:19.630 +I can use it for generations and that's what +we will see next week in an achiever which + +0:45:19.630 --> 0:45:22.456 +really is guiding the search. + +0:45:23.123 --> 0:45:30.899 +If it just uses a feature, we do not want +to use it for generations, but we want to only + +0:45:30.899 --> 0:45:32.559 +know how probable. + +0:45:32.953 --> 0:45:39.325 +There we might not be really interested in +all the probabilities, but we already know + +0:45:39.325 --> 0:45:46.217 +we just want to know the probability of this +one word, and then it might be very inefficient + +0:45:46.217 --> 0:45:49.403 +to really calculate all the probabilities. + +0:45:51.231 --> 0:45:52.919 +And how can you do that so? + +0:45:52.919 --> 0:45:56.296 +Initially, for example, the people look into +shortness. + +0:45:56.756 --> 0:46:02.276 +So this calculation at the end is really very +expensive. + +0:46:02.276 --> 0:46:05.762 +So can we make that more efficient. + +0:46:05.945 --> 0:46:17.375 +And most words occur very rarely, and maybe +we don't need anger, and so there we may want + +0:46:17.375 --> 0:46:18.645 +to focus. + +0:46:19.019 --> 0:46:29.437 +And so they use the smaller vocabulary, which +is maybe. + +0:46:29.437 --> 0:46:34.646 +This layer is used from to. + +0:46:34.646 --> 0:46:37.623 +Then you merge. + +0:46:37.937 --> 0:46:45.162 +So you're taking if the word is in the shortest, +so in the two thousand most frequent words. + +0:46:45.825 --> 0:46:58.299 +Of this short word by some normalization here, +and otherwise you take a back of probability + +0:46:58.299 --> 0:46:59.655 +from the. + +0:47:00.020 --> 0:47:04.933 +It will not be as good, but the idea is okay. + +0:47:04.933 --> 0:47:14.013 +Then we don't have to calculate all these +probabilities here at the end, but we only + +0:47:14.013 --> 0:47:16.042 +have to calculate. + +0:47:19.599 --> 0:47:32.097 +With some type of cost because it means we +don't model the probability of the infrequent + +0:47:32.097 --> 0:47:39.399 +words, and maybe it's even very important to +model. + +0:47:39.299 --> 0:47:46.671 +And one idea is to do what is reported as +so so structured out there. + +0:47:46.606 --> 0:47:49.571 +Network language models you see some years +ago. + +0:47:49.571 --> 0:47:53.154 +People were very creative and giving names +to new models. + +0:47:53.813 --> 0:48:00.341 +And there the idea is that we model the output +vocabulary as a clustered treat. + +0:48:00.680 --> 0:48:06.919 +So you don't need to model all of our bodies +directly, but you are putting words into a + +0:48:06.919 --> 0:48:08.479 +sequence of clusters. + +0:48:08.969 --> 0:48:15.019 +So maybe a very intriguant world is first +in cluster three and then in cluster three. + +0:48:15.019 --> 0:48:21.211 +You have subclusters again and there is subclusters +seven and subclusters and there is. + +0:48:21.541 --> 0:48:40.134 +And this is the path, so that is what was +the man in the past. + +0:48:40.340 --> 0:48:52.080 +And then you can calculate the probability +of the word again just by the product of the + +0:48:52.080 --> 0:48:55.548 +first class of the world. + +0:48:57.617 --> 0:49:07.789 +That it may be more clear where you have this +architecture, so this is all the same. + +0:49:07.789 --> 0:49:13.773 +But then you first predict here which main +class. + +0:49:14.154 --> 0:49:24.226 +Then you go to the appropriate subclass, then +you calculate the probability of the subclass + +0:49:24.226 --> 0:49:26.415 +and maybe the cell. + +0:49:27.687 --> 0:49:35.419 +Anybody have an idea why this is more efficient +or if you do it first, it looks a lot more. + +0:49:42.242 --> 0:49:51.788 +You have to do less calculations, so maybe +if you do it here you have to calculate the + +0:49:51.788 --> 0:49:59.468 +element there, but you don't have to do all +the one hundred thousand. + +0:49:59.980 --> 0:50:06.115 +The probabilities in the set classes that +you're going through and not for all of them. + +0:50:06.386 --> 0:50:18.067 +Therefore, it's more efficient if you don't +need all output proficient because you have + +0:50:18.067 --> 0:50:21.253 +to calculate the class. + +0:50:21.501 --> 0:50:28.936 +So it's only more efficient and scenarios +where you really need to use a language model + +0:50:28.936 --> 0:50:30.034 +to evaluate. + +0:50:35.275 --> 0:50:52.456 +How this works was that you can train first +in your language one on the short list. + +0:50:52.872 --> 0:51:03.547 +But on the input layer you have your full +vocabulary because at the input we saw that + +0:51:03.547 --> 0:51:06.650 +this is not complicated. + +0:51:06.906 --> 0:51:26.638 +And then you can cluster down all your words +here into classes and use that as your glasses. + +0:51:29.249 --> 0:51:34.148 +That is one idea of doing it. + +0:51:34.148 --> 0:51:44.928 +There is also a second idea of doing it, and +again we don't need. + +0:51:45.025 --> 0:51:53.401 +So sometimes it doesn't really need to be +a probability to evaluate. + +0:51:53.401 --> 0:51:56.557 +It's only important that. + +0:51:58.298 --> 0:52:04.908 +And: Here it's called self normalization what +people have done so. + +0:52:04.908 --> 0:52:11.562 +We have seen that the probability is in this +soft mechanism always to the input divided + +0:52:11.562 --> 0:52:18.216 +by our normalization, and the normalization +is a summary of the vocabulary to the power + +0:52:18.216 --> 0:52:19.274 +of the spell. + +0:52:19.759 --> 0:52:25.194 +So this is how we calculate the software. + +0:52:25.825 --> 0:52:41.179 +In self normalization of the idea, if this +would be zero then we don't need to calculate + +0:52:41.179 --> 0:52:42.214 +that. + +0:52:42.102 --> 0:52:54.272 +Will be zero, and then you don't even have +to calculate the normalization because it's. + +0:52:54.514 --> 0:53:08.653 +So how can we achieve that and then the nice +thing in your networks? + +0:53:09.009 --> 0:53:23.928 +And now we're just adding a second note with +some either permitted here. + +0:53:24.084 --> 0:53:29.551 +And the second lost just tells us he'll be +strained away. + +0:53:29.551 --> 0:53:31.625 +The locks at is zero. + +0:53:32.352 --> 0:53:38.614 +So then if it's nearly zero at the end we +don't need to calculate this and it's also + +0:53:38.614 --> 0:53:39.793 +very efficient. + +0:53:40.540 --> 0:53:49.498 +One important thing is this, of course, is +only in inference. + +0:53:49.498 --> 0:54:04.700 +During tests we don't need to calculate that +because: You can do a bit of a hyperparameter + +0:54:04.700 --> 0:54:14.851 +here where you do the waiting, so how good +should it be estimating the probabilities and + +0:54:14.851 --> 0:54:16.790 +how much effort? + +0:54:18.318 --> 0:54:28.577 +The only disadvantage is no speed up during +training. + +0:54:28.577 --> 0:54:43.843 +There are other ways of doing that, for example: +Englishman is in case you get it. + +0:54:44.344 --> 0:54:48.540 +Then we are coming very, very briefly like +just one idea. + +0:54:48.828 --> 0:54:53.058 +That there is more things on different types +of language models. + +0:54:53.058 --> 0:54:58.002 +We are having a very short view on restricted +person-based language models. + +0:54:58.298 --> 0:55:08.931 +Talk about recurrent neural networks for language +mines because they have the advantage that + +0:55:08.931 --> 0:55:17.391 +we can even further improve by not having a +continuous representation on. + +0:55:18.238 --> 0:55:23.845 +So there's different types of neural networks. + +0:55:23.845 --> 0:55:30.169 +These are these boxing machines and the interesting. + +0:55:30.330 --> 0:55:39.291 +They have these: And they define like an energy +function on the network, which can be in restricted + +0:55:39.291 --> 0:55:44.372 +balsam machines efficiently calculated in general +and restricted needs. + +0:55:44.372 --> 0:55:51.147 +You only have connection between the input +and the hidden layer, but you don't have connections + +0:55:51.147 --> 0:55:53.123 +in the input or within the. + +0:55:53.393 --> 0:56:00.194 +So you see here you don't have an input output, +you just have an input, and you calculate. + +0:56:00.460 --> 0:56:15.612 +Which of course nicely fits with the idea +we're having, so you can then use this for + +0:56:15.612 --> 0:56:19.177 +an N Gram language. + +0:56:19.259 --> 0:56:25.189 +Retaining the flexibility of the input by +this type of neon networks. + +0:56:26.406 --> 0:56:30.589 +And the advantage of this type of model was +there's. + +0:56:30.550 --> 0:56:37.520 +Very, very fast to integrate it, so that one +was the first one which was used during the + +0:56:37.520 --> 0:56:38.616 +coding model. + +0:56:38.938 --> 0:56:45.454 +The engram language models were that they +were very good and gave performance. + +0:56:45.454 --> 0:56:50.072 +However, calculation still with all these +tricks takes. + +0:56:50.230 --> 0:56:58.214 +We have talked about embest lists so they +generated an embest list of the most probable + +0:56:58.214 --> 0:57:05.836 +outputs and then they took this and best list +scored each entry with a new network. + +0:57:06.146 --> 0:57:09.306 +A language model, and then only change the +order again. + +0:57:09.306 --> 0:57:10.887 +Select based on that which. + +0:57:11.231 --> 0:57:17.187 +The neighboring list is maybe only like hundred +entries. + +0:57:17.187 --> 0:57:21.786 +When decoding you look at several thousand. + +0:57:26.186 --> 0:57:35.196 +Let's look at the context so we have now seen +your language models. + +0:57:35.196 --> 0:57:43.676 +There is the big advantage we can use this +word similarity and. + +0:57:44.084 --> 0:57:52.266 +Remember for engram language ones is not always +minus one words because sometimes you have + +0:57:52.266 --> 0:57:59.909 +to back off or interpolation to lower engrams +and you don't know the previous words. + +0:58:00.760 --> 0:58:04.742 +And however in neural models we always have +all of this importance. + +0:58:04.742 --> 0:58:05.504 +Can some of. + +0:58:07.147 --> 0:58:20.288 +The disadvantage is that you are still limited +in your context, and if you remember the sentence + +0:58:20.288 --> 0:58:22.998 +from last lecture,. + +0:58:22.882 --> 0:58:28.328 +Sometimes you need more context and there +is unlimited context that you might need and + +0:58:28.328 --> 0:58:34.086 +you can always create sentences where you may +need this five context in order to put a good + +0:58:34.086 --> 0:58:34.837 +estimation. + +0:58:35.315 --> 0:58:44.956 +Can also do it different in order to understand +that it makes sense to view language. + +0:58:45.445 --> 0:58:59.510 +So secret labeling tasks are a very common +type of task in language processing where you + +0:58:59.510 --> 0:59:03.461 +have the input sequence. + +0:59:03.323 --> 0:59:05.976 +So you have one output for each input. + +0:59:05.976 --> 0:59:12.371 +Machine translation is not a secret labeling +cast because the number of inputs and the number + +0:59:12.371 --> 0:59:14.072 +of outputs is different. + +0:59:14.072 --> 0:59:20.598 +So you put in a string German which has five +words and the output can be: See, for example, + +0:59:20.598 --> 0:59:24.078 +you always have the same number and the same +number of offices. + +0:59:24.944 --> 0:59:39.779 +And you can more language waddling as that, +and you just say the label for each word is + +0:59:39.779 --> 0:59:43.151 +always a next word. + +0:59:45.705 --> 0:59:50.312 +This is the more generous you can think of +it. + +0:59:50.312 --> 0:59:56.194 +For example, Paddle Speech Taking named Entity +Recognition. + +0:59:58.938 --> 1:00:08.476 +And if you look at now, this output token +and generally sequenced labeling can depend + +1:00:08.476 --> 1:00:26.322 +on: The input tokens are the same so we can +easily model it and they only depend on the + +1:00:26.322 --> 1:00:29.064 +input tokens. + +1:00:31.011 --> 1:00:42.306 +But we can always look at one specific type +of sequence labeling, unidirectional sequence + +1:00:42.306 --> 1:00:44.189 +labeling type. + +1:00:44.584 --> 1:01:00.855 +The probability of the next word only depends +on the previous words that we are having here. + +1:01:01.321 --> 1:01:05.998 +That's also not completely true in language. + +1:01:05.998 --> 1:01:14.418 +Well, the back context might also be helpful +by direction of the model's Google. + +1:01:14.654 --> 1:01:23.039 +We will always admire the probability of the +word given on its history. + +1:01:23.623 --> 1:01:30.562 +And currently there is approximation and sequence +labeling that we have this windowing approach. + +1:01:30.951 --> 1:01:43.016 +So in order to predict this type of word we +always look at the previous three words. + +1:01:43.016 --> 1:01:48.410 +This is this type of windowing model. + +1:01:49.389 --> 1:01:54.780 +If you're into neural networks you recognize +this type of structure. + +1:01:54.780 --> 1:01:57.515 +Also, the typical neural networks. + +1:01:58.938 --> 1:02:11.050 +Yes, yes, so like engram models you can, at +least in some way, prepare for that type of + +1:02:11.050 --> 1:02:12.289 +context. + +1:02:14.334 --> 1:02:23.321 +Are also other types of neonamic structures +which we can use for sequins lately and which + +1:02:23.321 --> 1:02:30.710 +might help us where we don't have this type +of fixed size representation. + +1:02:32.812 --> 1:02:34.678 +That we can do so. + +1:02:34.678 --> 1:02:39.391 +The idea is in recurrent new networks traction. + +1:02:39.391 --> 1:02:43.221 +We are saving complete history in one. + +1:02:43.623 --> 1:02:56.946 +So again we have to do this fixed size representation +because the neural networks always need a habit. + +1:02:57.157 --> 1:03:09.028 +And then the network should look like that, +so we start with an initial value for our storage. + +1:03:09.028 --> 1:03:15.900 +We are giving our first input and calculating +the new. + +1:03:16.196 --> 1:03:35.895 +So again in your network with two types of +inputs: Then you can apply it to the next type + +1:03:35.895 --> 1:03:41.581 +of input and you're again having this. + +1:03:41.581 --> 1:03:46.391 +You're taking this hidden state. + +1:03:47.367 --> 1:03:53.306 +Nice thing is now that you can do now step +by step by step, so all the way over. + +1:03:55.495 --> 1:04:06.131 +The nice thing we are having here now is that +now we are having context information from + +1:04:06.131 --> 1:04:07.206 +all the. + +1:04:07.607 --> 1:04:14.181 +So if you're looking like based on which words +do you, you calculate the probability of varying. + +1:04:14.554 --> 1:04:20.090 +It depends on this part. + +1:04:20.090 --> 1:04:33.154 +It depends on and this hidden state was influenced +by two. + +1:04:33.473 --> 1:04:38.259 +So now we're having something new. + +1:04:38.259 --> 1:04:46.463 +We can model like the word probability not +only on a fixed. + +1:04:46.906 --> 1:04:53.565 +Because the hidden states we are having here +in our Oregon are influenced by all the trivia. + +1:04:56.296 --> 1:05:02.578 +So how is there to be Singapore? + +1:05:02.578 --> 1:05:16.286 +But then we have the initial idea about this +P of given on the history. + +1:05:16.736 --> 1:05:25.300 +So do not need to do any clustering here, +and you also see how things are put together + +1:05:25.300 --> 1:05:26.284 +in order. + +1:05:29.489 --> 1:05:43.449 +The green box this night since we are starting +from the left to the right. + +1:05:44.524 --> 1:05:51.483 +Voices: Yes, that's right, so there are clusters, +and here is also sometimes clustering happens. + +1:05:51.871 --> 1:05:58.687 +The small difference does matter again, so +if you have now a lot of different histories, + +1:05:58.687 --> 1:06:01.674 +the similarity which you have in here. + +1:06:01.674 --> 1:06:08.260 +If two of the histories are very similar, +these representations will be the same, and + +1:06:08.260 --> 1:06:10.787 +then you're treating them again. + +1:06:11.071 --> 1:06:15.789 +Because in order to do the final restriction +you only do a good base on the green box. + +1:06:16.156 --> 1:06:28.541 +So you are now still learning some type of +clustering in there, but you are learning it + +1:06:28.541 --> 1:06:30.230 +implicitly. + +1:06:30.570 --> 1:06:38.200 +The only restriction you're giving is you +have to stall everything that is important + +1:06:38.200 --> 1:06:39.008 +in this. + +1:06:39.359 --> 1:06:54.961 +So it's a different type of limitation, so +you calculate the probability based on the + +1:06:54.961 --> 1:06:57.138 +last words. + +1:06:57.437 --> 1:07:04.430 +And that is how you still need to somehow +cluster things together in order to do efficiently. + +1:07:04.430 --> 1:07:09.563 +Of course, you need to do some type of clustering +because otherwise. + +1:07:09.970 --> 1:07:18.865 +But this is where things get merged together +in this type of hidden representation. + +1:07:18.865 --> 1:07:27.973 +So here the probability of the word first +only depends on this hidden representation. + +1:07:28.288 --> 1:07:33.104 +On the previous words, but they are some other +bottleneck in order to make a good estimation. + +1:07:34.474 --> 1:07:41.231 +So the idea is that we can store all our history +into or into one lecture. + +1:07:41.581 --> 1:07:44.812 +Which is the one that makes it more strong. + +1:07:44.812 --> 1:07:51.275 +Next we come to problems that of course at +some point it might be difficult if you have + +1:07:51.275 --> 1:07:57.811 +very long sequences and you always write all +the information you have on this one block. + +1:07:58.398 --> 1:08:02.233 +Then maybe things get overwritten or you cannot +store everything in there. + +1:08:02.662 --> 1:08:04.514 +So,. + +1:08:04.184 --> 1:08:09.569 +Therefore, yet for short things like single +sentences that works well, but especially if + +1:08:09.569 --> 1:08:15.197 +you think of other tasks and like symbolizations +with our document based on T where you need + +1:08:15.197 --> 1:08:20.582 +to consider the full document, these things +got got a bit more more more complicated and + +1:08:20.582 --> 1:08:23.063 +will learn another type of architecture. + +1:08:24.464 --> 1:08:30.462 +In order to understand these neighbors, it +is good to have all the bus use always. + +1:08:30.710 --> 1:08:33.998 +So this is the unrolled view. + +1:08:33.998 --> 1:08:43.753 +Somewhere you're over the type or in language +over the words you're unrolling a network. + +1:08:44.024 --> 1:08:52.096 +Here is the article and here is the network +which is connected by itself and that is recurrent. + +1:08:56.176 --> 1:09:04.982 +There is one challenge in this networks and +training. + +1:09:04.982 --> 1:09:11.994 +We can train them first of all as forward. + +1:09:12.272 --> 1:09:19.397 +So we don't really know how to train them, +but if you unroll them like this is a feet + +1:09:19.397 --> 1:09:20.142 +forward. + +1:09:20.540 --> 1:09:38.063 +Is exactly the same, so you can measure your +arrows here and be back to your arrows. + +1:09:38.378 --> 1:09:45.646 +If you unroll something, it's a feature in +your laptop and you can train it the same way. + +1:09:46.106 --> 1:09:57.606 +The only important thing is again, of course, +for different inputs. + +1:09:57.837 --> 1:10:05.145 +But since parameters are shared, it's somehow +a similar point you can train it. + +1:10:05.145 --> 1:10:08.800 +The training algorithm is very similar. + +1:10:10.310 --> 1:10:29.568 +One thing which makes things difficult is +what is referred to as the vanish ingredient. + +1:10:29.809 --> 1:10:32.799 +That's a very strong thing in the motivation +of using hardness. + +1:10:33.593 --> 1:10:44.604 +The influence here gets smaller and smaller, +and the modems are not really able to monitor. + +1:10:44.804 --> 1:10:51.939 +Because the gradient gets smaller and smaller, +and so the arrow here propagated to this one + +1:10:51.939 --> 1:10:58.919 +that contributes to the arrow is very small, +and therefore you don't do any changes there + +1:10:58.919 --> 1:10:59.617 +anymore. + +1:11:00.020 --> 1:11:06.703 +And yeah, that's why standard art men are +undifficult or have to pick them at custard. + +1:11:07.247 --> 1:11:11.462 +So everywhere talking to me about fire and +ants nowadays,. + +1:11:11.791 --> 1:11:23.333 +What we are typically meaning are LSDN's or +long short memories. + +1:11:23.333 --> 1:11:30.968 +You see they are by now quite old already. + +1:11:31.171 --> 1:11:39.019 +So there was a model in the language model +task. + +1:11:39.019 --> 1:11:44.784 +It's some more storing information. + +1:11:44.684 --> 1:11:51.556 +Because if you only look at the last words, +it's often no longer clear this is a question + +1:11:51.556 --> 1:11:52.548 +or a normal. + +1:11:53.013 --> 1:12:05.318 +So there you have these mechanisms with ripgate +in order to store things for a longer time + +1:12:05.318 --> 1:12:08.563 +into your hidden state. + +1:12:10.730 --> 1:12:20.162 +Here they are used in in in selling quite +a lot of works. + +1:12:21.541 --> 1:12:29.349 +For especially machine translation now, the +standard is to do transform base models which + +1:12:29.349 --> 1:12:30.477 +we'll learn. + +1:12:30.690 --> 1:12:38.962 +But for example, in architecture we have later +one lecture about efficiency. + +1:12:38.962 --> 1:12:42.830 +So how can we build very efficient? + +1:12:42.882 --> 1:12:53.074 +And there in the decoder in parts of the networks +they are still using. + +1:12:53.473 --> 1:12:57.518 +So it's not that yeah our hands are of no +importance in the body. + +1:12:59.239 --> 1:13:08.956 +In order to make them strong, there are some +more things which are helpful and should be: + +1:13:09.309 --> 1:13:19.683 +So one thing is there is a nice trick to make +this new network stronger and better. + +1:13:19.739 --> 1:13:21.523 +So of course it doesn't work always. + +1:13:21.523 --> 1:13:23.451 +They have to have enough training data. + +1:13:23.763 --> 1:13:28.959 +But in general there's the easiest way of +making your models bigger and stronger just + +1:13:28.959 --> 1:13:30.590 +to increase your pyramids. + +1:13:30.630 --> 1:13:43.236 +And you've seen that with a large language +models they are always bragging about. + +1:13:43.903 --> 1:13:56.463 +This is one way, so the question is how do +you get more parameters? + +1:13:56.463 --> 1:14:01.265 +There's ways of doing it. + +1:14:01.521 --> 1:14:10.029 +And the other thing is to make your networks +deeper so to have more legs in between. + +1:14:11.471 --> 1:14:13.827 +And then you can also get to get more calm. + +1:14:14.614 --> 1:14:23.340 +There's more traveling with this and it's +very similar to what we just saw with our hand. + +1:14:23.603 --> 1:14:34.253 +We have this problem of radiant flow that +if it flows so fast like a radiant gets very + +1:14:34.253 --> 1:14:35.477 +swollen,. + +1:14:35.795 --> 1:14:42.704 +Exactly the same thing happens in deep LSD +ends. + +1:14:42.704 --> 1:14:52.293 +If you take here the gradient, tell you what +is the right or wrong. + +1:14:52.612 --> 1:14:56.439 +With three layers it's no problem, but if +you're going to ten, twenty or hundred layers. + +1:14:57.797 --> 1:14:59.698 +That's Getting Typically Young. + +1:15:00.060 --> 1:15:07.000 +Are doing is using what is called decisional +connections. + +1:15:07.000 --> 1:15:15.855 +That's a very helpful idea, which is maybe +very surprising that it works. + +1:15:15.956 --> 1:15:20.309 +And so the idea is that these networks. + +1:15:20.320 --> 1:15:29.982 +In between should no longer calculate what +is a new good representation, but they're more + +1:15:29.982 --> 1:15:31.378 +calculating. + +1:15:31.731 --> 1:15:37.588 +Therefore, in the end you're always the output +of a layer is added with the input. + +1:15:38.318 --> 1:15:48.824 +The knife is later if you are doing back propagation +with this very fast back propagation. + +1:15:49.209 --> 1:16:02.540 +Nowadays in very deep architectures, not only +on other but always has this residual or highway + +1:16:02.540 --> 1:16:04.224 +connection. + +1:16:04.704 --> 1:16:06.616 +Has two advantages. + +1:16:06.616 --> 1:16:15.409 +On the one hand, these layers don't need to +learn a representation, they only need to learn + +1:16:15.409 --> 1:16:18.754 +what to change the representation. + +1:16:22.082 --> 1:16:24.172 +Good. + +1:16:23.843 --> 1:16:31.768 +That much for the new map before, so the last +thing now means this. + +1:16:31.671 --> 1:16:33.750 +Language was are yeah. + +1:16:33.750 --> 1:16:41.976 +I were used in the molds itself and now were +seeing them again, but one thing which at the + +1:16:41.976 --> 1:16:53.558 +beginning they were reading was very essential +was: So people really train part of the language + +1:16:53.558 --> 1:16:59.999 +models only to get this type of embedding. + +1:16:59.999 --> 1:17:04.193 +Therefore, we want to look. + +1:17:09.229 --> 1:17:15.678 +So now some last words to the word embeddings. + +1:17:15.678 --> 1:17:27.204 +The interesting thing is that word embeddings +can be used for very different tasks. + +1:17:27.347 --> 1:17:31.329 +The knife wing is you can train that on just +large amounts of data. + +1:17:31.931 --> 1:17:41.569 +And then if you have these wooden beddings +we have seen that they reduce the parameters. + +1:17:41.982 --> 1:17:52.217 +So then you can train your small mark to do +any other task and therefore you are more efficient. + +1:17:52.532 --> 1:17:55.218 +These initial word embeddings is important. + +1:17:55.218 --> 1:18:00.529 +They really depend only on the word itself, +so if you look at the two meanings of can, + +1:18:00.529 --> 1:18:06.328 +the can of beans or I can do that, they will +have the same embedding, so some of the embedding + +1:18:06.328 --> 1:18:08.709 +has to save the ambiguity inside that. + +1:18:09.189 --> 1:18:12.486 +That cannot be resolved. + +1:18:12.486 --> 1:18:24.753 +Therefore, if you look at the higher levels +in the context, but in the word embedding layers + +1:18:24.753 --> 1:18:27.919 +that really depends on. + +1:18:29.489 --> 1:18:33.757 +However, even this one has quite very interesting. + +1:18:34.034 --> 1:18:39.558 +So that people like to visualize them. + +1:18:39.558 --> 1:18:47.208 +They're always difficult because if you look +at this. + +1:18:47.767 --> 1:18:52.879 +And drawing your five hundred damage, the +vector is still a bit challenging. + +1:18:53.113 --> 1:19:12.472 +So you cannot directly do that, so people +have to do it like they look at some type of. + +1:19:13.073 --> 1:19:17.209 +And of course then yes some information is +getting lost by a bunch of control. + +1:19:18.238 --> 1:19:24.802 +And you see, for example, this is the most +famous and common example, so what you can + +1:19:24.802 --> 1:19:31.289 +look is you can look at the difference between +the main and the female word English. + +1:19:31.289 --> 1:19:37.854 +This is here in your embedding of king, and +this is the embedding of queen, and this. + +1:19:38.058 --> 1:19:40.394 +You can do that for a very different work. + +1:19:40.780 --> 1:19:45.407 +And that is where the masks come into, that +is what people then look into. + +1:19:45.725 --> 1:19:50.995 +So what you can now, for example, do is you +can calculate the difference between man and + +1:19:50.995 --> 1:19:51.410 +woman? + +1:19:52.232 --> 1:19:55.511 +Then you can take the embedding of tea. + +1:19:55.511 --> 1:20:02.806 +You can add on it the difference between man +and woman, and then you can notice what are + +1:20:02.806 --> 1:20:04.364 +the similar words. + +1:20:04.364 --> 1:20:08.954 +So you won't, of course, directly hit the +correct word. + +1:20:08.954 --> 1:20:10.512 +It's a continuous. + +1:20:10.790 --> 1:20:23.127 +But you can look what are the nearest neighbors +to this same, and often these words are near + +1:20:23.127 --> 1:20:24.056 +there. + +1:20:24.224 --> 1:20:33.913 +So it somehow learns that the difference between +these words is always the same. + +1:20:34.374 --> 1:20:37.746 +You can do that for different things. + +1:20:37.746 --> 1:20:41.296 +He also imagines that it's not perfect. + +1:20:41.296 --> 1:20:49.017 +He says the world tends to be swimming and +swimming, and with walking and walking you. + +1:20:49.469 --> 1:20:51.639 +So you can try to use them. + +1:20:51.639 --> 1:20:59.001 +It's no longer like saying yeah, but the interesting +thing is this is completely unsupervised. + +1:20:59.001 --> 1:21:03.961 +So nobody taught him the principle of their +gender in language. + +1:21:04.284 --> 1:21:09.910 +So it's purely trained on the task of doing +the next work prediction. + +1:21:10.230 --> 1:21:20.658 +And even for really cementing information +like the capital, this is the difference between + +1:21:20.658 --> 1:21:23.638 +the city and the capital. + +1:21:23.823 --> 1:21:25.518 +Visualization. + +1:21:25.518 --> 1:21:33.766 +Here we have done the same things of the difference +between country and. + +1:21:33.853 --> 1:21:41.991 +You see it's not perfect, but it's building +some kinds of a right direction, so you can't + +1:21:41.991 --> 1:21:43.347 +even use them. + +1:21:43.347 --> 1:21:51.304 +For example, for question answering, if you +have the difference between them, you apply + +1:21:51.304 --> 1:21:53.383 +that to a new country. + +1:21:54.834 --> 1:22:02.741 +So it seems these ones are able to really +learn a lot of information and collapse all + +1:22:02.741 --> 1:22:04.396 +this information. + +1:22:05.325 --> 1:22:11.769 +At just to do the next word prediction: And +that also explains a bit maybe or not explains + +1:22:11.769 --> 1:22:19.016 +wrong life by motivating why what is the main +advantage of this type of neural models that + +1:22:19.016 --> 1:22:26.025 +we can use this type of hidden representation, +transfer them and use them in different. + +1:22:28.568 --> 1:22:43.707 +So summarize what we did today, so what you +should hopefully have with you is for machine + +1:22:43.707 --> 1:22:45.893 +translation. + +1:22:45.805 --> 1:22:49.149 +Then how we can do language modern Chinese +literature? + +1:22:49.449 --> 1:22:55.617 +We looked at three different architectures: +We looked into the feet forward language mode + +1:22:55.617 --> 1:22:59.063 +and the one based on Bluetooth machines. + +1:22:59.039 --> 1:23:05.366 +And finally there are different architectures +to do in your networks. + +1:23:05.366 --> 1:23:14.404 +We have seen feet for your networks and we'll +see the next lectures, the last type of architecture. + +1:23:15.915 --> 1:23:17.412 +Have Any Questions. + +1:23:20.680 --> 1:23:27.341 +Then thanks a lot, and next on Tuesday we +will be again in our order to know how to play. + diff --git a/demo_data/lectures/Lecture-07-11.05.2023/video.mp4 b/demo_data/lectures/Lecture-07-11.05.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e46376921a8976c5b0f0a91e7b73fba3152a7c42 --- /dev/null +++ b/demo_data/lectures/Lecture-07-11.05.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee1fc2af8bf4d95a18dacaa3d5d9aad8c6c207e0f5f63090a9adefcfcf29f418 +size 150440033 diff --git a/demo_data/lectures/Lecture-07-16.05.2023/English.vtt b/demo_data/lectures/Lecture-07-16.05.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..22cf0d966035cf80581e27f808e7324b0be67f52 --- /dev/null +++ b/demo_data/lectures/Lecture-07-16.05.2023/English.vtt @@ -0,0 +1,5104 @@ +WEBVTT + +0:00:01.301 --> 0:00:05.707 +Okay So Welcome to Today's Lecture. + +0:00:06.066 --> 0:00:12.592 +I'm sorry for the inconvenience. + +0:00:12.592 --> 0:00:19.910 +Sometimes they are project meetings. + +0:00:19.910 --> 0:00:25.843 +There will be one other time. + +0:00:26.806 --> 0:00:40.863 +So what we want to talk today about is want +to start with neural approaches to machine + +0:00:40.863 --> 0:00:42.964 +translation. + +0:00:43.123 --> 0:00:51.285 +I guess you have heard about other types of +neural models for other types of neural language + +0:00:51.285 --> 0:00:52.339 +processing. + +0:00:52.339 --> 0:00:59.887 +This was some of the first steps in introducing +neal networks to machine translation. + +0:01:00.600 --> 0:01:06.203 +They are similar to what you know they see +in as large language models. + +0:01:06.666 --> 0:01:11.764 +And today look into what are these neuro-language +models? + +0:01:11.764 --> 0:01:13.874 +What is the difference? + +0:01:13.874 --> 0:01:15.983 +What is the motivation? + +0:01:16.316 --> 0:01:21.445 +And first will use them in statistics and +machine translation. + +0:01:21.445 --> 0:01:28.935 +So if you remember how fully like two or three +weeks ago we had this likely model where you + +0:01:28.935 --> 0:01:31.052 +can integrate easily any. + +0:01:31.351 --> 0:01:40.967 +We just have another model which evaluates +how good a system is or how good a fluent language + +0:01:40.967 --> 0:01:41.376 +is. + +0:01:41.376 --> 0:01:53.749 +The main advantage compared to the statistical +models we saw on Tuesday is: Next week we will + +0:01:53.749 --> 0:02:06.496 +then go for a neural machine translation where +we replace the whole model. + +0:02:11.211 --> 0:02:21.078 +Just as a remember from Tuesday, we've seen +the main challenge in language world was that + +0:02:21.078 --> 0:02:25.134 +most of the engrams we haven't seen. + +0:02:26.946 --> 0:02:33.967 +So this was therefore difficult to estimate +any probability because you've seen that normally + +0:02:33.967 --> 0:02:39.494 +if you have not seen the endgram you will assign +the probability of zero. + +0:02:39.980 --> 0:02:49.420 +However, this is not really very good because +we don't want to give zero probabilities to + +0:02:49.420 --> 0:02:54.979 +sentences, which still might be a very good +English. + +0:02:55.415 --> 0:03:02.167 +And then we learned a lot of techniques and +that is the main challenging statistical machine + +0:03:02.167 --> 0:03:04.490 +translate statistical language. + +0:03:04.490 --> 0:03:10.661 +What's how we can give a good estimate of +probability to events that we haven't seen + +0:03:10.661 --> 0:03:12.258 +smoothing techniques? + +0:03:12.258 --> 0:03:15.307 +We've seen this interpolation and begoff. + +0:03:15.435 --> 0:03:21.637 +And they invent or develop very specific techniques. + +0:03:21.637 --> 0:03:26.903 +To deal with that, however, it might not be. + +0:03:28.568 --> 0:03:43.190 +And therefore maybe we can do things different, +so if we have not seen an gram before in statistical + +0:03:43.190 --> 0:03:44.348 +models. + +0:03:45.225 --> 0:03:51.361 +Before and we can only get information from +exactly the same words. + +0:03:51.411 --> 0:04:06.782 +We don't have some on like approximate matching +like that, maybe in a sentence that cures similarly. + +0:04:06.782 --> 0:04:10.282 +So if you have seen a. + +0:04:11.191 --> 0:04:17.748 +And so you would like to have more something +like that where endgrams are represented, more + +0:04:17.748 --> 0:04:21.953 +in a general space, and we can generalize similar +numbers. + +0:04:22.262 --> 0:04:29.874 +So if you learn something about walk then +maybe we can use this knowledge and also apply. + +0:04:30.290 --> 0:04:42.596 +The same as we have done before, but we can +really better model how similar they are and + +0:04:42.596 --> 0:04:45.223 +transfer to other. + +0:04:47.047 --> 0:04:54.236 +And we maybe want to do that in a more hierarchical +approach that we know okay. + +0:04:54.236 --> 0:05:02.773 +Some words are similar but like go and walk +is somehow similar and I and P and G and therefore + +0:05:02.773 --> 0:05:06.996 +like maybe if we then merge them in an engram. + +0:05:07.387 --> 0:05:15.861 +If we learn something about our walk, then +it should tell us also something about Hugo. + +0:05:15.861 --> 0:05:17.113 +He walks or. + +0:05:17.197 --> 0:05:27.327 +You see that there is some relations which +we need to integrate for you. + +0:05:27.327 --> 0:05:35.514 +We need to add the s, but maybe walks should +also be here. + +0:05:37.137 --> 0:05:45.149 +And luckily there is one really convincing +method in doing that: And that is by using + +0:05:45.149 --> 0:05:47.231 +a neural mechanism. + +0:05:47.387 --> 0:05:58.497 +That's what we will introduce today so we +can use this type of neural networks to try + +0:05:58.497 --> 0:06:04.053 +to learn this similarity and to learn how. + +0:06:04.324 --> 0:06:14.355 +And that is one of the main advantages that +we have by switching from the standard statistical + +0:06:14.355 --> 0:06:15.200 +models. + +0:06:15.115 --> 0:06:22.830 +To learn similarities between words and generalized, +and learn what is called hidden representations + +0:06:22.830 --> 0:06:29.705 +or representations of words, where we can measure +similarity in some dimensions of words. + +0:06:30.290 --> 0:06:42.384 +So we can measure in which way words are similar. + +0:06:42.822 --> 0:06:48.902 +We had it before and we've seen that words +were just easier. + +0:06:48.902 --> 0:06:51.991 +The only thing we did is like. + +0:06:52.192 --> 0:07:02.272 +But this energies don't have any meaning, +so it wasn't that word is more similar to words. + +0:07:02.582 --> 0:07:12.112 +So we couldn't learn anything about words +in the statistical model and that's a big challenge. + +0:07:12.192 --> 0:07:23.063 +About words even like in morphology, so going +goes is somehow more similar because the person + +0:07:23.063 --> 0:07:24.219 +singular. + +0:07:24.264 --> 0:07:34.924 +The basic models we have to now have no idea +about that and goes as similar to go than it + +0:07:34.924 --> 0:07:37.175 +might be to sleep. + +0:07:39.919 --> 0:07:44.073 +So what we want to do today. + +0:07:44.073 --> 0:07:53.096 +In order to go to this we will have a short +introduction into. + +0:07:53.954 --> 0:08:05.984 +It very short just to see how we use them +here, but that's a good thing, so most of you + +0:08:05.984 --> 0:08:08.445 +think it will be. + +0:08:08.928 --> 0:08:14.078 +And then we will first look into a feet forward +neural network language models. + +0:08:14.454 --> 0:08:23.706 +And there we will still have this approximation. + +0:08:23.706 --> 0:08:33.902 +We have before we are looking only at a fixed +window. + +0:08:34.154 --> 0:08:35.030 +The case. + +0:08:35.030 --> 0:08:38.270 +However, we have the umbellent here. + +0:08:38.270 --> 0:08:43.350 +That's why they're already better in order +to generalize. + +0:08:44.024 --> 0:08:53.169 +And then at the end we'll look at language +models where we then have the additional advantage. + +0:08:53.093 --> 0:09:04.317 +Case that we need to have a fixed history, +but in theory we can model arbitrary long dependencies. + +0:09:04.304 --> 0:09:12.687 +And we talked about on Tuesday where it is +not clear what type of information it is to. + +0:09:16.396 --> 0:09:24.981 +So in general molecular networks I normally +learn to prove that they perform some tasks. + +0:09:25.325 --> 0:09:33.472 +We have the structure and we are learning +them from samples so that is similar to what + +0:09:33.472 --> 0:09:34.971 +we have before. + +0:09:34.971 --> 0:09:42.275 +So now we have the same task here, a language +model giving input or forwards. + +0:09:42.642 --> 0:09:48.959 +And is somewhat originally motivated by human +brain. + +0:09:48.959 --> 0:10:00.639 +However, when you now need to know about artificial +neural networks, it's hard to get similarity. + +0:10:00.540 --> 0:10:02.889 +There seemed to be not that point. + +0:10:03.123 --> 0:10:11.014 +So what they are mainly doing is summoning +multiplication and then one non-linear activation. + +0:10:12.692 --> 0:10:16.085 +So the basic units are these type of. + +0:10:17.937 --> 0:10:29.891 +Perceptron basic blocks which we have and +this does processing so we have a fixed number + +0:10:29.891 --> 0:10:36.070 +of input features and that will be important. + +0:10:36.096 --> 0:10:39.689 +So we have here numbers to xn as input. + +0:10:40.060 --> 0:10:53.221 +And this makes partly of course language processing +difficult. + +0:10:54.114 --> 0:10:57.609 +So we have to model this time on and then +go stand home and model. + +0:10:58.198 --> 0:11:02.099 +Then we are having weights, which are the +parameters and the number of weights exactly + +0:11:02.099 --> 0:11:03.668 +the same as the number of weights. + +0:11:04.164 --> 0:11:06.322 +Of input features. + +0:11:06.322 --> 0:11:15.068 +Sometimes he has his fires in there, and then +it's not really an input from. + +0:11:15.195 --> 0:11:19.205 +And what you then do is multiply. + +0:11:19.205 --> 0:11:26.164 +Each input resists weight and then you sum +it up and then. + +0:11:26.606 --> 0:11:34.357 +What is then additionally later important +is that we have an activation function and + +0:11:34.357 --> 0:11:42.473 +it's important that this activation function +is non linear, so we come to just a linear. + +0:11:43.243 --> 0:11:54.088 +And later it will be important that this is +differentiable because otherwise all the training. + +0:11:54.714 --> 0:12:01.907 +This model by itself is not very powerful. + +0:12:01.907 --> 0:12:10.437 +It was originally shown that this is not powerful. + +0:12:10.710 --> 0:12:19.463 +However, there is a very easy extension, the +multi layer perceptual, and then things get + +0:12:19.463 --> 0:12:20.939 +very powerful. + +0:12:21.081 --> 0:12:27.719 +The thing is you just connect a lot of these +in this layer of structures and we have our + +0:12:27.719 --> 0:12:35.029 +input layer where we have the inputs and our +hidden layer at least one where there is everywhere. + +0:12:35.395 --> 0:12:39.817 +And then we can combine them all to do that. + +0:12:40.260 --> 0:12:48.320 +The input layer is of course somewhat given +by a problem of dimension. + +0:12:48.320 --> 0:13:00.013 +The outward layer is also given by your dimension, +but the hidden layer is of course a hyperparameter. + +0:13:01.621 --> 0:13:08.802 +So let's start with the first question, now +more language related, and that is how we represent. + +0:13:09.149 --> 0:13:23.460 +So we've seen here we have the but the question +is now how can we put in a word into this? + +0:13:26.866 --> 0:13:34.117 +Noise: The first thing we're able to be better +is by the fact that like you are said,. + +0:13:34.314 --> 0:13:43.028 +That is not that easy because the continuous +vector will come to that. + +0:13:43.028 --> 0:13:50.392 +So from the neo-network we can directly put +in the bedding. + +0:13:50.630 --> 0:13:57.277 +But if we need to input a word into the needle +network, it has to be something which is easily + +0:13:57.277 --> 0:13:57.907 +defined. + +0:13:59.079 --> 0:14:12.492 +The one hood encoding, and then we have one +out of encoding, so one value is one, and all + +0:14:12.492 --> 0:14:15.324 +the others is the. + +0:14:16.316 --> 0:14:25.936 +That means we are always dealing with fixed +vocabulary because what said is we cannot. + +0:14:26.246 --> 0:14:38.017 +So you cannot easily extend your vocabulary +because if you mean you would extend your vocabulary. + +0:14:39.980 --> 0:14:41.502 +That's also motivating. + +0:14:41.502 --> 0:14:43.722 +We're talked about biperriagoding. + +0:14:43.722 --> 0:14:45.434 +That's a nice thing there. + +0:14:45.434 --> 0:14:47.210 +We have a fixed vocabulary. + +0:14:48.048 --> 0:14:55.804 +The big advantage of this one encoding is +that we don't implicitly sum our implement + +0:14:55.804 --> 0:15:04.291 +similarity between words, but really re-learning +because if you first think about this, this + +0:15:04.291 --> 0:15:06.938 +is a very, very inefficient. + +0:15:07.227 --> 0:15:15.889 +So you need like to represent end words, you +need a dimension of an end dimensional vector. + +0:15:16.236 --> 0:15:24.846 +Imagine you could do binary encoding so you +could represent words as binary vectors. + +0:15:24.846 --> 0:15:26.467 +Then you would. + +0:15:26.806 --> 0:15:31.177 +Will be significantly more efficient. + +0:15:31.177 --> 0:15:36.813 +However, then you have some implicit similarity. + +0:15:36.813 --> 0:15:39.113 +Some numbers share. + +0:15:39.559 --> 0:15:46.958 +Would somehow be bad because you would force +someone to do this by hand or clear how to + +0:15:46.958 --> 0:15:47.631 +define. + +0:15:48.108 --> 0:15:55.135 +So therefore currently this is the most successful +approach to just do this one watch. + +0:15:55.095 --> 0:15:59.563 +Representations, so we take a fixed vocabulary. + +0:15:59.563 --> 0:16:06.171 +We map each word to the inise, and then we +represent a word like this. + +0:16:06.171 --> 0:16:13.246 +So if home will be one, the representation +will be one zero zero zero, and. + +0:16:14.514 --> 0:16:30.639 +But this dimension here is a vocabulary size +and that is quite high, so we are always trying + +0:16:30.639 --> 0:16:33.586 +to be efficient. + +0:16:33.853 --> 0:16:43.792 +We are doing then some type of efficiency +because typically we are having this next layer. + +0:16:44.104 --> 0:16:51.967 +It can be still maybe two hundred or five +hundred or one thousand neurons, but this is + +0:16:51.967 --> 0:16:53.323 +significantly. + +0:16:53.713 --> 0:17:03.792 +You can learn that directly and there we then +have similarity between words. + +0:17:03.792 --> 0:17:07.458 +Then it is that some words. + +0:17:07.807 --> 0:17:14.772 +But the nice thing is that this is then learned +that we are not need to hand define that. + +0:17:17.117 --> 0:17:32.742 +We'll come later to the explicit architecture +of the neural language one, and there we can + +0:17:32.742 --> 0:17:35.146 +see how it's. + +0:17:38.418 --> 0:17:44.857 +So we're seeing that the other one or our +representation always has the same similarity. + +0:17:45.105 --> 0:17:59.142 +Then we're having this continuous factor which +is a lot smaller dimension and that's important + +0:17:59.142 --> 0:18:00.768 +for later. + +0:18:01.121 --> 0:18:06.989 +What we are doing then is learning these representations +so that they are best for language. + +0:18:07.487 --> 0:18:14.968 +So the representations are implicitly training +the language for the cards. + +0:18:14.968 --> 0:18:19.058 +This is the best way for doing language. + +0:18:19.479 --> 0:18:32.564 +And the nice thing that was found out later +is these representations are really good. + +0:18:33.153 --> 0:18:39.253 +And that is why they are now even called word +embeddings by themselves and used for other + +0:18:39.253 --> 0:18:39.727 +tasks. + +0:18:40.360 --> 0:18:49.821 +And they are somewhat describing very different +things so they can describe and semantic similarities. + +0:18:49.789 --> 0:18:58.650 +Are looking at the very example of today mass +vector space by adding words and doing some + +0:18:58.650 --> 0:19:00.618 +interesting things. + +0:19:00.940 --> 0:19:11.178 +So they got really like the first big improvement +when switching to neurostaff. + +0:19:11.491 --> 0:19:20.456 +Are like part of the model, but with more +complex representation, but they are the basic + +0:19:20.456 --> 0:19:21.261 +models. + +0:19:23.683 --> 0:19:36.979 +In the output layer we are also having one +output layer structure and a connection function. + +0:19:36.997 --> 0:19:46.525 +That is, for language learning we want to +predict what is the most common word. + +0:19:47.247 --> 0:19:56.453 +And that can be done very well with this so +called soft back layer, where again the dimension. + +0:19:56.376 --> 0:20:02.825 +Vocabulary size, so this is a vocabulary size, +and again the case neural represents the case + +0:20:02.825 --> 0:20:03.310 +class. + +0:20:03.310 --> 0:20:09.759 +So in our case we have again one round representation, +someone saying this is a core report. + +0:20:10.090 --> 0:20:17.255 +Our probability distribution is a probability +distribution over all works, so the case entry + +0:20:17.255 --> 0:20:21.338 +tells us how probable is that the next word +is this. + +0:20:22.682 --> 0:20:33.885 +So we need to have some probability distribution +at our output in order to achieve that this + +0:20:33.885 --> 0:20:37.017 +activation function goes. + +0:20:37.197 --> 0:20:46.944 +And we can achieve that with a soft max activation +we take the input to the form of the value, + +0:20:46.944 --> 0:20:47.970 +and then. + +0:20:48.288 --> 0:20:58.021 +So by having this type of activation function +we are really getting this type of probability. + +0:20:59.019 --> 0:21:15.200 +At the beginning was also very challenging +because again we have this inefficient representation. + +0:21:15.235 --> 0:21:29.799 +You can imagine that something over is maybe +a bit inefficient with cheap users, but definitely. + +0:21:36.316 --> 0:21:44.072 +And then for training the models that will +be fine, so we have to use architecture now. + +0:21:44.264 --> 0:21:48.491 +We need to minimize the arrow. + +0:21:48.491 --> 0:21:53.264 +Are we doing it taking the output? + +0:21:53.264 --> 0:21:58.174 +We are comparing it to our targets. + +0:21:58.298 --> 0:22:03.830 +So one important thing is by training them. + +0:22:03.830 --> 0:22:07.603 +How can we measure the error? + +0:22:07.603 --> 0:22:12.758 +So what is if we are training the ideas? + +0:22:13.033 --> 0:22:15.163 +And how well we are measuring. + +0:22:15.163 --> 0:22:19.768 +It is in natural language processing, typically +the cross entropy. + +0:22:19.960 --> 0:22:35.575 +And that means we are comparing the target +with the output. + +0:22:35.335 --> 0:22:44.430 +It gets optimized and you're seeing that this, +of course, makes it again very nice and easy + +0:22:44.430 --> 0:22:49.868 +because our target is again a one-hour representation. + +0:22:50.110 --> 0:23:00.116 +So all of these are always zero, and what +we are then doing is we are taking the one. + +0:23:00.100 --> 0:23:04.615 +And we only need to multiply the one with +the logarithm here, and that is all the feedback + +0:23:04.615 --> 0:23:05.955 +signal we are taking here. + +0:23:06.946 --> 0:23:13.885 +Of course, this is not always influenced by +all the others. + +0:23:13.885 --> 0:23:17.933 +Why is this influenced by all the. + +0:23:24.304 --> 0:23:34.382 +Have the activation function, which is the +current activation divided by some of the others. + +0:23:34.354 --> 0:23:45.924 +Otherwise it could easily just increase this +volume and ignore the others, but if you increase + +0:23:45.924 --> 0:23:49.090 +one value all the others. + +0:23:51.351 --> 0:23:59.912 +Then we can do with neometrics one very nice +and easy type of training that is done in all + +0:23:59.912 --> 0:24:07.721 +the neometrics where we are now calculating +our error and especially the gradient. + +0:24:07.707 --> 0:24:11.640 +So in which direction does the error show? + +0:24:11.640 --> 0:24:18.682 +And then if we want to go to a smaller arrow +that's what we want to achieve. + +0:24:18.682 --> 0:24:26.638 +We are taking the inverse direction of the +gradient and thereby trying to minimize our + +0:24:26.638 --> 0:24:27.278 +error. + +0:24:27.287 --> 0:24:31.041 +And we have to do that, of course, for all +the weights. + +0:24:31.041 --> 0:24:36.672 +And to calculate the error of all the weights, +we won't do the defectvagation here. + +0:24:36.672 --> 0:24:41.432 +But but what you can do is you can propagate +the arrow which measured. + +0:24:41.432 --> 0:24:46.393 +At the end you can propagate it back its basic +mass and basic derivation. + +0:24:46.706 --> 0:24:58.854 +For each way in your model measure how much +you contribute to the error and then change + +0:24:58.854 --> 0:25:01.339 +it in a way that. + +0:25:04.524 --> 0:25:11.625 +So to summarize what for at least machine +translation on your machine translation should + +0:25:11.625 --> 0:25:19.044 +remember, you know, to understand on this problem +is that this is how a multilayer first the + +0:25:19.044 --> 0:25:20.640 +problem looks like. + +0:25:20.580 --> 0:25:28.251 +There are fully two layers and no connections. + +0:25:28.108 --> 0:25:29.759 +Across layers. + +0:25:29.829 --> 0:25:35.153 +And what they're doing is always just a waited +sum here and then in activation production. + +0:25:35.415 --> 0:25:38.792 +And in order to train you have this forward +and backward pass. + +0:25:39.039 --> 0:25:41.384 +So We Put in Here. + +0:25:41.281 --> 0:25:41.895 +Inputs. + +0:25:41.895 --> 0:25:45.347 +We have some random values at the beginning. + +0:25:45.347 --> 0:25:47.418 +Then calculate the output. + +0:25:47.418 --> 0:25:54.246 +We are measuring how our error is propagating +the arrow back and then changing our model + +0:25:54.246 --> 0:25:57.928 +in a way that we hopefully get a smaller arrow. + +0:25:57.928 --> 0:25:59.616 +And then that is how. + +0:26:01.962 --> 0:26:12.893 +So before we're coming into our neural networks +language models, how can we use this type of + +0:26:12.893 --> 0:26:17.595 +neural network to do language modeling? + +0:26:23.103 --> 0:26:33.157 +So how can we use them in natural language +processing, especially machine translation? + +0:26:33.157 --> 0:26:41.799 +The first idea of using them was to estimate: +So we have seen that the output can be monitored + +0:26:41.799 --> 0:26:42.599 +here as well. + +0:26:43.603 --> 0:26:50.311 +A probability distribution and if we have +a full vocabulary we could mainly hear estimating + +0:26:50.311 --> 0:26:56.727 +how probable each next word is and then use +that in our language model fashion as we've + +0:26:56.727 --> 0:26:58.112 +done it last time. + +0:26:58.112 --> 0:27:03.215 +We got the probability of a full sentence +as a product of individual. + +0:27:04.544 --> 0:27:12.820 +And: That was done in the ninety seven years +and it's very easy to integrate it into this + +0:27:12.820 --> 0:27:14.545 +lot of the year model. + +0:27:14.545 --> 0:27:19.570 +So we have said that this is how the locker +here model looks like. + +0:27:19.570 --> 0:27:25.119 +So we are searching the best translation which +minimizes each waste time. + +0:27:25.125 --> 0:27:26.362 +The Future About You. + +0:27:26.646 --> 0:27:31.647 +We have that with minimum error rate training +if you can remember where we search for the + +0:27:31.647 --> 0:27:32.147 +optimal. + +0:27:32.512 --> 0:27:40.422 +The language model and many others, and we +can just add here a neuromodel, have a knock + +0:27:40.422 --> 0:27:41.591 +of features. + +0:27:41.861 --> 0:27:45.761 +So that is quite easy as said. + +0:27:45.761 --> 0:27:53.183 +That was how statistical machine translation +was improved. + +0:27:53.183 --> 0:27:57.082 +You just add one more feature. + +0:27:58.798 --> 0:28:07.631 +So how can we model the language modeling +with a network? + +0:28:07.631 --> 0:28:16.008 +So what we have to do is model the probability +of the. + +0:28:16.656 --> 0:28:25.047 +The problem in general in the head is that +mostly we haven't seen long sequences. + +0:28:25.085 --> 0:28:35.650 +Mostly we have to beg off to very short sequences +and we are working on this discrete space where + +0:28:35.650 --> 0:28:36.944 +similarity. + +0:28:37.337 --> 0:28:50.163 +So the idea is if we have now a real network, +we can make words into continuous representation. + +0:28:51.091 --> 0:29:00.480 +And the structure then looks like this, so +this is a basic still feed forward neural network. + +0:29:01.361 --> 0:29:10.645 +We are doing this at perximation again, so +we are not putting in all previous words, but + +0:29:10.645 --> 0:29:11.375 +it is. + +0:29:11.691 --> 0:29:25.856 +This is done because we said that in the real +network we can have only a fixed type of input. + +0:29:25.945 --> 0:29:31.886 +You can only do a fixed step and then we'll +be doing that exactly in minus one. + +0:29:33.593 --> 0:29:39.536 +So here you are, for example, three words +and three different words. + +0:29:39.536 --> 0:29:50.704 +One and all the others are: And then we're +having the first layer of the neural network, + +0:29:50.704 --> 0:29:56.230 +which like you learns is word embedding. + +0:29:57.437 --> 0:30:04.976 +There is one thing which is maybe special +compared to the standard neural member. + +0:30:05.345 --> 0:30:11.918 +So the representation of this word we want +to learn first of all position independence. + +0:30:11.918 --> 0:30:19.013 +So we just want to learn what is the general +meaning of the word independent of its neighbors. + +0:30:19.299 --> 0:30:26.239 +And therefore the representation you get here +should be the same as if in the second position. + +0:30:27.247 --> 0:30:36.865 +The nice thing you can achieve is that this +weights which you're using here you're reusing + +0:30:36.865 --> 0:30:41.727 +here and reusing here so we are forcing them. + +0:30:42.322 --> 0:30:48.360 +You then learn your word embedding, which +is contextual, independent, so it's the same + +0:30:48.360 --> 0:30:49.678 +for each position. + +0:30:49.909 --> 0:31:03.482 +So that's the idea that you want to learn +the representation first of and you don't want + +0:31:03.482 --> 0:31:07.599 +to really use the context. + +0:31:08.348 --> 0:31:13.797 +That of course might have a different meaning +depending on where it stands, but we'll learn + +0:31:13.797 --> 0:31:14.153 +that. + +0:31:14.514 --> 0:31:20.386 +So first we are learning here representational +words, which is just the representation. + +0:31:20.760 --> 0:31:32.498 +Normally we said in neurons all input neurons +here are connected to all here, but we're reducing + +0:31:32.498 --> 0:31:37.338 +the complexity by saying these neurons. + +0:31:37.857 --> 0:31:47.912 +Then we have a lot denser representation that +is our three word embedded in here, and now + +0:31:47.912 --> 0:31:57.408 +we are learning this interaction between words, +a direction between words not based. + +0:31:57.677 --> 0:32:08.051 +So we have at least one connected layer here, +which takes a three embedding input and then + +0:32:08.051 --> 0:32:14.208 +learns a new embedding which now represents +the full. + +0:32:15.535 --> 0:32:16.551 +Layers. + +0:32:16.551 --> 0:32:27.854 +It is the output layer which now and then +again the probability distribution of all the. + +0:32:28.168 --> 0:32:48.612 +So here is your target prediction. + +0:32:48.688 --> 0:32:56.361 +The nice thing is that you learn everything +together, so you don't have to teach them what + +0:32:56.361 --> 0:32:58.722 +a good word representation. + +0:32:59.079 --> 0:33:08.306 +Training the whole number together, so it +learns what a good representation for a word + +0:33:08.306 --> 0:33:13.079 +you get in order to perform your final task. + +0:33:15.956 --> 0:33:19.190 +Yeah, that is the main idea. + +0:33:20.660 --> 0:33:32.731 +This is now a days often referred to as one +way of self supervise learning. + +0:33:33.053 --> 0:33:37.120 +The output is the next word and the input +is the previous word. + +0:33:37.377 --> 0:33:46.783 +But it's not really that we created labels, +but we artificially created a task out of unlabeled. + +0:33:46.806 --> 0:33:59.434 +We just had pure text, and then we created +the telescopes by predicting the next word, + +0:33:59.434 --> 0:34:18.797 +which is: Say we have like two sentences like +go home and the second one is go to prepare. + +0:34:18.858 --> 0:34:30.135 +And then we have to predict the next series +and my questions in the labels for the album. + +0:34:31.411 --> 0:34:42.752 +We model this as one vector with like probability +for possible weights starting again. + +0:34:44.044 --> 0:34:57.792 +Multiple examples, so then you would twice +train one to predict KRT, one to predict home, + +0:34:57.792 --> 0:35:02.374 +and then of course the easel. + +0:35:04.564 --> 0:35:13.568 +Is a very good point, so you are not aggregating +examples beforehand, but you are taking each. + +0:35:19.259 --> 0:35:37.204 +So when you do it simultaneously learn the +projection layer and the endgram for abilities + +0:35:37.204 --> 0:35:39.198 +and then. + +0:35:39.499 --> 0:35:47.684 +And later analyze it that these representations +are very powerful. + +0:35:47.684 --> 0:35:56.358 +The task is just a very important task to +model what is the next word. + +0:35:56.816 --> 0:35:59.842 +Is motivated by nowadays. + +0:35:59.842 --> 0:36:10.666 +In order to get the meaning of the word you +have to look at its companies where the context. + +0:36:10.790 --> 0:36:16.048 +If you read texts in days of word which you +have never seen, you often can still estimate + +0:36:16.048 --> 0:36:21.130 +the meaning of this word because you do not +know how it is used, and this is typically + +0:36:21.130 --> 0:36:22.240 +used as a city or. + +0:36:22.602 --> 0:36:25.865 +Just imagine you read a text about some city. + +0:36:25.865 --> 0:36:32.037 +Even if you've never seen the city before, +you often know from the context of how it's + +0:36:32.037 --> 0:36:32.463 +used. + +0:36:34.094 --> 0:36:42.483 +So what is now the big advantage of using +neural neckworks? + +0:36:42.483 --> 0:36:51.851 +So just imagine we have to estimate that I +bought my first iPhone. + +0:36:52.052 --> 0:36:56.608 +So you have to monitor the probability of +ad hitting them. + +0:36:56.608 --> 0:37:00.237 +Now imagine iPhone, which you have never seen. + +0:37:00.600 --> 0:37:11.588 +So all the techniques we had last time at +the end, if you haven't seen iPhone you will + +0:37:11.588 --> 0:37:14.240 +always fall back to. + +0:37:15.055 --> 0:37:26.230 +You have no idea how to deal that you won't +have seen the diagram, the trigram, and all + +0:37:26.230 --> 0:37:27.754 +the others. + +0:37:28.588 --> 0:37:43.441 +If you're having this type of model, what +does it do if you have my first and then something? + +0:37:43.483 --> 0:37:50.270 +Maybe this representation is really messed +up because it's mainly on a cavalry word. + +0:37:50.730 --> 0:37:57.793 +However, you have still these two information +that two words before was first and therefore. + +0:37:58.098 --> 0:38:06.954 +So you have a lot of information in order +to estimate how good it is. + +0:38:06.954 --> 0:38:13.279 +There could be more information if you know +that. + +0:38:13.593 --> 0:38:25.168 +So all this type of modeling we can do that +we couldn't do beforehand because we always + +0:38:25.168 --> 0:38:25.957 +have. + +0:38:27.027 --> 0:38:40.466 +Good point, so typically you would have one +token for a vocabulary so that you could, for + +0:38:40.466 --> 0:38:45.857 +example: All you're doing by parent coding +when you have a fixed thing. + +0:38:46.226 --> 0:38:49.437 +Oh yeah, you have to do something like that +that that that's true. + +0:38:50.050 --> 0:38:55.420 +So yeah, auto vocabulary are by thanking where +you don't have other words written. + +0:38:55.735 --> 0:39:06.295 +But then, of course, you might be getting +very long previous things, and your sequence + +0:39:06.295 --> 0:39:11.272 +length gets very long for unknown words. + +0:39:17.357 --> 0:39:20.067 +Any more questions to the basic stable. + +0:39:23.783 --> 0:39:36.719 +For this model, what we then want to continue +is looking a bit into how complex or how we + +0:39:36.719 --> 0:39:39.162 +can make things. + +0:39:40.580 --> 0:39:49.477 +Because at the beginning there was definitely +a major challenge, it's still not that easy, + +0:39:49.477 --> 0:39:58.275 +and I mean our likeers followed the talk about +their environmental fingerprint and so on. + +0:39:58.478 --> 0:40:05.700 +So this calculation is not really heavy, and +if you build systems yourselves you have to + +0:40:05.700 --> 0:40:06.187 +wait. + +0:40:06.466 --> 0:40:14.683 +So it's good to know a bit about how complex +things are in order to do a good or efficient + +0:40:14.683 --> 0:40:15.405 +affair. + +0:40:15.915 --> 0:40:24.211 +So one thing where most of the calculation +really happens is if you're doing it in a bad + +0:40:24.211 --> 0:40:24.677 +way. + +0:40:25.185 --> 0:40:33.523 +So in generally all these layers we are talking +about networks and zones fancy. + +0:40:33.523 --> 0:40:46.363 +In the end it is: So what you have to do in +order to calculate here, for example, these + +0:40:46.363 --> 0:40:52.333 +activations: So make it simple a bit. + +0:40:52.333 --> 0:41:06.636 +Let's see where outputs and you just do metric +multiplication between your weight matrix and + +0:41:06.636 --> 0:41:08.482 +your input. + +0:41:08.969 --> 0:41:20.992 +So that is why computers are so powerful for +neural networks because they are very good + +0:41:20.992 --> 0:41:22.358 +in doing. + +0:41:22.782 --> 0:41:28.013 +However, for some type for the embedding layer +this is really very inefficient. + +0:41:28.208 --> 0:41:39.652 +So because remember we're having this one +art encoding in this input, it's always like + +0:41:39.652 --> 0:41:42.940 +one and everything else. + +0:41:42.940 --> 0:41:47.018 +It's zero if we're doing this. + +0:41:47.387 --> 0:41:55.552 +So therefore you can do at least the forward +pass a lot more efficient if you don't really + +0:41:55.552 --> 0:42:01.833 +do this calculation, but you can select the +one color where there is. + +0:42:01.833 --> 0:42:07.216 +Therefore, you also see this is called your +word embedding. + +0:42:08.348 --> 0:42:19.542 +So the weight matrix of the embedding layer +is just that in each color you have the embedding + +0:42:19.542 --> 0:42:20.018 +of. + +0:42:20.580 --> 0:42:30.983 +So this is like how your initial weights look +like and how you can interpret or understand. + +0:42:32.692 --> 0:42:39.509 +And this is already relatively important because +remember this is a huge dimensional thing. + +0:42:39.509 --> 0:42:46.104 +So typically here we have the number of words +is ten thousand or so, so this is the word + +0:42:46.104 --> 0:42:51.365 +embeddings metrics, typically the most expensive +to calculate metrics. + +0:42:51.451 --> 0:42:59.741 +Because it's the largest one there, we have +ten thousand entries, while for the hours we + +0:42:59.741 --> 0:43:00.393 +maybe. + +0:43:00.660 --> 0:43:03.408 +So therefore the addition to a little bit +more to make this. + +0:43:06.206 --> 0:43:10.538 +Then you can go where else the calculations +are very difficult. + +0:43:10.830 --> 0:43:20.389 +So here we then have our network, so we have +the word embeddings. + +0:43:20.389 --> 0:43:29.514 +We have one hidden there, and then you can +look how difficult. + +0:43:30.270 --> 0:43:38.746 +Could save a lot of calculation by not really +calculating the selection because that is always. + +0:43:40.600 --> 0:43:46.096 +The number of calculations you have to do +here is so. + +0:43:46.096 --> 0:43:51.693 +The length of this layer is minus one type +projection. + +0:43:52.993 --> 0:43:56.321 +That is a hint size. + +0:43:56.321 --> 0:44:10.268 +So the first step of calculation for this +metrics modification is how much calculation. + +0:44:10.730 --> 0:44:18.806 +Then you have to do some activation function +and then you have to do again the calculation. + +0:44:19.339 --> 0:44:27.994 +Here we need the vocabulary size because we +need to calculate the probability for each + +0:44:27.994 --> 0:44:29.088 +next word. + +0:44:29.889 --> 0:44:43.155 +And if you look at these numbers, so if you +have a projector size of and a vocabulary size + +0:44:43.155 --> 0:44:53.876 +of, you see: And that is why there has been +especially at the beginning some ideas how + +0:44:53.876 --> 0:44:55.589 +we can reduce. + +0:44:55.956 --> 0:45:01.942 +And if we really need to calculate all of +our capabilities, or if we can calculate only + +0:45:01.942 --> 0:45:02.350 +some. + +0:45:02.582 --> 0:45:10.871 +And there again the one important thing to +think about is for what will use my language + +0:45:10.871 --> 0:45:11.342 +mom. + +0:45:11.342 --> 0:45:19.630 +I can use it for generations and that's what +we will see next week in an achiever which + +0:45:19.630 --> 0:45:22.456 +really is guiding the search. + +0:45:23.123 --> 0:45:30.899 +If it just uses a feature, we do not want +to use it for generations, but we want to only + +0:45:30.899 --> 0:45:32.559 +know how probable. + +0:45:32.953 --> 0:45:39.325 +There we might not be really interested in +all the probabilities, but we already know + +0:45:39.325 --> 0:45:46.217 +we just want to know the probability of this +one word, and then it might be very inefficient + +0:45:46.217 --> 0:45:49.403 +to really calculate all the probabilities. + +0:45:51.231 --> 0:45:52.919 +And how can you do that so? + +0:45:52.919 --> 0:45:56.296 +Initially, for example, the people look into +shortness. + +0:45:56.756 --> 0:46:02.276 +So this calculation at the end is really very +expensive. + +0:46:02.276 --> 0:46:05.762 +So can we make that more efficient. + +0:46:05.945 --> 0:46:17.375 +And most words occur very rarely, and maybe +we don't need anger, and so there we may want + +0:46:17.375 --> 0:46:18.645 +to focus. + +0:46:19.019 --> 0:46:29.437 +And so they use the smaller vocabulary, which +is maybe. + +0:46:29.437 --> 0:46:34.646 +This layer is used from to. + +0:46:34.646 --> 0:46:37.623 +Then you merge. + +0:46:37.937 --> 0:46:45.162 +So you're taking if the word is in the shortest, +so in the two thousand most frequent words. + +0:46:45.825 --> 0:46:58.299 +Of this short word by some normalization here, +and otherwise you take a back of probability + +0:46:58.299 --> 0:46:59.655 +from the. + +0:47:00.020 --> 0:47:04.933 +It will not be as good, but the idea is okay. + +0:47:04.933 --> 0:47:14.013 +Then we don't have to calculate all these +probabilities here at the end, but we only + +0:47:14.013 --> 0:47:16.042 +have to calculate. + +0:47:19.599 --> 0:47:32.097 +With some type of cost because it means we +don't model the probability of the infrequent + +0:47:32.097 --> 0:47:39.399 +words, and maybe it's even very important to +model. + +0:47:39.299 --> 0:47:46.671 +And one idea is to do what is reported as +so so structured out there. + +0:47:46.606 --> 0:47:49.571 +Network language models you see some years +ago. + +0:47:49.571 --> 0:47:53.154 +People were very creative and giving names +to new models. + +0:47:53.813 --> 0:48:00.341 +And there the idea is that we model the output +vocabulary as a clustered treat. + +0:48:00.680 --> 0:48:06.919 +So you don't need to model all of our bodies +directly, but you are putting words into a + +0:48:06.919 --> 0:48:08.479 +sequence of clusters. + +0:48:08.969 --> 0:48:15.019 +So maybe a very intriguant world is first +in cluster three and then in cluster three. + +0:48:15.019 --> 0:48:21.211 +You have subclusters again and there is subclusters +seven and subclusters and there is. + +0:48:21.541 --> 0:48:40.134 +And this is the path, so that is what was +the man in the past. + +0:48:40.340 --> 0:48:52.080 +And then you can calculate the probability +of the word again just by the product of the + +0:48:52.080 --> 0:48:55.548 +first class of the world. + +0:48:57.617 --> 0:49:07.789 +That it may be more clear where you have this +architecture, so this is all the same. + +0:49:07.789 --> 0:49:13.773 +But then you first predict here which main +class. + +0:49:14.154 --> 0:49:24.226 +Then you go to the appropriate subclass, then +you calculate the probability of the subclass + +0:49:24.226 --> 0:49:26.415 +and maybe the cell. + +0:49:27.687 --> 0:49:35.419 +Anybody have an idea why this is more efficient +or if you do it first, it looks a lot more. + +0:49:42.242 --> 0:49:51.788 +You have to do less calculations, so maybe +if you do it here you have to calculate the + +0:49:51.788 --> 0:49:59.468 +element there, but you don't have to do all +the one hundred thousand. + +0:49:59.980 --> 0:50:06.115 +The probabilities in the set classes that +you're going through and not for all of them. + +0:50:06.386 --> 0:50:18.067 +Therefore, it's more efficient if you don't +need all output proficient because you have + +0:50:18.067 --> 0:50:21.253 +to calculate the class. + +0:50:21.501 --> 0:50:28.936 +So it's only more efficient and scenarios +where you really need to use a language model + +0:50:28.936 --> 0:50:30.034 +to evaluate. + +0:50:35.275 --> 0:50:52.456 +How this works was that you can train first +in your language one on the short list. + +0:50:52.872 --> 0:51:03.547 +But on the input layer you have your full +vocabulary because at the input we saw that + +0:51:03.547 --> 0:51:06.650 +this is not complicated. + +0:51:06.906 --> 0:51:26.638 +And then you can cluster down all your words +here into classes and use that as your glasses. + +0:51:29.249 --> 0:51:34.148 +That is one idea of doing it. + +0:51:34.148 --> 0:51:44.928 +There is also a second idea of doing it, and +again we don't need. + +0:51:45.025 --> 0:51:53.401 +So sometimes it doesn't really need to be +a probability to evaluate. + +0:51:53.401 --> 0:51:56.557 +It's only important that. + +0:51:58.298 --> 0:52:04.908 +And: Here it's called self normalization what +people have done so. + +0:52:04.908 --> 0:52:11.562 +We have seen that the probability is in this +soft mechanism always to the input divided + +0:52:11.562 --> 0:52:18.216 +by our normalization, and the normalization +is a summary of the vocabulary to the power + +0:52:18.216 --> 0:52:19.274 +of the spell. + +0:52:19.759 --> 0:52:25.194 +So this is how we calculate the software. + +0:52:25.825 --> 0:52:41.179 +In self normalization of the idea, if this +would be zero then we don't need to calculate + +0:52:41.179 --> 0:52:42.214 +that. + +0:52:42.102 --> 0:52:54.272 +Will be zero, and then you don't even have +to calculate the normalization because it's. + +0:52:54.514 --> 0:53:08.653 +So how can we achieve that and then the nice +thing in your networks? + +0:53:09.009 --> 0:53:23.928 +And now we're just adding a second note with +some either permitted here. + +0:53:24.084 --> 0:53:29.551 +And the second lost just tells us he'll be +strained away. + +0:53:29.551 --> 0:53:31.625 +The locks at is zero. + +0:53:32.352 --> 0:53:38.614 +So then if it's nearly zero at the end we +don't need to calculate this and it's also + +0:53:38.614 --> 0:53:39.793 +very efficient. + +0:53:40.540 --> 0:53:49.498 +One important thing is this, of course, is +only in inference. + +0:53:49.498 --> 0:54:04.700 +During tests we don't need to calculate that +because: You can do a bit of a hyperparameter + +0:54:04.700 --> 0:54:14.851 +here where you do the waiting, so how good +should it be estimating the probabilities and + +0:54:14.851 --> 0:54:16.790 +how much effort? + +0:54:18.318 --> 0:54:28.577 +The only disadvantage is no speed up during +training. + +0:54:28.577 --> 0:54:43.843 +There are other ways of doing that, for example: +Englishman is in case you get it. + +0:54:44.344 --> 0:54:48.540 +Then we are coming very, very briefly like +just one idea. + +0:54:48.828 --> 0:54:53.058 +That there is more things on different types +of language models. + +0:54:53.058 --> 0:54:58.002 +We are having a very short view on restricted +person-based language models. + +0:54:58.298 --> 0:55:08.931 +Talk about recurrent neural networks for language +mines because they have the advantage that + +0:55:08.931 --> 0:55:17.391 +we can even further improve by not having a +continuous representation on. + +0:55:18.238 --> 0:55:23.845 +So there's different types of neural networks. + +0:55:23.845 --> 0:55:30.169 +These are these boxing machines and the interesting. + +0:55:30.330 --> 0:55:39.291 +They have these: And they define like an energy +function on the network, which can be in restricted + +0:55:39.291 --> 0:55:44.372 +balsam machines efficiently calculated in general +and restricted needs. + +0:55:44.372 --> 0:55:51.147 +You only have connection between the input +and the hidden layer, but you don't have connections + +0:55:51.147 --> 0:55:53.123 +in the input or within the. + +0:55:53.393 --> 0:56:00.194 +So you see here you don't have an input output, +you just have an input, and you calculate. + +0:56:00.460 --> 0:56:15.612 +Which of course nicely fits with the idea +we're having, so you can then use this for + +0:56:15.612 --> 0:56:19.177 +an N Gram language. + +0:56:19.259 --> 0:56:25.189 +Retaining the flexibility of the input by +this type of neon networks. + +0:56:26.406 --> 0:56:30.589 +And the advantage of this type of model was +there's. + +0:56:30.550 --> 0:56:37.520 +Very, very fast to integrate it, so that one +was the first one which was used during the + +0:56:37.520 --> 0:56:38.616 +coding model. + +0:56:38.938 --> 0:56:45.454 +The engram language models were that they +were very good and gave performance. + +0:56:45.454 --> 0:56:50.072 +However, calculation still with all these +tricks takes. + +0:56:50.230 --> 0:56:58.214 +We have talked about embest lists so they +generated an embest list of the most probable + +0:56:58.214 --> 0:57:05.836 +outputs and then they took this and best list +scored each entry with a new network. + +0:57:06.146 --> 0:57:09.306 +A language model, and then only change the +order again. + +0:57:09.306 --> 0:57:10.887 +Select based on that which. + +0:57:11.231 --> 0:57:17.187 +The neighboring list is maybe only like hundred +entries. + +0:57:17.187 --> 0:57:21.786 +When decoding you look at several thousand. + +0:57:26.186 --> 0:57:35.196 +Let's look at the context so we have now seen +your language models. + +0:57:35.196 --> 0:57:43.676 +There is the big advantage we can use this +word similarity and. + +0:57:44.084 --> 0:57:52.266 +Remember for engram language ones is not always +minus one words because sometimes you have + +0:57:52.266 --> 0:57:59.909 +to back off or interpolation to lower engrams +and you don't know the previous words. + +0:58:00.760 --> 0:58:04.742 +And however in neural models we always have +all of this importance. + +0:58:04.742 --> 0:58:05.504 +Can some of. + +0:58:07.147 --> 0:58:20.288 +The disadvantage is that you are still limited +in your context, and if you remember the sentence + +0:58:20.288 --> 0:58:22.998 +from last lecture,. + +0:58:22.882 --> 0:58:28.328 +Sometimes you need more context and there +is unlimited context that you might need and + +0:58:28.328 --> 0:58:34.086 +you can always create sentences where you may +need this five context in order to put a good + +0:58:34.086 --> 0:58:34.837 +estimation. + +0:58:35.315 --> 0:58:44.956 +Can also do it different in order to understand +that it makes sense to view language. + +0:58:45.445 --> 0:58:59.510 +So secret labeling tasks are a very common +type of task in language processing where you + +0:58:59.510 --> 0:59:03.461 +have the input sequence. + +0:59:03.323 --> 0:59:05.976 +So you have one output for each input. + +0:59:05.976 --> 0:59:12.371 +Machine translation is not a secret labeling +cast because the number of inputs and the number + +0:59:12.371 --> 0:59:14.072 +of outputs is different. + +0:59:14.072 --> 0:59:20.598 +So you put in a string German which has five +words and the output can be: See, for example, + +0:59:20.598 --> 0:59:24.078 +you always have the same number and the same +number of offices. + +0:59:24.944 --> 0:59:39.779 +And you can more language waddling as that, +and you just say the label for each word is + +0:59:39.779 --> 0:59:43.151 +always a next word. + +0:59:45.705 --> 0:59:50.312 +This is the more generous you can think of +it. + +0:59:50.312 --> 0:59:56.194 +For example, Paddle Speech Taking named Entity +Recognition. + +0:59:58.938 --> 1:00:08.476 +And if you look at now, this output token +and generally sequenced labeling can depend + +1:00:08.476 --> 1:00:26.322 +on: The input tokens are the same so we can +easily model it and they only depend on the + +1:00:26.322 --> 1:00:29.064 +input tokens. + +1:00:31.011 --> 1:00:42.306 +But we can always look at one specific type +of sequence labeling, unidirectional sequence + +1:00:42.306 --> 1:00:44.189 +labeling type. + +1:00:44.584 --> 1:01:00.855 +The probability of the next word only depends +on the previous words that we are having here. + +1:01:01.321 --> 1:01:05.998 +That's also not completely true in language. + +1:01:05.998 --> 1:01:14.418 +Well, the back context might also be helpful +by direction of the model's Google. + +1:01:14.654 --> 1:01:23.039 +We will always admire the probability of the +word given on its history. + +1:01:23.623 --> 1:01:30.562 +And currently there is approximation and sequence +labeling that we have this windowing approach. + +1:01:30.951 --> 1:01:43.016 +So in order to predict this type of word we +always look at the previous three words. + +1:01:43.016 --> 1:01:48.410 +This is this type of windowing model. + +1:01:49.389 --> 1:01:54.780 +If you're into neural networks you recognize +this type of structure. + +1:01:54.780 --> 1:01:57.515 +Also, the typical neural networks. + +1:01:58.938 --> 1:02:11.050 +Yes, yes, so like engram models you can, at +least in some way, prepare for that type of + +1:02:11.050 --> 1:02:12.289 +context. + +1:02:14.334 --> 1:02:23.321 +Are also other types of neonamic structures +which we can use for sequins lately and which + +1:02:23.321 --> 1:02:30.710 +might help us where we don't have this type +of fixed size representation. + +1:02:32.812 --> 1:02:34.678 +That we can do so. + +1:02:34.678 --> 1:02:39.391 +The idea is in recurrent new networks traction. + +1:02:39.391 --> 1:02:43.221 +We are saving complete history in one. + +1:02:43.623 --> 1:02:56.946 +So again we have to do this fixed size representation +because the neural networks always need a habit. + +1:02:57.157 --> 1:03:09.028 +And then the network should look like that, +so we start with an initial value for our storage. + +1:03:09.028 --> 1:03:15.900 +We are giving our first input and calculating +the new. + +1:03:16.196 --> 1:03:35.895 +So again in your network with two types of +inputs: Then you can apply it to the next type + +1:03:35.895 --> 1:03:41.581 +of input and you're again having this. + +1:03:41.581 --> 1:03:46.391 +You're taking this hidden state. + +1:03:47.367 --> 1:03:53.306 +Nice thing is now that you can do now step +by step by step, so all the way over. + +1:03:55.495 --> 1:04:06.131 +The nice thing we are having here now is that +now we are having context information from + +1:04:06.131 --> 1:04:07.206 +all the. + +1:04:07.607 --> 1:04:14.181 +So if you're looking like based on which words +do you, you calculate the probability of varying. + +1:04:14.554 --> 1:04:20.090 +It depends on this part. + +1:04:20.090 --> 1:04:33.154 +It depends on and this hidden state was influenced +by two. + +1:04:33.473 --> 1:04:38.259 +So now we're having something new. + +1:04:38.259 --> 1:04:46.463 +We can model like the word probability not +only on a fixed. + +1:04:46.906 --> 1:04:53.565 +Because the hidden states we are having here +in our Oregon are influenced by all the trivia. + +1:04:56.296 --> 1:05:02.578 +So how is there to be Singapore? + +1:05:02.578 --> 1:05:16.286 +But then we have the initial idea about this +P of given on the history. + +1:05:16.736 --> 1:05:25.300 +So do not need to do any clustering here, +and you also see how things are put together + +1:05:25.300 --> 1:05:26.284 +in order. + +1:05:29.489 --> 1:05:43.449 +The green box this night since we are starting +from the left to the right. + +1:05:44.524 --> 1:05:51.483 +Voices: Yes, that's right, so there are clusters, +and here is also sometimes clustering happens. + +1:05:51.871 --> 1:05:58.687 +The small difference does matter again, so +if you have now a lot of different histories, + +1:05:58.687 --> 1:06:01.674 +the similarity which you have in here. + +1:06:01.674 --> 1:06:08.260 +If two of the histories are very similar, +these representations will be the same, and + +1:06:08.260 --> 1:06:10.787 +then you're treating them again. + +1:06:11.071 --> 1:06:15.789 +Because in order to do the final restriction +you only do a good base on the green box. + +1:06:16.156 --> 1:06:28.541 +So you are now still learning some type of +clustering in there, but you are learning it + +1:06:28.541 --> 1:06:30.230 +implicitly. + +1:06:30.570 --> 1:06:38.200 +The only restriction you're giving is you +have to stall everything that is important + +1:06:38.200 --> 1:06:39.008 +in this. + +1:06:39.359 --> 1:06:54.961 +So it's a different type of limitation, so +you calculate the probability based on the + +1:06:54.961 --> 1:06:57.138 +last words. + +1:06:57.437 --> 1:07:04.430 +And that is how you still need to somehow +cluster things together in order to do efficiently. + +1:07:04.430 --> 1:07:09.563 +Of course, you need to do some type of clustering +because otherwise. + +1:07:09.970 --> 1:07:18.865 +But this is where things get merged together +in this type of hidden representation. + +1:07:18.865 --> 1:07:27.973 +So here the probability of the word first +only depends on this hidden representation. + +1:07:28.288 --> 1:07:33.104 +On the previous words, but they are some other +bottleneck in order to make a good estimation. + +1:07:34.474 --> 1:07:41.231 +So the idea is that we can store all our history +into or into one lecture. + +1:07:41.581 --> 1:07:44.812 +Which is the one that makes it more strong. + +1:07:44.812 --> 1:07:51.275 +Next we come to problems that of course at +some point it might be difficult if you have + +1:07:51.275 --> 1:07:57.811 +very long sequences and you always write all +the information you have on this one block. + +1:07:58.398 --> 1:08:02.233 +Then maybe things get overwritten or you cannot +store everything in there. + +1:08:02.662 --> 1:08:04.514 +So,. + +1:08:04.184 --> 1:08:09.569 +Therefore, yet for short things like single +sentences that works well, but especially if + +1:08:09.569 --> 1:08:15.197 +you think of other tasks and like symbolizations +with our document based on T where you need + +1:08:15.197 --> 1:08:20.582 +to consider the full document, these things +got got a bit more more more complicated and + +1:08:20.582 --> 1:08:23.063 +will learn another type of architecture. + +1:08:24.464 --> 1:08:30.462 +In order to understand these neighbors, it +is good to have all the bus use always. + +1:08:30.710 --> 1:08:33.998 +So this is the unrolled view. + +1:08:33.998 --> 1:08:43.753 +Somewhere you're over the type or in language +over the words you're unrolling a network. + +1:08:44.024 --> 1:08:52.096 +Here is the article and here is the network +which is connected by itself and that is recurrent. + +1:08:56.176 --> 1:09:04.982 +There is one challenge in this networks and +training. + +1:09:04.982 --> 1:09:11.994 +We can train them first of all as forward. + +1:09:12.272 --> 1:09:19.397 +So we don't really know how to train them, +but if you unroll them like this is a feet + +1:09:19.397 --> 1:09:20.142 +forward. + +1:09:20.540 --> 1:09:38.063 +Is exactly the same, so you can measure your +arrows here and be back to your arrows. + +1:09:38.378 --> 1:09:45.646 +If you unroll something, it's a feature in +your laptop and you can train it the same way. + +1:09:46.106 --> 1:09:57.606 +The only important thing is again, of course, +for different inputs. + +1:09:57.837 --> 1:10:05.145 +But since parameters are shared, it's somehow +a similar point you can train it. + +1:10:05.145 --> 1:10:08.800 +The training algorithm is very similar. + +1:10:10.310 --> 1:10:29.568 +One thing which makes things difficult is +what is referred to as the vanish ingredient. + +1:10:29.809 --> 1:10:32.799 +That's a very strong thing in the motivation +of using hardness. + +1:10:33.593 --> 1:10:44.604 +The influence here gets smaller and smaller, +and the modems are not really able to monitor. + +1:10:44.804 --> 1:10:51.939 +Because the gradient gets smaller and smaller, +and so the arrow here propagated to this one + +1:10:51.939 --> 1:10:58.919 +that contributes to the arrow is very small, +and therefore you don't do any changes there + +1:10:58.919 --> 1:10:59.617 +anymore. + +1:11:00.020 --> 1:11:06.703 +And yeah, that's why standard art men are +undifficult or have to pick them at custard. + +1:11:07.247 --> 1:11:11.462 +So everywhere talking to me about fire and +ants nowadays,. + +1:11:11.791 --> 1:11:23.333 +What we are typically meaning are LSDN's or +long short memories. + +1:11:23.333 --> 1:11:30.968 +You see they are by now quite old already. + +1:11:31.171 --> 1:11:39.019 +So there was a model in the language model +task. + +1:11:39.019 --> 1:11:44.784 +It's some more storing information. + +1:11:44.684 --> 1:11:51.556 +Because if you only look at the last words, +it's often no longer clear this is a question + +1:11:51.556 --> 1:11:52.548 +or a normal. + +1:11:53.013 --> 1:12:05.318 +So there you have these mechanisms with ripgate +in order to store things for a longer time + +1:12:05.318 --> 1:12:08.563 +into your hidden state. + +1:12:10.730 --> 1:12:20.162 +Here they are used in in in selling quite +a lot of works. + +1:12:21.541 --> 1:12:29.349 +For especially machine translation now, the +standard is to do transform base models which + +1:12:29.349 --> 1:12:30.477 +we'll learn. + +1:12:30.690 --> 1:12:38.962 +But for example, in architecture we have later +one lecture about efficiency. + +1:12:38.962 --> 1:12:42.830 +So how can we build very efficient? + +1:12:42.882 --> 1:12:53.074 +And there in the decoder in parts of the networks +they are still using. + +1:12:53.473 --> 1:12:57.518 +So it's not that yeah our hands are of no +importance in the body. + +1:12:59.239 --> 1:13:08.956 +In order to make them strong, there are some +more things which are helpful and should be: + +1:13:09.309 --> 1:13:19.683 +So one thing is there is a nice trick to make +this new network stronger and better. + +1:13:19.739 --> 1:13:21.523 +So of course it doesn't work always. + +1:13:21.523 --> 1:13:23.451 +They have to have enough training data. + +1:13:23.763 --> 1:13:28.959 +But in general there's the easiest way of +making your models bigger and stronger just + +1:13:28.959 --> 1:13:30.590 +to increase your pyramids. + +1:13:30.630 --> 1:13:43.236 +And you've seen that with a large language +models they are always bragging about. + +1:13:43.903 --> 1:13:56.463 +This is one way, so the question is how do +you get more parameters? + +1:13:56.463 --> 1:14:01.265 +There's ways of doing it. + +1:14:01.521 --> 1:14:10.029 +And the other thing is to make your networks +deeper so to have more legs in between. + +1:14:11.471 --> 1:14:13.827 +And then you can also get to get more calm. + +1:14:14.614 --> 1:14:23.340 +There's more traveling with this and it's +very similar to what we just saw with our hand. + +1:14:23.603 --> 1:14:34.253 +We have this problem of radiant flow that +if it flows so fast like a radiant gets very + +1:14:34.253 --> 1:14:35.477 +swollen,. + +1:14:35.795 --> 1:14:42.704 +Exactly the same thing happens in deep LSD +ends. + +1:14:42.704 --> 1:14:52.293 +If you take here the gradient, tell you what +is the right or wrong. + +1:14:52.612 --> 1:14:56.439 +With three layers it's no problem, but if +you're going to ten, twenty or hundred layers. + +1:14:57.797 --> 1:14:59.698 +That's Getting Typically Young. + +1:15:00.060 --> 1:15:07.000 +Are doing is using what is called decisional +connections. + +1:15:07.000 --> 1:15:15.855 +That's a very helpful idea, which is maybe +very surprising that it works. + +1:15:15.956 --> 1:15:20.309 +And so the idea is that these networks. + +1:15:20.320 --> 1:15:29.982 +In between should no longer calculate what +is a new good representation, but they're more + +1:15:29.982 --> 1:15:31.378 +calculating. + +1:15:31.731 --> 1:15:37.588 +Therefore, in the end you're always the output +of a layer is added with the input. + +1:15:38.318 --> 1:15:48.824 +The knife is later if you are doing back propagation +with this very fast back propagation. + +1:15:49.209 --> 1:16:02.540 +Nowadays in very deep architectures, not only +on other but always has this residual or highway + +1:16:02.540 --> 1:16:04.224 +connection. + +1:16:04.704 --> 1:16:06.616 +Has two advantages. + +1:16:06.616 --> 1:16:15.409 +On the one hand, these layers don't need to +learn a representation, they only need to learn + +1:16:15.409 --> 1:16:18.754 +what to change the representation. + +1:16:22.082 --> 1:16:24.172 +Good. + +1:16:23.843 --> 1:16:31.768 +That much for the new map before, so the last +thing now means this. + +1:16:31.671 --> 1:16:33.750 +Language was are yeah. + +1:16:33.750 --> 1:16:41.976 +I were used in the molds itself and now were +seeing them again, but one thing which at the + +1:16:41.976 --> 1:16:53.558 +beginning they were reading was very essential +was: So people really train part of the language + +1:16:53.558 --> 1:16:59.999 +models only to get this type of embedding. + +1:16:59.999 --> 1:17:04.193 +Therefore, we want to look. + +1:17:09.229 --> 1:17:15.678 +So now some last words to the word embeddings. + +1:17:15.678 --> 1:17:27.204 +The interesting thing is that word embeddings +can be used for very different tasks. + +1:17:27.347 --> 1:17:31.329 +The knife wing is you can train that on just +large amounts of data. + +1:17:31.931 --> 1:17:41.569 +And then if you have these wooden beddings +we have seen that they reduce the parameters. + +1:17:41.982 --> 1:17:52.217 +So then you can train your small mark to do +any other task and therefore you are more efficient. + +1:17:52.532 --> 1:17:55.218 +These initial word embeddings is important. + +1:17:55.218 --> 1:18:00.529 +They really depend only on the word itself, +so if you look at the two meanings of can, + +1:18:00.529 --> 1:18:06.328 +the can of beans or I can do that, they will +have the same embedding, so some of the embedding + +1:18:06.328 --> 1:18:08.709 +has to save the ambiguity inside that. + +1:18:09.189 --> 1:18:12.486 +That cannot be resolved. + +1:18:12.486 --> 1:18:24.753 +Therefore, if you look at the higher levels +in the context, but in the word embedding layers + +1:18:24.753 --> 1:18:27.919 +that really depends on. + +1:18:29.489 --> 1:18:33.757 +However, even this one has quite very interesting. + +1:18:34.034 --> 1:18:39.558 +So that people like to visualize them. + +1:18:39.558 --> 1:18:47.208 +They're always difficult because if you look +at this. + +1:18:47.767 --> 1:18:52.879 +And drawing your five hundred damage, the +vector is still a bit challenging. + +1:18:53.113 --> 1:19:12.472 +So you cannot directly do that, so people +have to do it like they look at some type of. + +1:19:13.073 --> 1:19:17.209 +And of course then yes some information is +getting lost by a bunch of control. + +1:19:18.238 --> 1:19:24.802 +And you see, for example, this is the most +famous and common example, so what you can + +1:19:24.802 --> 1:19:31.289 +look is you can look at the difference between +the main and the female word English. + +1:19:31.289 --> 1:19:37.854 +This is here in your embedding of king, and +this is the embedding of queen, and this. + +1:19:38.058 --> 1:19:40.394 +You can do that for a very different work. + +1:19:40.780 --> 1:19:45.407 +And that is where the masks come into, that +is what people then look into. + +1:19:45.725 --> 1:19:50.995 +So what you can now, for example, do is you +can calculate the difference between man and + +1:19:50.995 --> 1:19:51.410 +woman? + +1:19:52.232 --> 1:19:55.511 +Then you can take the embedding of tea. + +1:19:55.511 --> 1:20:02.806 +You can add on it the difference between man +and woman, and then you can notice what are + +1:20:02.806 --> 1:20:04.364 +the similar words. + +1:20:04.364 --> 1:20:08.954 +So you won't, of course, directly hit the +correct word. + +1:20:08.954 --> 1:20:10.512 +It's a continuous. + +1:20:10.790 --> 1:20:23.127 +But you can look what are the nearest neighbors +to this same, and often these words are near + +1:20:23.127 --> 1:20:24.056 +there. + +1:20:24.224 --> 1:20:33.913 +So it somehow learns that the difference between +these words is always the same. + +1:20:34.374 --> 1:20:37.746 +You can do that for different things. + +1:20:37.746 --> 1:20:41.296 +He also imagines that it's not perfect. + +1:20:41.296 --> 1:20:49.017 +He says the world tends to be swimming and +swimming, and with walking and walking you. + +1:20:49.469 --> 1:20:51.639 +So you can try to use them. + +1:20:51.639 --> 1:20:59.001 +It's no longer like saying yeah, but the interesting +thing is this is completely unsupervised. + +1:20:59.001 --> 1:21:03.961 +So nobody taught him the principle of their +gender in language. + +1:21:04.284 --> 1:21:09.910 +So it's purely trained on the task of doing +the next work prediction. + +1:21:10.230 --> 1:21:20.658 +And even for really cementing information +like the capital, this is the difference between + +1:21:20.658 --> 1:21:23.638 +the city and the capital. + +1:21:23.823 --> 1:21:25.518 +Visualization. + +1:21:25.518 --> 1:21:33.766 +Here we have done the same things of the difference +between country and. + +1:21:33.853 --> 1:21:41.991 +You see it's not perfect, but it's building +some kinds of a right direction, so you can't + +1:21:41.991 --> 1:21:43.347 +even use them. + +1:21:43.347 --> 1:21:51.304 +For example, for question answering, if you +have the difference between them, you apply + +1:21:51.304 --> 1:21:53.383 +that to a new country. + +1:21:54.834 --> 1:22:02.741 +So it seems these ones are able to really +learn a lot of information and collapse all + +1:22:02.741 --> 1:22:04.396 +this information. + +1:22:05.325 --> 1:22:11.769 +At just to do the next word prediction: And +that also explains a bit maybe or not explains + +1:22:11.769 --> 1:22:19.016 +wrong life by motivating why what is the main +advantage of this type of neural models that + +1:22:19.016 --> 1:22:26.025 +we can use this type of hidden representation, +transfer them and use them in different. + +1:22:28.568 --> 1:22:43.707 +So summarize what we did today, so what you +should hopefully have with you is for machine + +1:22:43.707 --> 1:22:45.893 +translation. + +1:22:45.805 --> 1:22:49.149 +Then how we can do language modern Chinese +literature? + +1:22:49.449 --> 1:22:55.617 +We looked at three different architectures: +We looked into the feet forward language mode + +1:22:55.617 --> 1:22:59.063 +and the one based on Bluetooth machines. + +1:22:59.039 --> 1:23:05.366 +And finally there are different architectures +to do in your networks. + +1:23:05.366 --> 1:23:14.404 +We have seen feet for your networks and we'll +see the next lectures, the last type of architecture. + +1:23:15.915 --> 1:23:17.412 +Have Any Questions. + +1:23:20.680 --> 1:23:27.341 +Then thanks a lot, and next on Tuesday we +will be again in our order to know how to play. + +0:00:01.301 --> 0:00:05.687 +Okay, so we're welcome to today's lecture. + +0:00:06.066 --> 0:00:18.128 +A bit desperate in a small room and I'm sorry +for the inconvenience. + +0:00:18.128 --> 0:00:25.820 +Sometimes there are project meetings where. + +0:00:26.806 --> 0:00:40.863 +So what we want to talk today about is want +to start with neural approaches to machine + +0:00:40.863 --> 0:00:42.964 +translation. + +0:00:43.123 --> 0:00:55.779 +Guess I've heard about other types of neural +models for natural language processing. + +0:00:55.779 --> 0:00:59.948 +This was some of the first. + +0:01:00.600 --> 0:01:06.203 +They are similar to what you know they see +in as large language models. + +0:01:06.666 --> 0:01:14.810 +And we want today look into what are these +neural language models, how we can build them, + +0:01:14.810 --> 0:01:15.986 +what is the. + +0:01:16.316 --> 0:01:23.002 +And first we'll show how to use them in statistical +machine translation. + +0:01:23.002 --> 0:01:31.062 +If you remember weeks ago, we had this log-linear +model where you can integrate easily. + +0:01:31.351 --> 0:01:42.756 +And that was how they first were used, so +we just had another model that evaluates how + +0:01:42.756 --> 0:01:49.180 +good a system is or how good a lot of languages. + +0:01:50.690 --> 0:02:04.468 +And next week we will go for a neuromachine +translation where we replace the whole model + +0:02:04.468 --> 0:02:06.481 +by one huge. + +0:02:11.211 --> 0:02:20.668 +So just as a member from Tuesday we've seen, +the main challenge in language modeling was + +0:02:20.668 --> 0:02:25.131 +that most of the anthrax we haven't seen. + +0:02:26.946 --> 0:02:34.167 +So this was therefore difficult to estimate +any probability because we've seen that yet + +0:02:34.167 --> 0:02:39.501 +normally if you've seen had not seen the N +gram you will assign. + +0:02:39.980 --> 0:02:53.385 +However, this is not really very good because +we don't want to give zero probabilities to + +0:02:53.385 --> 0:02:55.023 +sentences. + +0:02:55.415 --> 0:03:10.397 +And then we learned a lot of techniques and +that is the main challenge in statistical language. + +0:03:10.397 --> 0:03:15.391 +How we can give somehow a good. + +0:03:15.435 --> 0:03:23.835 +And they developed very specific, very good +techniques to deal with that. + +0:03:23.835 --> 0:03:26.900 +However, this is the best. + +0:03:28.568 --> 0:03:33.907 +And therefore we can do things different. + +0:03:33.907 --> 0:03:44.331 +If we have not seen an N gram before in statistical +models, we have to have seen. + +0:03:45.225 --> 0:03:51.361 +Before, and we can only get information from +exactly the same word. + +0:03:51.411 --> 0:03:57.567 +We don't have an approximate matching like +that. + +0:03:57.567 --> 0:04:10.255 +Maybe it stood together in some way or similar, +and in a sentence we might generalize the knowledge. + +0:04:11.191 --> 0:04:21.227 +Would like to have more something like that +where engrams are represented more in a general + +0:04:21.227 --> 0:04:21.990 +space. + +0:04:22.262 --> 0:04:29.877 +So if you learn something about eyewalk then +maybe we can use this knowledge and also. + +0:04:30.290 --> 0:04:43.034 +And thereby no longer treat all or at least +a lot of the ingrams as we've done before. + +0:04:43.034 --> 0:04:45.231 +We can really. + +0:04:47.047 --> 0:04:56.157 +And we maybe want to even do that in a more +hierarchical approach, but we know okay some + +0:04:56.157 --> 0:05:05.268 +words are similar like go and walk is somehow +similar and and therefore like maybe if we + +0:05:05.268 --> 0:05:07.009 +then merge them. + +0:05:07.387 --> 0:05:16.104 +If we learn something about work, then it +should tell us also something about Hugo or + +0:05:16.104 --> 0:05:17.118 +he walks. + +0:05:17.197 --> 0:05:18.970 +We see already. + +0:05:18.970 --> 0:05:22.295 +It's, of course, not so easy. + +0:05:22.295 --> 0:05:31.828 +We see that there is some relations which +we need to integrate, for example, for you. + +0:05:31.828 --> 0:05:35.486 +We need to add the S, but maybe. + +0:05:37.137 --> 0:05:42.984 +And luckily there is one really yeah, convincing +methods in doing that. + +0:05:42.963 --> 0:05:47.239 +And that is by using an evil neck or. + +0:05:47.387 --> 0:05:57.618 +That's what we will introduce today so we +can use this type of neural networks to try + +0:05:57.618 --> 0:06:04.042 +to learn this similarity and to learn how some +words. + +0:06:04.324 --> 0:06:13.711 +And that is one of the main advantages that +we have by switching from the standard statistical + +0:06:13.711 --> 0:06:15.193 +models to the. + +0:06:15.115 --> 0:06:22.840 +To learn similarities between words and generalized +and learn what we call hidden representations. + +0:06:22.840 --> 0:06:29.707 +So somehow representations of words where +we can measure similarity in some dimensions. + +0:06:30.290 --> 0:06:42.275 +So in representations where as a tubically +continuous vector or a vector of a fixed size. + +0:06:42.822 --> 0:06:52.002 +We had it before and we've seen that the only +thing we did is we don't want to do. + +0:06:52.192 --> 0:06:59.648 +But these indices don't have any meaning, +so it wasn't that word five is more similar + +0:06:59.648 --> 0:07:02.248 +to words twenty than to word. + +0:07:02.582 --> 0:07:09.059 +So we couldn't learn anything about words +in the statistical model. + +0:07:09.059 --> 0:07:12.107 +That's a big challenge because. + +0:07:12.192 --> 0:07:24.232 +If you think about words even in morphology, +so go and go is more similar because the person. + +0:07:24.264 --> 0:07:36.265 +While the basic models we have up to now, +they have no idea about that and goes as similar + +0:07:36.265 --> 0:07:37.188 +to go. + +0:07:39.919 --> 0:07:53.102 +So what we want to do today, in order to go +to this, we will have a short introduction. + +0:07:53.954 --> 0:08:06.667 +It very short just to see how we use them +here, but that's the good thing that are important + +0:08:06.667 --> 0:08:08.445 +for dealing. + +0:08:08.928 --> 0:08:14.083 +And then we'll first look into feet forward, +new network language models. + +0:08:14.454 --> 0:08:21.221 +And there we will still have this approximation +we had before, then we are looking only at + +0:08:21.221 --> 0:08:22.336 +fixed windows. + +0:08:22.336 --> 0:08:28.805 +So if you remember we have this classroom +of language models, and to determine what is + +0:08:28.805 --> 0:08:33.788 +the probability of a word, we only look at +the past and minus one. + +0:08:34.154 --> 0:08:36.878 +This is the theory of the case. + +0:08:36.878 --> 0:08:43.348 +However, we have the ability and that's why +they're really better in order. + +0:08:44.024 --> 0:08:51.953 +And then at the end we'll look at current +network language models where we then have + +0:08:51.953 --> 0:08:53.166 +a different. + +0:08:53.093 --> 0:09:01.922 +And thereby it is no longer the case that +we need to have a fixed history, but in theory + +0:09:01.922 --> 0:09:04.303 +we can model arbitrary. + +0:09:04.304 --> 0:09:06.854 +And we can log this phenomenon. + +0:09:06.854 --> 0:09:12.672 +We talked about a Tuesday where it's not clear +what type of information. + +0:09:16.396 --> 0:09:24.982 +So yeah, generally new networks are normally +learned to improve and perform some tasks. + +0:09:25.325 --> 0:09:38.934 +We have this structure and we are learning +them from samples so that is similar to what + +0:09:38.934 --> 0:09:42.336 +we had before so now. + +0:09:42.642 --> 0:09:49.361 +And is somehow originally motivated by the +human brain. + +0:09:49.361 --> 0:10:00.640 +However, when you now need to know artificial +neural networks, it's hard to get a similarity. + +0:10:00.540 --> 0:10:02.884 +There seems to be not that important. + +0:10:03.123 --> 0:10:11.013 +So what they are mainly doing is doing summoning +multiplication and then one linear activation. + +0:10:12.692 --> 0:10:16.078 +So so the basic units are these type of. + +0:10:17.937 --> 0:10:29.837 +Perceptron is a basic block which we have +and this does exactly the processing. + +0:10:29.837 --> 0:10:36.084 +We have a fixed number of input features. + +0:10:36.096 --> 0:10:39.668 +So we have here numbers six zero to x and +as input. + +0:10:40.060 --> 0:10:48.096 +And this makes language processing difficult +because we know that it's not the case. + +0:10:48.096 --> 0:10:53.107 +If we're dealing with language, it doesn't +have any. + +0:10:54.114 --> 0:10:57.609 +So we have to model this somehow and understand +how we model this. + +0:10:58.198 --> 0:11:03.681 +Then we have the weights, which are the parameters +and the number of weights exactly the same. + +0:11:04.164 --> 0:11:15.069 +Of input features sometimes you have the spires +in there that always and then it's not really. + +0:11:15.195 --> 0:11:19.656 +And what you then do is very simple. + +0:11:19.656 --> 0:11:26.166 +It's just like the weight it sounds, so you +multiply. + +0:11:26.606 --> 0:11:38.405 +What is then additionally important is we +have an activation function and it's important + +0:11:38.405 --> 0:11:42.514 +that this activation function. + +0:11:43.243 --> 0:11:54.088 +And later it will be important that this is +differentiable because otherwise all the training. + +0:11:54.714 --> 0:12:01.471 +This model by itself is not very powerful. + +0:12:01.471 --> 0:12:10.427 +We have the X Or problem and with this simple +you can't. + +0:12:10.710 --> 0:12:15.489 +However, there is a very easy and nice extension. + +0:12:15.489 --> 0:12:20.936 +The multi layer perception and things get +very powerful. + +0:12:21.081 --> 0:12:32.953 +The thing is you just connect a lot of these +in these layers of structures where we have + +0:12:32.953 --> 0:12:35.088 +the inputs and. + +0:12:35.395 --> 0:12:47.297 +And then we can combine them, or to do them: +The input layer is of course given by your + +0:12:47.297 --> 0:12:51.880 +problem with the dimension. + +0:12:51.880 --> 0:13:00.063 +The output layer is also given by your dimension. + +0:13:01.621 --> 0:13:08.802 +So let's start with the first question, now +more language related, and that is how we represent. + +0:13:09.149 --> 0:13:19.282 +So we have seen here input to x, but the question +is now okay. + +0:13:19.282 --> 0:13:23.464 +How can we put into this? + +0:13:26.866 --> 0:13:34.123 +The first thing that we're able to do is we're +going to set it in the inspector. + +0:13:34.314 --> 0:13:45.651 +Yeah, and that is not that easy because the +continuous vector will come to that. + +0:13:45.651 --> 0:13:47.051 +We can't. + +0:13:47.051 --> 0:13:50.410 +We don't want to do it. + +0:13:50.630 --> 0:13:57.237 +But if we need to input the word into the +needle network, it has to be something easily + +0:13:57.237 --> 0:13:57.912 +defined. + +0:13:59.079 --> 0:14:11.511 +One is the typical thing, the one-hour encoded +vector, so we have a vector where the dimension + +0:14:11.511 --> 0:14:15.306 +is the vocabulary, and then. + +0:14:16.316 --> 0:14:25.938 +So the first thing you are ready to see that +means we are always dealing with fixed. + +0:14:26.246 --> 0:14:34.961 +So you cannot easily extend your vocabulary, +but if you mean your vocabulary would increase + +0:14:34.961 --> 0:14:37.992 +the size of this input vector,. + +0:14:39.980 --> 0:14:42.423 +That's maybe also motivating. + +0:14:42.423 --> 0:14:45.355 +We'll talk about bike parade going. + +0:14:45.355 --> 0:14:47.228 +That's the nice thing. + +0:14:48.048 --> 0:15:01.803 +The big advantage of this one putt encoding +is that we don't implement similarity between + +0:15:01.803 --> 0:15:06.999 +words, but we're really learning. + +0:15:07.227 --> 0:15:11.219 +So you need like to represent any words. + +0:15:11.219 --> 0:15:15.893 +You need a dimension of and dimensional vector. + +0:15:16.236 --> 0:15:26.480 +Imagine you could eat no binary encoding, +so you could represent words as binary vectors. + +0:15:26.806 --> 0:15:32.348 +So you will be significantly more efficient. + +0:15:32.348 --> 0:15:39.122 +However, you have some more digits than other +numbers. + +0:15:39.559 --> 0:15:46.482 +Would somehow be bad because you would force +the one to do this and it's by hand not clear + +0:15:46.482 --> 0:15:47.623 +how to define. + +0:15:48.108 --> 0:15:55.135 +So therefore currently this is the most successful +approach to just do this one patch. + +0:15:55.095 --> 0:15:59.344 +We take a fixed vocabulary. + +0:15:59.344 --> 0:16:10.269 +We map each word to the initial and then we +represent a word like this. + +0:16:10.269 --> 0:16:13.304 +The representation. + +0:16:14.514 --> 0:16:27.019 +But this dimension here is a secondary size, +and if you think ten thousand that's quite + +0:16:27.019 --> 0:16:33.555 +high, so we're always trying to be efficient. + +0:16:33.853 --> 0:16:42.515 +And we are doing the same type of efficiency +because then we are having a very small one + +0:16:42.515 --> 0:16:43.781 +compared to. + +0:16:44.104 --> 0:16:53.332 +It can be still a maybe or neurons, but this +is significantly smaller, of course, as before. + +0:16:53.713 --> 0:17:04.751 +So you are learning there this word as you +said, but you can learn it directly, and there + +0:17:04.751 --> 0:17:07.449 +we have similarities. + +0:17:07.807 --> 0:17:14.772 +But the nice thing is that this is then learned, +and we do not need to like hand define. + +0:17:17.117 --> 0:17:32.377 +So yes, so that is how we're typically adding +at least a single word into the language world. + +0:17:32.377 --> 0:17:43.337 +Then we can see: So we're seeing that you +have the one hard representation always of + +0:17:43.337 --> 0:17:44.857 +the same similarity. + +0:17:45.105 --> 0:18:00.803 +Then we're having this continuous vector which +is a lot smaller dimension and that's. + +0:18:01.121 --> 0:18:06.984 +What we are doing then is learning these representations +so that they are best for language modeling. + +0:18:07.487 --> 0:18:19.107 +So the representations are implicitly because +we're training on the language. + +0:18:19.479 --> 0:18:30.115 +And the nice thing was found out later is +these representations are really, really good + +0:18:30.115 --> 0:18:32.533 +for a lot of other. + +0:18:33.153 --> 0:18:39.729 +And that is why they are now called word embedded +space themselves, and used for other tasks. + +0:18:40.360 --> 0:18:49.827 +And they are somehow describing different +things so they can describe and semantic similarities. + +0:18:49.789 --> 0:18:58.281 +We are looking at the very example of today +that you can do in this vector space by adding + +0:18:58.281 --> 0:19:00.613 +some interesting things. + +0:19:00.940 --> 0:19:11.174 +And so they got really was a first big improvement +when switching to neural staff. + +0:19:11.491 --> 0:19:20.736 +They are like part of the model still with +more complex representation alert, but they + +0:19:20.736 --> 0:19:21.267 +are. + +0:19:23.683 --> 0:19:34.975 +Then we are having the output layer, and in +the output layer we also have output structure + +0:19:34.975 --> 0:19:36.960 +and activation. + +0:19:36.997 --> 0:19:44.784 +That is the language we want to predict, which +word should be the next. + +0:19:44.784 --> 0:19:46.514 +We always have. + +0:19:47.247 --> 0:19:56.454 +And that can be done very well with the softball +softbacked layer, where again the dimension. + +0:19:56.376 --> 0:20:03.971 +Is the vocabulary, so this is a vocabulary +size, and again the case neuro represents the + +0:20:03.971 --> 0:20:09.775 +case class, so in our case we have again a +one-hour representation. + +0:20:10.090 --> 0:20:18.929 +Ours is a probability distribution and the +end is a probability distribution of all works. + +0:20:18.929 --> 0:20:28.044 +The case entry tells us: So we need to have +some of our probability distribution at our + +0:20:28.044 --> 0:20:36.215 +output, and in order to achieve that this activation +function goes, it needs to be that all the + +0:20:36.215 --> 0:20:36.981 +outputs. + +0:20:37.197 --> 0:20:47.993 +And we can achieve that with a softmax activation +we take each of the value and then. + +0:20:48.288 --> 0:20:58.020 +So by having this type of activation function +we are really getting that at the end we always. + +0:20:59.019 --> 0:21:12.340 +The beginning was very challenging because +again we have this inefficient representation + +0:21:12.340 --> 0:21:15.184 +of our vocabulary. + +0:21:15.235 --> 0:21:27.500 +And then you can imagine escalating over to +something over a thousand is maybe a bit inefficient + +0:21:27.500 --> 0:21:29.776 +with cheap users. + +0:21:36.316 --> 0:21:43.664 +And then yeah, for training the models, that +is how we refine, so we have this architecture + +0:21:43.664 --> 0:21:44.063 +now. + +0:21:44.264 --> 0:21:52.496 +We need to minimize the arrow by taking the +output. + +0:21:52.496 --> 0:21:58.196 +We are comparing it to our targets. + +0:21:58.298 --> 0:22:07.670 +So one important thing is, of course, how +can we measure the error? + +0:22:07.670 --> 0:22:12.770 +So what if we're training the ideas? + +0:22:13.033 --> 0:22:19.770 +And how well when measuring it is in natural +language processing, typically the cross entropy. + +0:22:19.960 --> 0:22:32.847 +That means we are comparing the target with +the output, so we're taking the value multiplying + +0:22:32.847 --> 0:22:35.452 +with the horizons. + +0:22:35.335 --> 0:22:43.454 +Which gets optimized and you're seeing that +this, of course, makes it again very nice and + +0:22:43.454 --> 0:22:49.859 +easy because our target, we said, is again +a one-hound representation. + +0:22:50.110 --> 0:23:00.111 +So except for one, all of these are always +zero, and what we are doing is taking the one. + +0:23:00.100 --> 0:23:05.970 +And we only need to multiply the one with +the logarism here, and that is all the feedback. + +0:23:06.946 --> 0:23:14.194 +Of course, this is not always influenced by +all the others. + +0:23:14.194 --> 0:23:17.938 +Why is this influenced by all? + +0:23:24.304 --> 0:23:33.554 +Think Mac the activation function, which is +the current activation divided by some of the + +0:23:33.554 --> 0:23:34.377 +others. + +0:23:34.354 --> 0:23:44.027 +Because otherwise it could of course easily +just increase this value and ignore the others, + +0:23:44.027 --> 0:23:49.074 +but if you increase one value or the other, +so. + +0:23:51.351 --> 0:24:04.433 +And then we can do with neon networks one +very nice and easy type of training that is + +0:24:04.433 --> 0:24:07.779 +done in all the neon. + +0:24:07.707 --> 0:24:12.664 +So in which direction does the arrow show? + +0:24:12.664 --> 0:24:23.152 +And then if we want to go to a smaller like +smaller arrow, that's what we want to achieve. + +0:24:23.152 --> 0:24:27.302 +We're trying to minimize our arrow. + +0:24:27.287 --> 0:24:32.875 +And we have to do that, of course, for all +the weights, and to calculate the error of + +0:24:32.875 --> 0:24:36.709 +all the weights we want in the back of the +baggation here. + +0:24:36.709 --> 0:24:41.322 +But what you can do is you can propagate the +arrow which you measured. + +0:24:41.322 --> 0:24:43.792 +At the end you can propagate it back. + +0:24:43.792 --> 0:24:46.391 +That's basic mass and basic derivation. + +0:24:46.706 --> 0:24:59.557 +Then you can do each weight in your model +and measure how much it contributes to this + +0:24:59.557 --> 0:25:01.350 +individual. + +0:25:04.524 --> 0:25:17.712 +To summarize what your machine translation +should be, to understand all this problem is + +0:25:17.712 --> 0:25:20.710 +that this is how a. + +0:25:20.580 --> 0:25:23.056 +The notes are perfect thrones. + +0:25:23.056 --> 0:25:28.167 +They are fully connected between two layers +and no connections. + +0:25:28.108 --> 0:25:29.759 +Across layers. + +0:25:29.829 --> 0:25:35.152 +And what they're doing is always just to wait +for some here and then an activation function. + +0:25:35.415 --> 0:25:38.794 +And in order to train you have this sword +in backwards past. + +0:25:39.039 --> 0:25:41.384 +So we put in here. + +0:25:41.281 --> 0:25:46.540 +Our inputs have some random values at the +beginning. + +0:25:46.540 --> 0:25:49.219 +They calculate the output. + +0:25:49.219 --> 0:25:58.646 +We are measuring how big our error is, propagating +the arrow back, and then changing our model + +0:25:58.646 --> 0:25:59.638 +in a way. + +0:26:01.962 --> 0:26:14.267 +So before we're coming into the neural networks, +how can we use this type of neural network + +0:26:14.267 --> 0:26:17.611 +to do language modeling? + +0:26:23.103 --> 0:26:25.520 +So the question is now okay. + +0:26:25.520 --> 0:26:33.023 +How can we use them in natural language processing +and especially in machine translation? + +0:26:33.023 --> 0:26:38.441 +The first idea of using them was to estimate +the language model. + +0:26:38.999 --> 0:26:42.599 +So we have seen that the output can be monitored +here as well. + +0:26:43.603 --> 0:26:49.308 +Has a probability distribution, and if we +have a full vocabulary, we could mainly hear + +0:26:49.308 --> 0:26:55.209 +estimate how probable each next word is, and +then use that in our language model fashion, + +0:26:55.209 --> 0:27:02.225 +as we've done it last time, we've got the probability +of a full sentence as a product of all probabilities + +0:27:02.225 --> 0:27:03.208 +of individual. + +0:27:04.544 --> 0:27:06.695 +And UM. + +0:27:06.446 --> 0:27:09.776 +That was done and in ninety seven years. + +0:27:09.776 --> 0:27:17.410 +It's very easy to integrate it into this Locklear +model, so we have said that this is how the + +0:27:17.410 --> 0:27:24.638 +Locklear model looks like, so we're searching +the best translation, which minimizes each + +0:27:24.638 --> 0:27:25.126 +wage. + +0:27:25.125 --> 0:27:26.371 +The feature value. + +0:27:26.646 --> 0:27:31.642 +We have that with the minimum error training, +if you can remember when we search for the + +0:27:31.642 --> 0:27:32.148 +optimal. + +0:27:32.512 --> 0:27:40.927 +We have the phrasetable probabilities, the +language model, and we can just add here and + +0:27:40.927 --> 0:27:41.597 +there. + +0:27:41.861 --> 0:27:46.077 +So that is quite easy as said. + +0:27:46.077 --> 0:27:54.101 +That was how statistical machine translation +was improved. + +0:27:54.101 --> 0:27:57.092 +Add one more feature. + +0:27:58.798 --> 0:28:11.220 +So how can we model the language mark for +Belty with your network? + +0:28:11.220 --> 0:28:22.994 +So what we have to do is: And the problem +in generally in the head is that most we haven't + +0:28:22.994 --> 0:28:25.042 +seen long sequences. + +0:28:25.085 --> 0:28:36.956 +Mostly we have to beg off to very short sequences +and we are working on this discrete space where. + +0:28:37.337 --> 0:28:48.199 +So the idea is if we have a meal network we +can map words into continuous representation + +0:28:48.199 --> 0:28:50.152 +and that helps. + +0:28:51.091 --> 0:28:59.598 +And the structure then looks like this, so +this is the basic still feed forward neural + +0:28:59.598 --> 0:29:00.478 +network. + +0:29:01.361 --> 0:29:10.744 +We are doing this at Proximation again, so +we are not putting in all previous words, but + +0:29:10.744 --> 0:29:11.376 +it's. + +0:29:11.691 --> 0:29:25.089 +And this is done because in your network we +can have only a fixed type of input, so we + +0:29:25.089 --> 0:29:31.538 +can: Can only do a fixed set, and they are +going to be doing exactly the same in minus + +0:29:31.538 --> 0:29:31.879 +one. + +0:29:33.593 --> 0:29:41.026 +And then we have, for example, three words +and three different words, which are in these + +0:29:41.026 --> 0:29:54.583 +positions: And then we're having the first +layer of the neural network, which learns words + +0:29:54.583 --> 0:29:56.247 +and words. + +0:29:57.437 --> 0:30:04.976 +There is one thing which is maybe special +compared to the standard neural memory. + +0:30:05.345 --> 0:30:13.163 +So the representation of this word we want +to learn first of all position independence, + +0:30:13.163 --> 0:30:19.027 +so we just want to learn what is the general +meaning of the word. + +0:30:19.299 --> 0:30:26.244 +Therefore, the representation you get here +should be the same as if you put it in there. + +0:30:27.247 --> 0:30:35.069 +The nice thing is you can achieve that in +networks the same way you achieve it. + +0:30:35.069 --> 0:30:41.719 +This way you're reusing ears so we are forcing +them to always stay. + +0:30:42.322 --> 0:30:49.689 +And that's why you then learn your word embedding, +which is contextual and independent, so. + +0:30:49.909 --> 0:31:05.561 +So the idea is you have the diagram go home +and you don't want to use the context. + +0:31:05.561 --> 0:31:07.635 +First you. + +0:31:08.348 --> 0:31:14.155 +That of course it might have a different meaning +depending on where it stands, but learn that. + +0:31:14.514 --> 0:31:19.623 +First, we're learning key representation of +the words, which is just the representation + +0:31:19.623 --> 0:31:20.378 +of the word. + +0:31:20.760 --> 0:31:37.428 +So it's also not like normally all input neurons +are connected to all neurons. + +0:31:37.857 --> 0:31:47.209 +This is the first layer of representation, +and then we have a lot denser representation, + +0:31:47.209 --> 0:31:56.666 +that is, our three word embeddings here, and +now we are learning this interaction between + +0:31:56.666 --> 0:31:57.402 +words. + +0:31:57.677 --> 0:32:08.265 +So now we have at least one connected, fully +connected layer here, which takes the three + +0:32:08.265 --> 0:32:14.213 +imbedded input and then learns the new embedding. + +0:32:15.535 --> 0:32:27.871 +And then if you had one of several layers +of lining which is your output layer, then. + +0:32:28.168 --> 0:32:46.222 +So here the size is a vocabulary size, and +then you put as target what is the probability + +0:32:46.222 --> 0:32:48.228 +for each. + +0:32:48.688 --> 0:32:56.778 +The nice thing is that you learn everything +together, so you're not learning what is a + +0:32:56.778 --> 0:32:58.731 +good representation. + +0:32:59.079 --> 0:33:12.019 +When you are training the whole network together, +it learns what representation for a word you + +0:33:12.019 --> 0:33:13.109 +get in. + +0:33:15.956 --> 0:33:19.176 +It's Yeah That Is the Main Idea. + +0:33:20.660 --> 0:33:32.695 +Nowadays often referred to as one way of self-supervised +learning, why self-supervisory learning? + +0:33:33.053 --> 0:33:37.120 +The output is the next word and the input +is the previous word. + +0:33:37.377 --> 0:33:46.778 +But somehow it's self-supervised because it's +not really that we created labels, but we artificially. + +0:33:46.806 --> 0:34:01.003 +We just have pure text, and then we created +the task. + +0:34:05.905 --> 0:34:12.413 +Say we have two sentences like go home again. + +0:34:12.413 --> 0:34:18.780 +Second one is go to creative again, so both. + +0:34:18.858 --> 0:34:31.765 +The starboard bygo and then we have to predict +the next four years and my question is: Be + +0:34:31.765 --> 0:34:40.734 +modeled this ability as one vector with like +probability or possible works. + +0:34:40.734 --> 0:34:42.740 +We have musical. + +0:34:44.044 --> 0:34:56.438 +You have multiple examples, so you would twice +train, once you predict, once you predict, + +0:34:56.438 --> 0:35:02.359 +and then, of course, the best performance. + +0:35:04.564 --> 0:35:11.772 +A very good point, so you're not aggregating +examples beforehand, but you're taking each + +0:35:11.772 --> 0:35:13.554 +example individually. + +0:35:19.259 --> 0:35:33.406 +So what you do is you simultaneously learn +the projection layer which represents this + +0:35:33.406 --> 0:35:39.163 +word and the N gram probabilities. + +0:35:39.499 --> 0:35:48.390 +And what people then later analyzed is that +these representations are very powerful. + +0:35:48.390 --> 0:35:56.340 +The task is just a very important task to +model like what is the next word. + +0:35:56.816 --> 0:36:09.429 +It's a bit motivated by people saying in order +to get the meaning of the word you have to + +0:36:09.429 --> 0:36:10.690 +look at. + +0:36:10.790 --> 0:36:18.467 +If you read the text in there, which you have +never seen, you can still estimate the meaning + +0:36:18.467 --> 0:36:22.264 +of this word because you know how it is used. + +0:36:22.602 --> 0:36:26.667 +Just imagine you read this text about some +city. + +0:36:26.667 --> 0:36:32.475 +Even if you've never seen the city before +heard, you often know from. + +0:36:34.094 --> 0:36:44.809 +So what is now the big advantage of using +neural networks? + +0:36:44.809 --> 0:36:57.570 +Just imagine we have to estimate this: So +you have to monitor the probability of ad hip + +0:36:57.570 --> 0:37:00.272 +and now imagine iPhone. + +0:37:00.600 --> 0:37:06.837 +So all the techniques we have at the last +time. + +0:37:06.837 --> 0:37:14.243 +At the end, if you haven't seen iPhone, you +will always. + +0:37:15.055 --> 0:37:19.502 +Because you haven't seen the previous words, +so you have no idea how to do that. + +0:37:19.502 --> 0:37:24.388 +You won't have seen the diagram, the trigram +and all the others, so the probability here + +0:37:24.388 --> 0:37:27.682 +will just be based on the probability of ad, +so it uses no. + +0:37:28.588 --> 0:37:38.328 +If you're having this type of model, what +does it do so? + +0:37:38.328 --> 0:37:43.454 +This is the last three words. + +0:37:43.483 --> 0:37:49.837 +Maybe this representation is messed up because +it's mainly on a particular word or source + +0:37:49.837 --> 0:37:50.260 +that. + +0:37:50.730 --> 0:37:57.792 +Now anyway you have these two information +that were two words before was first and therefore: + +0:37:58.098 --> 0:38:07.214 +So you have a lot of information here to estimate +how good it is. + +0:38:07.214 --> 0:38:13.291 +Of course, there could be more information. + +0:38:13.593 --> 0:38:25.958 +So all this type of modeling we can do and +that we couldn't do beforehand because we always. + +0:38:27.027 --> 0:38:31.905 +Don't guess how we do it now. + +0:38:31.905 --> 0:38:41.824 +Typically you would have one talking for awkward +vocabulary. + +0:38:42.602 --> 0:38:45.855 +All you're doing by carrying coding when it +has a fixed dancing. + +0:38:46.226 --> 0:38:49.439 +Yeah, you have to do something like that that +the opposite way. + +0:38:50.050 --> 0:38:55.413 +So yeah, all the vocabulary are by thankcoding +where you don't have have all the vocabulary. + +0:38:55.735 --> 0:39:07.665 +But then, of course, the back pairing coating +is better with arbitrary context because a + +0:39:07.665 --> 0:39:11.285 +problem with back pairing. + +0:39:17.357 --> 0:39:20.052 +Anymore questions to the basic same little +things. + +0:39:23.783 --> 0:39:36.162 +This model we then want to continue is to +look into how complex that is or can make things + +0:39:36.162 --> 0:39:39.155 +maybe more efficient. + +0:39:40.580 --> 0:39:47.404 +At the beginning there was definitely a major +challenge. + +0:39:47.404 --> 0:39:50.516 +It's still not that easy. + +0:39:50.516 --> 0:39:58.297 +All guess follow the talk about their environmental +fingerprint. + +0:39:58.478 --> 0:40:05.686 +So this calculation is normally heavy, and +if you build systems yourself, you have to + +0:40:05.686 --> 0:40:06.189 +wait. + +0:40:06.466 --> 0:40:15.412 +So it's good to know a bit about how complex +things are in order to do a good or efficient. + +0:40:15.915 --> 0:40:24.706 +So one thing where most of the calculation +really happens is if you're. + +0:40:25.185 --> 0:40:34.649 +So in generally all these layers, of course, +we're talking about networks and the zones + +0:40:34.649 --> 0:40:35.402 +fancy. + +0:40:35.835 --> 0:40:48.305 +So what you have to do in order to calculate +here these activations, you have this weight. + +0:40:48.488 --> 0:41:05.021 +So to make it simple, let's see we have three +outputs, and then you just do a metric identification + +0:41:05.021 --> 0:41:08.493 +between your weight. + +0:41:08.969 --> 0:41:19.641 +That is why the use is so powerful for neural +networks because they are very good in doing + +0:41:19.641 --> 0:41:22.339 +metric multiplication. + +0:41:22.782 --> 0:41:28.017 +However, for some type of embedding layer +this is really very inefficient. + +0:41:28.208 --> 0:41:37.547 +So in this input we are doing this calculation. + +0:41:37.547 --> 0:41:47.081 +What we are mainly doing is selecting one +color. + +0:41:47.387 --> 0:42:03.570 +So therefore you can do at least the forward +pass a lot more efficient if you don't really + +0:42:03.570 --> 0:42:07.304 +do this calculation. + +0:42:08.348 --> 0:42:20.032 +So the weight metrics of the first embedding +layer is just that in each color you have. + +0:42:20.580 --> 0:42:30.990 +So this is how your initial weights look like +and how you can interpret or understand. + +0:42:32.692 --> 0:42:42.042 +And this is already relatively important because +remember this is a huge dimensional thing, + +0:42:42.042 --> 0:42:51.392 +so typically here we have the number of words +ten thousand, so this is the word embeddings. + +0:42:51.451 --> 0:43:00.400 +Because it's the largest one there, we have +entries, while for the others we maybe have. + +0:43:00.660 --> 0:43:03.402 +So they are a little bit efficient and are +important to make this in. + +0:43:06.206 --> 0:43:10.529 +And then you can look at where else the calculations +are very difficult. + +0:43:10.830 --> 0:43:20.294 +So here we have our individual network, so +here are the word embeddings. + +0:43:20.294 --> 0:43:29.498 +Then we have one hidden layer, and then you +can look at how difficult. + +0:43:30.270 --> 0:43:38.742 +We could save a lot of calculations by calculating +that by just doing like do the selection because: + +0:43:40.600 --> 0:43:51.748 +And then the number of calculations you have +to do here is the length. + +0:43:52.993 --> 0:44:06.206 +Then we have here the hint size that is the +hint size, so the first step of calculation + +0:44:06.206 --> 0:44:10.260 +for this metric is an age. + +0:44:10.730 --> 0:44:22.030 +Then you have to do some activation function +which is this: This is the hidden size hymn + +0:44:22.030 --> 0:44:29.081 +because we need the vocabulary socks to calculate +the probability for each. + +0:44:29.889 --> 0:44:40.474 +And if you look at this number, so if you +have a projection sign of one hundred and a + +0:44:40.474 --> 0:44:45.027 +vocabulary sign of one hundred, you. + +0:44:45.425 --> 0:44:53.958 +And that's why there has been especially at +the beginning some ideas on how we can reduce + +0:44:53.958 --> 0:44:55.570 +the calculation. + +0:44:55.956 --> 0:45:02.352 +And if we really need to calculate all our +capabilities, or if we can calculate only some. + +0:45:02.582 --> 0:45:13.061 +And there again one important thing to think +about is for what you will use my language. + +0:45:13.061 --> 0:45:21.891 +One can use it for generations and that's +where we will see the next week. + +0:45:21.891 --> 0:45:22.480 +And. + +0:45:23.123 --> 0:45:32.164 +Initially, if it's just used as a feature, +we do not want to use it for generation, but + +0:45:32.164 --> 0:45:32.575 +we. + +0:45:32.953 --> 0:45:41.913 +And there we might not be interested in all +the probabilities, but we already know all + +0:45:41.913 --> 0:45:49.432 +the probability of this one word, and then +it might be very inefficient. + +0:45:51.231 --> 0:45:53.638 +And how can you do that so initially? + +0:45:53.638 --> 0:45:56.299 +For example, people look into shortlists. + +0:45:56.756 --> 0:46:03.321 +So the idea was this calculation at the end +is really very expensive. + +0:46:03.321 --> 0:46:05.759 +So can we make that more. + +0:46:05.945 --> 0:46:17.135 +And the idea was okay, and most birds occur +very rarely, and some beef birds occur very, + +0:46:17.135 --> 0:46:18.644 +very often. + +0:46:19.019 --> 0:46:37.644 +And so they use the smaller imagery, which +is maybe very small, and then you merge a new. + +0:46:37.937 --> 0:46:45.174 +So you're taking if the word is in the shortness, +so in the most frequent words. + +0:46:45.825 --> 0:46:58.287 +You're taking the probability of this short +word by some normalization here, and otherwise + +0:46:58.287 --> 0:46:59.656 +you take. + +0:47:00.020 --> 0:47:00.836 +Course. + +0:47:00.836 --> 0:47:09.814 +It will not be as good, but then we don't +have to calculate all the capabilities at the + +0:47:09.814 --> 0:47:16.037 +end, but we only have to calculate it for the +most frequent. + +0:47:19.599 --> 0:47:39.477 +Machines about that, but of course we don't +model the probability of the infrequent words. + +0:47:39.299 --> 0:47:46.658 +And one idea is to do what is reported as +soles for the structure of the layer. + +0:47:46.606 --> 0:47:53.169 +You see how some years ago people were very +creative in giving names to newer models. + +0:47:53.813 --> 0:48:00.338 +And there the idea is that we model the out +group vocabulary as a clustered strip. + +0:48:00.680 --> 0:48:08.498 +So you don't need to mold all of your bodies +directly, but you are putting words into. + +0:48:08.969 --> 0:48:20.623 +A very intricate word is first in and then +in and then in and that is in sub-sub-clusters + +0:48:20.623 --> 0:48:21.270 +and. + +0:48:21.541 --> 0:48:29.936 +And this is what was mentioned in the past +of the work, so these are the subclasses that + +0:48:29.936 --> 0:48:30.973 +always go. + +0:48:30.973 --> 0:48:39.934 +So if it's in cluster one at the first position +then you only look at all the words which are: + +0:48:40.340 --> 0:48:50.069 +And then you can calculate the probability +of a word again just by the product over these, + +0:48:50.069 --> 0:48:55.522 +so the probability of the word is the first +class. + +0:48:57.617 --> 0:49:12.331 +It's maybe more clear where you have the sole +architecture, so what you will do is first + +0:49:12.331 --> 0:49:13.818 +predict. + +0:49:14.154 --> 0:49:26.435 +Then you go to the appropriate sub-class, +then you calculate the probability of the sub-class. + +0:49:27.687 --> 0:49:34.932 +Anybody have an idea why this is more, more +efficient, or if people do it first, it looks + +0:49:34.932 --> 0:49:35.415 +more. + +0:49:42.242 --> 0:49:56.913 +Yes, so you have to do less calculations, +or maybe here you have to calculate the element + +0:49:56.913 --> 0:49:59.522 +there, but you. + +0:49:59.980 --> 0:50:06.116 +The capabilities in the set classes that you're +going through and not for all of them. + +0:50:06.386 --> 0:50:16.688 +Therefore, it's only more efficient if you +don't need all awkward preferences because + +0:50:16.688 --> 0:50:21.240 +you have to even calculate the class. + +0:50:21.501 --> 0:50:30.040 +So it's only more efficient in scenarios where +you really need to use a language to evaluate. + +0:50:35.275 --> 0:50:54.856 +How this works is that on the output layer +you only have a vocabulary of: But on the input + +0:50:54.856 --> 0:51:05.126 +layer you have always your full vocabulary +because at the input we saw that this is not + +0:51:05.126 --> 0:51:06.643 +complicated. + +0:51:06.906 --> 0:51:19.778 +And then you can cluster down all your words, +embedding series of classes, and use that as + +0:51:19.778 --> 0:51:23.031 +your classes for that. + +0:51:23.031 --> 0:51:26.567 +So yeah, you have words. + +0:51:29.249 --> 0:51:32.593 +Is one idea of doing it. + +0:51:32.593 --> 0:51:44.898 +There is also a second idea of doing it again, +the idea that we don't need the probability. + +0:51:45.025 --> 0:51:53.401 +So sometimes it doesn't really need to be +a probability to evaluate. + +0:51:53.401 --> 0:52:05.492 +It's only important that: And: Here is called +self-normalization. + +0:52:05.492 --> 0:52:19.349 +What people have done so is in the softmax +is always to the input divided by normalization. + +0:52:19.759 --> 0:52:25.194 +So this is how we calculate the soft mix. + +0:52:25.825 --> 0:52:42.224 +And in self-normalization now, the idea is +that we don't need to calculate the logarithm. + +0:52:42.102 --> 0:52:54.284 +That would be zero, and then you don't even +have to calculate the normalization. + +0:52:54.514 --> 0:53:01.016 +So how can we achieve that? + +0:53:01.016 --> 0:53:08.680 +And then there's the nice thing. + +0:53:09.009 --> 0:53:14.743 +And our novel Lots and more to maximize probability. + +0:53:14.743 --> 0:53:23.831 +We have this cross entry lot that probability +is higher, and now we're just adding. + +0:53:24.084 --> 0:53:31.617 +And the second loss just tells us you're pleased +training the way the lock set is zero. + +0:53:32.352 --> 0:53:38.625 +So then if it's nearly zero at the end you +don't need to calculate this and it's also + +0:53:38.625 --> 0:53:39.792 +very efficient. + +0:53:40.540 --> 0:53:57.335 +One important thing is this is only an inference, +so during tests we don't need to calculate. + +0:54:00.480 --> 0:54:15.006 +You can do a bit of a hyperparameter here +where you do the waiting and how much effort + +0:54:15.006 --> 0:54:16.843 +should be. + +0:54:18.318 --> 0:54:35.037 +The only disadvantage is that it's no speed +up during training and there are other ways + +0:54:35.037 --> 0:54:37.887 +of doing that. + +0:54:41.801 --> 0:54:43.900 +I'm with you all. + +0:54:44.344 --> 0:54:48.540 +Then we are coming very, very briefly like +this one here. + +0:54:48.828 --> 0:54:53.692 +There are more things on different types of +languages. + +0:54:53.692 --> 0:54:58.026 +We are having a very short view of a restricted. + +0:54:58.298 --> 0:55:09.737 +And then we'll talk about recurrent neural +networks for our language minds because they + +0:55:09.737 --> 0:55:17.407 +have the advantage now that we can't even further +improve. + +0:55:18.238 --> 0:55:24.395 +There's also different types of neural networks. + +0:55:24.395 --> 0:55:30.175 +These ballroom machines are not having input. + +0:55:30.330 --> 0:55:39.271 +They have these binary units: And they define +an energy function on the network, which can + +0:55:39.271 --> 0:55:46.832 +be in respect of bottom machines efficiently +calculated, and restricted needs. + +0:55:46.832 --> 0:55:53.148 +You only have connections between the input +and the hidden layer. + +0:55:53.393 --> 0:56:00.190 +So you see here you don't have input and output, +you just have an input and you calculate what. + +0:56:00.460 --> 0:56:16.429 +Which of course nicely fits with the idea +we're having, so you can use this for N gram + +0:56:16.429 --> 0:56:19.182 +language ones. + +0:56:19.259 --> 0:56:25.187 +Decaying this credibility of the input by +this type of neural networks. + +0:56:26.406 --> 0:56:30.582 +And the advantage of this type of model of +board that is. + +0:56:30.550 --> 0:56:38.629 +Very fast to integrate it, so that one was +the first one which was used during decoding. + +0:56:38.938 --> 0:56:50.103 +The problem of it is that the Enron language +models were very good at performing the calculation. + +0:56:50.230 --> 0:57:00.114 +So what people typically did is we talked +about a best list, so they generated a most + +0:57:00.114 --> 0:57:05.860 +probable output, and then they scored each +entry. + +0:57:06.146 --> 0:57:10.884 +A language model, and then only like change +the order against that based on that which. + +0:57:11.231 --> 0:57:20.731 +The knifing is maybe only hundred entries, +while during decoding you will look at several + +0:57:20.731 --> 0:57:21.787 +thousand. + +0:57:26.186 --> 0:57:40.437 +This but let's look at the context, so we +have now seen your language models. + +0:57:40.437 --> 0:57:43.726 +There is the big. + +0:57:44.084 --> 0:57:57.552 +Remember ingram language is not always words +because sometimes you have to back off or interpolation + +0:57:57.552 --> 0:57:59.953 +to lower ingrams. + +0:58:00.760 --> 0:58:05.504 +However, in neural models we always have all +of these inputs and some of these. + +0:58:07.147 --> 0:58:21.262 +The disadvantage is that you are still limited +in your context, and if you remember the sentence + +0:58:21.262 --> 0:58:23.008 +from last,. + +0:58:22.882 --> 0:58:28.445 +Sometimes you need more context and there's +unlimited contexts that you might need and + +0:58:28.445 --> 0:58:34.838 +you can always create sentences where you need +this file context in order to put a good estimation. + +0:58:35.315 --> 0:58:44.955 +Can we also do it different in order to better +understand that it makes sense to view? + +0:58:45.445 --> 0:58:57.621 +So sequence labeling tasks are a very common +type of towns in natural language processing + +0:58:57.621 --> 0:59:03.438 +where you have an input sequence and then. + +0:59:03.323 --> 0:59:08.663 +I've token so you have one output for each +input so machine translation is not a secret + +0:59:08.663 --> 0:59:14.063 +labeling cast because the number of inputs +and the number of outputs is different so you + +0:59:14.063 --> 0:59:19.099 +put in a string German which has five words +and the output can be six or seven or. + +0:59:19.619 --> 0:59:20.155 +Secrets. + +0:59:20.155 --> 0:59:24.083 +Lately you always have the same number of +and the same number of. + +0:59:24.944 --> 0:59:40.940 +And you can model language modeling as that, +and you just say a label for each word is always + +0:59:40.940 --> 0:59:43.153 +a next word. + +0:59:45.705 --> 0:59:54.823 +This is the more general you can think of +it, for example how to speech taking entity + +0:59:54.823 --> 0:59:56.202 +recognition. + +0:59:58.938 --> 1:00:08.081 +And if you look at now fruit cut token in +generally sequence, they can depend on import + +1:00:08.081 --> 1:00:08.912 +tokens. + +1:00:09.869 --> 1:00:11.260 +Nice thing. + +1:00:11.260 --> 1:00:21.918 +In our case, the output tokens are the same +so we can easily model it that they only depend + +1:00:21.918 --> 1:00:24.814 +on all the input tokens. + +1:00:24.814 --> 1:00:28.984 +So we have this whether it's or so. + +1:00:31.011 --> 1:00:42.945 +But we can always do a look at what specific +type of sequence labeling, unidirectional sequence + +1:00:42.945 --> 1:00:44.188 +labeling. + +1:00:44.584 --> 1:00:58.215 +And that's exactly how we want the language +of the next word only depends on all the previous + +1:00:58.215 --> 1:01:00.825 +words that we're. + +1:01:01.321 --> 1:01:12.899 +Mean, of course, that's not completely true +in a language that the bad context might also + +1:01:12.899 --> 1:01:14.442 +be helpful. + +1:01:14.654 --> 1:01:22.468 +We will model always the probability of a +word given on its history, and therefore we + +1:01:22.468 --> 1:01:23.013 +need. + +1:01:23.623 --> 1:01:29.896 +And currently we did there this approximation +in sequence labeling that we have this windowing + +1:01:29.896 --> 1:01:30.556 +approach. + +1:01:30.951 --> 1:01:43.975 +So in order to predict this type of word we +always look at the previous three words and + +1:01:43.975 --> 1:01:48.416 +then to do this one we again. + +1:01:49.389 --> 1:01:55.137 +If you are into neural networks you recognize +this type of structure. + +1:01:55.137 --> 1:01:57.519 +Also are the typical neural. + +1:01:58.938 --> 1:02:09.688 +Yes, so this is like Engram, Louis Couperus, +and at least in some way compared to the original, + +1:02:09.688 --> 1:02:12.264 +you're always looking. + +1:02:14.334 --> 1:02:30.781 +However, there are also other types of neural +network structures which we can use for sequence. + +1:02:32.812 --> 1:02:34.678 +That we can do so. + +1:02:34.678 --> 1:02:39.686 +The idea is in recurrent neural network structure. + +1:02:39.686 --> 1:02:43.221 +We are saving the complete history. + +1:02:43.623 --> 1:02:55.118 +So again we have to do like this fix size +representation because neural networks always + +1:02:55.118 --> 1:02:56.947 +need to have. + +1:02:57.157 --> 1:03:05.258 +And then we start with an initial value for +our storage. + +1:03:05.258 --> 1:03:15.917 +We are giving our first input and then calculating +the new representation. + +1:03:16.196 --> 1:03:26.328 +If you look at this, it's just again your +network was two types of inputs: in your work, + +1:03:26.328 --> 1:03:29.743 +in your initial hidden state. + +1:03:30.210 --> 1:03:46.468 +Then you can apply it to the next type of +input and you're again having. + +1:03:47.367 --> 1:03:53.306 +Nice thing is now that you can do now step +by step by step, so all the way over. + +1:03:55.495 --> 1:04:05.245 +The nice thing that we are having here now +is that we are having context information from + +1:04:05.245 --> 1:04:07.195 +all the previous. + +1:04:07.607 --> 1:04:13.582 +So if you're looking like based on which words +do you use here, calculate your ability of + +1:04:13.582 --> 1:04:14.180 +varying. + +1:04:14.554 --> 1:04:20.128 +It depends on is based on this path. + +1:04:20.128 --> 1:04:33.083 +It depends on and this hidden state was influenced +by this one and this hidden state. + +1:04:33.473 --> 1:04:37.798 +So now we're having something new. + +1:04:37.798 --> 1:04:46.449 +We can really model the word probability not +only on a fixed context. + +1:04:46.906 --> 1:04:53.570 +Because the in-states we're having here in +our area are influenced by all the trivia. + +1:04:56.296 --> 1:05:00.909 +So how is that to mean? + +1:05:00.909 --> 1:05:16.288 +If you're not thinking about the history of +clustering, we said the clustering. + +1:05:16.736 --> 1:05:24.261 +So do not need to do any clustering here, +and we also see how things are put together + +1:05:24.261 --> 1:05:26.273 +in order to really do. + +1:05:29.489 --> 1:05:43.433 +In the green box this way since we are starting +from the left point to the right. + +1:05:44.524 --> 1:05:48.398 +And that's right, so they're clustered in +some parts. + +1:05:48.398 --> 1:05:58.196 +Here is some type of clustering happening: +It's continuous representations, but a smaller + +1:05:58.196 --> 1:06:02.636 +difference doesn't matter again. + +1:06:02.636 --> 1:06:10.845 +So if you have a lot of different histories, +the similarity. + +1:06:11.071 --> 1:06:15.791 +Because in order to do the final restriction +you only do it based on the green box. + +1:06:16.156 --> 1:06:24.284 +So you are now again still learning some type +of clasp. + +1:06:24.284 --> 1:06:30.235 +You don't have to do this hard decision. + +1:06:30.570 --> 1:06:39.013 +The only restriction you are giving is you +have to install everything that is important. + +1:06:39.359 --> 1:06:54.961 +So it's a different type of limitation, so +you calculate the probability based on the + +1:06:54.961 --> 1:06:57.138 +last words. + +1:06:57.437 --> 1:07:09.645 +That is how you still need some cluster things +in order to do it efficiently. + +1:07:09.970 --> 1:07:25.311 +But this is where things get merged together +in this type of hidden representation, which + +1:07:25.311 --> 1:07:28.038 +is then merged. + +1:07:28.288 --> 1:07:33.104 +On the previous words, but they are some other +bottleneck in order to make a good estimation. + +1:07:34.474 --> 1:07:41.242 +So the idea is that we can store all our history +into one lecture. + +1:07:41.581 --> 1:07:47.351 +Which is very good and makes it more strong. + +1:07:47.351 --> 1:07:51.711 +Next we come to problems of that. + +1:07:51.711 --> 1:07:57.865 +Of course, at some point it might be difficult. + +1:07:58.398 --> 1:08:02.230 +Then maybe things get all overwritten, or +you cannot store everything in there. + +1:08:02.662 --> 1:08:04.514 +So,. + +1:08:04.184 --> 1:08:10.252 +Therefore, yet for short things like signal +sentences that works well, but especially if + +1:08:10.252 --> 1:08:16.184 +you think of other tasks like harmonisation +where a document based on T where you need + +1:08:16.184 --> 1:08:22.457 +to consider a full document, these things got +a bit more complicated and we learned another + +1:08:22.457 --> 1:08:23.071 +type of. + +1:08:24.464 --> 1:08:30.455 +For the further in order to understand these +networks, it's good to have both views always. + +1:08:30.710 --> 1:08:39.426 +So this is the unroll view, so you have this +type of network. + +1:08:39.426 --> 1:08:48.532 +Therefore, it can be shown as: We have here +the output and here's your network which is + +1:08:48.532 --> 1:08:52.091 +connected by itself and that is a recurrent. + +1:08:56.176 --> 1:09:11.033 +There is one challenge in these networks and +that is the training so the nice thing is train + +1:09:11.033 --> 1:09:11.991 +them. + +1:09:12.272 --> 1:09:20.147 +So the idea is we don't really know how to +train them, but if you unroll them like this,. + +1:09:20.540 --> 1:09:38.054 +It's exactly the same so you can measure your +arrows and then you propagate your arrows. + +1:09:38.378 --> 1:09:45.647 +Now the nice thing is if you unroll something, +it's a feet forward and you can train it. + +1:09:46.106 --> 1:09:56.493 +The only important thing is, of course, for +different inputs you have to take that into + +1:09:56.493 --> 1:09:57.555 +account. + +1:09:57.837 --> 1:10:07.621 +But since parameters are shared, it's somehow +similar and you can train that the training + +1:10:07.621 --> 1:10:08.817 +algorithm. + +1:10:10.310 --> 1:10:16.113 +One thing which makes things difficult is +what is referred to as the vanishing gradient. + +1:10:16.113 --> 1:10:21.720 +So we are saying there is a big advantage +of these models and that's why we are using + +1:10:21.720 --> 1:10:22.111 +that. + +1:10:22.111 --> 1:10:27.980 +The output here does not only depend on the +current input of a last three but on anything + +1:10:27.980 --> 1:10:29.414 +that was said before. + +1:10:29.809 --> 1:10:32.803 +That's a very strong thing is the motivation +of using art. + +1:10:33.593 --> 1:10:44.599 +However, if you're using standard, the influence +here gets smaller and smaller, and the models. + +1:10:44.804 --> 1:10:55.945 +Because the gradients get smaller and smaller, +and so the arrow here propagated to this one, + +1:10:55.945 --> 1:10:59.659 +this contributes to the arrow. + +1:11:00.020 --> 1:11:06.710 +And yeah, that's why standard R&S are +difficult or have to become boosters. + +1:11:07.247 --> 1:11:11.481 +So if we are talking about our ends nowadays,. + +1:11:11.791 --> 1:11:19.532 +What we are typically meaning are long short +memories. + +1:11:19.532 --> 1:11:30.931 +You see there by now quite old already, but +they have special gating mechanisms. + +1:11:31.171 --> 1:11:41.911 +So in the language model tasks, for example +in some other story information, all this sentence + +1:11:41.911 --> 1:11:44.737 +started with a question. + +1:11:44.684 --> 1:11:51.886 +Because if you only look at the five last +five words, it's often no longer clear as a + +1:11:51.886 --> 1:11:52.556 +normal. + +1:11:53.013 --> 1:12:06.287 +So there you have these mechanisms with the +right gate in order to store things for a longer + +1:12:06.287 --> 1:12:08.571 +time into your. + +1:12:10.730 --> 1:12:20.147 +Here they are used in, in, in, in selling +quite a lot of works. + +1:12:21.541 --> 1:12:30.487 +For especially text machine translation now, +the standard is to do transformer base models. + +1:12:30.690 --> 1:12:42.857 +But for example, this type of in architecture +we have later one lecture about efficiency. + +1:12:42.882 --> 1:12:53.044 +And there in the decoder and partial networks +they are still using our edges because then. + +1:12:53.473 --> 1:12:57.542 +So it's not that our ends are of no importance. + +1:12:59.239 --> 1:13:08.956 +In order to make them strong, there are some +more things which are helpful and should be: + +1:13:09.309 --> 1:13:19.668 +So one thing is it's a very easy and nice trick +to make this neon network stronger and better. + +1:13:19.739 --> 1:13:21.619 +So, of course, it doesn't work always. + +1:13:21.619 --> 1:13:23.451 +They have to have enough training to. + +1:13:23.763 --> 1:13:29.583 +But in general that is the easiest way of +making your mouth bigger and stronger is to + +1:13:29.583 --> 1:13:30.598 +increase your. + +1:13:30.630 --> 1:13:43.244 +And you've seen that with a large size model +they are always braggling about. + +1:13:43.903 --> 1:13:53.657 +This is one way so the question is how do +you get more parameters? + +1:13:53.657 --> 1:14:05.951 +There's two ways you can make your representations: +And the other thing is its octave deep learning, + +1:14:05.951 --> 1:14:10.020 +so the other thing is to make your networks. + +1:14:11.471 --> 1:14:13.831 +And then you can also get more work off. + +1:14:14.614 --> 1:14:19.931 +There's one problem with this and with more +deeper networks. + +1:14:19.931 --> 1:14:23.330 +It's very similar to what we saw with. + +1:14:23.603 --> 1:14:34.755 +With the we have this problem of radiant flow +that if it flows so fast like the radiant gets + +1:14:34.755 --> 1:14:35.475 +very. + +1:14:35.795 --> 1:14:41.114 +Exactly the same thing happens in deep. + +1:14:41.114 --> 1:14:52.285 +If you take the gradient and tell it's the +right or wrong, then you're propagating. + +1:14:52.612 --> 1:14:53.228 +Three layers. + +1:14:53.228 --> 1:14:56.440 +It's no problem, but if you're going to ten, +twenty or a hundred layers. + +1:14:57.797 --> 1:14:59.690 +That is getting typically a problem. + +1:15:00.060 --> 1:15:10.659 +People are doing and they are using what is +called visual connections. + +1:15:10.659 --> 1:15:15.885 +That's a very helpful idea, which. + +1:15:15.956 --> 1:15:20.309 +And so the idea is that these networks. + +1:15:20.320 --> 1:15:30.694 +In between should calculate really what is +a new representation, but they are calculating + +1:15:30.694 --> 1:15:31.386 +what. + +1:15:31.731 --> 1:15:37.585 +And therefore in the end you'll always the +output of a layer is added with the input. + +1:15:38.318 --> 1:15:48.824 +The nice thing is that later, if you are doing +back propagation with this very fast back,. + +1:15:49.209 --> 1:16:01.896 +So that is what you're seeing nowadays in +very deep architectures, not only as others, + +1:16:01.896 --> 1:16:04.229 +but you always. + +1:16:04.704 --> 1:16:07.388 +Has two advantages. + +1:16:07.388 --> 1:16:15.304 +On the one hand, it's more easy to learn a +representation. + +1:16:15.304 --> 1:16:18.792 +On the other hand, these. + +1:16:22.082 --> 1:16:24.114 +Goods. + +1:16:23.843 --> 1:16:31.763 +That much for the new record before, so the +last thing now means this. + +1:16:31.671 --> 1:16:36.400 +Language was used in the molds itself. + +1:16:36.400 --> 1:16:46.707 +Now we're seeing them again, but one thing +that at the beginning was very essential. + +1:16:46.967 --> 1:16:57.655 +So people really train part in the language +models only to get this type of embeddings + +1:16:57.655 --> 1:17:04.166 +and therefore we want to look a bit more into +these. + +1:17:09.229 --> 1:17:13.456 +Some laugh words to the word embeddings. + +1:17:13.456 --> 1:17:22.117 +The interesting thing is that word embeddings +can be used for very different tasks. + +1:17:22.117 --> 1:17:27.170 +The advantage is we can train the word embedded. + +1:17:27.347 --> 1:17:31.334 +The knife is you can train that on just large +amounts of data. + +1:17:31.931 --> 1:17:40.937 +And then if you have these wooden beddings +you don't have a layer of ten thousand any + +1:17:40.937 --> 1:17:41.566 +more. + +1:17:41.982 --> 1:17:52.231 +So then you can train a small market to do +any other tasks and therefore you're more. + +1:17:52.532 --> 1:17:58.761 +Initial word embeddings really depend only +on the word itself. + +1:17:58.761 --> 1:18:07.363 +If you look at the two meanings of can, the +can of beans, or can they do that, some of + +1:18:07.363 --> 1:18:08.747 +the embedded. + +1:18:09.189 --> 1:18:12.395 +That cannot be resolved. + +1:18:12.395 --> 1:18:23.939 +Therefore, you need to know the context, and +if you look at the higher levels that people + +1:18:23.939 --> 1:18:27.916 +are doing in the context, but. + +1:18:29.489 --> 1:18:33.757 +However, even this one has quite very interesting. + +1:18:34.034 --> 1:18:44.644 +So people like to visualize that they're always +a bit difficult because if you look at this + +1:18:44.644 --> 1:18:47.182 +word, vector or word. + +1:18:47.767 --> 1:18:52.879 +And drawing your five hundred dimensional +vector is still a bit challenging. + +1:18:53.113 --> 1:19:12.464 +So you cannot directly do that, so what people +have to do is learn some type of dimension. + +1:19:13.073 --> 1:19:17.216 +And of course then yes some information gets +lost but you can try it. + +1:19:18.238 --> 1:19:28.122 +And you see, for example, this is the most +famous and common example, so what you can + +1:19:28.122 --> 1:19:37.892 +look is you can look at the difference between +the male and the female word English. + +1:19:38.058 --> 1:19:40.389 +And you can do that for a very different work. + +1:19:40.780 --> 1:19:45.403 +And that is where, where the masks come into +that, what people then look into. + +1:19:45.725 --> 1:19:50.995 +So what you can now, for example, do is you +can calculate the difference between man and + +1:19:50.995 --> 1:19:51.410 +woman. + +1:19:52.232 --> 1:19:56.356 +And what you can do then you can take the +embedding of peeing. + +1:19:56.356 --> 1:20:02.378 +You can add on it the difference between men +and women and where people get really excited. + +1:20:02.378 --> 1:20:05.586 +Then you can look at what are the similar +words. + +1:20:05.586 --> 1:20:09.252 +So you won't, of course, directly hit the +correct word. + +1:20:09.252 --> 1:20:10.495 +It's a continuous. + +1:20:10.790 --> 1:20:24.062 +But you can look at what are the nearest neighbors +to the same, and often these words are near. + +1:20:24.224 --> 1:20:33.911 +So it's somehow weird that the difference +between these works is always the same. + +1:20:34.374 --> 1:20:37.308 +Can do different things. + +1:20:37.308 --> 1:20:47.520 +You can also imagine that the work tends to +be assuming and swim, and with walking and + +1:20:47.520 --> 1:20:49.046 +walking you. + +1:20:49.469 --> 1:20:53.040 +So you can try to use him. + +1:20:53.040 --> 1:20:56.346 +It's no longer like say. + +1:20:56.346 --> 1:21:04.016 +The interesting thing is nobody taught him +the principle. + +1:21:04.284 --> 1:21:09.910 +So it's purely trained on the task of doing +the next work prediction. + +1:21:10.230 --> 1:21:23.669 +And even for some information like the capital, +this is the difference between the capital. + +1:21:23.823 --> 1:21:33.760 +Is another visualization here where you have +done the same things on the difference between. + +1:21:33.853 --> 1:21:41.342 +And you see it's not perfect, but it's building +in my directory, so you can even use that for + +1:21:41.342 --> 1:21:42.936 +pressure answering. + +1:21:42.936 --> 1:21:50.345 +If you have no three countries, the capital, +you can do what is the difference between them. + +1:21:50.345 --> 1:21:53.372 +You apply that to a new country, and. + +1:21:54.834 --> 1:22:02.280 +So these models are able to really learn a +lot of information and collapse this information + +1:22:02.280 --> 1:22:04.385 +into this representation. + +1:22:05.325 --> 1:22:07.679 +And just to do the next two are predictions. + +1:22:07.707 --> 1:22:22.358 +And that also explains a bit maybe or explains +strongly, but motivates what is the main advantage + +1:22:22.358 --> 1:22:26.095 +of this type of neurons. + +1:22:28.568 --> 1:22:46.104 +So to summarize what we did today, so what +you should hopefully have with you is: Then + +1:22:46.104 --> 1:22:49.148 +how we can do language modeling with new networks. + +1:22:49.449 --> 1:22:55.445 +We looked at three different architectures: +We looked into the feet forward language one, + +1:22:55.445 --> 1:22:59.059 +the R&N, and the one based the balsamic. + +1:22:59.039 --> 1:23:04.559 +And finally, there are different architectures +to do in neural networks. + +1:23:04.559 --> 1:23:10.986 +We have seen feet for neural networks and +base neural networks, and we'll see in the + +1:23:10.986 --> 1:23:14.389 +next lectures the last type of architecture. + +1:23:15.915 --> 1:23:17.438 +Any questions. + +1:23:20.680 --> 1:23:27.360 +Then thanks a lot, and next I'm just there, +we'll be again on order to. + diff --git a/demo_data/lectures/Lecture-07-16.05.2023/video.mp4 b/demo_data/lectures/Lecture-07-16.05.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e46376921a8976c5b0f0a91e7b73fba3152a7c42 --- /dev/null +++ b/demo_data/lectures/Lecture-07-16.05.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee1fc2af8bf4d95a18dacaa3d5d9aad8c6c207e0f5f63090a9adefcfcf29f418 +size 150440033 diff --git a/demo_data/lectures/Lecture-09-25.05.2023/English.vtt b/demo_data/lectures/Lecture-09-25.05.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..ca862156faebf5e2e913fdcddd8b732ef551e437 --- /dev/null +++ b/demo_data/lectures/Lecture-09-25.05.2023/English.vtt @@ -0,0 +1,3031 @@ +WEBVTT + +0:00:01.721 --> 0:00:05.064 +Hey, and then welcome to today's lecture. + +0:00:06.126 --> 0:00:13.861 +What we want to do today is we will finish +with what we have done last time, so we started + +0:00:13.861 --> 0:00:22.192 +looking at the new machine translation system, +but we have had all the components of the sequence + +0:00:22.192 --> 0:00:22.787 +model. + +0:00:22.722 --> 0:00:29.361 +We're still missing is the transformer based +architecture so that maybe the self attention. + +0:00:29.849 --> 0:00:31.958 +Then we want to look at the beginning today. + +0:00:32.572 --> 0:00:39.315 +And then the main part of the day's lecture +will be decoding. + +0:00:39.315 --> 0:00:43.992 +That means we know how to train the model. + +0:00:44.624 --> 0:00:47.507 +So decoding sewage all they can be. + +0:00:47.667 --> 0:00:53.359 +Be useful that and the idea is how we find +that and what challenges are there. + +0:00:53.359 --> 0:00:59.051 +Since it's unregressive, we will see that +it's not as easy as for other tasks. + +0:00:59.359 --> 0:01:08.206 +While generating the translation step by step, +we might make additional arrows that lead. + +0:01:09.069 --> 0:01:16.464 +But let's start with a self attention, so +what we looked at into was an base model. + +0:01:16.816 --> 0:01:27.931 +And then in our based models you always take +the last new state, you take your input, you + +0:01:27.931 --> 0:01:31.513 +generate a new hidden state. + +0:01:31.513 --> 0:01:35.218 +This is more like a standard. + +0:01:35.675 --> 0:01:41.088 +And one challenge in this is that we always +store all our history in one signal hidden + +0:01:41.088 --> 0:01:41.523 +stick. + +0:01:41.781 --> 0:01:50.235 +We saw that this is a problem when going from +encoder to decoder, and that is why we then + +0:01:50.235 --> 0:01:58.031 +introduced the attention mechanism so that +we can look back and see all the parts. + +0:01:59.579 --> 0:02:06.059 +However, in the decoder we still have this +issue so we are still storing all information + +0:02:06.059 --> 0:02:12.394 +in one hidden state and we might do things +like here that we start to overwrite things + +0:02:12.394 --> 0:02:13.486 +and we forgot. + +0:02:14.254 --> 0:02:23.575 +So the idea is, can we do something similar +which we do between encoder and decoder within + +0:02:23.575 --> 0:02:24.907 +the decoder? + +0:02:26.526 --> 0:02:33.732 +And the idea is each time we're generating +here in New York State, it will not only depend + +0:02:33.732 --> 0:02:40.780 +on the previous one, but we will focus on the +whole sequence and look at different parts + +0:02:40.780 --> 0:02:46.165 +as we did in attention in order to generate +our new representation. + +0:02:46.206 --> 0:02:53.903 +So each time we generate a new representation +we will look into what is important now to + +0:02:53.903 --> 0:02:54.941 +understand. + +0:02:55.135 --> 0:03:00.558 +You may want to understand what much is important. + +0:03:00.558 --> 0:03:08.534 +You might want to look to vary and to like +so that it's much about liking. + +0:03:08.808 --> 0:03:24.076 +So the idea is that we are not staring everything +in each time we are looking at the full sequence. + +0:03:25.125 --> 0:03:35.160 +And that is achieved by no longer going really +secret, and the hidden states here aren't dependent + +0:03:35.160 --> 0:03:37.086 +on the same layer. + +0:03:37.086 --> 0:03:42.864 +But instead we are always looking at the previous +layer. + +0:03:42.942 --> 0:03:45.510 +We will always have more information that +we are coming. + +0:03:47.147 --> 0:03:51.572 +So how does this censor work in detail? + +0:03:51.572 --> 0:03:56.107 +So we started with our initial mistakes. + +0:03:56.107 --> 0:04:08.338 +So, for example: Now where we had the three +terms already, the query, the key and the value, + +0:04:08.338 --> 0:04:12.597 +it was motivated by our database. + +0:04:12.772 --> 0:04:20.746 +We are comparing it to the keys to all the +other values, and then we are merging the values. + +0:04:21.321 --> 0:04:35.735 +There was a difference between the decoder +and the encoder. + +0:04:35.775 --> 0:04:41.981 +You can assume all the same because we are +curving ourselves. + +0:04:41.981 --> 0:04:49.489 +However, we can make them different but just +learning a linear projection. + +0:04:49.529 --> 0:05:01.836 +So you learn here some projection based on +what need to do in order to ask which question. + +0:05:02.062 --> 0:05:11.800 +That is, the query and the key is to what +do want to compare and provide others, and + +0:05:11.800 --> 0:05:13.748 +which values do. + +0:05:14.014 --> 0:05:23.017 +This is not like hand defined, but learn, +so it's like three linear projections that + +0:05:23.017 --> 0:05:26.618 +you apply on all of these hidden. + +0:05:26.618 --> 0:05:32.338 +That is the first thing based on your initial +hidden. + +0:05:32.612 --> 0:05:37.249 +And now you can do exactly as before, you +can do the attention. + +0:05:37.637 --> 0:05:40.023 +How did the attention work? + +0:05:40.023 --> 0:05:45.390 +The first thing is we are comparing our query +to all the keys. + +0:05:45.445 --> 0:05:52.713 +And that is now the difference before the +quarry was from the decoder, the keys were + +0:05:52.713 --> 0:05:54.253 +from the encoder. + +0:05:54.253 --> 0:06:02.547 +Now it's like all from the same, so we started +the first in state to the keys of all the others. + +0:06:02.582 --> 0:06:06.217 +We're learning some value here. + +0:06:06.217 --> 0:06:12.806 +How important are these information to better +understand? + +0:06:13.974 --> 0:06:19.103 +And these are just like floating point numbers. + +0:06:19.103 --> 0:06:21.668 +They are normalized so. + +0:06:22.762 --> 0:06:30.160 +And that is the first step, so let's go first +for the first curve. + +0:06:30.470 --> 0:06:41.937 +What we can then do is multiply each value +as we have done before with the importance + +0:06:41.937 --> 0:06:43.937 +of each state. + +0:06:45.145 --> 0:06:47.686 +And then we have in here the new hit step. + +0:06:48.308 --> 0:06:57.862 +See now this new hidden status is depending +on all the hidden state of all the sequences + +0:06:57.862 --> 0:06:59.686 +of the previous. + +0:06:59.879 --> 0:07:01.739 +One important thing. + +0:07:01.739 --> 0:07:08.737 +This one doesn't really depend, so the hidden +states here don't depend on the. + +0:07:09.029 --> 0:07:15.000 +So it only depends on the hidden state of +the previous layer, but it depends on all the + +0:07:15.000 --> 0:07:18.664 +hidden states, and that is of course a big +advantage. + +0:07:18.664 --> 0:07:25.111 +So on the one hand information can directly +flow from each hidden state before the information + +0:07:25.111 --> 0:07:27.214 +flow was always a bit limited. + +0:07:28.828 --> 0:07:35.100 +And the independence is important so we can +calculate all these in the states in parallel. + +0:07:35.100 --> 0:07:41.371 +That's another big advantage of self attention +that we can calculate all the hidden states + +0:07:41.371 --> 0:07:46.815 +in one layer in parallel and therefore it's +the ad designed for GPUs and fast. + +0:07:47.587 --> 0:07:50.235 +Then we can do the same thing for the second +in the state. + +0:07:50.530 --> 0:08:06.866 +And the only difference here is how we calculate +what is occurring. + +0:08:07.227 --> 0:08:15.733 +Getting these values is different because +we use the different query and then getting + +0:08:15.733 --> 0:08:17.316 +our new hidden. + +0:08:18.258 --> 0:08:26.036 +Yes, this is the word of words that underneath +this case might, but this is simple. + +0:08:26.036 --> 0:08:26.498 +Not. + +0:08:27.127 --> 0:08:33.359 +That's a very good question that is like on +the initial thing. + +0:08:33.359 --> 0:08:38.503 +That is exactly not one of you in the architecture. + +0:08:38.503 --> 0:08:44.042 +Maybe first you would think of a very big +disadvantage. + +0:08:44.384 --> 0:08:49.804 +So this hidden state would be the same if +the movie would be different. + +0:08:50.650 --> 0:08:59.983 +And of course this estate is a site someone +should like, so if the estate would be here + +0:08:59.983 --> 0:09:06.452 +except for this correspondence the word order +is completely. + +0:09:06.706 --> 0:09:17.133 +Therefore, just doing self attention wouldn't +work at all because we know word order is important + +0:09:17.133 --> 0:09:21.707 +and there is a complete different meaning. + +0:09:22.262 --> 0:09:26.277 +We introduce the word position again. + +0:09:26.277 --> 0:09:33.038 +The main idea is if the position is already +in your embeddings. + +0:09:33.533 --> 0:09:39.296 +Then of course the position is there and you +don't lose it anymore. + +0:09:39.296 --> 0:09:46.922 +So mainly if your life representation here +encodes at the second position and your output + +0:09:46.922 --> 0:09:48.533 +will be different. + +0:09:49.049 --> 0:09:54.585 +And that's how you encode it, but that's essential +in order to get this work. + +0:09:57.137 --> 0:10:08.752 +But before we are coming to the next slide, +one other thing that is typically done is multi-head + +0:10:08.752 --> 0:10:10.069 +attention. + +0:10:10.430 --> 0:10:15.662 +And it might be that in order to understand +much, it might be good that in some way we + +0:10:15.662 --> 0:10:19.872 +focus on life, and in some way we can focus +on vary, but not equally. + +0:10:19.872 --> 0:10:25.345 +But maybe it's like to understand again on +different dimensions we should look into these. + +0:10:25.905 --> 0:10:31.393 +And therefore what we're doing is we're just +doing the self attention at once, but we're + +0:10:31.393 --> 0:10:35.031 +doing it end times or based on your multi head +attentions. + +0:10:35.031 --> 0:10:41.299 +So in typical examples, the number of heads +people are talking about is like: So you're + +0:10:41.299 --> 0:10:50.638 +doing this process and have different queries +and keys so you can focus. + +0:10:50.790 --> 0:10:52.887 +How can you generate eight different? + +0:10:53.593 --> 0:11:07.595 +Things it's quite easy here, so instead of +having one linear projection you can have age + +0:11:07.595 --> 0:11:09.326 +different. + +0:11:09.569 --> 0:11:13.844 +And it might be that sometimes you're looking +more into one thing, and sometimes you're Looking + +0:11:13.844 --> 0:11:14.779 +more into the other. + +0:11:15.055 --> 0:11:24.751 +So that's of course nice with this type of +learned approach because we can automatically + +0:11:24.751 --> 0:11:25.514 +learn. + +0:11:29.529 --> 0:11:36.629 +And what you correctly said is its positional +independence, so it doesn't really matter the + +0:11:36.629 --> 0:11:39.176 +order which should be important. + +0:11:39.379 --> 0:11:47.686 +So how can we do that and the idea is we are +just encoding it directly into the embedding + +0:11:47.686 --> 0:11:52.024 +so into the starting so that a representation. + +0:11:52.512 --> 0:11:55.873 +How do we get that so we started with our +embeddings? + +0:11:55.873 --> 0:11:58.300 +Just imagine this is embedding of eye. + +0:11:59.259 --> 0:12:06.169 +And then we are having additionally this positional +encoding. + +0:12:06.169 --> 0:12:10.181 +In this position, encoding is just. + +0:12:10.670 --> 0:12:19.564 +With different wavelength, so with different +lengths of your signal as you see here. + +0:12:20.160 --> 0:12:37.531 +And the number of functions you have is exactly +the number of dimensions you have in your embedded. + +0:12:38.118 --> 0:12:51.091 +And what will then do is take the first one, +and based on your position you multiply your + +0:12:51.091 --> 0:12:51.955 +word. + +0:12:52.212 --> 0:13:02.518 +And you see now if you put it in this position, +of course it will get a different value. + +0:13:03.003 --> 0:13:12.347 +And thereby in each position a different function +is multiplied. + +0:13:12.347 --> 0:13:19.823 +This is a representation for at the first +position. + +0:13:20.020 --> 0:13:34.922 +If you have it in the input already encoded +then of course the model is able to keep the + +0:13:34.922 --> 0:13:38.605 +position information. + +0:13:38.758 --> 0:13:48.045 +But your embeddings can also learn your embeddings +in a way that they are optimal collaborating + +0:13:48.045 --> 0:13:49.786 +with these types. + +0:13:51.451 --> 0:13:59.351 +Is that somehow clear where he is there? + +0:14:06.006 --> 0:14:13.630 +Am the first position and second position? + +0:14:16.576 --> 0:14:17.697 +Have a long wait period. + +0:14:17.697 --> 0:14:19.624 +I'm not going to tell you how to turn the. + +0:14:21.441 --> 0:14:26.927 +Be completely issued because if you have a +very short wavelength there might be quite + +0:14:26.927 --> 0:14:28.011 +big differences. + +0:14:28.308 --> 0:14:33.577 +And it might also be that then it depends, +of course, like what type of world embedding + +0:14:33.577 --> 0:14:34.834 +you've learned like. + +0:14:34.834 --> 0:14:37.588 +Is the dimension where you have long changes? + +0:14:37.588 --> 0:14:43.097 +Is the report for your embedding or not so +that's what I mean so that the model can somehow + +0:14:43.097 --> 0:14:47.707 +learn that by putting more information into +one of the embedding dimensions? + +0:14:48.128 --> 0:14:54.560 +So incorporated and would assume it's learning +it a bit haven't seen. + +0:14:54.560 --> 0:14:57.409 +Details studied how different. + +0:14:58.078 --> 0:15:07.863 +It's also a bit difficult because really measuring +how similar or different a world isn't that + +0:15:07.863 --> 0:15:08.480 +easy. + +0:15:08.480 --> 0:15:13.115 +You can do, of course, the average distance. + +0:15:14.114 --> 0:15:21.393 +Them, so are the weight tags not at model +two, or is there fixed weight tags that the + +0:15:21.393 --> 0:15:21.986 +model. + +0:15:24.164 --> 0:15:30.165 +To believe they are fixed and the mono learns +there's a different way of doing it. + +0:15:30.165 --> 0:15:32.985 +The other thing you can do is you can. + +0:15:33.213 --> 0:15:36.945 +So you can learn the second embedding which +says this is position one. + +0:15:36.945 --> 0:15:38.628 +This is position two and so on. + +0:15:38.628 --> 0:15:42.571 +Like for words you could learn fixed embeddings +and then add them upwards. + +0:15:42.571 --> 0:15:45.094 +So then it would have the same thing it's +done. + +0:15:45.094 --> 0:15:46.935 +There is one disadvantage of this. + +0:15:46.935 --> 0:15:51.403 +There is anybody an idea what could be the +disadvantage of a more learned embedding. + +0:15:54.955 --> 0:16:00.000 +Here maybe extra play this finger and ethnic +stuff that will be an art. + +0:16:00.000 --> 0:16:01.751 +This will be an art for. + +0:16:02.502 --> 0:16:08.323 +You would only be good at positions you have +seen often and especially for long sequences. + +0:16:08.323 --> 0:16:14.016 +You might have seen the positions very rarely +and then normally not performing that well + +0:16:14.016 --> 0:16:17.981 +while here it can better learn a more general +representation. + +0:16:18.298 --> 0:16:22.522 +So that is another thing which we won't discuss +here. + +0:16:22.522 --> 0:16:25.964 +Guess is what is called relative attention. + +0:16:25.945 --> 0:16:32.570 +And in this case you don't learn absolute +positions, but in your calculation of the similarity + +0:16:32.570 --> 0:16:39.194 +you take again the relative distance into account +and have a different similarity depending on + +0:16:39.194 --> 0:16:40.449 +how far they are. + +0:16:40.660 --> 0:16:45.898 +And then you don't need to encode it beforehand, +but you would more happen within your comparison. + +0:16:46.186 --> 0:16:53.471 +So when you compare how similar things you +print, of course also take the relative position. + +0:16:55.715 --> 0:17:03.187 +Because there are multiple ways to use the +one, to multiply all the embedding, or to use + +0:17:03.187 --> 0:17:03.607 +all. + +0:17:17.557 --> 0:17:21.931 +The encoder can be bidirectional. + +0:17:21.931 --> 0:17:30.679 +We have everything from the beginning so we +can have a model where. + +0:17:31.111 --> 0:17:36.455 +Decoder training of course has also everything +available but during inference you always have + +0:17:36.455 --> 0:17:41.628 +only the past available so you can only look +into the previous one and not into the future + +0:17:41.628 --> 0:17:46.062 +because if you generate word by word you don't +know what it will be there in. + +0:17:46.866 --> 0:17:53.180 +And so we also have to consider this somehow +in the attention, and until now we look more + +0:17:53.180 --> 0:17:54.653 +at the ecoder style. + +0:17:54.653 --> 0:17:58.652 +So if you look at this type of model, it's +by direction. + +0:17:58.652 --> 0:18:03.773 +So for this hill state we are looking into +the past and into the future. + +0:18:04.404 --> 0:18:14.436 +So the question is, can we have to do this +like unidirectional so that you only look into + +0:18:14.436 --> 0:18:15.551 +the past? + +0:18:15.551 --> 0:18:22.573 +And the nice thing is, this is even easier +than for our hands. + +0:18:23.123 --> 0:18:29.738 +So we would have different types of parameters +and models because you have a forward direction. + +0:18:31.211 --> 0:18:35.679 +For attention, that is very simple. + +0:18:35.679 --> 0:18:39.403 +We are doing what is masking. + +0:18:39.403 --> 0:18:45.609 +If you want to have a backward model, these +ones. + +0:18:45.845 --> 0:18:54.355 +So on the first hit stage it's been over, +so it's maybe only looking at its health. + +0:18:54.894 --> 0:19:05.310 +By the second it looks on the second and the +third, so you're always selling all values + +0:19:05.310 --> 0:19:07.085 +in the future. + +0:19:07.507 --> 0:19:13.318 +And thereby you can have with the same parameters +the same model. + +0:19:13.318 --> 0:19:15.783 +You can have then a unique. + +0:19:16.156 --> 0:19:29.895 +In the decoder you do the masked self attention +where you only look into the past and you don't + +0:19:29.895 --> 0:19:30.753 +look. + +0:19:32.212 --> 0:19:36.400 +Then we only have, of course, looked onto +itself. + +0:19:36.616 --> 0:19:50.903 +So the question: How can we combine forward +and decoder and then we can do a decoder and + +0:19:50.903 --> 0:19:54.114 +just have a second? + +0:19:54.374 --> 0:20:00.286 +And then we're doing the cross attention which +attacks from the decoder to the anchoder. + +0:20:00.540 --> 0:20:10.239 +So in this time it's again that the queries +is a current state of decoder, while the keys + +0:20:10.239 --> 0:20:22.833 +are: You can do both onto yourself to get the +meaning on the target side and to get the meaning. + +0:20:23.423 --> 0:20:25.928 +So see then the full picture. + +0:20:25.928 --> 0:20:33.026 +This is now the typical picture of the transformer +and where you use self attention. + +0:20:33.026 --> 0:20:36.700 +So what you have is have your power hidden. + +0:20:37.217 --> 0:20:43.254 +What you then apply is here the position they're +coding: We have then doing the self attention + +0:20:43.254 --> 0:20:46.734 +to all the others, and this can be bi-directional. + +0:20:47.707 --> 0:20:54.918 +You normally do another feed forward layer +just like to make things to learn additional + +0:20:54.918 --> 0:20:55.574 +things. + +0:20:55.574 --> 0:21:02.785 +You're just having also a feed forward layer +which takes your heel stable and generates + +0:21:02.785 --> 0:21:07.128 +your heel state because we are making things +deeper. + +0:21:07.747 --> 0:21:15.648 +Then this blue part you can stack over several +times so you can have layers so that. + +0:21:16.336 --> 0:21:30.256 +In addition to these blue arrows, so we talked +about this in R&S that if you are now back + +0:21:30.256 --> 0:21:35.883 +propagating your arrow from the top,. + +0:21:36.436 --> 0:21:48.578 +In order to prevent that we are not really +learning how to transform that, but instead + +0:21:48.578 --> 0:21:51.230 +we have to change. + +0:21:51.671 --> 0:22:00.597 +You're calculating what should be changed +with this one. + +0:22:00.597 --> 0:22:09.365 +The backwards clip each layer and the learning +is just. + +0:22:10.750 --> 0:22:21.632 +The encoder before we go to the decoder. + +0:22:21.632 --> 0:22:30.655 +We have any additional questions. + +0:22:31.471 --> 0:22:33.220 +That's a Very Good Point. + +0:22:33.553 --> 0:22:38.709 +Yeah, you normally take always that at least +the default architecture to only look at the + +0:22:38.709 --> 0:22:38.996 +top. + +0:22:40.000 --> 0:22:40.388 +Coder. + +0:22:40.388 --> 0:22:42.383 +Of course, you can do other things. + +0:22:42.383 --> 0:22:45.100 +We investigated, for example, the lowest layout. + +0:22:45.100 --> 0:22:49.424 +The decoder is looking at the lowest level +of the incoder and not of the top. + +0:22:49.749 --> 0:23:05.342 +You can average or you can even learn theoretically +that what you can also do is attending to all. + +0:23:05.785 --> 0:23:11.180 +Can attend to all possible layers and states. + +0:23:11.180 --> 0:23:18.335 +But what the default thing is is that you +only have the top. + +0:23:20.580 --> 0:23:31.999 +The decoder when we're doing is firstly doing +the same position and coding, then we're doing + +0:23:31.999 --> 0:23:36.419 +self attention in the decoder side. + +0:23:37.837 --> 0:23:43.396 +Of course here it's not important we're doing +the mask self attention so that we're only + +0:23:43.396 --> 0:23:45.708 +attending to the past and we're not. + +0:23:47.287 --> 0:24:02.698 +Here you see the difference, so in this case +the keys and values are from the encoder and + +0:24:02.698 --> 0:24:03.554 +the. + +0:24:03.843 --> 0:24:12.103 +You're comparing it to all the counter hidden +states calculating the similarity and then + +0:24:12.103 --> 0:24:13.866 +you do the weight. + +0:24:14.294 --> 0:24:17.236 +And that is an edit to what is here. + +0:24:18.418 --> 0:24:29.778 +Then you have a linen layer and again this +green one is sticked several times and then. + +0:24:32.232 --> 0:24:36.987 +Question, so each code is off. + +0:24:36.987 --> 0:24:46.039 +Every one of those has the last layer of thing, +so in the. + +0:24:46.246 --> 0:24:51.007 +All with and only to the last or the top layer +of the anchor. + +0:24:57.197 --> 0:25:00.127 +Good So That Would Be. + +0:25:01.501 --> 0:25:12.513 +To sequence models we have looked at attention +and before we are decoding do you have any + +0:25:12.513 --> 0:25:18.020 +more questions to this type of architecture. + +0:25:20.480 --> 0:25:30.049 +Transformer was first used in machine translation, +but now it's a standard thing for doing nearly + +0:25:30.049 --> 0:25:32.490 +any tie sequence models. + +0:25:33.013 --> 0:25:35.984 +Even large language models. + +0:25:35.984 --> 0:25:38.531 +They are a bit similar. + +0:25:38.531 --> 0:25:45.111 +They are just throwing away the anchor and +cross the tension. + +0:25:45.505 --> 0:25:59.329 +And that is maybe interesting that it's important +to have this attention because you cannot store + +0:25:59.329 --> 0:26:01.021 +everything. + +0:26:01.361 --> 0:26:05.357 +The interesting thing with the attention is +now we can attend to everything. + +0:26:05.745 --> 0:26:13.403 +So you can again go back to your initial model +and have just a simple sequence model and then + +0:26:13.403 --> 0:26:14.055 +target. + +0:26:14.694 --> 0:26:24.277 +There would be a more language model style +or people call it Decoder Only model where + +0:26:24.277 --> 0:26:26.617 +you throw this away. + +0:26:27.247 --> 0:26:30.327 +The nice thing is because of your self attention. + +0:26:30.327 --> 0:26:34.208 +You have the original problem why you introduce +the attention. + +0:26:34.208 --> 0:26:39.691 +You don't have that anymore because it's not +everything is summarized, but each time you + +0:26:39.691 --> 0:26:44.866 +generate, you're looking back at all the previous +words, the source and the target. + +0:26:45.805 --> 0:26:51.734 +And there is a lot of work on is a really +important to have encoded a decoded model or + +0:26:51.734 --> 0:26:54.800 +is a decoded only model as good if you have. + +0:26:54.800 --> 0:27:00.048 +But the comparison is not that easy because +how many parameters do you have? + +0:27:00.360 --> 0:27:08.832 +So think the general idea at the moment is, +at least for machine translation, it's normally + +0:27:08.832 --> 0:27:17.765 +a bit better to have an encoded decoder model +and not a decoder model where you just concatenate + +0:27:17.765 --> 0:27:20.252 +the source and the target. + +0:27:21.581 --> 0:27:24.073 +But there is not really a big difference anymore. + +0:27:24.244 --> 0:27:29.891 +Because this big issue, which we had initially +with it that everything is stored in the working + +0:27:29.891 --> 0:27:31.009 +state, is nothing. + +0:27:31.211 --> 0:27:45.046 +Of course, the advantage maybe here is that +you give it a bias at your same language information. + +0:27:45.285 --> 0:27:53.702 +While in an encoder only model this all is +merged into one thing and sometimes it is good + +0:27:53.702 --> 0:28:02.120 +to give models a bit of bias okay you should +maybe treat things separately and you should + +0:28:02.120 --> 0:28:03.617 +look different. + +0:28:04.144 --> 0:28:11.612 +And of course one other difference, one other +disadvantage, maybe of an encoder owning one. + +0:28:16.396 --> 0:28:19.634 +You think about the suicide sentence and how +it's treated. + +0:28:21.061 --> 0:28:33.787 +Architecture: Anchorer can both be in the +sentence for every state and cause a little + +0:28:33.787 --> 0:28:35.563 +difference. + +0:28:35.475 --> 0:28:43.178 +If you only have a decoder that has to be +unidirectional because for the decoder side + +0:28:43.178 --> 0:28:51.239 +for the generation you need it and so your +input is read state by state so you don't have + +0:28:51.239 --> 0:28:54.463 +positional bidirection information. + +0:28:56.596 --> 0:29:05.551 +Again, it receives a sequence of embeddings +with position encoding. + +0:29:05.551 --> 0:29:11.082 +The piece is like long vector has output. + +0:29:11.031 --> 0:29:17.148 +Don't understand how you can set footworks +to this part of each other through inputs. + +0:29:17.097 --> 0:29:20.060 +Other than cola is the same as the food consume. + +0:29:21.681 --> 0:29:27.438 +Okay, it's very good bye, so this one hand +coding is only done on the top layer. + +0:29:27.727 --> 0:29:32.012 +So this green one is only repeated. + +0:29:32.012 --> 0:29:38.558 +You have the word embedding or the position +embedding. + +0:29:38.558 --> 0:29:42.961 +You have one layer of decoder which. + +0:29:43.283 --> 0:29:48.245 +Then you stick in the second one, the third +one, the fourth one, and then on the top. + +0:29:48.208 --> 0:29:55.188 +Layer: You put this projection layer which +takes a one thousand dimensional backtalk and + +0:29:55.188 --> 0:30:02.089 +generates based on your vocabulary maybe in +ten thousand soft max layer which gives you + +0:30:02.089 --> 0:30:04.442 +the probability of all words. + +0:30:06.066 --> 0:30:22.369 +It's a very good part part of the mass tape +ladies, but it wouldn't be for the X-rays. + +0:30:22.262 --> 0:30:27.015 +Aquarium filters to be like monsoon roding +as they get by the river. + +0:30:27.647 --> 0:30:33.140 +Yes, there is work on that think we will discuss +that in the pre-trained models. + +0:30:33.493 --> 0:30:39.756 +It's called where you exactly do that. + +0:30:39.756 --> 0:30:48.588 +If you have more metric side, it's like diagonal +here. + +0:30:48.708 --> 0:30:53.018 +And it's a full metric, so here everybody's +attending to each position. + +0:30:53.018 --> 0:30:54.694 +Here you're only attending. + +0:30:54.975 --> 0:31:05.744 +Then you can do the previous one where this +one is decoded, not everything but everything. + +0:31:06.166 --> 0:31:13.961 +So you have a bit more that is possible, and +we'll have that in the lecture on pre-train + +0:31:13.961 --> 0:31:14.662 +models. + +0:31:18.478 --> 0:31:27.440 +So we now know how to build a translation +system, but of course we don't want to have + +0:31:27.440 --> 0:31:30.774 +a translation system by itself. + +0:31:31.251 --> 0:31:40.037 +Now given this model an input sentence, how +can we generate an output mind? + +0:31:40.037 --> 0:31:49.398 +The general idea is still: So what we really +want to do is we start with the model. + +0:31:49.398 --> 0:31:53.893 +We generate different possible translations. + +0:31:54.014 --> 0:31:59.754 +We score them the lock probability that we're +getting, so for each input and output pair + +0:31:59.754 --> 0:32:05.430 +we can calculate the lock probability, which +is a product of all probabilities for each + +0:32:05.430 --> 0:32:09.493 +word in there, and then we can find what is +the most probable. + +0:32:09.949 --> 0:32:15.410 +However, that's a bit complicated we will +see because we can't look at all possible translations. + +0:32:15.795 --> 0:32:28.842 +So there is infinite or a number of possible +translations, so we have to do it somehow in + +0:32:28.842 --> 0:32:31.596 +more intelligence. + +0:32:32.872 --> 0:32:37.821 +So what we want to do today in the rest of +the lecture? + +0:32:37.821 --> 0:32:40.295 +What is the search problem? + +0:32:40.295 --> 0:32:44.713 +Then we will look at different search algorithms. + +0:32:45.825 --> 0:32:56.636 +Will compare model and search errors, so there +can be errors on the model where the model + +0:32:56.636 --> 0:33:03.483 +is not giving the highest score to the best +translation. + +0:33:03.903 --> 0:33:21.069 +This is always like searching the best translation +out of one model, which is often also interesting. + +0:33:24.004 --> 0:33:29.570 +And how do we do the search? + +0:33:29.570 --> 0:33:41.853 +We want to find the translation where the +reference is minimal. + +0:33:42.042 --> 0:33:44.041 +So the nice thing is SMT. + +0:33:44.041 --> 0:33:51.347 +It wasn't the case, but in neuromachine translation +we can't find any possible translation, so + +0:33:51.347 --> 0:33:53.808 +at least within our vocabulary. + +0:33:53.808 --> 0:33:58.114 +But if we have BPE we can really generate +any possible. + +0:33:58.078 --> 0:34:04.604 +Translation and cereal: We could always minimize +that, but yeah, we can't do it that easy because + +0:34:04.604 --> 0:34:07.734 +of course we don't have the reference at hand. + +0:34:07.747 --> 0:34:10.384 +If it has a reference, it's not a problem. + +0:34:10.384 --> 0:34:13.694 +We know what we are searching for, but we +don't know. + +0:34:14.054 --> 0:34:23.886 +So how can we then model this by just finding +the translation with the highest probability? + +0:34:23.886 --> 0:34:29.015 +Looking at it, we want to find the translation. + +0:34:29.169 --> 0:34:32.525 +Idea is our model is a good approximation. + +0:34:32.525 --> 0:34:34.399 +That's how we train it. + +0:34:34.399 --> 0:34:36.584 +What is a good translation? + +0:34:36.584 --> 0:34:43.687 +And if we find translation with the highest +probability, this should also give us the best + +0:34:43.687 --> 0:34:44.702 +translation. + +0:34:45.265 --> 0:34:56.965 +And that is then, of course, the difference +between the search error is that the model + +0:34:56.965 --> 0:35:02.076 +doesn't predict the best translation. + +0:35:02.622 --> 0:35:08.777 +How can we do the basic search first of all +in basic search that seems to be very easy + +0:35:08.777 --> 0:35:15.003 +so what we can do is we can do the forward +pass for the whole encoder and that's how it + +0:35:15.003 --> 0:35:21.724 +starts the input sentences known you can put +the input sentence and calculate all your estates + +0:35:21.724 --> 0:35:22.573 +and hidden? + +0:35:23.083 --> 0:35:35.508 +Then you can put in your sentence start and +you can generate. + +0:35:35.508 --> 0:35:41.721 +Here you have the probability. + +0:35:41.801 --> 0:35:52.624 +A good idea we would see later that as a typical +algorithm is guess what you all would do, you + +0:35:52.624 --> 0:35:54.788 +would then select. + +0:35:55.235 --> 0:36:06.265 +So if you generate here a probability distribution +over all the words in your vocabulary then + +0:36:06.265 --> 0:36:08.025 +you can solve. + +0:36:08.688 --> 0:36:13.147 +Yeah, this is how our auto condition is done +in our system. + +0:36:14.794 --> 0:36:19.463 +Yeah, this is also why there you have to have +a model of possible extending. + +0:36:19.463 --> 0:36:24.314 +It's more of a language model, but then this +is one algorithm to do the search. + +0:36:24.314 --> 0:36:26.801 +They maybe have also more advanced ones. + +0:36:26.801 --> 0:36:32.076 +We will see that so this search and other +completion should be exactly the same as the + +0:36:32.076 --> 0:36:33.774 +search machine translation. + +0:36:34.914 --> 0:36:40.480 +So we'll see that this is not optimal, so +hopefully it's not that this way, but for this + +0:36:40.480 --> 0:36:41.043 +problem. + +0:36:41.941 --> 0:36:47.437 +And what you can do then you can select this +word. + +0:36:47.437 --> 0:36:50.778 +This was the best translation. + +0:36:51.111 --> 0:36:57.675 +Because the decoder, of course, in the next +step needs not to know what is the best word + +0:36:57.675 --> 0:37:02.396 +here, it inputs it and generates that flexibility +distribution. + +0:37:03.423 --> 0:37:14.608 +And then your new distribution, and you can +do the same thing, there's the best word there, + +0:37:14.608 --> 0:37:15.216 +and. + +0:37:15.435 --> 0:37:22.647 +So you can continue doing that and always +get the hopefully the best translation in. + +0:37:23.483 --> 0:37:30.839 +The first question is, of course, how long +are you doing it? + +0:37:30.839 --> 0:37:33.854 +Now we could go forever. + +0:37:36.476 --> 0:37:52.596 +We had this token at the input and we put +the stop token at the output. + +0:37:53.974 --> 0:38:07.217 +And this is important because if we wouldn't +do that then we wouldn't have a good idea. + +0:38:10.930 --> 0:38:16.193 +So that seems to be a good idea, but is it +really? + +0:38:16.193 --> 0:38:21.044 +Do we find the most probable sentence in this? + +0:38:23.763 --> 0:38:25.154 +Or my dear healed proverb,. + +0:38:27.547 --> 0:38:41.823 +We are always selecting the highest probability +one, so it seems to be that this is a very + +0:38:41.823 --> 0:38:45.902 +good solution to anybody. + +0:38:46.406 --> 0:38:49.909 +Yes, that is actually the problem. + +0:38:49.909 --> 0:38:56.416 +You might do early decisions and you don't +have the global view. + +0:38:56.796 --> 0:39:02.813 +And this problem happens because it is an +outer regressive model. + +0:39:03.223 --> 0:39:13.275 +So it happens because yeah, the output we +generate is the input in the next step. + +0:39:13.793 --> 0:39:19.493 +And this, of course, is leading to problems. + +0:39:19.493 --> 0:39:27.474 +If we always take the best solution, it doesn't +mean you have. + +0:39:27.727 --> 0:39:33.941 +It would be different if you have a problem +where the output is not influencing your input. + +0:39:34.294 --> 0:39:44.079 +Then this solution will give you the best +model, but since the output is influencing + +0:39:44.079 --> 0:39:47.762 +your next input and the model,. + +0:39:48.268 --> 0:39:51.599 +Because one question might not be why do we +have this type of model? + +0:39:51.771 --> 0:39:58.946 +So why do we really need to put here in the +last source word? + +0:39:58.946 --> 0:40:06.078 +You can also put in: And then always predict +the word and the nice thing is then you wouldn't + +0:40:06.078 --> 0:40:11.846 +need to do beams or a difficult search because +then the output here wouldn't influence what + +0:40:11.846 --> 0:40:12.975 +is inputted here. + +0:40:15.435 --> 0:40:20.219 +Idea whether that might not be the best idea. + +0:40:20.219 --> 0:40:24.588 +You'll just be translating each word and. + +0:40:26.626 --> 0:40:37.815 +The second one is right, yes, you're not generating +a Korean sentence. + +0:40:38.058 --> 0:40:48.197 +We'll also see that later it's called non +auto-progressive translation, so there is work + +0:40:48.197 --> 0:40:49.223 +on that. + +0:40:49.529 --> 0:41:02.142 +So you might know it roughly because you know +it's based on this hidden state, but it can + +0:41:02.142 --> 0:41:08.588 +be that in the end you have your probability. + +0:41:09.189 --> 0:41:14.633 +And then you're not modeling the dependencies +within a work within the target sentence. + +0:41:14.633 --> 0:41:27.547 +For example: You can express things in German, +then you don't know which one you really select. + +0:41:27.547 --> 0:41:32.156 +That influences what you later. + +0:41:33.393 --> 0:41:46.411 +Then you try to find a better way not only +based on the English sentence and the words + +0:41:46.411 --> 0:41:48.057 +that come. + +0:41:49.709 --> 0:42:00.954 +Yes, that is more like a two-step decoding, +but that is, of course, a lot more like computational. + +0:42:01.181 --> 0:42:15.978 +The first thing you can do, which is typically +done, is doing not really search. + +0:42:16.176 --> 0:42:32.968 +So first look at what the problem of research +is to make it a bit more clear. + +0:42:34.254 --> 0:42:53.163 +And now you can extend them and you can extend +these and the joint probabilities. + +0:42:54.334 --> 0:42:59.063 +The other thing is the second word. + +0:42:59.063 --> 0:43:03.397 +You can do the second word dusk. + +0:43:03.397 --> 0:43:07.338 +Now you see the problem here. + +0:43:07.707 --> 0:43:17.507 +It is true that these have the highest probability, +but for these you have an extension. + +0:43:18.078 --> 0:43:31.585 +So the problem is just because in one position +one hypothesis, so you can always call this + +0:43:31.585 --> 0:43:34.702 +partial translation. + +0:43:34.874 --> 0:43:41.269 +The blue one begin is higher, but the green +one can be better extended and it will overtake. + +0:43:45.525 --> 0:43:54.672 +So the problem is if we are doing this greedy +search is that we might not end up in really + +0:43:54.672 --> 0:43:55.275 +good. + +0:43:55.956 --> 0:44:00.916 +So the first thing we could not do is like +yeah, we can just try. + +0:44:00.880 --> 0:44:06.049 +All combinations that are there, so there +is the other direction. + +0:44:06.049 --> 0:44:13.020 +So if the solution to to check the first one +is to just try all and it doesn't give us a + +0:44:13.020 --> 0:44:17.876 +good result, maybe what we have to do is just +try everything. + +0:44:18.318 --> 0:44:23.120 +The nice thing is if we try everything, we'll +definitely find the best translation. + +0:44:23.463 --> 0:44:26.094 +So we won't have a search error. + +0:44:26.094 --> 0:44:28.167 +We'll come to that later. + +0:44:28.167 --> 0:44:32.472 +The interesting thing is our translation performance. + +0:44:33.353 --> 0:44:37.039 +But we will definitely find the most probable +translation. + +0:44:38.598 --> 0:44:44.552 +However, it's not really possible because +the number of combinations is just too high. + +0:44:44.764 --> 0:44:57.127 +So the number of congregations is your vocabulary +science times the lengths of your sentences. + +0:44:57.157 --> 0:45:03.665 +Ten thousand or so you can imagine that very +soon you will have so many possibilities here + +0:45:03.665 --> 0:45:05.597 +that you cannot check all. + +0:45:06.226 --> 0:45:13.460 +So this is not really an implication or an +algorithm that you can use for applying machine + +0:45:13.460 --> 0:45:14.493 +translation. + +0:45:15.135 --> 0:45:24.657 +So maybe we have to do something in between +and yeah, not look at all but only look at + +0:45:24.657 --> 0:45:25.314 +some. + +0:45:26.826 --> 0:45:29.342 +And the easiest thing for that is okay. + +0:45:29.342 --> 0:45:34.877 +Just do sampling, so if we don't know what +to look at, maybe it's good to randomly pick + +0:45:34.877 --> 0:45:35.255 +some. + +0:45:35.255 --> 0:45:40.601 +That's not only a very good algorithm, so +the basic idea will always randomly select + +0:45:40.601 --> 0:45:42.865 +the word, of course, based on bits. + +0:45:43.223 --> 0:45:52.434 +We are doing that or times, and then we are +looking which one at the end has the highest. + +0:45:52.672 --> 0:45:59.060 +So we are not doing anymore really searching +for the best one, but we are more randomly + +0:45:59.060 --> 0:46:05.158 +doing selections with the idea that we always +select the best one at the beginning. + +0:46:05.158 --> 0:46:11.764 +So maybe it's better to do random, but of +course one important thing is how do we randomly + +0:46:11.764 --> 0:46:12.344 +select? + +0:46:12.452 --> 0:46:15.756 +If we just do uniform distribution, it would +be very bad. + +0:46:15.756 --> 0:46:18.034 +You'll only have very bad translations. + +0:46:18.398 --> 0:46:23.261 +Because in each position if you think about +it you have ten thousand possibilities. + +0:46:23.903 --> 0:46:28.729 +Most of them are really bad decisions and +you shouldn't do that. + +0:46:28.729 --> 0:46:35.189 +There is always only a very small number, +at least compared to the 10 000 translation. + +0:46:35.395 --> 0:46:43.826 +So if you have the sentence here, this is +an English sentence. + +0:46:43.826 --> 0:46:47.841 +You can start with these and. + +0:46:48.408 --> 0:46:58.345 +You're thinking about setting legal documents +in a legal document. + +0:46:58.345 --> 0:47:02.350 +You should not change the. + +0:47:03.603 --> 0:47:11.032 +The problem is we have a neural network, we +have a black box, so it's anyway a bit random. + +0:47:12.092 --> 0:47:24.341 +It is considered, but you will see that if +you make it intelligent for clear sentences, + +0:47:24.341 --> 0:47:26.986 +there is not that. + +0:47:27.787 --> 0:47:35.600 +Is an issue we should consider that this one +might lead to more randomness, but it might + +0:47:35.600 --> 0:47:39.286 +also be positive for machine translation. + +0:47:40.080 --> 0:47:46.395 +Least can't directly think of a good implication +where it's positive, but if you most think + +0:47:46.395 --> 0:47:52.778 +about dialogue systems, for example, whereas +the similar architecture is nowadays also used, + +0:47:52.778 --> 0:47:55.524 +you predict what the system should say. + +0:47:55.695 --> 0:48:00.885 +Then you want to have randomness because it's +not always saying the same thing. + +0:48:01.341 --> 0:48:08.370 +Machine translation is typically not you want +to have consistency, so if you have the same + +0:48:08.370 --> 0:48:09.606 +input normally. + +0:48:09.889 --> 0:48:14.528 +Therefore, sampling is not a mathieu. + +0:48:14.528 --> 0:48:22.584 +There are some things you will later see as +a preprocessing step. + +0:48:23.003 --> 0:48:27.832 +But of course it's important how you can make +this process not too random. + +0:48:29.269 --> 0:48:41.619 +Therefore, the first thing is don't take a +uniform distribution, but we have a very nice + +0:48:41.619 --> 0:48:43.562 +distribution. + +0:48:43.843 --> 0:48:46.621 +So I'm like randomly taking a word. + +0:48:46.621 --> 0:48:51.328 +We are looking at output distribution and +now taking a word. + +0:48:51.731 --> 0:49:03.901 +So that means we are taking the word these, +we are taking the word does, and all these. + +0:49:04.444 --> 0:49:06.095 +How can you do that? + +0:49:06.095 --> 0:49:09.948 +You randomly draw a number between zero and +one. + +0:49:10.390 --> 0:49:23.686 +And then you have ordered your words in some +way, and then you take the words before the + +0:49:23.686 --> 0:49:26.375 +sum of the words. + +0:49:26.806 --> 0:49:34.981 +So the easiest thing is you have zero point +five, zero point two five, and zero point two + +0:49:34.981 --> 0:49:35.526 +five. + +0:49:35.526 --> 0:49:43.428 +If you have a number smaller than you take +the first word, it takes a second word, and + +0:49:43.428 --> 0:49:45.336 +if it's higher than. + +0:49:45.845 --> 0:49:57.707 +Therefore, you can very easily get a distribution +distributed according to this probability mass + +0:49:57.707 --> 0:49:59.541 +and no longer. + +0:49:59.799 --> 0:50:12.479 +You can't even do that a bit more and more +focus on the important part if we are not randomly + +0:50:12.479 --> 0:50:19.494 +drawing from all words, but we are looking +only at. + +0:50:21.361 --> 0:50:24.278 +You have an idea why this is an important +stamp. + +0:50:24.278 --> 0:50:29.459 +Although we say I'm only throwing away the +words which have a very low probability, so + +0:50:29.459 --> 0:50:32.555 +anyway the probability of taking them is quite +low. + +0:50:32.555 --> 0:50:35.234 +So normally that shouldn't matter that much. + +0:50:36.256 --> 0:50:38.830 +There's ten thousand words. + +0:50:40.300 --> 0:50:42.074 +Of course, they admire thousand nine hundred. + +0:50:42.074 --> 0:50:44.002 +They're going to build a good people steal +it up. + +0:50:45.085 --> 0:50:47.425 +Hi, I'm Sarah Hauer and I'm Sig Hauer and +We're Professional. + +0:50:47.867 --> 0:50:55.299 +Yes, that's exactly why you do this most sampling +or so that you don't take the lowest. + +0:50:55.415 --> 0:50:59.694 +Probability words, but you only look at the +most probable ones and then like. + +0:50:59.694 --> 0:51:04.632 +Of course you have to rescale your probability +mass then so that it's still a probability + +0:51:04.632 --> 0:51:08.417 +because now it's a probability distribution +over ten thousand words. + +0:51:08.417 --> 0:51:13.355 +If you only take ten of them or so it's no +longer a probability distribution, you rescale + +0:51:13.355 --> 0:51:15.330 +them and you can still do that and. + +0:51:16.756 --> 0:51:20.095 +That is what is done assembling. + +0:51:20.095 --> 0:51:26.267 +It's not the most common thing, but it's done +several times. + +0:51:28.088 --> 0:51:40.625 +Then the search, which is somehow a standard, +and if you're doing some type of machine translation. + +0:51:41.181 --> 0:51:50.162 +And the basic idea is that in research we +select for the most probable and only continue + +0:51:50.162 --> 0:51:51.171 +with the. + +0:51:51.691 --> 0:51:53.970 +You can easily generalize this. + +0:51:53.970 --> 0:52:00.451 +We are not only continuing the most probable +one, but we are continuing the most probable. + +0:52:00.880 --> 0:52:21.376 +The. + +0:52:17.697 --> 0:52:26.920 +You should say we are sampling how many examples +it makes sense to take the one with the highest. + +0:52:27.127 --> 0:52:33.947 +But that is important that once you do a mistake +you might want to not influence that much. + +0:52:39.899 --> 0:52:45.815 +So the idea is if we're keeping the end best +hypotheses and not only the first fact. + +0:52:46.586 --> 0:52:51.558 +And the nice thing is in statistical machine +translation. + +0:52:51.558 --> 0:52:54.473 +We have exactly the same problem. + +0:52:54.473 --> 0:52:57.731 +You would do the same thing, however. + +0:52:57.731 --> 0:53:03.388 +Since the model wasn't that strong you needed +a quite large beam. + +0:53:03.984 --> 0:53:18.944 +Machine translation models are really strong +and you get already a very good performance. + +0:53:19.899 --> 0:53:22.835 +So how does it work? + +0:53:22.835 --> 0:53:35.134 +We can't relate to our capabilities, but now +we are not storing the most probable ones. + +0:53:36.156 --> 0:53:45.163 +Done that we extend all these hypothesis and +of course there is now a bit difficult because + +0:53:45.163 --> 0:53:54.073 +now we always have to switch what is the input +so the search gets more complicated and the + +0:53:54.073 --> 0:53:55.933 +first one is easy. + +0:53:56.276 --> 0:54:09.816 +In this case we have to once put in here these +and then somehow delete this one and instead + +0:54:09.816 --> 0:54:12.759 +put that into that. + +0:54:13.093 --> 0:54:24.318 +Otherwise you could only store your current +network states here and just continue by going + +0:54:24.318 --> 0:54:25.428 +forward. + +0:54:26.766 --> 0:54:34.357 +So now you have done the first two, and then +you have known the best. + +0:54:34.357 --> 0:54:37.285 +Can you now just continue? + +0:54:39.239 --> 0:54:53.511 +Yes, that's very important, otherwise all +your beam search doesn't really help because + +0:54:53.511 --> 0:54:57.120 +you would still have. + +0:54:57.317 --> 0:55:06.472 +So now you have to do one important step and +then reduce again to end. + +0:55:06.472 --> 0:55:13.822 +So in our case to make things easier we have +the inputs. + +0:55:14.014 --> 0:55:19.072 +Otherwise you will have two to the power of +length possibilities, so it is still exponential. + +0:55:19.559 --> 0:55:26.637 +But by always throwing them away you keep +your beans fixed. + +0:55:26.637 --> 0:55:31.709 +The items now differ in the last position. + +0:55:32.492 --> 0:55:42.078 +They are completely different, but you are +always searching what is the best one. + +0:55:44.564 --> 0:55:50.791 +So another way of hearing it is like this, +so just imagine you start with the empty sentence. + +0:55:50.791 --> 0:55:55.296 +Then you have three possible extensions: A, +B, and end of sentence. + +0:55:55.296 --> 0:55:59.205 +It's throwing away the worst one, continuing +with the two. + +0:55:59.699 --> 0:56:13.136 +Then you want to stay too, so in this state +it's either or and then you continue. + +0:56:13.293 --> 0:56:24.924 +So you always have this exponential growing +tree by destroying most of them away and only + +0:56:24.924 --> 0:56:26.475 +continuing. + +0:56:26.806 --> 0:56:42.455 +And thereby you can hopefully do less errors +because in these examples you always see this + +0:56:42.455 --> 0:56:43.315 +one. + +0:56:43.503 --> 0:56:47.406 +So you're preventing some errors, but of course +it's not perfect. + +0:56:47.447 --> 0:56:56.829 +You can still do errors because it could be +not the second one but the fourth one. + +0:56:57.017 --> 0:57:03.272 +Now just the idea is that you make yeah less +errors and prevent that. + +0:57:07.667 --> 0:57:11.191 +Then the question is how much does it help? + +0:57:11.191 --> 0:57:14.074 +And here is some examples for that. + +0:57:14.074 --> 0:57:16.716 +So for S & T it was really like. + +0:57:16.716 --> 0:57:23.523 +Typically the larger beam you have a larger +third space and you have a better score. + +0:57:23.763 --> 0:57:27.370 +So the larger you get, the bigger your emails, +the better you will. + +0:57:27.370 --> 0:57:30.023 +Typically maybe use something like three hundred. + +0:57:30.250 --> 0:57:38.777 +And it's mainly a trade-off between quality +and speed because the larger your beams, the + +0:57:38.777 --> 0:57:43.184 +more time it takes and you want to finish it. + +0:57:43.184 --> 0:57:49.124 +So your quality improvements are getting smaller +and smaller. + +0:57:49.349 --> 0:57:57.164 +So the difference between a beam of one and +ten is bigger than the difference between a. + +0:57:58.098 --> 0:58:14.203 +And the interesting thing is we're seeing +a bit of a different view, and we're seeing + +0:58:14.203 --> 0:58:16.263 +typically. + +0:58:16.776 --> 0:58:24.376 +And then especially if you look at the green +ones, this is unnormalized. + +0:58:24.376 --> 0:58:26.770 +You're seeing a sharp. + +0:58:27.207 --> 0:58:32.284 +So your translation quality here measured +in blue will go down again. + +0:58:33.373 --> 0:58:35.663 +That is now a question. + +0:58:35.663 --> 0:58:37.762 +Why is that the case? + +0:58:37.762 --> 0:58:43.678 +Why should we are seeing more and more possible +translations? + +0:58:46.226 --> 0:58:48.743 +If we have a bigger stretch and we are going. + +0:58:52.612 --> 0:58:56.312 +I'm going to be using my examples before we +also look at the bar. + +0:58:56.656 --> 0:58:59.194 +A good idea. + +0:59:00.000 --> 0:59:18.521 +But it's not everything because we in the +end always in this list we're selecting. + +0:59:18.538 --> 0:59:19.382 +So this is here. + +0:59:19.382 --> 0:59:21.170 +We don't do any regions to do that. + +0:59:21.601 --> 0:59:29.287 +So the probabilities at the end we always +give out the hypothesis with the highest probabilities. + +0:59:30.250 --> 0:59:33.623 +That is always the case. + +0:59:33.623 --> 0:59:43.338 +If you have a beam of this should be a subset +of the items you look at. + +0:59:44.224 --> 0:59:52.571 +So if you increase your biomeat you're just +looking at more and you're always taking the + +0:59:52.571 --> 0:59:54.728 +wine with the highest. + +0:59:57.737 --> 1:00:07.014 +Maybe they are all the probability that they +will be comparable to don't really have. + +1:00:08.388 --> 1:00:14.010 +But the probabilities are the same, not that +easy. + +1:00:14.010 --> 1:00:23.931 +One morning maybe you will have more examples +where we look at some stuff that's not seen + +1:00:23.931 --> 1:00:26.356 +in the trading space. + +1:00:28.428 --> 1:00:36.478 +That's mainly the answer why we give a hyperability +math we will see, but that is first of all + +1:00:36.478 --> 1:00:43.087 +the biggest issues, so here is a blue score, +so that is somewhat translation. + +1:00:43.883 --> 1:00:48.673 +This will go down by the probability of the +highest one that only goes out where stays + +1:00:48.673 --> 1:00:49.224 +at least. + +1:00:49.609 --> 1:00:57.971 +The problem is if we are searching more, we +are finding high processes which have a high + +1:00:57.971 --> 1:00:59.193 +translation. + +1:00:59.579 --> 1:01:10.375 +So we are finding these things which we wouldn't +find and we'll see why this is happening. + +1:01:10.375 --> 1:01:15.714 +So somehow we are reducing our search error. + +1:01:16.336 --> 1:01:25.300 +However, we also have a model error and we +don't assign the highest probability to translation + +1:01:25.300 --> 1:01:27.942 +quality to the really best. + +1:01:28.548 --> 1:01:31.460 +They don't always add up. + +1:01:31.460 --> 1:01:34.932 +Of course somehow they add up. + +1:01:34.932 --> 1:01:41.653 +If your bottle is worse then your performance +will even go. + +1:01:42.202 --> 1:01:49.718 +But sometimes it's happening that by increasing +search errors we are missing out the really + +1:01:49.718 --> 1:01:57.969 +bad translations which have a high probability +and we are only finding the decently good probability + +1:01:57.969 --> 1:01:58.460 +mass. + +1:01:59.159 --> 1:02:03.859 +So they are a bit independent of each other +and you can make those types of arrows. + +1:02:04.224 --> 1:02:09.858 +That's why, for example, doing exact search +will give you the translation with the highest + +1:02:09.858 --> 1:02:15.245 +probability, but there has been work on it +that you then even have a lower translation + +1:02:15.245 --> 1:02:21.436 +quality because then you find some random translation +which has a very high translation probability + +1:02:21.436 --> 1:02:22.984 +by which I'm really bad. + +1:02:23.063 --> 1:02:29.036 +Because our model is not perfect and giving +a perfect translation probability over air,. + +1:02:31.431 --> 1:02:34.537 +So why is this happening? + +1:02:34.537 --> 1:02:42.301 +And one issue with this is the so called label +or length spiral. + +1:02:42.782 --> 1:02:47.115 +And we are in each step of decoding. + +1:02:47.115 --> 1:02:55.312 +We are modeling the probability of the next +word given the input and. + +1:02:55.895 --> 1:03:06.037 +So if you have this picture, so you always +hear you have the probability of the next word. + +1:03:06.446 --> 1:03:16.147 +That's that's what your modeling, and of course +the model is not perfect. + +1:03:16.576 --> 1:03:22.765 +So it can be that if we at one time do a bitter +wrong prediction not for the first one but + +1:03:22.765 --> 1:03:28.749 +maybe for the 5th or 6th thing, then we're +giving it an exceptional high probability we + +1:03:28.749 --> 1:03:30.178 +cannot recover from. + +1:03:30.230 --> 1:03:34.891 +Because this high probability will stay there +forever and we just multiply other things to + +1:03:34.891 --> 1:03:39.910 +it, but we cannot like later say all this probability +was a bit too high, we shouldn't have done. + +1:03:41.541 --> 1:03:48.984 +And this leads to that the more the longer +your translation is, the more often you use + +1:03:48.984 --> 1:03:51.637 +this probability distribution. + +1:03:52.112 --> 1:04:03.321 +The typical example is this one, so you have +the probability of the translation. + +1:04:04.104 --> 1:04:12.608 +And this probability is quite low as you see, +and maybe there are a lot of other things. + +1:04:13.053 --> 1:04:25.658 +However, it might still be overestimated that +it's still a bit too high. + +1:04:26.066 --> 1:04:33.042 +The problem is if you know the project translation +is a very long one, but probability mask gets + +1:04:33.042 --> 1:04:33.545 +lower. + +1:04:34.314 --> 1:04:45.399 +Because each time you multiply your probability +to it, so your sequence probability gets lower + +1:04:45.399 --> 1:04:46.683 +and lower. + +1:04:48.588 --> 1:04:59.776 +And this means that at some point you might +get over this, and it might be a lower probability. + +1:05:00.180 --> 1:05:09.651 +And if you then have this probability at the +beginning away, but it wasn't your beam, then + +1:05:09.651 --> 1:05:14.958 +at this point you would select the empty sentence. + +1:05:15.535 --> 1:05:25.379 +So this has happened because this short translation +is seen and it's not thrown away. + +1:05:28.268 --> 1:05:31.121 +So,. + +1:05:31.151 --> 1:05:41.256 +If you have a very sore beam that can be prevented, +but if you have a large beam, this one is in + +1:05:41.256 --> 1:05:41.986 +there. + +1:05:42.302 --> 1:05:52.029 +This in general seems reasonable that shorter +pronunciations instead of longer sentences + +1:05:52.029 --> 1:05:54.543 +because non-religious. + +1:05:56.376 --> 1:06:01.561 +It's a bit depending on whether the translation +should be a bit related to your input. + +1:06:02.402 --> 1:06:18.053 +And since we are always multiplying things, +the longer the sequences we are getting smaller, + +1:06:18.053 --> 1:06:18.726 +it. + +1:06:19.359 --> 1:06:29.340 +It's somewhat right for human main too, but +the models tend to overestimate because of + +1:06:29.340 --> 1:06:34.388 +this short translation of long translation. + +1:06:35.375 --> 1:06:46.474 +Then, of course, that means that it's not +easy to stay on a computer because eventually + +1:06:46.474 --> 1:06:48.114 +it suggests. + +1:06:51.571 --> 1:06:59.247 +First of all there is another way and that's +typically used but you don't have to do really + +1:06:59.247 --> 1:07:07.089 +because this is normally not a second position +and if it's like on the 20th position you only + +1:07:07.089 --> 1:07:09.592 +have to have some bean lower. + +1:07:10.030 --> 1:07:17.729 +But you are right because these issues get +larger, the larger your input is, and then + +1:07:17.729 --> 1:07:20.235 +you might make more errors. + +1:07:20.235 --> 1:07:27.577 +So therefore this is true, but it's not as +simple that this one is always in the. + +1:07:28.408 --> 1:07:45.430 +That the translation for it goes down with +higher insert sizes has there been more control. + +1:07:47.507 --> 1:07:51.435 +In this work you see a dozen knocks. + +1:07:51.435 --> 1:07:53.027 +Knots go down. + +1:07:53.027 --> 1:08:00.246 +That's light green here, but at least you +don't see the sharp rock. + +1:08:00.820 --> 1:08:07.897 +So if you do some type of normalization, at +least you can assess this probability and limit + +1:08:07.897 --> 1:08:08.204 +it. + +1:08:15.675 --> 1:08:24.828 +There is other reasons why, like initial, +it's not only the length, but there can be + +1:08:24.828 --> 1:08:26.874 +other reasons why. + +1:08:27.067 --> 1:08:37.316 +And if you just take it too large, you're +looking too often at ways in between, but it's + +1:08:37.316 --> 1:08:40.195 +better to ignore things. + +1:08:41.101 --> 1:08:44.487 +But that's more a hand gravy argument. + +1:08:44.487 --> 1:08:47.874 +Agree so don't know if the exact word. + +1:08:48.648 --> 1:08:53.223 +You need to do the normalization and there +are different ways of doing it. + +1:08:53.223 --> 1:08:54.199 +It's mainly OK. + +1:08:54.199 --> 1:08:59.445 +We're just now not taking the translation +with the highest probability, but we during + +1:08:59.445 --> 1:09:04.935 +the coding have another feature saying not +only take the one with the highest probability + +1:09:04.935 --> 1:09:08.169 +but also prefer translations which are a bit +longer. + +1:09:08.488 --> 1:09:16.933 +You can do that different in a way to divide +by the center length. + +1:09:16.933 --> 1:09:23.109 +We take not the highest but the highest average. + +1:09:23.563 --> 1:09:28.841 +Of course, if both are the same lengths, it +doesn't matter if M is the same lengths in + +1:09:28.841 --> 1:09:34.483 +all cases, but if you compare a translation +with seven or eight words, there is a difference + +1:09:34.483 --> 1:09:39.700 +if you want to have the one with the highest +probability or with the highest average. + +1:09:41.021 --> 1:09:50.993 +So that is the first one can have some reward +model for each word, add a bit of the score, + +1:09:50.993 --> 1:09:51.540 +and. + +1:09:51.711 --> 1:10:03.258 +And then, of course, you have to find you +that there is also more complex ones here. + +1:10:03.903 --> 1:10:08.226 +So there is different ways of doing that, +and of course that's important. + +1:10:08.428 --> 1:10:11.493 +But in all of that, the main idea is OK. + +1:10:11.493 --> 1:10:18.520 +We are like knowing of the arrow that the +model seems to prevent or prefer short translation. + +1:10:18.520 --> 1:10:24.799 +We circumvent that by OK we are adding we +are no longer searching for the best one. + +1:10:24.764 --> 1:10:30.071 +But we're searching for the one best one and +some additional constraints, so mainly you + +1:10:30.071 --> 1:10:32.122 +are doing here during the coding. + +1:10:32.122 --> 1:10:37.428 +You're not completely trusting your model, +but you're adding some buyers or constraints + +1:10:37.428 --> 1:10:39.599 +into what should also be fulfilled. + +1:10:40.000 --> 1:10:42.543 +That can be, for example, that the length +should be recently. + +1:10:49.369 --> 1:10:51.071 +Any More Questions to That. + +1:10:56.736 --> 1:11:04.001 +Last idea which gets recently quite a bit +more interest also is what is called minimum + +1:11:04.001 --> 1:11:11.682 +base risk decoding and there is maybe not the +one correct translation but there are several + +1:11:11.682 --> 1:11:13.937 +good correct translations. + +1:11:14.294 --> 1:11:21.731 +And the idea is now we don't want to find +the one translation, which is maybe the highest + +1:11:21.731 --> 1:11:22.805 +probability. + +1:11:23.203 --> 1:11:31.707 +Instead we are looking at all the high translation, +all translation with high probability and then + +1:11:31.707 --> 1:11:39.524 +we want to take one representative out of this +so we're just most similar to all the other + +1:11:39.524 --> 1:11:42.187 +hydrobility translation again. + +1:11:43.643 --> 1:11:46.642 +So how does it work? + +1:11:46.642 --> 1:11:55.638 +First you could have imagined you have reference +translations. + +1:11:55.996 --> 1:12:13.017 +You have a set of reference translations and +then what you want to get is you want to have. + +1:12:13.073 --> 1:12:28.641 +As a probability distribution you measure +the similarity of reference and the hypothesis. + +1:12:28.748 --> 1:12:31.408 +So you have two sets of translation. + +1:12:31.408 --> 1:12:34.786 +You have the human translations of a sentence. + +1:12:35.675 --> 1:12:39.251 +That's of course not realistic, but first +from the idea. + +1:12:39.251 --> 1:12:42.324 +Then you have your set of possible translations. + +1:12:42.622 --> 1:12:52.994 +And now you're not saying okay, we have only +one human, but we have several humans with + +1:12:52.994 --> 1:12:56.294 +different types of quality. + +1:12:56.796 --> 1:13:07.798 +You have to have two metrics here, the similarity +between the automatic translation and the quality + +1:13:07.798 --> 1:13:09.339 +of the human. + +1:13:10.951 --> 1:13:17.451 +Of course, we have the same problem that we +don't have the human reference, so we have. + +1:13:18.058 --> 1:13:29.751 +So when we are doing it, instead of estimating +the quality based on the human, we use our + +1:13:29.751 --> 1:13:30.660 +model. + +1:13:31.271 --> 1:13:37.612 +So we can't be like humans, so we take the +model probability. + +1:13:37.612 --> 1:13:40.782 +We take the set here first of. + +1:13:41.681 --> 1:13:48.755 +Then we are comparing each hypothesis to this +one, so you have two sets. + +1:13:48.755 --> 1:13:53.987 +Just imagine here you take all possible translations. + +1:13:53.987 --> 1:13:58.735 +Here you take your hypothesis in comparing +them. + +1:13:58.678 --> 1:14:03.798 +And then you're taking estimating the quality +based on the outcome. + +1:14:04.304 --> 1:14:06.874 +So the overall idea is okay. + +1:14:06.874 --> 1:14:14.672 +We are not finding the best hypothesis but +finding the hypothesis which is most similar + +1:14:14.672 --> 1:14:17.065 +to many good translations. + +1:14:19.599 --> 1:14:21.826 +Why would you do that? + +1:14:21.826 --> 1:14:25.119 +It's a bit like a smoothing idea. + +1:14:25.119 --> 1:14:28.605 +Imagine this is the probability of. + +1:14:29.529 --> 1:14:36.634 +So if you would do beam search or mini search +or anything, if you just take the highest probability + +1:14:36.634 --> 1:14:39.049 +one, you would take this red one. + +1:14:39.799 --> 1:14:45.686 +Has this type of probability distribution. + +1:14:45.686 --> 1:14:58.555 +Then it might be better to take some of these +models because it's a bit lower in probability. + +1:14:58.618 --> 1:15:12.501 +So what you're mainly doing is you're doing +some smoothing of your probability distribution. + +1:15:15.935 --> 1:15:17.010 +How can you do that? + +1:15:17.010 --> 1:15:20.131 +Of course, we cannot do this again compared +to all the hype. + +1:15:21.141 --> 1:15:29.472 +But what we can do is we have just two sets +and we're just taking them the same. + +1:15:29.472 --> 1:15:38.421 +So we're having our penny data of the hypothesis +and the sum of the soider references. + +1:15:39.179 --> 1:15:55.707 +And we can just take the same clue so we can +just compare the utility of the. + +1:15:56.656 --> 1:16:16.182 +And then, of course, the question is how do +we measure the quality of the hypothesis? + +1:16:16.396 --> 1:16:28.148 +Course: You could also take here the probability +of this pee of given, but you can also say + +1:16:28.148 --> 1:16:30.958 +we only take the top. + +1:16:31.211 --> 1:16:39.665 +And where we don't want to really rely on +how good they are, we filtered out all the + +1:16:39.665 --> 1:16:40.659 +bad ones. + +1:16:40.940 --> 1:16:54.657 +So that is the first question for the minimum +base rhythm, and what are your pseudo references? + +1:16:55.255 --> 1:17:06.968 +So how do you set the quality of all these +references here in the independent sampling? + +1:17:06.968 --> 1:17:10.163 +They all have the same. + +1:17:10.750 --> 1:17:12.308 +There's Also Work Where You Can Take That. + +1:17:13.453 --> 1:17:17.952 +And then the second question you have to do +is, of course,. + +1:17:17.917 --> 1:17:26.190 +How do you prepare now two hypothesisms so +you have now Y and H which are post generated + +1:17:26.190 --> 1:17:34.927 +by the system and you want to find the H which +is most similar to all the other translations. + +1:17:35.335 --> 1:17:41.812 +So it's mainly like this model here, which +says how similar is age to all the other whites. + +1:17:42.942 --> 1:17:50.127 +So you have to again use some type of similarity +metric, which says how similar to possible. + +1:17:52.172 --> 1:17:53.775 +How can you do that? + +1:17:53.775 --> 1:17:58.355 +We luckily knew how to compare a reference +to a hypothesis. + +1:17:58.355 --> 1:18:00.493 +We have evaluation metrics. + +1:18:00.493 --> 1:18:03.700 +You can do something like sentence level. + +1:18:04.044 --> 1:18:13.501 +But especially if you're looking into neuromodels +you should have a stromometric so you can use + +1:18:13.501 --> 1:18:17.836 +a neural metric which directly compares to. + +1:18:22.842 --> 1:18:29.292 +Yes, so that is, is the main idea of minimum +base risk to, so the important idea you should + +1:18:29.292 --> 1:18:35.743 +keep in mind is that it's doing somehow the +smoothing by not taking the highest probability + +1:18:35.743 --> 1:18:40.510 +one, but by comparing like by taking a set +of high probability one. + +1:18:40.640 --> 1:18:45.042 +And then looking for the translation, which +is most similar to all of that. + +1:18:45.445 --> 1:18:49.888 +And thereby doing a bit more smoothing because +you look at this one. + +1:18:49.888 --> 1:18:55.169 +If you have this one, for example, it would +be more similar to all of these ones. + +1:18:55.169 --> 1:19:00.965 +But if you take this one, it's higher probability, +but it's very dissimilar to all these. + +1:19:05.445 --> 1:19:17.609 +Hey, that is all for decoding before we finish +with your combination of models. + +1:19:18.678 --> 1:19:20.877 +Sort of set of pseudo-reperences. + +1:19:20.877 --> 1:19:24.368 +Thomas Brown writes a little bit of type research +or. + +1:19:24.944 --> 1:19:27.087 +For example, you can do beam search. + +1:19:27.087 --> 1:19:28.825 +You can do sampling for that. + +1:19:28.825 --> 1:19:31.257 +Oh yeah, we had mentioned sampling there. + +1:19:31.257 --> 1:19:34.500 +I don't know somebody asking for what sampling +is good. + +1:19:34.500 --> 1:19:37.280 +So there's, of course, another important issue. + +1:19:37.280 --> 1:19:40.117 +How do you get a good representative set of +age? + +1:19:40.620 --> 1:19:47.147 +If you do beam search, it might be that you +end up with two similar ones, and maybe it's + +1:19:47.147 --> 1:19:49.274 +prevented by doing sampling. + +1:19:49.274 --> 1:19:55.288 +But maybe in sampling you find worse ones, +but yet some type of model is helpful. + +1:19:56.416 --> 1:20:04.863 +Search method use more transformed based translation +points. + +1:20:04.863 --> 1:20:09.848 +Nowadays beam search is definitely. + +1:20:10.130 --> 1:20:13.749 +There is work on this. + +1:20:13.749 --> 1:20:27.283 +The problem is that the MBR is often a lot +more like heavy because you have to sample + +1:20:27.283 --> 1:20:29.486 +translations. + +1:20:31.871 --> 1:20:40.946 +If you are bustling then we take a pen or +a pen for the most possible one. + +1:20:40.946 --> 1:20:43.003 +Now we put them. + +1:20:43.623 --> 1:20:46.262 +Bit and then we say okay, you don't have to +be fine. + +1:20:46.262 --> 1:20:47.657 +I'm going to put it to you. + +1:20:48.428 --> 1:20:52.690 +Yes, so that is what you can also do. + +1:20:52.690 --> 1:21:00.092 +Instead of taking uniform per ability, you +could take the modest. + +1:21:01.041 --> 1:21:14.303 +The uniform is a bit more robust because if +you had this one it might be that there is + +1:21:14.303 --> 1:21:17.810 +some crazy exceptions. + +1:21:17.897 --> 1:21:21.088 +And then it would still relax. + +1:21:21.088 --> 1:21:28.294 +So if you look at this picture, the probability +here would be higher. + +1:21:28.294 --> 1:21:31.794 +But yeah, that's a bit of tuning. + +1:21:33.073 --> 1:21:42.980 +In this case, and yes, it is like modeling +also the ants that. + +1:21:49.169 --> 1:21:56.265 +The last thing is now we always have considered +one model. + +1:21:56.265 --> 1:22:04.084 +It's also some prints helpful to not only +look at one model but. + +1:22:04.384 --> 1:22:10.453 +So in general there's many ways of how you +can make several models and with it's even + +1:22:10.453 --> 1:22:17.370 +easier you can just start three different random +municipalizations you get three different models + +1:22:17.370 --> 1:22:18.428 +and typically. + +1:22:19.019 --> 1:22:27.299 +And then the question is, can we combine their +strength into one model and use that then? + +1:22:29.669 --> 1:22:39.281 +And that can be done and it can be either +online or ensemble, and the more offline thing + +1:22:39.281 --> 1:22:41.549 +is called reranking. + +1:22:42.462 --> 1:22:52.800 +So the idea is, for example, an ensemble that +you combine different initializations. + +1:22:52.800 --> 1:23:02.043 +Of course, you can also do other things like +having different architecture. + +1:23:02.222 --> 1:23:08.922 +But the easiest thing you can change always +in generating two motors is to have different. + +1:23:09.209 --> 1:23:24.054 +And then the question is how can you combine +that? + +1:23:26.006 --> 1:23:34.245 +And the easiest thing, as said, is the bottle +of soda. + +1:23:34.245 --> 1:23:39.488 +What you mainly do is in parallel. + +1:23:39.488 --> 1:23:43.833 +You decode all of the money. + +1:23:44.444 --> 1:23:59.084 +So the probability of the output and you can +join this one to a joint one by just summing + +1:23:59.084 --> 1:24:04.126 +up over your key models again. + +1:24:04.084 --> 1:24:10.374 +So you still have a pro bonding distribution, +but you are not taking only one output here, + +1:24:10.374 --> 1:24:10.719 +but. + +1:24:11.491 --> 1:24:20.049 +So that's one you can easily combine different +models, and the nice thing is it typically + +1:24:20.049 --> 1:24:20.715 +works. + +1:24:21.141 --> 1:24:27.487 +You additional improvement with only more +calculation but not more human work. + +1:24:27.487 --> 1:24:33.753 +You just do the same thing for times and you're +getting a better performance. + +1:24:33.793 --> 1:24:41.623 +Like having more layers and so on, the advantage +of bigger models is of course you have to have + +1:24:41.623 --> 1:24:46.272 +the big models only joint and decoding during +inference. + +1:24:46.272 --> 1:24:52.634 +There you have to load models in parallel +because you have to do your search. + +1:24:52.672 --> 1:24:57.557 +Normally there is more memory resources for +training than you need for insurance. + +1:25:00.000 --> 1:25:12.637 +You have to train four models and the decoding +speed is also slower because you need to decode + +1:25:12.637 --> 1:25:14.367 +four models. + +1:25:14.874 --> 1:25:25.670 +There is one other very important thing and +the models have to be very similar, at least + +1:25:25.670 --> 1:25:27.368 +in some ways. + +1:25:27.887 --> 1:25:28.506 +Course. + +1:25:28.506 --> 1:25:34.611 +You can only combine this one if you have +the same words because you are just. + +1:25:34.874 --> 1:25:43.110 +So just imagine you have two different sizes +because you want to compare them or a director + +1:25:43.110 --> 1:25:44.273 +based model. + +1:25:44.724 --> 1:25:53.327 +That's at least not easily possible here because +once your output would be here a word and the + +1:25:53.327 --> 1:25:56.406 +other one would have to sum over. + +1:25:56.636 --> 1:26:07.324 +So this ensemble typically only works if you +have the same output vocabulary. + +1:26:07.707 --> 1:26:16.636 +Your input can be different because that is +only done once and then. + +1:26:16.636 --> 1:26:23.752 +Your hardware vocabulary has to be the same +otherwise. + +1:26:27.507 --> 1:26:41.522 +There's even a surprising effect of improving +your performance and it's again some kind of + +1:26:41.522 --> 1:26:43.217 +smoothing. + +1:26:43.483 --> 1:26:52.122 +So normally during training what we are doing +is we can save the checkpoints after each epoch. + +1:26:52.412 --> 1:27:01.774 +And you have this type of curve where your +Arab performance normally should go down, and + +1:27:01.774 --> 1:27:09.874 +if you do early stopping it means that at the +end you select not the lowest. + +1:27:11.571 --> 1:27:21.467 +However, some type of smoothing is there again. + +1:27:21.467 --> 1:27:31.157 +Sometimes what you can do is take an ensemble. + +1:27:31.491 --> 1:27:38.798 +That is not as good, but you still have four +different bottles, and they give you a little. + +1:27:39.259 --> 1:27:42.212 +So,. + +1:27:43.723 --> 1:27:48.340 +It's some are helping you, so now they're +supposed to be something different, you know. + +1:27:49.489 --> 1:27:53.812 +Oh didn't do that, so that is a checkpoint. + +1:27:53.812 --> 1:27:59.117 +There is one thing interesting, which is even +faster. + +1:27:59.419 --> 1:28:12.255 +Normally let's give you better performance +because this one might be again like a smooth + +1:28:12.255 --> 1:28:13.697 +ensemble. + +1:28:16.736 --> 1:28:22.364 +Of course, there is also some problems with +this, so I said. + +1:28:22.364 --> 1:28:30.022 +For example, maybe you want to do different +web representations with Cherokee and. + +1:28:30.590 --> 1:28:37.189 +You want to do right to left decoding so you +normally do like I go home but then your translation + +1:28:37.189 --> 1:28:39.613 +depends only on the previous words. + +1:28:39.613 --> 1:28:45.942 +If you want to model on the future you could +do the inverse direction and generate the target + +1:28:45.942 --> 1:28:47.895 +sentence from right to left. + +1:28:48.728 --> 1:28:50.839 +But it's not easy to combine these things. + +1:28:51.571 --> 1:28:56.976 +In order to do this, or what is also sometimes +interesting is doing in verse translation. + +1:28:57.637 --> 1:29:07.841 +You can combine these types of models in the +next election. + +1:29:07.841 --> 1:29:13.963 +That is only a bit which we can do. + +1:29:14.494 --> 1:29:29.593 +Next time what you should remember is how +search works and do you have any final questions. + +1:29:33.773 --> 1:29:43.393 +Then I wish you a happy holiday for next week +and then Monday there is another practical + +1:29:43.393 --> 1:29:50.958 +and then Thursday in two weeks so we'll have +the next lecture Monday. + diff --git a/demo_data/lectures/Lecture-09-25.05.2023/video.mp4 b/demo_data/lectures/Lecture-09-25.05.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..6e8c6b74b6369481feabc8c1f492b73d07025cfc --- /dev/null +++ b/demo_data/lectures/Lecture-09-25.05.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb17280ddd03304eacdea7239b8a65b617c0c5bc9a4ab92e07100370c09187af +size 119262060 diff --git a/demo_data/lectures/Lecture-10-13.06.2023/English.vtt b/demo_data/lectures/Lecture-10-13.06.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..08cdb5eae2b0d841596391b3c2e0a2adc893d9f0 --- /dev/null +++ b/demo_data/lectures/Lecture-10-13.06.2023/English.vtt @@ -0,0 +1,2450 @@ +WEBVTT + +0:00:00.860 --> 0:00:04.211 +Okay Again Welcome. + +0:00:04.524 --> 0:00:09.256 +So today I'll be doing the lecture. + +0:00:09.256 --> 0:00:12.279 +My name is Danny Liro. + +0:00:12.279 --> 0:00:16.747 +I'm one of the PhD students with. + +0:00:17.137 --> 0:00:25.942 +And specifically how to learn representations +that are common across languages and use that + +0:00:25.942 --> 0:00:29.004 +to help low resource languages. + +0:00:29.689 --> 0:00:39.445 +So hope today we can explore a little bit +about motoring machine translation and hopefully. + +0:00:40.100 --> 0:00:50.940 +So today what we are going to do first we +are going to look at. + +0:00:52.152 --> 0:01:02.491 +Second, we will be looking into more details +as in how we achieve modeling or machine translation + +0:01:02.491 --> 0:01:06.183 +and what are the techniques there. + +0:01:06.183 --> 0:01:12.197 +At last, we are going to look at the current +challenges. + +0:01:13.573 --> 0:01:15.976 +Alright, so some definitions. + +0:01:15.976 --> 0:01:19.819 +First, what is modeling or machine translation? + +0:01:21.201 --> 0:01:28.637 +So for a multilingual machine translation +system, it's basically a system that is able + +0:01:28.637 --> 0:01:34.279 +to handle multiple source languages or multiple +target languages. + +0:01:34.254 --> 0:01:44.798 +You see here you've got source on the source +side, some German Chinese, Spanish and English. + +0:01:45.485 --> 0:01:50.615 +Physically, it's also a quite interesting +machine learning challenge actually. + +0:01:51.031 --> 0:02:05.528 +So if you consider each translation pair as +a different task in machine learning, then + +0:02:05.528 --> 0:02:08.194 +a multilingual. + +0:02:08.628 --> 0:02:17.290 +Where it has to specialize in all these different +translation directions and try to be good. + +0:02:17.917 --> 0:02:26.890 +So this is basically about multi-task learning, +and here when translation direction being one + +0:02:26.890 --> 0:02:27.462 +task. + +0:02:28.428 --> 0:02:35.096 +Interesting question to ask here is like do +we get synergy like different tasks helping + +0:02:35.096 --> 0:02:39.415 +each other, the knowledge of one task helping +the other? + +0:02:39.539 --> 0:02:48.156 +Or do we get more interference in English +to German, and now I get worse at English to + +0:02:48.156 --> 0:02:49.047 +Chinese. + +0:02:49.629 --> 0:02:55.070 +So this is also a very interesting question +that we'll look into later. + +0:02:56.096 --> 0:02:58.605 +Now a little bit of context. + +0:02:59.519 --> 0:03:04.733 +We care about multilingual machine translation. + +0:03:04.733 --> 0:03:10.599 +Part of the thing is that machine translation +models. + +0:03:11.291 --> 0:03:22.659 +If you consider all the languages in the world, +there are a read it here roughly seven thousand + +0:03:22.659 --> 0:03:23.962 +languages. + +0:03:24.684 --> 0:03:37.764 +So consider this number, and if you think +about this many languages out there, how many + +0:03:37.764 --> 0:03:39.548 +directions. + +0:03:40.220 --> 0:03:46.897 +So this means to cover end languages. + +0:03:46.897 --> 0:03:59.374 +We're going to end up with a prodretic in +square number of directions. + +0:03:59.779 --> 0:04:02.290 +This Is Very Bad, Padre Is Very Bad. + +0:04:03.203 --> 0:04:14.078 +The prosthetic situation going on means that +for a lot of translation directions, if you + +0:04:14.078 --> 0:04:16.278 +consider all the. + +0:04:17.177 --> 0:04:34.950 +For many of them we aren't going to have any +parallel data as in existing translated data. + +0:04:35.675 --> 0:04:40.001 +So this is a very data scarce situation. + +0:04:40.001 --> 0:04:49.709 +We're not going to get parallel data in blue +wear, especially likely when you have a system + +0:04:49.709 --> 0:04:52.558 +that covers tan languages. + +0:04:52.912 --> 0:05:04.437 +If this access actually goes towards thousands +that are realistic, we are going to end up + +0:05:04.437 --> 0:05:06.614 +with some holes. + +0:05:07.667 --> 0:05:15.400 +So now we are going to ask: Can we use motel +inquality to help this kind of glow resource? + +0:05:15.875 --> 0:05:22.858 +So when useful concept there is mutual intelligibility, +don't know if you've heard of this. + +0:05:23.203 --> 0:05:30.264 +Basically isn't linguistic when you say somebody +who's speaking one language can directly without + +0:05:30.264 --> 0:05:33.218 +learning understands the other language. + +0:05:33.218 --> 0:05:39.343 +So if you're a German speaker maybe Dutch +or Danish and all that kind of stuff would + +0:05:39.343 --> 0:05:39.631 +be. + +0:05:40.000 --> 0:05:45.990 +Useful or like directly understandable partially +to you. + +0:05:46.586 --> 0:05:52.082 +That is, thanks to this kind of mutual enthology +ability that is basically based on language + +0:05:52.082 --> 0:05:52.791 +similarity. + +0:05:53.893 --> 0:05:57.105 +And then there's knowledge sharing this concept. + +0:05:57.105 --> 0:06:01.234 +I mean, it's quite intuitive, basically a +very German speaker. + +0:06:01.234 --> 0:06:06.805 +If you start learning Dutch or Danish and +all these Mordic languages, I think you're + +0:06:06.805 --> 0:06:11.196 +going to be faster than just a native English +speaker or anything. + +0:06:11.952 --> 0:06:18.751 +So hopefully our model is also able to do +this, but we'll see later what the real situation. + +0:06:19.799 --> 0:06:27.221 +So we said multilingual is good multilingual +transmission, it's nice and there's a lot of + +0:06:27.221 --> 0:06:28.210 +potentials. + +0:06:28.969 --> 0:06:32.205 +So it's a long path towards there. + +0:06:32.205 --> 0:06:37.569 +Think all the efforts started in so quite +some years ago. + +0:06:37.958 --> 0:06:54.639 +At first people started with models with language +specific modules. + +0:06:54.454 --> 0:06:58.747 +So we talked about the input of the decoder +architecture in the previous lecturer area. + +0:07:00.100 --> 0:07:06.749 +And with this separation of the inputter and +the decoder, it gives it a natural way to split + +0:07:06.749 --> 0:07:07.679 +the modules. + +0:07:09.069 --> 0:07:20.805 +So basically what's happening going on here +is dedicated to each toes language and dedicated. + +0:07:21.281 --> 0:07:34.252 +Now given parallel data of body good data +English German data we just activate this German + +0:07:34.252 --> 0:07:39.241 +inputter and activate this and an. + +0:07:40.680 --> 0:07:48.236 +So now we are training basically like corresponding +parts of the encoder decoders. + +0:07:48.236 --> 0:07:55.278 +It has some advantages: First, we have a multilingual +system. + +0:07:55.278 --> 0:08:03.898 +Of course, second modularity is also an advantage +in software engineering. + +0:08:03.898 --> 0:08:10.565 +We want to decouple things if the German input +is broken. + +0:08:11.011 --> 0:08:19.313 +So modularity is advantage in this case, but +again if we think about scalability, if we + +0:08:19.313 --> 0:08:27.521 +think about languages out there that we talked +about, scalability isn't a great thing. + +0:08:27.947 --> 0:08:37.016 +We also talked about sharing knowledge or +sharing representations for different languages. + +0:08:37.317 --> 0:08:41.968 +We have a separate thing for each language. + +0:08:41.968 --> 0:08:46.513 +How likely is it that we are sharing much? + +0:08:46.513 --> 0:08:52.538 +So these are potential disadvantages with +this approach. + +0:08:53.073 --> 0:09:01.181 +So yeah we talked about, we want to have knowledge +transfer, we want to have similar languages + +0:09:01.181 --> 0:09:02.888 +helping each other. + +0:09:02.822 --> 0:09:06.095 +This is somehow a more reachable goal. + +0:09:06.095 --> 0:09:13.564 +If you have a shared in corner and a shared +in physically, a full perimeter shared model + +0:09:13.564 --> 0:09:21.285 +for all the translation pairs out there, and +there's also another game, so if you just have + +0:09:21.285 --> 0:09:21.705 +one. + +0:09:22.582 --> 0:09:26.084 +Lock of model for all the translation directions +out there. + +0:09:26.606 --> 0:09:38.966 +It's easier to deploy in the sense that if +you are serving a model you don't have a thousand + +0:09:38.966 --> 0:09:42.555 +small modules to maintain. + +0:09:42.762 --> 0:09:52.448 +So in terms of engineering somehow these kind +of fully primitive shared models have: So this + +0:09:52.448 --> 0:09:59.819 +is also where the parent research has been +going towards in recent years. + +0:10:00.460 --> 0:10:16.614 +So the rest of the electro are also going +to focus on this kind of model. + +0:10:17.037 --> 0:10:30.901 +So the first type of multilinguali is this +kind of many to one abbreviated kind of situation. + +0:10:30.901 --> 0:10:34.441 +Basically what's going. + +0:10:35.355 --> 0:10:49.804 +So one news case that you can think of here +is if you're subtitled for international movies + +0:10:49.804 --> 0:10:51.688 +in Germany. + +0:10:53.073 --> 0:11:02.863 +Then flipping the situation there is also +many configurations where we only have when + +0:11:02.863 --> 0:11:04.798 +source language. + +0:11:06.046 --> 0:11:13.716 +There's also many use cases like if you think +about the lecture translator here you've seen. + +0:11:14.914 --> 0:11:21.842 +So here most of the lecturers are in German +and now we want to translate it into. + +0:11:21.842 --> 0:11:28.432 +I think on the user end we only support English +but they're also supportable. + +0:11:28.608 --> 0:11:38.988 +So in this kind of used case, if you have +one speaker and you want to serve or expand + +0:11:38.988 --> 0:11:41.281 +to many audience,. + +0:11:42.802 --> 0:11:50.542 +But of course, combining everything, there's +the many to many situation here. + +0:11:50.542 --> 0:11:54.015 +You can think of Google Translate. + +0:11:54.015 --> 0:11:58.777 +They are doing basically any selected language. + +0:11:59.159 --> 0:12:03.760 +And this is also more difficult. + +0:12:03.760 --> 0:12:14.774 +If you consider the data you need to get and +concerns, we'll cover this later. + +0:12:15.135 --> 0:12:21.034 +But first we are going to start with many +to one translations. + +0:12:21.741 --> 0:12:30.436 +Say this is the most similar to the bilingual +translation situation you saw earlier, but + +0:12:30.436 --> 0:12:39.423 +now one difference is we need a vocabulary +or tokens that can represent all these different + +0:12:39.423 --> 0:12:40.498 +languages. + +0:12:41.301 --> 0:12:44.200 +So we need a joint more telecom global vocabulary. + +0:12:44.924 --> 0:12:48.794 +So let's just quickly recall what word embedding +is to do. + +0:12:49.189 --> 0:12:54.561 +Basically we need to represent it. + +0:12:54.561 --> 0:13:04.077 +We have to get some vector representation +for discrete words. + +0:13:04.784 --> 0:13:16.911 +And when we embed a token, we are retrieving +the corresponding vector out of this little. + +0:13:17.697 --> 0:13:19.625 +And then we put it. + +0:13:19.625 --> 0:13:26.082 +We feed a sequence of vectors into the inputter +as the next steps. + +0:13:26.987 --> 0:13:34.973 +Now if it's motelingual you can imagine that +vocabulary suddenly gets very, very big because + +0:13:34.973 --> 0:13:36.262 +the languages. + +0:13:37.877 --> 0:13:46.141 +So what is quite useful here is the by pair +like subwords you talked about by pairing. + +0:13:46.406 --> 0:13:55.992 +So in this case we are still limiting ourselves +to a finite number of vocabularies that we + +0:13:55.992 --> 0:13:59.785 +are exploding the vocabulary table. + +0:14:01.181 --> 0:14:11.631 +So when we learn these kinds of subwords, +what happens basically? + +0:14:11.631 --> 0:14:17.015 +We look at all the training data. + +0:14:18.558 --> 0:14:20.856 +So think about this. + +0:14:20.856 --> 0:14:28.077 +If we do this now on a bunch of Mozilla data, +are there concerns? + +0:14:30.050 --> 0:14:36.811 +Maybe we have an underground status head, +so we get over English mergers and nocularities. + +0:14:37.337 --> 0:14:39.271 +Yeah Exactly Thanks. + +0:14:39.539 --> 0:14:46.602 +So what we have to pay attention to here is +learn this motilingual vocabulary. + +0:14:46.602 --> 0:14:52.891 +We should pay attention: All the languages +are more or less balanced, not that you only + +0:14:52.891 --> 0:14:58.912 +learning words is for for English or some bigger +languages, and then neglecting other other + +0:14:58.912 --> 0:15:00.025 +languages, yeah. + +0:15:01.021 --> 0:15:04.068 +Of course, this is not going to solve everything. + +0:15:04.068 --> 0:15:09.614 +Even if we get a perfectly uniform distribution +out of all the languages out, there is not + +0:15:09.614 --> 0:15:13.454 +going to mean that we are ending up with a +perfect vocabulary. + +0:15:14.154 --> 0:15:20.068 +There are also language differences read, +so if you consider more European languages. + +0:15:20.180 --> 0:15:27.081 +There will be many shared subcomponents like +how you write a certain word, somewhat similar. + +0:15:27.267 --> 0:15:34.556 +But then there are other languages with completely +different scripts like Arabic, Cyrillic scripts + +0:15:34.556 --> 0:15:40.594 +or Eastern Asian scripts where you get a vocabulary +like the characters set with. + +0:15:40.940 --> 0:15:43.531 +Tens of thousands of characters. + +0:15:43.531 --> 0:15:50.362 +So these are also individual concerns that +one has to think about my building specific + +0:15:50.362 --> 0:15:51.069 +systems. + +0:15:51.591 --> 0:16:02.660 +But overall, the rule of thumb is that when +you do a mottling tokenizer vocabulary, there's + +0:16:02.660 --> 0:16:04.344 +more or less. + +0:16:05.385 --> 0:16:17.566 +And there's actually some paper showing that +the performance of the final system is going + +0:16:17.566 --> 0:16:25.280 +to start to degrade if you have a disproportionate +data. + +0:16:27.207 --> 0:16:33.186 +Of course there is currently the trend of +using pre-train models. + +0:16:33.186 --> 0:16:39.890 +If you take a pre-train model somewhere then +you don't have this concern. + +0:16:40.580 --> 0:16:47.810 +Making sure that you use the same organizers +that they used so that there is no train test + +0:16:47.810 --> 0:16:48.287 +time. + +0:16:48.888 --> 0:16:53.634 +Yeah for a pre-trainer, we're going to talk +about a little bit later as well. + +0:16:54.734 --> 0:16:59.960 +Alright: So now where's a Martin Luther vocabulary? + +0:17:00.920 --> 0:17:04.187 +There are several good things, obviously. + +0:17:04.187 --> 0:17:10.953 +So one thing is that if we have words that +are in the textful form like we said, there + +0:17:10.953 --> 0:17:16.242 +are European languages that share some vocabulary, +then it's great. + +0:17:16.242 --> 0:17:19.897 +Then we have the first step towards knowledge. + +0:17:20.000 --> 0:17:30.464 +For example, the word pineapple for some reason +is also in Eastern European languages. + +0:17:30.464 --> 0:17:34.915 +In Cyrillic scripts that's also the. + +0:17:36.116 --> 0:17:42.054 +But however, there is also ambiguity if you've +embracing together or dye. + +0:17:42.054 --> 0:17:46.066 +Of course, they mean different things for +German. + +0:17:46.246 --> 0:17:53.276 +Then, of course, that's possible to rely on +further context. + +0:17:53.276 --> 0:17:59.154 +It's not a problem, it's something to think +about. + +0:18:00.200 --> 0:18:11.061 +And when we go higher to cover more vocabulary +entries, we might need to go bigger in the + +0:18:11.061 --> 0:18:13.233 +vocabulary count. + +0:18:13.653 --> 0:18:28.561 +So there is always sort of a bottleneck as +the number of languages increase. + +0:18:30.110 --> 0:18:32.836 +Right, so what is the result? + +0:18:32.836 --> 0:18:38.289 +What are these crustling over inventings actually +learning? + +0:18:40.160 --> 0:18:44.658 +So normally to inspect them it's quite hard. + +0:18:44.658 --> 0:18:53.853 +It's like high dimensional vectors with dimensions, +but researchers also try to project it. + +0:18:54.454 --> 0:19:05.074 +So in this case it is a little bit small, +but in this case for English and French there + +0:19:05.074 --> 0:19:07.367 +are many injuries. + +0:19:07.467 --> 0:19:20.014 +My example is like different words with the +same word in morphological forms. + +0:19:20.014 --> 0:19:26.126 +Basically, it's like a morphological. + +0:19:26.546 --> 0:19:32.727 +There are also words in different languages +like think there is research for English and + +0:19:32.727 --> 0:19:33.282 +French. + +0:19:33.954 --> 0:19:41.508 +So the take away from this plot is that somehow +we learn a bit of semantic meanings beyond + +0:19:41.508 --> 0:19:43.086 +the textual forms. + +0:19:45.905 --> 0:19:50.851 +But then this looks good and this gives us +hope. + +0:19:52.252 --> 0:20:05.240 +That if we consider what is the baseline here, +the baseline we compare to is a bilingual system + +0:20:05.240 --> 0:20:09.164 +without any multilinguality. + +0:20:10.290 --> 0:20:19.176 +This looks good because if we compare for +many Central European languages, Eastern and + +0:20:19.176 --> 0:20:28.354 +Central European languages to English, we compare: +And we see that the Mini Two English has actually + +0:20:28.354 --> 0:20:30.573 +always gained quite a bit over it. + +0:20:31.751 --> 0:20:38.876 +But there is also later investigation on whether +it is actually out of mountain linguality or + +0:20:38.876 --> 0:20:39.254 +not. + +0:20:39.639 --> 0:20:46.692 +So this is a spoiler won't tell much about +it until the second half, but just remember + +0:20:46.692 --> 0:20:47.908 +there is this. + +0:20:49.449 --> 0:20:53.601 +Now move on to many translations. + +0:20:53.601 --> 0:21:01.783 +Let's recall in a normal transformer or any +encoder decoder setup. + +0:21:02.242 --> 0:21:08.839 +We have an inkluder that creates sort of contextual +representation for the sort of sentence. + +0:21:09.949 --> 0:21:17.787 +Is more or less the context for generating +the target sentence red. + +0:21:17.787 --> 0:21:28.392 +Now on the target side we get the first open, +then we feed it again and then get the second + +0:21:28.392 --> 0:21:29.544 +decoding. + +0:21:31.651 --> 0:21:35.039 +And now we have multiple target languages. + +0:21:35.039 --> 0:21:39.057 +Does anybody see a problem with this architecture? + +0:21:48.268 --> 0:21:57.791 +Specifically, it's in the decoder, so now +have a German sentiments encoded. + +0:21:57.791 --> 0:22:01.927 +It now want to generate Spanish. + +0:22:07.367 --> 0:22:11.551 +So the problem is how does the model know +which language to generate? + +0:22:12.112 --> 0:22:24.053 +If you just give it a generic start token, +there is nowhere where we are telling the model. + +0:22:24.944 --> 0:22:30.277 +So that this can only be a guess, and this +model will definitely not run well. + +0:22:32.492 --> 0:22:40.021 +So this comes to the question: How do we indicate +the one's intended language to the model? + +0:22:41.441 --> 0:22:52.602 +One first idea is what people tried is basically +now in a source where not only including the + +0:22:52.602 --> 0:22:53.552 +source. + +0:22:53.933 --> 0:23:01.172 +To Spanish things like this, so basically +the source is already informed. + +0:23:01.172 --> 0:23:12.342 +The source sentence is already supplemented +with: Now this is also called a target forcing + +0:23:12.342 --> 0:23:19.248 +in the sense that we try to force it to give +the right target. + +0:23:20.080 --> 0:23:24.622 +This is one approach. + +0:23:24.622 --> 0:23:38.044 +Another approach is basically based on the +idea that if we have. + +0:23:38.438 --> 0:23:52.177 +So if we create a context of our world, the +incode output shouldn't really differ. + +0:23:52.472 --> 0:24:02.397 +So out of this motivation people have moved +this signaling mechanism. + +0:24:02.397 --> 0:24:09.911 +They basically replaced the traditional start +token. + +0:24:10.330 --> 0:24:17.493 +So here we are not kids starting into the +generic start talking anymore instead language + +0:24:17.493 --> 0:24:18.298 +specific. + +0:24:18.938 --> 0:24:21.805 +So this is also another way to achieve this. + +0:24:23.283 --> 0:24:27.714 +But there are still more challenging cases. + +0:24:27.714 --> 0:24:35.570 +Sometimes here it can be called as General +English or German when it's there. + +0:24:35.570 --> 0:24:39.700 +Later on it goes further and further on. + +0:24:40.320 --> 0:24:46.752 +Basically this information is not strong enough +to always enforce the target language, especially + +0:24:46.752 --> 0:24:48.392 +in zero shot conditions. + +0:24:48.392 --> 0:24:54.168 +We'll look into this later so we'll get this +kind of target translation into generating + +0:24:54.168 --> 0:24:57.843 +and generating and then going into some wrong +language. + +0:24:59.219 --> 0:25:12.542 +So another technique actually developed here +some years ago was to inject this language. + +0:25:12.872 --> 0:25:19.834 +So when we are feeding doing the auto-aggressive +decoding normally, we only feed the upherb. + +0:25:20.000 --> 0:25:22.327 +Into the depoter. + +0:25:22.327 --> 0:25:33.704 +But if we also add a language embedding for +the target language, on top of that we have + +0:25:33.704 --> 0:25:37.066 +the language information. + +0:25:37.397 --> 0:25:44.335 +And this has shown to perform quite a bit +better, especially in conditions where the + +0:25:44.335 --> 0:25:44.906 +model. + +0:25:46.126 --> 0:25:56.040 +So yeah, we introduced three ways to enforce +the Tardid language: And now with this we're + +0:25:56.040 --> 0:26:02.607 +going to move on to the more interesting case +of many too many translations. + +0:26:03.503 --> 0:26:14.021 +Am so here we just consider a system that +translates two directions: English to English + +0:26:14.021 --> 0:26:15.575 +and English. + +0:26:16.676 --> 0:26:21.416 +Now we have target languages read. + +0:26:21.416 --> 0:26:29.541 +Can you see where we're enforcing the target +language here? + +0:26:29.541 --> 0:26:33.468 +In this case what technique? + +0:26:34.934 --> 0:26:45.338 +So here we are enforcing the characteristic +language with the yelling we train this system. + +0:26:46.526 --> 0:27:00.647 +And at the inference time we are able to generate +English to French, but in addition to this + +0:27:00.647 --> 0:27:12.910 +we are also able to: We will be able to do +zero shot inference that basically translates + +0:27:12.910 --> 0:27:17.916 +a direction that is not seen in training. + +0:27:19.319 --> 0:27:25.489 +So this is so called zero shot translation +using a modeling wall system. + +0:27:26.606 --> 0:27:34.644 +Of course, we have to reach several things +before we are able to control the language, + +0:27:34.644 --> 0:27:36.769 +otherwise it's no use. + +0:27:37.317 --> 0:27:51.087 +Second, we should also have some kind of language +independent representation. + +0:27:51.731 --> 0:27:53.196 +Why is this? + +0:27:53.196 --> 0:27:55.112 +Why is this big? + +0:27:55.112 --> 0:28:00.633 +Because if women drink generally French up +here? + +0:28:00.940 --> 0:28:05.870 +It was trained to translate from some English. + +0:28:07.187 --> 0:28:15.246 +But now we use Anchored Germans in the French, +so intuitively we need these representations + +0:28:15.246 --> 0:28:22.429 +to be similar enough, not that they are so +far attracted that we cannot use this. + +0:28:25.085 --> 0:28:32.059 +So there are several works out there showing +that if you do a standard transformer architecture + +0:28:32.059 --> 0:28:39.107 +this language independent property is not really +there and you need to add additional approaches + +0:28:39.107 --> 0:28:40.633 +in order to enforce. + +0:28:41.201 --> 0:28:51.422 +So you can, for example, add an additional +training objective: That says, we invoked SARSN, + +0:28:51.422 --> 0:29:00.305 +be invoked by German, and the invoked English +have to be the same or be as close to each + +0:29:00.305 --> 0:29:02.201 +other as possible. + +0:29:02.882 --> 0:29:17.576 +So if we take the output and the output for +another language, how can we formulate this + +0:29:17.576 --> 0:29:18.745 +as an. + +0:29:20.981 --> 0:29:27.027 +We can take the translation to the encoder +and whatever you translate. + +0:29:27.027 --> 0:29:32.817 +The embeddings also must be similar and that's +the great direction. + +0:29:33.253 --> 0:29:42.877 +So one thing to take care of here is the length +for the same sentence in German and English + +0:29:42.877 --> 0:29:44.969 +is not necessarily. + +0:29:45.305 --> 0:30:00.858 +So if we just do a word to word matching, +we can always do pulling to a fixed length + +0:30:00.858 --> 0:30:03.786 +representation. + +0:30:04.004 --> 0:30:08.392 +Or there are more advanced techniques that +involve some alignments. + +0:30:08.848 --> 0:30:23.456 +So this is useful in the sense that in this +part in experiments we have shown it improves + +0:30:23.456 --> 0:30:27.189 +zero shot translation. + +0:30:27.447 --> 0:30:36.628 +This is on the data condition of English to +Malay, Java and Filipino, so kind of made to + +0:30:36.628 --> 0:30:39.722 +low resource language family. + +0:30:40.100 --> 0:30:50.876 +And there we assume that we get parallel English +to all of them, but among all these. + +0:30:51.451 --> 0:31:03.592 +So the blue bar is a Vanilla Transformer model, +and the purple bar is when we add a language. + +0:31:04.544 --> 0:31:12.547 +You see that in supervised conditions it's +not changing much, but in zero shots there's + +0:31:12.547 --> 0:31:13.183 +quite. + +0:31:15.215 --> 0:31:22.649 +Yeah, so far we said zero shots is doable +and it's even more achievable if we enforce + +0:31:22.649 --> 0:31:26.366 +some language independent representations. + +0:31:26.366 --> 0:31:29.823 +However, there's one practical concern. + +0:31:29.823 --> 0:31:33.800 +Don't know if you also had the same question. + +0:31:34.514 --> 0:31:39.835 +If you have two languages, you don't have +direct parallel. + +0:31:39.835 --> 0:31:43.893 +One's into English and one's out of English. + +0:31:45.685 --> 0:31:52.845 +It's actually this kind of approach is called +pivoting as in pivoting over an intermediate + +0:31:52.845 --> 0:31:53.632 +language. + +0:31:55.935 --> 0:32:00.058 +Yeah, that it definitely has advantages in +the sense that we're going. + +0:32:00.440 --> 0:32:11.507 +Now if we go over these two steps every direction +was trained with supervised data so you could + +0:32:11.507 --> 0:32:18.193 +always assume that when we are working with +a supervised. + +0:32:18.718 --> 0:32:26.868 +So in this case we can expect more robust +inference time behavior. + +0:32:26.868 --> 0:32:31.613 +However, there are also disadvantages. + +0:32:31.531 --> 0:32:38.860 +An inference where passing through the model +ties so that's doubling the inference time + +0:32:38.860 --> 0:32:39.943 +computation. + +0:32:40.500 --> 0:32:47.878 +You might think okay doubling then what, but +if you consider if your company like Google, + +0:32:47.878 --> 0:32:54.929 +Google Translate and all your life traffic +suddenly becomes twice as big, this is not + +0:32:54.929 --> 0:33:00.422 +something scalable that you want to see, especially +in production. + +0:33:01.641 --> 0:33:11.577 +A problem with this is making information +loss because if we go over these games when + +0:33:11.577 --> 0:33:20.936 +a chain of kids pass the word to each other, +in the end it's losing information. + +0:33:22.082 --> 0:33:24.595 +Can give it an example here. + +0:33:24.595 --> 0:33:27.803 +It's also from a master thesis here. + +0:33:27.803 --> 0:33:30.316 +It's on gender preservation. + +0:33:30.770 --> 0:33:39.863 +Basically, some languages like Italian and +French have different word forms based on the + +0:33:39.863 --> 0:33:40.782 +speaker. + +0:33:41.001 --> 0:33:55.987 +So if a male person says feel alienated, this +word for alienated would be exclusive and a + +0:33:55.987 --> 0:33:58.484 +female person. + +0:34:00.620 --> 0:34:05.730 +Now imagine that we pivot through anguish. + +0:34:05.730 --> 0:34:08.701 +The information is lost. + +0:34:08.701 --> 0:34:11.910 +We don't know what gender. + +0:34:12.492 --> 0:34:19.626 +When we go out into branch again, there are +different forms. + +0:34:19.626 --> 0:34:29.195 +Depending on the speaker gender, we can: So +this is one problem. + +0:34:31.871 --> 0:34:44.122 +This is especially the case because English +compared to many other languages is relatively + +0:34:44.122 --> 0:34:45.199 +simple. + +0:34:45.205 --> 0:34:53.373 +Gendered where it forms like this, it also +doesn't have many cases, so going through English + +0:34:53.373 --> 0:34:56.183 +many information would be lost. + +0:34:57.877 --> 0:35:12.796 +And another thing is if you have similar languages +that you are translating out of my systems + +0:35:12.796 --> 0:35:15.494 +that translates. + +0:35:16.496 --> 0:35:24.426 +This is the output of going from Dutch to +German again. + +0:35:24.426 --> 0:35:30.231 +If you read the German, how many of you? + +0:35:32.552 --> 0:35:51.679 +Good and the problem here is that we are going +over English and then the English to German. + +0:35:51.831 --> 0:36:06.332 +However, if we go direct in this case zero +shot translation you see that word forgive. + +0:36:06.546 --> 0:36:09.836 +In this case, the outward translation is better. + +0:36:10.150 --> 0:36:20.335 +And we believe this has to do with using the +language similarity between the two languages. + +0:36:20.335 --> 0:36:26.757 +There is also quantitative results we found +when born in. + +0:36:27.988 --> 0:36:33.780 +The models are always doing better when translating +similar languages compared to the. + +0:36:35.535 --> 0:36:42.093 +Yeah, so in this first half what we talked +about basically first, we started with how + +0:36:42.093 --> 0:36:49.719 +motilinguality or motilingual machine translation +could enable knowledge transfer between languages + +0:36:49.719 --> 0:36:53.990 +and help with conditions where we don't have +much data. + +0:36:55.235 --> 0:37:02.826 +Now it looks at three types of multilingual +translation, so one is many to one, one to + +0:37:02.826 --> 0:37:03.350 +many. + +0:37:05.285 --> 0:37:13.397 +We got there first about a shared vocabulary +based on different languages and how these + +0:37:13.397 --> 0:37:22.154 +cross lingual word embeddings capture semantic +meanings rather than just on a text proof form. + +0:37:25.505 --> 0:37:37.637 +Then we looked at how to signal the target +language, how to ask for the model to generate, + +0:37:37.637 --> 0:37:43.636 +and then we looked at zero shot translation. + +0:37:45.325 --> 0:37:58.187 +You now before go into the second half are +there questions about the first okay good. + +0:38:00.140 --> 0:38:10.932 +In the second half of this lecture we'll be +looking into challenges like what is still + +0:38:10.932 --> 0:38:12.916 +unsolved about. + +0:38:13.113 --> 0:38:18.620 +There are some aspects to look at it. + +0:38:18.620 --> 0:38:26.591 +The first is modeling, the second is more +engineering. + +0:38:28.248 --> 0:38:33.002 +Okay, so we talked about this question several +times. + +0:38:33.002 --> 0:38:35.644 +How does motilinguality help? + +0:38:35.644 --> 0:38:37.405 +Where does it help? + +0:38:38.298 --> 0:38:45.416 +Here want to show results of an experiment +based on over a hundred languages. + +0:38:46.266 --> 0:38:58.603 +Here you can see the data amount so they use +parallel data to English and it's very. + +0:38:58.999 --> 0:39:00.514 +This is already lock scale. + +0:39:00.961 --> 0:39:12.982 +So for higher resource languages like English +to French, German to Spanish you get over billion + +0:39:12.982 --> 0:39:14.359 +sentences. + +0:39:14.254 --> 0:39:21.003 +In parallel, and when we go more to the right +to the more low resource spectrum on the other + +0:39:21.003 --> 0:39:26.519 +hand, there are languages that maybe many of +us have new and heard of like. + +0:39:26.466 --> 0:39:29.589 +Do You Want to Move Back? + +0:39:30.570 --> 0:39:33.270 +Hawaiian Indians have heard of it. + +0:39:34.414 --> 0:39:39.497 +So on that spectrum we only have like thirty +thousand sentences. + +0:39:40.400 --> 0:39:48.389 +So what this means is when we train, we have +to up sample these guys. + +0:39:48.389 --> 0:39:51.585 +The model didn't even know. + +0:39:52.732 --> 0:40:05.777 +Yeah, so on this graph on how we read it is +this horizontal line and zero is basically + +0:40:05.777 --> 0:40:07.577 +indicating. + +0:40:07.747 --> 0:40:14.761 +Because we want to see where mottling quality +helps only compare to what happens when there + +0:40:14.761 --> 0:40:15.371 +is not. + +0:40:16.356 --> 0:40:29.108 +So upper like higher than the zero line it +means we're gaining. + +0:40:29.309 --> 0:40:34.154 +The same like for these languages. + +0:40:34.154 --> 0:40:40.799 +This side means we are a high resource for +the. + +0:40:40.981 --> 0:40:46.675 +Yeah sorry, think I've somehow removed the +the ex-O as he does. + +0:40:48.008 --> 0:40:58.502 +Yeah alright, what happens now if we look +at many into English? + +0:40:58.698 --> 0:41:08.741 +On the low resource spectrum by going multilingua +we gain a lot over the Palumbo system. + +0:41:10.010 --> 0:41:16.658 +Overall, if you consider the average for all +of the languages, it's still again. + +0:41:17.817 --> 0:41:27.301 +Now we're looking at the green line so you +can ignore the blue line. + +0:41:27.301 --> 0:41:32.249 +Basically we have to do our sample. + +0:41:33.753 --> 0:41:41.188 +Yeah, so if you just even consider the average, +it's still a game form over by link. + +0:41:42.983 --> 0:41:57.821 +However, if we go to the English to many systems +looking at the gains, we only get minor improvements. + +0:41:59.039 --> 0:42:12.160 +So why is it the case that Going Mott Lingu +isn't really helping universally? + +0:42:16.016 --> 0:42:18.546 +Do you have some intuitions on yeah? + +0:42:18.698 --> 0:42:38.257 +It's easier to understand something that generates +if we consider what the model has to generate. + +0:42:38.718 --> 0:42:40.091 +I See It Like. + +0:42:40.460 --> 0:42:49.769 +Generating is a bit like writing or speaking, +while inputing on the source side is more like + +0:42:49.769 --> 0:42:50.670 +reading. + +0:42:50.650 --> 0:42:57.971 +So one is more passive and the other is more +active and don't know if you have similar experience. + +0:42:57.971 --> 0:43:05.144 +I think speaking and writing is always a little +bit more difficult than just passively listening + +0:43:05.144 --> 0:43:06.032 +or reading. + +0:43:06.032 --> 0:43:09.803 +But this is a very pendwavy kind of understanding. + +0:43:10.390 --> 0:43:11.854 +And fed. + +0:43:12.032 --> 0:43:20.309 +In terms of the model, if we consider what +is the difference for the target side for many + +0:43:20.309 --> 0:43:26.703 +to English: One difference is that there's +a data difference. + +0:43:27.167 --> 0:43:33.438 +So if you just consider a modern English system +with German to English and Spanish to English,. + +0:43:34.975 --> 0:43:44.321 +One thing we have to keep in mind is that +the parallel data is not all the same, so on + +0:43:44.321 --> 0:43:49.156 +the target side there are different English. + +0:43:49.769 --> 0:43:54.481 +So the situation rather looks like this. + +0:43:54.481 --> 0:43:59.193 +What this means is that we are going to. + +0:44:00.820 --> 0:44:04.635 +We also add more data on the target side for +English. + +0:44:06.967 --> 0:44:18.581 +Now since the target side data is not identical, +how do we do a controlled experiment to remove + +0:44:18.581 --> 0:44:21.121 +the multilinguality? + +0:44:24.644 --> 0:44:42.794 +So what people tried as a control experiment +is to keep all the English same as the above + +0:44:42.794 --> 0:44:44.205 +setup. + +0:44:44.684 --> 0:44:49.700 +So they take the English on English data of +the same branch to German. + +0:44:50.090 --> 0:44:55.533 +And then the general synthetic data for Germans. + +0:44:55.533 --> 0:45:05.864 +So now we have a bilingual system again, but +on the target side we still have the previously + +0:45:05.864 --> 0:45:08.419 +enriched English data. + +0:45:10.290 --> 0:45:25.092 +Now back to this picture that we've seen before, +this mysterious orange line here is basically + +0:45:25.092 --> 0:45:26.962 +the result. + +0:45:27.907 --> 0:45:36.594 +And somewhat struckly and perhaps sadly for +believers of multilinguality. + +0:45:36.594 --> 0:45:39.176 +This is also gaining. + +0:45:41.001 --> 0:45:52.775 +So what this means is for the many English +is gaining not really because of multilinguality + +0:45:52.775 --> 0:45:55.463 +but just because of. + +0:45:55.976 --> 0:46:10.650 +And this means that there is still quite a +lot to do if we really want to gain from just + +0:46:10.650 --> 0:46:13.618 +shared knowledge. + +0:46:14.514 --> 0:46:27.599 +But this also gives hope because there are +still many things to research in this area + +0:46:27.599 --> 0:46:28.360 +now. + +0:46:28.708 --> 0:46:40.984 +So we've seen adding more languages helps +with somewhat data side effect and can it hurt. + +0:46:40.984 --> 0:46:45.621 +So if we just add more languages. + +0:46:47.007 --> 0:46:48.408 +We've seen this. + +0:46:48.408 --> 0:46:52.694 +This is the picture for the Manitou English +system. + +0:46:53.793 --> 0:47:09.328 +Comparing to this valuable face line, we see +that for these high resource languages we are + +0:47:09.328 --> 0:47:12.743 +not doing as great. + +0:47:15.956 --> 0:47:18.664 +So why are we losing here? + +0:47:18.664 --> 0:47:25.285 +It's been showing that this performance last +is somewhat related. + +0:47:26.026 --> 0:47:37.373 +In the sense that the motto has to learn so +much that at some point it has to sacrifice + +0:47:37.373 --> 0:47:39.308 +capacity from. + +0:47:41.001 --> 0:47:57.081 +So what to do to basically grow a bigger brain +to tackle this is to add some dedicated capacity + +0:47:57.081 --> 0:47:59.426 +per language. + +0:48:00.100 --> 0:48:15.600 +Here it's like a simplified graph of a transformer +architecture, so this is the encoder within + +0:48:15.600 --> 0:48:16.579 +time. + +0:48:17.357 --> 0:48:27.108 +But additionally here these little colorable +blouse are now the language-specific capable + +0:48:27.108 --> 0:48:28.516 +of capacity. + +0:48:29.169 --> 0:48:42.504 +There are language specific in the sense that +if you get the Chinese to English, the pattern. + +0:48:43.103 --> 0:48:54.900 +We are also going to language specific parts +that in this case consists of a down projection. + +0:48:56.416 --> 0:49:07.177 +So this is also called adaptors, something +that is plugged into an existing model and + +0:49:07.177 --> 0:49:11.556 +it adapts towards a specific task. + +0:49:12.232 --> 0:49:22.593 +And this is conditionally activated in the +sense that if you get a different input sentence. + +0:49:27.307 --> 0:49:34.173 +So this was first proposed in by some folks +selling Google. + +0:49:34.173 --> 0:49:36.690 +Does this scale well? + +0:49:39.619 --> 0:49:56.621 +Yes exactly, so this is a translation periscusive +cannon adapter, and this is not going to scale + +0:49:56.621 --> 0:49:57.672 +well. + +0:49:58.959 --> 0:50:13.676 +So this also brought people to try some more +simple architecture. + +0:50:16.196 --> 0:50:22.788 +Yeah, this is also an alternative, in this +case called monolingual adapters. + +0:50:24.184 --> 0:50:32.097 +Any of these adapters so again have this low +resource. + +0:50:32.097 --> 0:50:42.025 +The zero line is bilingual baseline, but the +lines are interpolated. + +0:50:43.783 --> 0:50:48.767 +The red one is the mottling word original +mottling word model. + +0:50:49.929 --> 0:50:57.582 +And if we put the adapters in like a basic +virginal adapter that goes to the blue liner,. + +0:50:58.078 --> 0:51:08.582 +You see the lids gaining performance for the +high resource languages. + +0:51:08.582 --> 0:51:16.086 +If they even scale a lot, this further increases. + +0:51:16.556 --> 0:51:22.770 +So this is also a side kind of this. + +0:51:23.103 --> 0:51:27.807 +From the side shows that it's really a capacity +bottom up. + +0:51:28.488 --> 0:51:30.590 +Like If You Eleanor. + +0:51:31.151 --> 0:51:34.313 +Resource they regain their performance. + +0:51:38.959 --> 0:51:50.514 +For smaller languages, but it's just. + +0:51:50.770 --> 0:52:03.258 +Think in the original modeling, the smaller +languages they weren't constrained by capacity. + +0:52:05.445 --> 0:52:13.412 +So guess for the smaller languages, the difficulty +is more the data rather than the model capacity. + +0:52:13.573 --> 0:52:26.597 +So in general you always want to have more +or less data matching your model capacity. + +0:52:27.647 --> 0:52:33.255 +Yeah, here think the bigger challenge for +lower roots was the data. + +0:52:34.874 --> 0:52:39.397 +You also mention it a little bit. + +0:52:39.397 --> 0:52:46.979 +Are these adapters per language or how many +adapters do? + +0:52:47.267 --> 0:52:55.378 +And do we have to design them differently +so that we learn to share more like a language + +0:52:55.378 --> 0:52:56.107 +family? + +0:52:56.576 --> 0:53:15.680 +So one downside of the adaptor we talked about +is that basically there is no way to go over. + +0:53:16.516 --> 0:53:31.391 +So then a recent kind of additional approach +for these language specific capacity is so + +0:53:31.391 --> 0:53:36.124 +called routing or learning. + +0:53:36.256 --> 0:53:42.438 +Basically, we have these language specific +components. + +0:53:42.438 --> 0:53:45.923 +We also have a shared adapter. + +0:53:45.923 --> 0:53:52.574 +The model should learn: So in this case maybe +we could imagine for the lower resource case + +0:53:52.574 --> 0:53:54.027 +that we just talked about. + +0:53:54.094 --> 0:54:04.838 +Sense to go there because there's not much +to do with language specific anyway than it's + +0:54:04.838 --> 0:54:10.270 +better to make use of similarity with other. + +0:54:11.111 --> 0:54:30.493 +So this architecture is more data driven instead +of what we specify prior to training. + +0:54:31.871 --> 0:54:33.998 +So how do we learn this? + +0:54:35.095 --> 0:54:49.286 +Basically, in terms of the mask, we want to +basically have a binary rule that goes either + +0:54:49.286 --> 0:54:50.548 +to the. + +0:54:51.311 --> 0:54:56.501 +But how do we get a valued zero or one mean +we can? + +0:54:56.501 --> 0:54:58.498 +We can do a signal. + +0:54:58.999 --> 0:55:13.376 +However, one thing is we don't want to get +stuck in the middle, so we don't want black. + +0:55:14.434 --> 0:55:28.830 +It is also bad because it is not going to +be the same training and test time by the way. + +0:55:31.151 --> 0:55:50.483 +So here the question is how do we force basically +the model to always go there prior to activation? + +0:55:54.894 --> 0:56:02.463 +Found it interesting because it sounds like +a trick for me. + +0:56:02.463 --> 0:56:05.491 +This approach has been. + +0:56:06.026 --> 0:56:15.844 +So what they do is prior to going through +this activation, and they add some bosom noise. + +0:56:17.257 --> 0:56:31.610 +If there is always noise prior to activation +then the model will be encouraged to preserve + +0:56:31.610 --> 0:56:34.291 +the information. + +0:56:36.356 --> 0:56:44.067 +Was a very interesting thing that found out +while preparing this, so wanted to share this + +0:56:44.067 --> 0:56:44.410 +as. + +0:56:44.544 --> 0:56:48.937 +So basically you can create a battery gate +with this technique. + +0:56:50.390 --> 0:57:01.668 +And if you add these language specific routing: +Here they also have some that can control how + +0:57:01.668 --> 0:57:07.790 +much is shared and how much is language specific. + +0:57:07.727 --> 0:57:16.374 +Here the seals are the is the routing with +the red and orange lines, so. + +0:57:16.576 --> 0:57:22.752 +So you can see that poor for many and many +to one there in both cases quite some games. + +0:57:23.063 --> 0:57:30.717 +So that is the overall picture and just find +the idea of the routing quite interesting. + +0:57:30.991 --> 0:57:32.363 +And UM. + +0:57:32.212 --> 0:57:38.348 +It's also getting a bit more increasingly +used as there are the so called mixture of + +0:57:38.348 --> 0:57:39.431 +expert models. + +0:57:39.499 --> 0:57:51.801 +The model learns where to route the input +so they are all conditionally activated when + +0:57:51.801 --> 0:57:53.074 +you are. + +0:57:53.213 --> 0:57:59.089 +But this is not really something specific +to mortal inquality, so won't talk too much + +0:57:59.089 --> 0:57:59.567 +about. + +0:58:00.620 --> 0:58:02.115 +No. + +0:58:01.761 --> 0:58:09.640 +From this parrot is first that we talked about +the listing of the capacity bottleneck. + +0:58:10.570 --> 0:58:19.808 +Where we can partly compensate by adapters +or adding language specific capacity, there's + +0:58:19.808 --> 0:58:23.026 +the idea of negative transfer. + +0:58:24.844 --> 0:58:35.915 +When we add any additional capacity, how can +we improve the knowledge sharing? + +0:58:38.318 --> 0:58:46.662 +Also, for this one too many directions that +seem to be hopeless for multilinguality, can + +0:58:46.662 --> 0:58:47.881 +we actually? + +0:58:49.129 --> 0:58:52.171 +Yeah, these are all open things still in the +area. + +0:58:53.673 --> 0:59:04.030 +Now next part, I'm going to talk about some +data challenges for Model Ewell. + +0:59:04.030 --> 0:59:07.662 +We talk about Model Ewell. + +0:59:08.488 --> 0:59:14.967 +But there are these lower resource languages +that don't have well curated parallel data. + +0:59:16.216 --> 0:59:27.539 +When alternative people resort to Pro Data +from the Internet, there's a lot of noise. + +0:59:27.927 --> 0:59:36.244 +And in this paper last year they did some +manual analyses of several popular cross data + +0:59:36.244 --> 0:59:36.811 +sets. + +0:59:37.437 --> 0:59:55.262 +And you'll see that there are a lot of wrong +translations, non-linguistic contents, pornographic + +0:59:55.262 --> 0:59:57.100 +contents. + +0:59:57.777 --> 1:00:04.661 +So as you can imagine, they say what you eat. + +1:00:04.661 --> 1:00:20.116 +If you use this kind of data to train a model, +you can: So there are also many techniques + +1:00:20.116 --> 1:00:28.819 +for filtering and filtering these noisy data +sets. + +1:00:29.809 --> 1:00:36.982 +So to filter these out we can use an additional +classifier that basically are trained to classify + +1:00:36.982 --> 1:00:43.496 +which language to sentences and then kick out +all the sentences with the wrong language. + +1:00:45.105 --> 1:00:49.331 +Another thing is the length ratio. + +1:00:49.331 --> 1:01:00.200 +Basically, the assumption there is that if +two sentences are translations of each other,. + +1:01:01.901 --> 1:01:08.718 +So often people use maybe a ratio of three +and then it eliminates the rest. + +1:01:09.909 --> 1:01:20.187 +Also, the other idea maybe similar to the +language classifier is basically to heaven + +1:01:20.187 --> 1:01:24.540 +allowed character set per language. + +1:01:24.540 --> 1:01:28.289 +So if you're trying to filter. + +1:01:28.568 --> 1:01:34.622 +Don't know Cyrillic spribs or Arabic spribs, +then it's maybe a good idea to remove them. + +1:01:35.775 --> 1:01:43.123 +This is not all there are many other ideas +using some pre-trained neural networks to compare + +1:01:43.123 --> 1:01:50.629 +the representations, but just to give you an +idea of what our basic techniques were filtering. + +1:01:50.991 --> 1:01:53.458 +Is quite important. + +1:01:53.458 --> 1:02:02.465 +We have seen in our experience that if you +do these thoroughly there is. + +1:02:03.883 --> 1:02:17.814 +So after all, even if we do web crawling, +there is still a bit of data scarcity problem. + +1:02:18.118 --> 1:02:30.760 +So there are many bad things that can happen +when there's too little training data. + +1:02:30.760 --> 1:02:35.425 +The first is low performances. + +1:02:35.735 --> 1:02:55.562 +So they did it on many English system index +languages, all together with here means: So + +1:02:55.562 --> 1:03:04.079 +we really need to get that area of a lot of +data in order to get that ideal performance. + +1:03:04.884 --> 1:03:20.639 +There are also many horrible things that can +happen in general when you train a model across + +1:03:20.639 --> 1:03:24.874 +different training runs. + +1:03:26.946 --> 1:03:36.733 +So one solution to tackle this problem, the +data scarcity problem, is by fine tuning some + +1:03:36.733 --> 1:03:38.146 +pre-trained. + +1:03:38.979 --> 1:03:46.245 +And basically the idea is you've got the pre-trained +model that can already do translation. + +1:03:46.846 --> 1:03:54.214 +Then you find units on your own training data +and you end up with a more specialized model. + +1:03:55.155 --> 1:03:59.369 +So why does pretraining help? + +1:03:59.369 --> 1:04:11.448 +One argument is that if you do pretraining +then the motto has seen over more data and + +1:04:11.448 --> 1:04:12.713 +learned. + +1:04:13.313 --> 1:04:19.135 +Say more generalizable representations that +can help more downstream tasks. + +1:04:19.719 --> 1:04:28.063 +So in this case we are basically trying to +make use of the more meaningful and generalizable + +1:04:28.063 --> 1:04:29.499 +representation. + +1:04:30.490 --> 1:04:45.103 +So for machine translation there are several +open source models out there that can handle + +1:04:45.103 --> 1:04:46.889 +languages. + +1:04:48.188 --> 1:04:49.912 +Two hundred model. + +1:04:49.912 --> 1:04:53.452 +They also cover two hundred languages. + +1:04:53.452 --> 1:04:57.628 +That means that's quite a lot of translation. + +1:04:57.978 --> 1:05:06.218 +However, one thing to remember is that these +lados are more like a how do you call them. + +1:05:06.146 --> 1:05:12.812 +Jackson Waltry is a master of none in the +sense that they are very good as coverage, + +1:05:12.812 --> 1:05:20.498 +but if you look at specific translation directions +they might be not as good as dedicated models. + +1:05:21.521 --> 1:05:34.170 +So here I'm going to have some results by +comparing random initialization versus the + +1:05:34.170 --> 1:05:36.104 +first thing. + +1:05:36.396 --> 1:05:46.420 +The third line is the result of basically +finding a pre-train model that is one of the + +1:05:46.420 --> 1:05:47.342 +family. + +1:05:47.947 --> 1:05:51.822 +So in this case you could see the. + +1:05:51.831 --> 1:05:58.374 +If we just look at the second line, that is +the pre trade model out of the box, you see + +1:05:58.374 --> 1:06:04.842 +that if we just use it out of the box, the +performance everywhere isn't super great as + +1:06:04.842 --> 1:06:06.180 +dedicated models. + +1:06:07.867 --> 1:06:21.167 +But then here that ex-here means English: +So the first takeaway here is that if we do + +1:06:21.167 --> 1:06:31.560 +pre-train financing again when we do it into +English,. + +1:06:33.433 --> 1:06:40.438 +Here is that we are forgetting. + +1:06:40.438 --> 1:06:50.509 +When we do further training there is no data. + +1:06:50.770 --> 1:07:04.865 +So even if we initialize the pre-trained bottle +and continue training, if we don't see translation. + +1:07:05.345 --> 1:07:13.826 +So this is bad machine learning people termed +it as perfect forgetting in the sense that + +1:07:13.826 --> 1:07:20.115 +if you have a model that is trained to do some +task and then you. + +1:07:20.860 --> 1:07:22.487 +This Is Also Pretty Bad. + +1:07:24.244 --> 1:07:32.341 +Is especially bad if you consider training +data actually grows over time. + +1:07:32.341 --> 1:07:35.404 +It's not like you have one. + +1:07:36.336 --> 1:07:46.756 +So in practice we do not always train systems +from stretch so it's more like you have an + +1:07:46.756 --> 1:07:54.951 +existing system and later we want to expand +the translation coverage. + +1:07:57.277 --> 1:08:08.932 +Here and the key question is how do we continue +training from an existing system in doing so? + +1:08:09.909 --> 1:08:12.288 +Approaches. + +1:08:12.288 --> 1:08:27.945 +One very simple one is to include a portion +of your previous training so that. + +1:08:28.148 --> 1:08:34.333 +So if you consider you have an English German +system and now you want to explain it to English + +1:08:34.333 --> 1:08:34.919 +French,. + +1:08:36.036 --> 1:08:42.308 +Like so nice going English, French and English +German, so when you train it you still include + +1:08:42.308 --> 1:08:45.578 +a small proportion of your previous German +data. + +1:08:45.578 --> 1:08:51.117 +Hopefully your model is not forgetting that +much about the previously lent German. + +1:08:53.073 --> 1:08:58.876 +Idea here is what we saw earlier. + +1:08:58.876 --> 1:09:09.800 +We can also add adaptors and only train them +while keeping the. + +1:09:10.170 --> 1:09:26.860 +So this means we're going to end up with a +generic model that was not anyhow changed. + +1:09:27.447 --> 1:09:37.972 +So in this way it's also more module and more +suitable to the incremental learning kind of. + +1:09:38.758 --> 1:09:49.666 +Right in this part, the takeaways guess are +first data filtering. + +1:09:49.666 --> 1:09:55.120 +His Internet data is very noisy. + +1:09:56.496 --> 1:10:05.061 +Second, it's about paint tuning pre-fine models +and how we can or cannot avoid catastrophic + +1:10:05.061 --> 1:10:06.179 +forgetting. + +1:10:07.247 --> 1:10:15.866 +And of course open questions would include +how can we do incremental learning with these + +1:10:15.866 --> 1:10:19.836 +multilingual machine translation models? + +1:10:20.860 --> 1:10:31.840 +So with this in mind would like to briefly +cover several engineering challenges when we + +1:10:31.840 --> 1:10:43.031 +talk about: Yeah, earlier we also briefly talked +about the motelingual means sometimes you have + +1:10:43.031 --> 1:10:51.384 +to scale up, you have to make your models bigger +just to have that capacity to deal with. + +1:10:52.472 --> 1:10:59.262 +This means the model sizes are getting bigger +and sometimes having one single is not enough + +1:10:59.262 --> 1:11:00.073 +to handle. + +1:11:00.400 --> 1:11:08.914 +Here wanted to introduce ideas of going parallel +and scaling up. + +1:11:08.914 --> 1:11:12.843 +The first is so called model. + +1:11:14.434 --> 1:11:18.859 +Don't know if you also had this in other like +maury cue related courses. + +1:11:20.220 --> 1:11:30.639 +Okay, so the idea of data parallel is basically +we train in parallel. + +1:11:30.790 --> 1:11:35.852 +We put our model onto several GPS. + +1:11:35.852 --> 1:11:47.131 +We send the same model there and then when +we get the training data we split. + +1:11:48.108 --> 1:11:54.594 +So each on each of these we are doing the +forward and backward pass in parallel. + +1:11:55.355 --> 1:12:07.779 +Then after we get his gradient all these reviews +will be synchronized and the gradients will + +1:12:07.779 --> 1:12:09.783 +be aggregated. + +1:12:11.691 --> 1:12:27.127 +We are having a bigger batch size in effect, +so this would be much faster than, for example, + +1:12:27.127 --> 1:12:31.277 +doing all these smaller. + +1:12:32.772 --> 1:12:45.252 +That is, if your model itself is too big to +fit onto an energy group, so you cannot split + +1:12:45.252 --> 1:12:46.084 +this. + +1:12:46.486 --> 1:12:51.958 +And honestly, the model itself, unless you're +going for those. + +1:12:51.891 --> 1:12:55.500 +Huge models the industry made these days. + +1:12:55.500 --> 1:13:03.233 +I've never run into a situation where the +single model itself does not fit into one shape + +1:13:03.233 --> 1:13:03.748 +here. + +1:13:03.748 --> 1:13:08.474 +Realistically, it's more the what is memory +consuming. + +1:13:08.528 --> 1:13:14.871 +It is more of the backward cast and the Optimizer +states that led me to be stored. + +1:13:15.555 --> 1:13:22.193 +So but still there are people training gigantic +models where they have to go model parallel. + +1:13:22.602 --> 1:13:35.955 +This means you have a model consisting of +all those orange pets, but it doesn't fit to + +1:13:35.955 --> 1:13:40.714 +split the next several layers. + +1:13:41.581 --> 1:13:51.787 +So this means when you do the forward pass +you have to wait and to finish before doing. + +1:13:52.532 --> 1:14:11.193 +And this kind of implementation is sometimes +a bit architecture or specific. + +1:14:12.172 --> 1:14:17.177 +Right, so there's one more thing when scaling +up. + +1:14:17.177 --> 1:14:19.179 +Want it to mention. + +1:14:20.080 --> 1:14:25.687 +We also talked about it briefly earlier. + +1:14:25.687 --> 1:14:34.030 +We said that when we go to Linguo we need +a vocabulary that. + +1:14:34.614 --> 1:14:40.867 +And can give you some numbers. + +1:14:40.867 --> 1:14:53.575 +Most of the pre-trained modeling models here +use a vocabulary. + +1:14:53.933 --> 1:14:58.454 +Normally each vector is. + +1:14:58.454 --> 1:15:10.751 +This means just the word embedding table alone +is times parameters. + +1:15:11.011 --> 1:15:18.620 +This means just for the embedding table alone +it's already taking million parameters of the. + +1:15:19.859 --> 1:15:28.187 +And this is often one of the largest parts +of the machine. + +1:15:28.187 --> 1:15:31.292 +This also comes with. + +1:15:31.651 --> 1:15:43.891 +So one question is how can we efficiently +represent a multilingual vocabulary? + +1:15:43.891 --> 1:15:49.003 +Are there better ways than just? + +1:15:50.750 --> 1:16:00.526 +There are many out there people tread, maybe +not all targeted for mottling wool, but think. + +1:16:00.840 --> 1:16:03.635 +So when is bites level representation? + +1:16:03.743 --> 1:16:11.973 +So the idea there is if we train with data +they're all stored on computers, so all their + +1:16:11.973 --> 1:16:15.579 +characters must be reused in by bites. + +1:16:15.579 --> 1:16:23.716 +So they want to then not using subwords, not +using characters, but using bites instead. + +1:16:25.905 --> 1:16:27.693 +Do You See Some Downsides? + +1:16:31.791 --> 1:16:38.245 +There are some languages that are easier to +represent than others. + +1:16:38.245 --> 1:16:40.556 +That's definitely true. + +1:16:41.081 --> 1:16:44.981 +So if you have a sentence normally of five +words,. + +1:16:46.246 --> 1:16:59.899 +You think about if we split it into characters, +how many characters we have, and each character + +1:16:59.899 --> 1:17:04.166 +that would be how many bites. + +1:17:04.424 --> 1:17:15.749 +And then it's more to model, it's more for +the model to learn, and it's also a bigger + +1:17:15.749 --> 1:17:19.831 +sequence to give to the model. + +1:17:20.260 --> 1:17:22.038 +Yeah. + +1:17:21.941 --> 1:17:31.232 +Visual representation is also quite interesting, +so some people argued that we don't want to + +1:17:31.232 --> 1:17:35.428 +have a fixed discrete vocabulary anymore. + +1:17:35.428 --> 1:17:41.921 +Instead, we want to do it like OCR, like reading +them as images. + +1:17:42.942 --> 1:17:54.016 +We'll look at one example for this next: Then +another idea is how if you can distill the + +1:17:54.016 --> 1:18:03.966 +vocabulary as in learning some more compact +representation,. + +1:18:04.284 --> 1:18:12.554 +But next wanted to show you an example of +pixel inputs for modeling war machine. + +1:18:12.852 --> 1:18:29.757 +If you look at the picture, all the characters +that are marked with red are actually not. + +1:18:32.772 --> 1:18:48.876 +They are actually from a different script +for the model and let it do the subword tokenization. + +1:18:52.852 --> 1:19:04.373 +You would get maybe mostly characters out +of it because I guess in the pre existing vocabulary + +1:19:04.373 --> 1:19:07.768 +there won't be Latin H and. + +1:19:07.707 --> 1:19:16.737 +So you'll get characters out of it, which +means it's probably going to be more difficult + +1:19:16.737 --> 1:19:18.259 +for the model. + +1:19:20.140 --> 1:19:28.502 +Yeah, so the motivation for pixel inputs is +that there is more sharing across languages. + +1:19:30.010 --> 1:19:37.773 +Here basically illustrates an embedding table +for subwords and saying if you have sentences + +1:19:37.773 --> 1:19:45.705 +in the letter scripts like French and the English +then it's going to take certain proportions + +1:19:45.705 --> 1:19:48.152 +of this big embetting table. + +1:19:48.328 --> 1:19:56.854 +While for Arabic and Chinese it's yet again +another,. + +1:19:56.796 --> 1:20:09.037 +That is not joined with the previous one if +we want to have shared representations for + +1:20:09.037 --> 1:20:11.992 +different languages. + +1:20:12.692 --> 1:20:18.531 +On the other hand, if we're going with pixels, +there's definitely more sharing. + +1:20:22.362 --> 1:20:30.911 +There's a difference though to a standard +kind of norm machine translation typeline. + +1:20:32.252 --> 1:20:47.581 +If you have this brace then how do we go with +images into a translation model? + +1:20:50.690 --> 1:20:58.684 +We still have to tokenize it somehow, so in +this case they do an overlapping sliding window. + +1:20:59.259 --> 1:21:13.636 +Since it's more visual, we're using some kind +of convolution blocks before going into these + +1:21:13.636 --> 1:21:14.730 +black. + +1:21:15.035 --> 1:21:25.514 +So here wanted to show that if you go with +these more specialist architectures we get + +1:21:25.514 --> 1:21:27.829 +pixels and that's. + +1:21:30.050 --> 1:21:31.310 +There's Also One Down the Side. + +1:21:31.431 --> 1:21:51.380 +If we go with pixels and present teachings, +what are our challenges? + +1:21:52.993 --> 1:22:00.001 +Exactly so as they beat us others here, also +pointing out here for their experiments. + +1:22:01.061 --> 1:22:08.596 +They only consider a one target language, +and this is also on their target site. + +1:22:08.596 --> 1:22:10.643 +It's not pixel based. + +1:22:11.131 --> 1:22:31.033 +So this is definitely, in my opinion, very +interesting steps towards more shared representations. + +1:22:31.831 --> 1:22:40.574 +Yeah, so with this kind of out of the box +approach just wanted to summarize today's lecture. + +1:22:41.962 --> 1:22:53.158 +First think we saw why motelingue is cool, +why there are several open challenges out there + +1:22:53.158 --> 1:22:53.896 +that. + +1:22:55.355 --> 1:23:03.601 +We also saw, like several approaches, how +to realize implement a modern molecular translation + +1:23:03.601 --> 1:23:11.058 +system, and yeah, lastly, we've seen quite +some over challenges on what is unsolved. + +1:23:11.691 --> 1:23:22.403 +Yeah, so with this want to thank you for being +here today and I'm up there if you want. + +1:23:26.106 --> 1:23:29.727 +If you have questions, how will we also share +with the moment? + diff --git a/demo_data/lectures/Lecture-10-13.06.2023/video.mp4 b/demo_data/lectures/Lecture-10-13.06.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..3331b31ff4594e61dc1428487c760f30f85d51f4 --- /dev/null +++ b/demo_data/lectures/Lecture-10-13.06.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8dc282db3512e8731326f1898c8dd757c40f33bd1468ffae249a9374f76fe28 +size 122197601 diff --git a/demo_data/lectures/Lecture-11-15.06.2023/English.vtt b/demo_data/lectures/Lecture-11-15.06.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..deef97df3a96141804a22e9708d9c3656351e9d5 --- /dev/null +++ b/demo_data/lectures/Lecture-11-15.06.2023/English.vtt @@ -0,0 +1,12362 @@ +WEBVTT + +0:00:00.981 --> 0:00:20.036 +Today about is how to use some type of additional +resources to improve the translation. + +0:00:20.300 --> 0:00:28.188 +We have in the first part of the semester +two thirds of the semester how to build some + +0:00:28.188 --> 0:00:31.361 +of your basic machine translation. + +0:00:31.571 --> 0:00:42.317 +Now the basic components are both for statistical +and for neural, with the encoded decoding. + +0:00:43.123 --> 0:00:46.000 +Now, of course, that's not where it stops. + +0:00:46.000 --> 0:00:51.286 +It's still what nearly every machine translation +system is currently in there. + +0:00:51.286 --> 0:00:57.308 +However, there's a lot of challenges which +you need to address in addition and which need + +0:00:57.308 --> 0:00:58.245 +to be solved. + +0:00:58.918 --> 0:01:09.858 +And there we want to start to tell you what +else can you do around this, and partly. + +0:01:10.030 --> 0:01:14.396 +And one important question there is on what +do you train your models? + +0:01:14.394 --> 0:01:32.003 +Because like this type of parallel data, it's +easier in machine translation than in other + +0:01:32.003 --> 0:01:33.569 +trusts. + +0:01:33.853 --> 0:01:41.178 +And therefore an important question is, can +we also learn from like other sources and through? + +0:01:41.701 --> 0:01:47.830 +Because if you remember strongly right at +the beginning of the election,. + +0:01:51.171 --> 0:01:53.801 +This Is How We Train All Our. + +0:01:54.194 --> 0:01:59.887 +Machine learning models from statistical to +neural. + +0:01:59.887 --> 0:02:09.412 +This doesn't have changed so we need this +type of parallel data where we have a source + +0:02:09.412 --> 0:02:13.462 +sentence aligned with a target data. + +0:02:13.493 --> 0:02:19.135 +We have now a strong model here, a very good +model to do that. + +0:02:19.135 --> 0:02:22.091 +However, we always rely on this. + +0:02:22.522 --> 0:02:28.395 +For languages, high risk language pairs say +from German to English or other European languages, + +0:02:28.395 --> 0:02:31.332 +there is decent amount, at least for similarly. + +0:02:31.471 --> 0:02:37.630 +But even there if we are going to very specific +domains it might get difficult and then your + +0:02:37.630 --> 0:02:43.525 +system performance might drop because if you +want to translate now some medical text for + +0:02:43.525 --> 0:02:50.015 +example of course you need to also have peril +data in the medical domain to know how to translate + +0:02:50.015 --> 0:02:50.876 +these types. + +0:02:51.231 --> 0:02:55.264 +Phrases how to use the vocabulary and so on +in the style. + +0:02:55.915 --> 0:03:04.887 +And if you are going to other languages, there +is a lot bigger challenge and the question + +0:03:04.887 --> 0:03:05.585 +there. + +0:03:05.825 --> 0:03:09.649 +So is really this the only resource we can +use. + +0:03:09.889 --> 0:03:19.462 +Can be adapted or training phase in order +to also make use of other types of models that + +0:03:19.462 --> 0:03:27.314 +might enable us to build strong systems with +other types of information. + +0:03:27.707 --> 0:03:35.276 +And that we will look into now in the next +starting from from just saying the next election. + +0:03:35.515 --> 0:03:40.697 +So this idea we already have covered on Tuesday. + +0:03:40.697 --> 0:03:45.350 +One very successful idea for this is to do. + +0:03:45.645 --> 0:03:51.990 +So that we're no longer doing translation +between languages, but we can do translation + +0:03:51.990 --> 0:03:55.928 +between languages and share common knowledge +between. + +0:03:56.296 --> 0:04:04.703 +And you also learned about things like zero +shots machine translation so you can translate + +0:04:04.703 --> 0:04:06.458 +between languages. + +0:04:06.786 --> 0:04:09.790 +Which is the case for many, many language +pairs. + +0:04:10.030 --> 0:04:19.209 +Like even with German, you have not translation +parallel data to all languages around the world, + +0:04:19.209 --> 0:04:26.400 +or most of them you have it to the Europeans +once, maybe even for Japanese. + +0:04:26.746 --> 0:04:35.332 +There is quite a lot of data, for example +English to Japanese, but German to Japanese + +0:04:35.332 --> 0:04:37.827 +or German to Vietnamese. + +0:04:37.827 --> 0:04:41.621 +There is some data from Multilingual. + +0:04:42.042 --> 0:04:54.584 +So there is a very promising direction if +you want to build translation systems between + +0:04:54.584 --> 0:05:00.142 +language peers, typically not English. + +0:05:01.221 --> 0:05:05.887 +And the other ideas, of course, we don't have +to either just search for it. + +0:05:06.206 --> 0:05:12.505 +Some work on a data crawling so if I don't +have a corpus directly or I don't have an high + +0:05:12.505 --> 0:05:19.014 +quality corpus like from the European Parliament +for a TED corpus so maybe it makes sense to + +0:05:19.014 --> 0:05:23.913 +crawl more data and get additional sources +so you can build stronger. + +0:05:24.344 --> 0:05:35.485 +There has been quite a big effort in Europe +to collect really large data sets for parallel + +0:05:35.485 --> 0:05:36.220 +data. + +0:05:36.220 --> 0:05:40.382 +How can we do this data crawling? + +0:05:40.600 --> 0:05:46.103 +There the interesting thing from the machine +translation point is not just general data + +0:05:46.103 --> 0:05:46.729 +crawling. + +0:05:47.067 --> 0:05:50.037 +But how can we explicitly crawl data? + +0:05:50.037 --> 0:05:52.070 +Which is some of a peril? + +0:05:52.132 --> 0:05:58.461 +So there is in the Internet quite a lot of +data which has been company websites which + +0:05:58.461 --> 0:06:01.626 +have been translated and things like that. + +0:06:01.626 --> 0:06:05.158 +So how can you extract them parallel fragments? + +0:06:06.566 --> 0:06:13.404 +That is typically more noisy than where you +do more at hands where mean if you have Parliament. + +0:06:13.693 --> 0:06:17.680 +You can do some rules how to extract parallel +things. + +0:06:17.680 --> 0:06:24.176 +Here there is more to it, so the quality is +later maybe not as good, but normally scale + +0:06:24.176 --> 0:06:26.908 +is then a possibility to address it. + +0:06:26.908 --> 0:06:30.304 +So you just have so much more data that even. + +0:06:33.313 --> 0:06:40.295 +The other thing can be used monolingual data +and monolingual data has a big advantage that + +0:06:40.295 --> 0:06:46.664 +we can have a huge amount of that so that you +can be autocrawed from the Internet. + +0:06:46.664 --> 0:06:51.728 +The nice thing is you can also get it typically +for many domains. + +0:06:52.352 --> 0:06:59.558 +There is just so much more magnitude of monolingual +data so that it might be very helpful. + +0:06:59.559 --> 0:07:03.054 +We can do that in statistical machine translation. + +0:07:03.054 --> 0:07:06.755 +It was quite easy to integrate using language +models. + +0:07:08.508 --> 0:07:16.912 +In neural machine translation we have the +advantage that we have this overall architecture + +0:07:16.912 --> 0:07:22.915 +that does everything together, but it has also +the disadvantage. + +0:07:23.283 --> 0:07:25.675 +We'll look today at two things. + +0:07:25.675 --> 0:07:32.925 +On the one end you can still try to do a bit +of language modeling in there and add an additional + +0:07:32.925 --> 0:07:35.168 +language model into in there. + +0:07:35.168 --> 0:07:38.232 +There is some work, one very successful. + +0:07:38.178 --> 0:07:43.764 +A way in which I think is used in most systems +at the moment is to do some scientific data. + +0:07:43.763 --> 0:07:53.087 +Is a very easy thing, but you can just translate +there and use it as training gator, and normally. + +0:07:53.213 --> 0:07:59.185 +And thereby you are able to use like some +type of monolingual a day. + +0:08:00.380 --> 0:08:05.271 +Another way to do it is unsupervised and the +extreme case. + +0:08:05.271 --> 0:08:11.158 +If you have a scenario then you only have +data, only monolingual data. + +0:08:11.158 --> 0:08:13.976 +Can you still build translations? + +0:08:14.754 --> 0:08:27.675 +If you have large amounts of data and languages +are not too dissimilar, you can build translation + +0:08:27.675 --> 0:08:31.102 +systems without parallel. + +0:08:32.512 --> 0:08:36.267 +That we will see you then next Thursday. + +0:08:37.857 --> 0:08:50.512 +And then there is now a third type of pre-trained +model that recently became very successful + +0:08:50.512 --> 0:08:55.411 +and now with large language models. + +0:08:55.715 --> 0:09:03.525 +So the idea is we are no longer sharing the +real data, but it can also help to train a + +0:09:03.525 --> 0:09:04.153 +model. + +0:09:04.364 --> 0:09:11.594 +And that is now a big advantage of deep learning +based approaches. + +0:09:11.594 --> 0:09:22.169 +There you have this ability that you can train +a model in some task and then apply it to another. + +0:09:22.722 --> 0:09:33.405 +And then, of course, the question is, can +I have an initial task where there's huge amounts + +0:09:33.405 --> 0:09:34.450 +of data? + +0:09:34.714 --> 0:09:40.251 +And the test that typically you pre train +on is more like similar to a language moral + +0:09:40.251 --> 0:09:45.852 +task either direct to a language moral task +or like a masking task which is related so + +0:09:45.852 --> 0:09:51.582 +the idea is oh I can train on this data and +the knowledge about words how they relate to + +0:09:51.582 --> 0:09:53.577 +each other I can use in there. + +0:09:53.753 --> 0:10:00.276 +So it's a different way of using language +models. + +0:10:00.276 --> 0:10:06.276 +There's more transfer learning at the end +of. + +0:10:09.029 --> 0:10:17.496 +So first we will start with how can we use +monolingual data to do a Yeah to do a machine + +0:10:17.496 --> 0:10:18.733 +translation? + +0:10:20.040 --> 0:10:27.499 +That: Big difference is you should remember +from what I mentioned before is. + +0:10:27.499 --> 0:10:32.783 +In statistical machine translation we directly +have the opportunity. + +0:10:32.783 --> 0:10:39.676 +There's peril data for the translation model +and monolingual data for the language model. + +0:10:39.679 --> 0:10:45.343 +And you combine your translation model and +language model, and then you can make use of + +0:10:45.343 --> 0:10:45.730 +both. + +0:10:46.726 --> 0:10:53.183 +That you can make use of these large large +amounts of monolingual data, but of course + +0:10:53.183 --> 0:10:55.510 +it has also some disadvantage. + +0:10:55.495 --> 0:11:01.156 +Because we say the problem is we are optimizing +both parts a bit independently to each other + +0:11:01.156 --> 0:11:06.757 +and we say oh yeah the big disadvantage of +newer machine translations now we are optimizing + +0:11:06.757 --> 0:11:10.531 +the overall architecture everything together +to perform best. + +0:11:10.890 --> 0:11:16.994 +And then, of course, we can't do there, so +Leo we can can only do a mural like use power + +0:11:16.994 --> 0:11:17.405 +data. + +0:11:17.897 --> 0:11:28.714 +So the question is, but this advantage is +not so important that we can train everything, + +0:11:28.714 --> 0:11:35.276 +but we have a moral legal data or even small +amounts. + +0:11:35.675 --> 0:11:43.102 +So in data we know it's not only important +the amount of data we have but also like how + +0:11:43.102 --> 0:11:50.529 +similar it is to your test data so it can be +that this modeling data is quite small but + +0:11:50.529 --> 0:11:55.339 +it's very well fitting and then it's still +very helpful. + +0:11:55.675 --> 0:12:02.691 +At the first year of surprisingness, if we +are here successful with integrating a language + +0:12:02.691 --> 0:12:09.631 +model into a translation system, maybe we can +also integrate some type of language models + +0:12:09.631 --> 0:12:14.411 +into our empty system in order to make it better +and perform. + +0:12:16.536 --> 0:12:23.298 +The first thing we can do is we know there +is language models, so let's try to integrate. + +0:12:23.623 --> 0:12:31.096 +There was our language model because these +works were mainly done before transformer-based + +0:12:31.096 --> 0:12:31.753 +models. + +0:12:32.152 --> 0:12:38.764 +In general, of course, you can do the same +thing with transformer baseball. + +0:12:38.764 --> 0:12:50.929 +There is nothing about whether: It's just +that it has mainly been done before people + +0:12:50.929 --> 0:13:01.875 +started using R&S and they tried to do +this more in cases. + +0:13:07.087 --> 0:13:22.938 +So what we're happening here is in some of +this type of idea, and in key system you remember + +0:13:22.938 --> 0:13:25.495 +the attention. + +0:13:25.605 --> 0:13:29.465 +Gets it was your last in this day that you +calculate easy attention. + +0:13:29.729 --> 0:13:36.610 +We get the context back, then combine both +and then base the next in state and then predict. + +0:13:37.057 --> 0:13:42.424 +So this is our system, and the question is, +can we send our integrated language model? + +0:13:42.782 --> 0:13:49.890 +And somehow it makes sense to take out a neural +language model because we are anyway in the + +0:13:49.890 --> 0:13:50.971 +neural space. + +0:13:50.971 --> 0:13:58.465 +It's not surprising that it contrasts to statistical +work used and grants it might make sense to + +0:13:58.465 --> 0:14:01.478 +take a bit of a normal language model. + +0:14:01.621 --> 0:14:06.437 +And there would be something like on Tubbles +Air, a neural language model, and our man based + +0:14:06.437 --> 0:14:11.149 +is you have a target word, you put it in, you +get a new benchmark, and then you always put + +0:14:11.149 --> 0:14:15.757 +in the words and get new hidden states, and +you can do some predictions at the output to + +0:14:15.757 --> 0:14:16.948 +predict the next word. + +0:14:17.597 --> 0:14:26.977 +So if we're having this type of in language +model, there's like two main questions we have + +0:14:26.977 --> 0:14:34.769 +to answer: So how do we combine now on the +one hand our system and on the other hand our + +0:14:34.769 --> 0:14:35.358 +model? + +0:14:35.358 --> 0:14:42.004 +You see that was mentioned before when we +started talking about ENCODA models. + +0:14:42.004 --> 0:14:45.369 +They can be viewed as a language model. + +0:14:45.805 --> 0:14:47.710 +The wine is lengthened, unconditioned. + +0:14:47.710 --> 0:14:49.518 +It's just modeling the target sides. + +0:14:49.970 --> 0:14:56.963 +And the other one is a conditional language +one, which is a language one conditioned on + +0:14:56.963 --> 0:14:57.837 +the Sewer. + +0:14:58.238 --> 0:15:03.694 +So how can you combine to language models? + +0:15:03.694 --> 0:15:14.860 +Of course, it's like the translation model +will be more important because it has access + +0:15:14.860 --> 0:15:16.763 +to the source. + +0:15:18.778 --> 0:15:22.571 +If we have that, the other question is okay. + +0:15:22.571 --> 0:15:24.257 +Now we have models. + +0:15:24.257 --> 0:15:25.689 +How do we train? + +0:15:26.026 --> 0:15:30.005 +Pickers integrated them. + +0:15:30.005 --> 0:15:34.781 +We have now two sets of data. + +0:15:34.781 --> 0:15:42.741 +We have parallel data where you can do the +lower. + +0:15:44.644 --> 0:15:53.293 +So the first idea is we can do something more +like a parallel combination. + +0:15:53.293 --> 0:15:55.831 +We just keep running. + +0:15:56.036 --> 0:15:59.864 +So here you see your system that is running. + +0:16:00.200 --> 0:16:09.649 +It's normally completely independent of your +language model, which is up there, so down + +0:16:09.649 --> 0:16:13.300 +here we have just our NMT system. + +0:16:13.313 --> 0:16:26.470 +The only thing which is used is we have the +words, and of course they are put into both + +0:16:26.470 --> 0:16:30.059 +systems, and out there. + +0:16:30.050 --> 0:16:42.221 +So we use them somehow for both, and then +we are doing our decision just by merging these + +0:16:42.221 --> 0:16:42.897 +two. + +0:16:43.343 --> 0:16:53.956 +So there can be, for example, we are doing +a probability distribution here, and then we + +0:16:53.956 --> 0:17:03.363 +are taking the average of post-perability distribution +to do our predictions. + +0:17:11.871 --> 0:17:18.923 +You could also take the output with Steve's +to be more in chore about the mixture. + +0:17:20.000 --> 0:17:32.896 +Yes, you could also do that, so it's more +like engaging mechanisms that you're not doing. + +0:17:32.993 --> 0:17:41.110 +Another one would be cochtrinate the hidden +states, and then you would have another layer + +0:17:41.110 --> 0:17:41.831 +on top. + +0:17:43.303 --> 0:17:56.889 +You think about if you do the conqueredination +instead of taking the instead and then merging + +0:17:56.889 --> 0:18:01.225 +the probability distribution. + +0:18:03.143 --> 0:18:16.610 +Introduce many new parameters, and these parameters +have somehow something special compared to + +0:18:16.610 --> 0:18:17.318 +the. + +0:18:23.603 --> 0:18:37.651 +So before all the error other parameters can +be trained independent, the language model + +0:18:37.651 --> 0:18:42.121 +can be trained independent. + +0:18:43.043 --> 0:18:51.749 +If you have a joint layer, of course you need +to train them because you have now inputs. + +0:18:54.794 --> 0:19:02.594 +Not surprisingly, if you have a parallel combination +of whether you could, the other way is to do + +0:19:02.594 --> 0:19:04.664 +more serial combinations. + +0:19:04.924 --> 0:19:10.101 +How can you do a similar combination? + +0:19:10.101 --> 0:19:18.274 +Your final decision makes sense to do a face +on the system. + +0:19:18.438 --> 0:19:20.996 +So you have on top of your normal and system. + +0:19:21.121 --> 0:19:30.678 +The only thing is now you're inputting into +your system. + +0:19:30.678 --> 0:19:38.726 +You're no longer inputting the word embeddings. + +0:19:38.918 --> 0:19:45.588 +So you're training your mainly what you have +your lower layers here which are trained more + +0:19:45.588 --> 0:19:52.183 +on the purely language model style and then +on top your putting into the NMT system where + +0:19:52.183 --> 0:19:55.408 +it now has already here the language model. + +0:19:55.815 --> 0:19:58.482 +So here you can also view it. + +0:19:58.482 --> 0:20:06.481 +Here you have more contextual embeddings which +no longer depend only on the word but they + +0:20:06.481 --> 0:20:10.659 +also depend on the context of the target site. + +0:20:11.051 --> 0:20:19.941 +But you have more understanding of the source +word, so you have a language in the current + +0:20:19.941 --> 0:20:21.620 +target sentence. + +0:20:21.881 --> 0:20:27.657 +So if it's like the word can, for example, +will be put in here always the same independent + +0:20:27.657 --> 0:20:31.147 +of its user can of beans, or if it's like I +can do it. + +0:20:31.147 --> 0:20:37.049 +However, because you are having your language +model style, you have maybe disintegrated this + +0:20:37.049 --> 0:20:40.984 +already a bit, and you give this information +directly to the. + +0:20:41.701 --> 0:20:43.095 +An empty cyst. + +0:20:44.364 --> 0:20:49.850 +You, if you're remembering more the transformer +based approach, you have some layers. + +0:20:49.850 --> 0:20:55.783 +The lower layers are purely languaged while +the other ones are with attention to the source. + +0:20:55.783 --> 0:21:01.525 +So you can view it also that you just have +lower layers which don't attend to the source. + +0:21:02.202 --> 0:21:07.227 +This is purely a language model, and then +at some point you're starting to attend to + +0:21:07.227 --> 0:21:08.587 +the source and use it. + +0:21:13.493 --> 0:21:20.781 +Yes, so this is how you combine them in peril +or first do the language model and then do. + +0:21:23.623 --> 0:21:26.147 +Questions for the integration. + +0:21:31.831 --> 0:21:35.034 +Not really sure about the input of the. + +0:21:35.475 --> 0:21:38.102 +Model, and in this case in the sequence. + +0:21:38.278 --> 0:21:53.199 +Case so the actual word that we transferred +into a numerical lecture, and this is an input + +0:21:53.199 --> 0:21:54.838 +into the. + +0:21:56.176 --> 0:22:03.568 +That depends on if you view the word embedding +as part of the language model. + +0:22:03.568 --> 0:22:10.865 +So if you first put the word target word then +you do the one hot end coding. + +0:22:11.691 --> 0:22:13.805 +And then the word embedding there is the r& + +0:22:13.805 --> 0:22:13.937 +n. + +0:22:14.314 --> 0:22:21.035 +So you can use this together as your language +model when you first do the word embedding. + +0:22:21.401 --> 0:22:24.346 +All you can say is like before. + +0:22:24.346 --> 0:22:28.212 +It's more a definition, but you're right. + +0:22:28.212 --> 0:22:30.513 +So what's the steps out? + +0:22:30.513 --> 0:22:36.128 +You take the word, the one hut encoding, the +word embedding. + +0:22:36.516 --> 0:22:46.214 +What one of these parrots, you know, called +a language model is definition wise and not + +0:22:46.214 --> 0:22:47.978 +that important. + +0:22:53.933 --> 0:23:02.264 +So the question is how can you then train +them and make this this one work? + +0:23:02.264 --> 0:23:02.812 +The. + +0:23:03.363 --> 0:23:15.201 +So in the case where you combine the language +one of the abilities you can train them independently + +0:23:15.201 --> 0:23:18.516 +and just put them together. + +0:23:18.918 --> 0:23:27.368 +Might not be the best because we have no longer +the stability that we had before that optimally + +0:23:27.368 --> 0:23:29.128 +performed together. + +0:23:29.128 --> 0:23:33.881 +It's not clear if they really work the best +together. + +0:23:34.514 --> 0:23:41.585 +At least you need to somehow find how much +do you trust the one model and how much. + +0:23:43.323 --> 0:23:45.058 +Still in some cases useful. + +0:23:45.058 --> 0:23:48.530 +It might be helpful if you have only data +and software. + +0:23:48.928 --> 0:23:59.064 +However, in MT we have one specific situation +that at least for the MT part parallel is also + +0:23:59.064 --> 0:24:07.456 +always monolingual data, so what we definitely +can do is train the language. + +0:24:08.588 --> 0:24:18.886 +So what we also can do is more like the pre-training +approach. + +0:24:18.886 --> 0:24:24.607 +We first train the language model. + +0:24:24.704 --> 0:24:27.334 +The pre-training approach. + +0:24:27.334 --> 0:24:33.470 +You first train on the monolingual data and +then you join the. + +0:24:33.933 --> 0:24:41.143 +Of course, the model size is this way, but +the data size is too bigly the other way around. + +0:24:41.143 --> 0:24:47.883 +You often have a lot more monolingual data +than you have here parallel data, in which + +0:24:47.883 --> 0:24:52.350 +scenario can you imagine where this type of +pretraining? + +0:24:56.536 --> 0:24:57.901 +Any Ideas. + +0:25:04.064 --> 0:25:12.772 +One example where this might also be helpful +if you want to adapt to domains. + +0:25:12.772 --> 0:25:22.373 +So let's say you do medical sentences and +if you want to translate medical sentences. + +0:25:23.083 --> 0:25:26.706 +In this case it could be or its most probable +happen. + +0:25:26.706 --> 0:25:32.679 +You're learning here up there what medical +means, but in your fine tuning step the model + +0:25:32.679 --> 0:25:38.785 +is forgotten everything about Medicare, so +you may be losing all the information you gain. + +0:25:39.099 --> 0:25:42.366 +So this type of priest training step is good. + +0:25:42.366 --> 0:25:47.978 +If your pretraining data is more general, +very large and then you're adapting. + +0:25:48.428 --> 0:25:56.012 +But in the task with moral lingual data, which +should be used to adapt the system to some + +0:25:56.012 --> 0:25:57.781 +general topic style. + +0:25:57.817 --> 0:26:06.795 +Then, of course, this is not a good strategy +because you might forgot about everything up + +0:26:06.795 --> 0:26:09.389 +there and you don't have. + +0:26:09.649 --> 0:26:14.678 +So then you have to check what you can do +for them. + +0:26:14.678 --> 0:26:23.284 +You can freeze this part and change it any +more so you don't lose the ability or you can + +0:26:23.284 --> 0:26:25.702 +do a direct combination. + +0:26:25.945 --> 0:26:31.028 +Where you jointly train both of them, so you +train the NMT system on the, and then you train + +0:26:31.028 --> 0:26:34.909 +the language model always in parallels so that +you don't forget about. + +0:26:35.395 --> 0:26:37.684 +And what you learn of the length. + +0:26:37.937 --> 0:26:46.711 +Depends on what you want to combine because +it's large data and you have a good general + +0:26:46.711 --> 0:26:48.107 +knowledge in. + +0:26:48.548 --> 0:26:55.733 +Then you normally don't really forget it because +it's also in the or you use it to adapt to + +0:26:55.733 --> 0:26:57.295 +something specific. + +0:26:57.295 --> 0:26:58.075 +Then you. + +0:27:01.001 --> 0:27:06.676 +Then this is a way of how we can make use +of monolingual data. + +0:27:07.968 --> 0:27:12.116 +It seems to be the easiest one somehow. + +0:27:12.116 --> 0:27:20.103 +It's more similar to what we are doing with +statistical machine translation. + +0:27:21.181 --> 0:27:31.158 +Normally always beats this type of model, +which in some view can be like from the conceptual + +0:27:31.158 --> 0:27:31.909 +thing. + +0:27:31.909 --> 0:27:36.844 +It's even easier from the computational side. + +0:27:40.560 --> 0:27:42.078 +And the idea is OK. + +0:27:42.078 --> 0:27:49.136 +We have monolingual data that we just translate +and then generate some type of parallel data + +0:27:49.136 --> 0:27:50.806 +and use that then to. + +0:27:51.111 --> 0:28:00.017 +So if you want to build a German-to-English +system first, take the large amount of data + +0:28:00.017 --> 0:28:02.143 +you have translated. + +0:28:02.402 --> 0:28:10.446 +Then you have more peril data and the interesting +thing is if you then train on the joint thing + +0:28:10.446 --> 0:28:18.742 +or on the original peril data and on what is +artificial where you have generated the translations. + +0:28:18.918 --> 0:28:26.487 +So you can because you are not doing the same +era all the times and you have some knowledge. + +0:28:28.028 --> 0:28:43.199 +With this first approach, however, there is +one issue why it might not work the best. + +0:28:49.409 --> 0:28:51.177 +Very a bit shown in the image to you. + +0:28:53.113 --> 0:28:58.153 +You trade on that quality data. + +0:28:58.153 --> 0:29:02.563 +Here is a bit of a problem. + +0:29:02.563 --> 0:29:08.706 +Your English style is not really good. + +0:29:08.828 --> 0:29:12.213 +And as you're saying, the system always mistranslates. + +0:29:13.493 --> 0:29:19.798 +Something then you will learn that this is +correct because now it's a training game and + +0:29:19.798 --> 0:29:23.022 +you will encourage it to make it more often. + +0:29:23.022 --> 0:29:29.614 +So the problem with training on your own areas +yeah you might prevent some areas you rarely + +0:29:29.614 --> 0:29:29.901 +do. + +0:29:30.150 --> 0:29:31.749 +But errors use systematically. + +0:29:31.749 --> 0:29:34.225 +Do you even enforce more and will even do +more? + +0:29:34.654 --> 0:29:40.145 +So that might not be the best solution to +have any idea how you could do it better. + +0:29:44.404 --> 0:29:57.754 +Is one way there is even a bit of more simple +idea. + +0:30:04.624 --> 0:30:10.975 +The problem is yeah, the translations are +not perfect, so the output and you're learning + +0:30:10.975 --> 0:30:12.188 +something wrong. + +0:30:12.188 --> 0:30:17.969 +Normally it's less bad if your inputs are +not bad, but your outputs are perfect. + +0:30:18.538 --> 0:30:24.284 +So if your inputs are wrong you may learn +that if you're doing this wrong input you're + +0:30:24.284 --> 0:30:30.162 +generating something correct, but you're not +learning to generate something which is not + +0:30:30.162 --> 0:30:30.756 +correct. + +0:30:31.511 --> 0:30:47.124 +So often the case it is that it is more important +than your target is correct. + +0:30:47.347 --> 0:30:52.182 +But you can assume in your application scenario +you hope that you may only get correct inputs. + +0:30:52.572 --> 0:31:02.535 +So that is not harming you, and in machine +translation we have one very nice advantage: + +0:31:02.762 --> 0:31:04.648 +And also the other way around. + +0:31:04.648 --> 0:31:10.062 +It's a very similar task, so there's a task +to translate from German to English, but the + +0:31:10.062 --> 0:31:13.894 +task to translate from English to German is +very similar, and. + +0:31:14.094 --> 0:31:19.309 +So what we can do is we can just switch it +initially and generate the data the other way + +0:31:19.309 --> 0:31:19.778 +around. + +0:31:20.120 --> 0:31:25.959 +So what we are doing here is we are starting +with an English to German system. + +0:31:25.959 --> 0:31:32.906 +Then we are translating the English data into +German where the German is maybe not very nice. + +0:31:33.293 --> 0:31:51.785 +And then we are training on our original data +and on the back translated data. + +0:31:52.632 --> 0:32:02.332 +So here we have the advantage that our target +side is human quality and only the input. + +0:32:03.583 --> 0:32:08.113 +Then this helps us to get really good. + +0:32:08.113 --> 0:32:15.431 +There is one difference if you think about +the data resources. + +0:32:21.341 --> 0:32:27.336 +Too obvious here we need a target site monolingual +layer. + +0:32:27.336 --> 0:32:31.574 +In the first example we had source site. + +0:32:31.931 --> 0:32:45.111 +So back translation is normally working if +you have target size peril later and not search + +0:32:45.111 --> 0:32:48.152 +side modeling later. + +0:32:48.448 --> 0:32:56.125 +Might be also, like if you think about it, +understand a little better to understand the + +0:32:56.125 --> 0:32:56.823 +target. + +0:32:57.117 --> 0:33:01.469 +On the source side you have to understand +the content. + +0:33:01.469 --> 0:33:08.749 +On the target side you have to generate really +sentences and somehow it's more difficult to + +0:33:08.749 --> 0:33:12.231 +generate something than to only understand. + +0:33:17.617 --> 0:33:30.734 +This works well if you have to select how +many back translated data do you use. + +0:33:31.051 --> 0:33:32.983 +Because only there's like a lot more. + +0:33:33.253 --> 0:33:42.136 +Question: Should take all of my data there +is two problems with it? + +0:33:42.136 --> 0:33:51.281 +Of course it's expensive because you have +to translate all this data. + +0:33:51.651 --> 0:34:00.946 +So if you don't know the normal good starting +point is to take equal amount of data as many + +0:34:00.946 --> 0:34:02.663 +back translated. + +0:34:02.963 --> 0:34:04.673 +It depends on the used case. + +0:34:04.673 --> 0:34:08.507 +If we have very few data here, it makes more +sense to have more. + +0:34:08.688 --> 0:34:15.224 +Depends on how good your quality is here, +so the better the more data you might use because + +0:34:15.224 --> 0:34:16.574 +quality is better. + +0:34:16.574 --> 0:34:22.755 +So it depends on a lot of things, but your +rule of sum is like which general way often + +0:34:22.755 --> 0:34:24.815 +is to have equal amounts of. + +0:34:26.646 --> 0:34:29.854 +And you can, of course, do that now. + +0:34:29.854 --> 0:34:34.449 +I said already that it's better to have the +quality. + +0:34:34.449 --> 0:34:38.523 +At the end, of course, depends on this system. + +0:34:38.523 --> 0:34:46.152 +Also, because the better this system is, the +better your synthetic data is, the better. + +0:34:47.207 --> 0:34:50.949 +That leads to what is referred to as iterated +back translation. + +0:34:51.291 --> 0:34:56.917 +So you play them on English to German, and +you translate the data on. + +0:34:56.957 --> 0:35:03.198 +Then you train a model on German to English +with the additional data. + +0:35:03.198 --> 0:35:09.796 +Then you translate German data and then you +train to gain your first one. + +0:35:09.796 --> 0:35:14.343 +So in the second iteration this quality is +better. + +0:35:14.334 --> 0:35:19.900 +System is better because it's not only trained +on the small data but additionally on back + +0:35:19.900 --> 0:35:22.003 +translated data with this system. + +0:35:22.442 --> 0:35:24.458 +And so you can get better. + +0:35:24.764 --> 0:35:28.053 +However, typically you can stop quite early. + +0:35:28.053 --> 0:35:35.068 +Maybe one iteration is good, but then you +have diminishing gains after two or three iterations. + +0:35:35.935 --> 0:35:46.140 +There is very slight difference because you +need a quite big difference in the quality + +0:35:46.140 --> 0:35:46.843 +here. + +0:35:47.207 --> 0:36:02.262 +Language is also good because it means you +can already train it with relatively bad profiles. + +0:36:03.723 --> 0:36:10.339 +It's a design decision would advise so guess +because it's easy to get it. + +0:36:10.550 --> 0:36:20.802 +Replace that because you have a higher quality +real data, but then I think normally it's okay + +0:36:20.802 --> 0:36:22.438 +to replace it. + +0:36:22.438 --> 0:36:28.437 +I would assume it's not too much of a difference, +but. + +0:36:34.414 --> 0:36:42.014 +That's about like using monolingual data before +we go into the pre-train models to have any + +0:36:42.014 --> 0:36:43.005 +more crash. + +0:36:49.029 --> 0:36:55.740 +Yes, so the other thing which we can do and +which is recently more and more successful + +0:36:55.740 --> 0:37:02.451 +and even more successful since we have this +really large language models where you can + +0:37:02.451 --> 0:37:08.545 +even do the translation task with this is the +way of using pre-trained models. + +0:37:08.688 --> 0:37:16.135 +So you learn a representation of one task, +and then you use this representation from another. + +0:37:16.576 --> 0:37:26.862 +It was made maybe like one of the first words +where it really used largely is doing something + +0:37:26.862 --> 0:37:35.945 +like a bird which you pre trained on purely +text era and you take it in fine tune. + +0:37:36.496 --> 0:37:42.953 +And one big advantage, of course, is that +people can only share data but also pre-trained. + +0:37:43.423 --> 0:37:59.743 +The recent models and the large language ones +which are available. + +0:37:59.919 --> 0:38:09.145 +Where I think it costs several millions to +train them all, just if you would buy the GPUs + +0:38:09.145 --> 0:38:15.397 +from some cloud company and train that the +cost of training. + +0:38:15.475 --> 0:38:21.735 +And guess as a student project you won't have +the budget to like build these models. + +0:38:21.801 --> 0:38:24.598 +So another idea is what you can do is okay. + +0:38:24.598 --> 0:38:27.330 +Maybe if these months are once available,. + +0:38:27.467 --> 0:38:36.598 +Can take them and use them as an also resource +similar to pure text, and you can now build + +0:38:36.598 --> 0:38:44.524 +models which somehow learn not only from from +data but also from other models. + +0:38:44.844 --> 0:38:49.127 +So it's a quite new way of thinking of how +to train. + +0:38:49.127 --> 0:38:53.894 +We are not only learning from examples, but +we might also. + +0:38:54.534 --> 0:39:05.397 +The nice thing is that this type of training +where we are not learning directly from data + +0:39:05.397 --> 0:39:07.087 +but learning. + +0:39:07.427 --> 0:39:17.647 +So the main idea this go is you have a person +initial task. + +0:39:17.817 --> 0:39:26.369 +And if you're working with anLP, that means +you're training pure taxator because that's + +0:39:26.369 --> 0:39:30.547 +where you have the largest amount of data. + +0:39:30.951 --> 0:39:35.857 +And then you're defining some type of task +in order to do your creek training. + +0:39:36.176 --> 0:39:43.092 +And: The typical task you can train on on +that is like the language waddling task. + +0:39:43.092 --> 0:39:50.049 +So to predict the next word or we have a related +task to predict something in between, we'll + +0:39:50.049 --> 0:39:52.667 +see depending on the architecture. + +0:39:52.932 --> 0:39:58.278 +But somehow to predict something which you +have not in the input is a task which is easy + +0:39:58.278 --> 0:40:00.740 +to generate, so you just need your data. + +0:40:00.740 --> 0:40:06.086 +That's why it's called self supervised, so +you're creating your supervised pending data. + +0:40:06.366 --> 0:40:07.646 +By yourself. + +0:40:07.646 --> 0:40:15.133 +On the other hand, you need a lot of knowledge +and that is the other thing. + +0:40:15.735 --> 0:40:24.703 +Because there is this idea that the meaning +of a word heavily depends on the context that. + +0:40:25.145 --> 0:40:36.846 +So can give you a sentence with some giverish +word and there's some name and although you've + +0:40:36.846 --> 0:40:41.627 +never heard the name you will assume. + +0:40:42.062 --> 0:40:44.149 +And exactly the same thing. + +0:40:44.149 --> 0:40:49.143 +The models can also learn something about +the world by just using. + +0:40:49.649 --> 0:40:53.651 +So that is typically the mule. + +0:40:53.651 --> 0:40:59.848 +Then we can use this model to train the system. + +0:41:00.800 --> 0:41:03.368 +Course we might need to adapt the system. + +0:41:03.368 --> 0:41:07.648 +To do that we have to change the architecture +we might use only some. + +0:41:07.627 --> 0:41:09.443 +Part of the pre-trained model. + +0:41:09.443 --> 0:41:14.773 +In there we have seen that a bit already in +the R&N case you can also see that we have + +0:41:14.773 --> 0:41:17.175 +also mentioned the pre-training already. + +0:41:17.437 --> 0:41:22.783 +So you can use the R&N as one of these +approaches. + +0:41:22.783 --> 0:41:28.712 +You train the R&M language more on large +pre-train data. + +0:41:28.712 --> 0:41:32.309 +Then you put it somewhere into your. + +0:41:33.653 --> 0:41:37.415 +So this gives you the ability to really do +these types of tests. + +0:41:37.877 --> 0:41:53.924 +So you can build a system which is knowledge, +which is just trained on large amounts of data. + +0:41:56.376 --> 0:42:01.564 +So the question is maybe what type of information +so what type of models can you? + +0:42:01.821 --> 0:42:05.277 +And we want today to look at briefly at swings. + +0:42:05.725 --> 0:42:08.704 +First, that was what was initially done. + +0:42:08.704 --> 0:42:15.314 +It wasn't as famous as in machine translation +as in other things, but it's also used there + +0:42:15.314 --> 0:42:21.053 +and that is to use static word embedding, so +just the first step we know here. + +0:42:21.221 --> 0:42:28.981 +So we have this mapping from the one hot to +a small continuous word representation. + +0:42:29.229 --> 0:42:38.276 +Using this one in your NG system, so you can, +for example, replace the embedding layer by + +0:42:38.276 --> 0:42:38.779 +the. + +0:42:39.139 --> 0:42:41.832 +That is helpful to be a really small amount +of data. + +0:42:42.922 --> 0:42:48.517 +And we're always in this pre-training phase +and have the thing the advantage is. + +0:42:48.468 --> 0:42:52.411 +More data than the trade off, so you can get +better. + +0:42:52.411 --> 0:42:59.107 +The disadvantage is, does anybody have an +idea of what might be the disadvantage of using + +0:42:59.107 --> 0:43:00.074 +things like. + +0:43:04.624 --> 0:43:12.175 +What was one mentioned today giving like big +advantage of the system compared to previous. + +0:43:20.660 --> 0:43:25.134 +Where one advantage was the enter end training, +so you have the enter end training so that + +0:43:25.134 --> 0:43:27.937 +all parameters and all components play optimal +together. + +0:43:28.208 --> 0:43:33.076 +If you know pre-train something on one fast, +it may be no longer optimal fitting to everything + +0:43:33.076 --> 0:43:33.384 +else. + +0:43:33.893 --> 0:43:37.862 +So what do pretending or not? + +0:43:37.862 --> 0:43:48.180 +It depends on how important everything is +optimal together and how important. + +0:43:48.388 --> 0:43:50.454 +Of large amount. + +0:43:50.454 --> 0:44:00.541 +The pre-change one is so much better that +it's helpful, and the advantage of that. + +0:44:00.600 --> 0:44:11.211 +Getting everything optimal together, yes, +we would use random instructions for raising. + +0:44:11.691 --> 0:44:26.437 +The problem is you might be already in some +area where it's not easy to get. + +0:44:26.766 --> 0:44:35.329 +But often in some way right, so often it's +not about your really worse pre trained monolepsy. + +0:44:35.329 --> 0:44:43.254 +If you're going already in some direction, +and if this is not really optimal for you,. + +0:44:43.603 --> 0:44:52.450 +But if you're not really getting better because +you have a decent amount of data, it's so different + +0:44:52.450 --> 0:44:52.981 +that. + +0:44:53.153 --> 0:44:59.505 +Initially it wasn't a machine translation +done so much because there are more data in + +0:44:59.505 --> 0:45:06.153 +MPs than in other tasks, but now with really +large amounts of monolingual data we do some + +0:45:06.153 --> 0:45:09.403 +type of pretraining in currently all state. + +0:45:12.632 --> 0:45:14.302 +The other one is okay now. + +0:45:14.302 --> 0:45:18.260 +It's always like how much of the model do +you plea track a bit? + +0:45:18.658 --> 0:45:22.386 +To the other one you can do contextural word +embedded. + +0:45:22.386 --> 0:45:28.351 +That is something like bird or Roberta where +you train already a sequence model and the + +0:45:28.351 --> 0:45:34.654 +embeddings you're using are no longer specific +for word but they are also taking the context + +0:45:34.654 --> 0:45:35.603 +into account. + +0:45:35.875 --> 0:45:50.088 +The embedding you're using is no longer depending +on the word itself but on the whole sentence, + +0:45:50.088 --> 0:45:54.382 +so you can use this context. + +0:45:55.415 --> 0:46:02.691 +You can use similar things also in the decoder +just by having layers which don't have access + +0:46:02.691 --> 0:46:12.430 +to the source, but there it still might have +and these are typically models like: And finally + +0:46:12.430 --> 0:46:14.634 +they will look at the end. + +0:46:14.634 --> 0:46:19.040 +You can also have models which are already +sequenced. + +0:46:19.419 --> 0:46:28.561 +So you may be training a sequence to sequence +models. + +0:46:28.561 --> 0:46:35.164 +You have to make it a bit challenging. + +0:46:36.156 --> 0:46:43.445 +But the idea is really you're pre-training +your whole model and then you'll find tuning. + +0:46:47.227 --> 0:46:59.614 +But let's first do a bit of step back and +look into what are the different things. + +0:46:59.614 --> 0:47:02.151 +The first thing. + +0:47:02.382 --> 0:47:11.063 +The wooden bettings are just this first layer +and you can train them with feedback annual + +0:47:11.063 --> 0:47:12.028 +networks. + +0:47:12.212 --> 0:47:22.761 +But you can also train them with an N language +model, and by now you hopefully have also seen + +0:47:22.761 --> 0:47:27.699 +that you cannot transform a language model. + +0:47:30.130 --> 0:47:37.875 +So this is how you can train them and you're +training them. + +0:47:37.875 --> 0:47:45.234 +For example, to speak the next word that is +the easiest. + +0:47:45.525 --> 0:47:55.234 +And that is what is now referred to as South +Supervised Learning and, for example, all the + +0:47:55.234 --> 0:48:00.675 +big large language models like Chad GPT and +so on. + +0:48:00.675 --> 0:48:03.129 +They are trained with. + +0:48:03.823 --> 0:48:15.812 +So that is where you can hopefully learn how +a word is used because you always try to previct + +0:48:15.812 --> 0:48:17.725 +the next word. + +0:48:19.619 --> 0:48:27.281 +Word embedding: Why do you keep the first +look at the word embeddings and the use of + +0:48:27.281 --> 0:48:29.985 +word embeddings for our task? + +0:48:29.985 --> 0:48:38.007 +The main advantage was it might be only the +first layer where you typically have most of + +0:48:38.007 --> 0:48:39.449 +the parameters. + +0:48:39.879 --> 0:48:57.017 +Most of your parameters already on the large +data, then on your target data you have to + +0:48:57.017 --> 0:48:59.353 +train less. + +0:48:59.259 --> 0:49:06.527 +Big difference that your input size is so +much bigger than the size of the novel in size. + +0:49:06.626 --> 0:49:17.709 +So it's a normally sign, maybe like, but your +input and banning size is something like. + +0:49:17.709 --> 0:49:20.606 +Then here you have to. + +0:49:23.123 --> 0:49:30.160 +While here you see it's only like zero point +five times as much in the layer. + +0:49:30.750 --> 0:49:36.534 +So here is where most of your parameters are, +which means if you already replace the word + +0:49:36.534 --> 0:49:41.739 +embeddings, they might look a bit small in +your overall and in key architecture. + +0:49:41.739 --> 0:49:47.395 +It's where most of the things are, and if +you're doing that you already have really big + +0:49:47.395 --> 0:49:48.873 +games and can do that. + +0:49:57.637 --> 0:50:01.249 +The thing is we have seen these were the bettings. + +0:50:01.249 --> 0:50:04.295 +They can be very good use for other types. + +0:50:04.784 --> 0:50:08.994 +You learn some general relations between words. + +0:50:08.994 --> 0:50:17.454 +If you're doing this type of language modeling +cast, you predict: The one thing is you have + +0:50:17.454 --> 0:50:24.084 +a lot of data, so the one question is we want +to have data to trade a model. + +0:50:24.084 --> 0:50:28.734 +The other thing, the tasks need to be somehow +useful. + +0:50:29.169 --> 0:50:43.547 +If you would predict the first letter of the +word, then you wouldn't learn anything about + +0:50:43.547 --> 0:50:45.144 +the word. + +0:50:45.545 --> 0:50:53.683 +And the interesting thing is people have looked +at these wood embeddings. + +0:50:53.954 --> 0:50:58.550 +And looking at the word embeddings. + +0:50:58.550 --> 0:51:09.276 +You can ask yourself how they look and visualize +them by doing dimension reduction. + +0:51:09.489 --> 0:51:13.236 +Don't know if you and you are listening to +artificial intelligence. + +0:51:13.236 --> 0:51:15.110 +Advanced artificial intelligence. + +0:51:15.515 --> 0:51:23.217 +We had on yesterday there how to do this type +of representation, but you can do this time + +0:51:23.217 --> 0:51:29.635 +of representation, and now you're seeing interesting +things that normally. + +0:51:30.810 --> 0:51:41.027 +Now you can represent a here in a three dimensional +space with some dimension reduction. + +0:51:41.027 --> 0:51:46.881 +For example, the relation between male and +female. + +0:51:47.447 --> 0:51:56.625 +So this vector between the male and female +version of something is always not the same, + +0:51:56.625 --> 0:51:58.502 +but it's related. + +0:51:58.718 --> 0:52:14.522 +So you can do a bit of maths, so you do take +king, you subtract this vector, add this vector. + +0:52:14.894 --> 0:52:17.591 +So that means okay, there is really something +stored. + +0:52:17.591 --> 0:52:19.689 +Some information are stored in that book. + +0:52:20.040 --> 0:52:22.621 +Similar, you can do it with Bob Hansen. + +0:52:22.621 --> 0:52:25.009 +See here swimming slam walking walk. + +0:52:25.265 --> 0:52:34.620 +So again these vectors are not the same, but +they are related. + +0:52:34.620 --> 0:52:42.490 +So you learn something from going from here +to here. + +0:52:43.623 --> 0:52:49.761 +Or semantically, the relations between city +and capital have exactly the same sense. + +0:52:51.191 --> 0:52:56.854 +And people had even done that question answering +about that if they showed the diembeddings + +0:52:56.854 --> 0:52:57.839 +and the end of. + +0:52:58.218 --> 0:53:06.711 +All you can also do is don't trust the dimensions +of the reaction because maybe there is something. + +0:53:06.967 --> 0:53:16.863 +You can also look into what happens really +in the individual space. + +0:53:16.863 --> 0:53:22.247 +What is the nearest neighbor of the. + +0:53:22.482 --> 0:53:29.608 +So you can take the relationship between France +and Paris and add it to Italy and you'll. + +0:53:30.010 --> 0:53:33.078 +You can do big and bigger and you have small +and smaller and stuff. + +0:53:33.593 --> 0:53:49.417 +Because it doesn't work everywhere, there +is also some typical dish here in German. + +0:53:51.491 --> 0:54:01.677 +You can do what the person is doing for famous +ones, of course only like Einstein scientists + +0:54:01.677 --> 0:54:06.716 +that find midfielders not completely correct. + +0:54:06.846 --> 0:54:10.134 +You see the examples are a bit old. + +0:54:10.134 --> 0:54:15.066 +The politicians are no longer they am, but +of course. + +0:54:16.957 --> 0:54:26.759 +What people have done there, especially at +the beginning training our end language model, + +0:54:26.759 --> 0:54:28.937 +was very expensive. + +0:54:29.309 --> 0:54:38.031 +So one famous model was, but we are not really +interested in the language model performance. + +0:54:38.338 --> 0:54:40.581 +Think something good to keep in mind. + +0:54:40.581 --> 0:54:42.587 +What are we really interested in? + +0:54:42.587 --> 0:54:45.007 +Do we really want to have an R&N no? + +0:54:45.007 --> 0:54:48.607 +In this case we are only interested in this +type of mapping. + +0:54:49.169 --> 0:54:55.500 +And so successful and very successful was +this word to vet. + +0:54:55.535 --> 0:54:56.865 +The idea is okay. + +0:54:56.865 --> 0:55:03.592 +We are not training real language one, making +it even simpler and doing this, for example, + +0:55:03.592 --> 0:55:05.513 +continuous peck of words. + +0:55:05.513 --> 0:55:12.313 +We're just having four input tokens and we're +predicting what is the word in the middle and + +0:55:12.313 --> 0:55:15.048 +this is just like two linear layers. + +0:55:15.615 --> 0:55:21.627 +So it's even simplifying things and making +the calculation faster because that is what + +0:55:21.627 --> 0:55:22.871 +we're interested. + +0:55:23.263 --> 0:55:32.897 +All this continuous skip ground models with +these other models which refer to as where + +0:55:32.897 --> 0:55:34.004 +to where. + +0:55:34.234 --> 0:55:42.394 +Where you have one equal word and the other +way around, you're predicting the four words + +0:55:42.394 --> 0:55:43.585 +around them. + +0:55:43.585 --> 0:55:45.327 +It's very similar. + +0:55:45.327 --> 0:55:48.720 +The task is in the end very similar. + +0:55:51.131 --> 0:56:01.407 +Before we are going to the next point, anything +about normal weight vectors or weight embedding. + +0:56:04.564 --> 0:56:07.794 +The next thing is contexture. + +0:56:07.794 --> 0:56:12.208 +Word embeddings and the idea is helpful. + +0:56:12.208 --> 0:56:19.206 +However, we might even be able to get more +from one lingo layer. + +0:56:19.419 --> 0:56:31.732 +And now in the word that is overlap of these +two meanings, so it represents both the meaning + +0:56:31.732 --> 0:56:33.585 +of can do it. + +0:56:34.834 --> 0:56:40.410 +But we might be able to in the pre-trained +model already disambiguate this because they + +0:56:40.410 --> 0:56:41.044 +are used. + +0:56:41.701 --> 0:56:53.331 +So if we can have a model which can not only +represent a word but can also represent the + +0:56:53.331 --> 0:56:58.689 +meaning of the word within the context,. + +0:56:59.139 --> 0:57:03.769 +So then we are going to context your word +embeddings. + +0:57:03.769 --> 0:57:07.713 +We are really having a representation in the. + +0:57:07.787 --> 0:57:11.519 +And we have a very good architecture for that +already. + +0:57:11.691 --> 0:57:23.791 +The hidden state represents what is currently +said, but it's focusing on what is the last + +0:57:23.791 --> 0:57:29.303 +one, so it's some of the representation. + +0:57:29.509 --> 0:57:43.758 +The first one doing that is something like +the Elmo paper where they instead of this is + +0:57:43.758 --> 0:57:48.129 +the normal language model. + +0:57:48.008 --> 0:57:50.714 +Within the third, predicting the fourth, and +so on. + +0:57:50.714 --> 0:57:53.004 +So you are always predicting the next work. + +0:57:53.193 --> 0:57:57.335 +The architecture is the heaven words embedding +layer and then layers. + +0:57:57.335 --> 0:58:03.901 +See you, for example: And now instead of using +this one in the end, you're using here this + +0:58:03.901 --> 0:58:04.254 +one. + +0:58:04.364 --> 0:58:11.245 +This represents the meaning of this word mainly +in the context of what we have seen before. + +0:58:11.871 --> 0:58:18.610 +We can train it in a language model style +always predicting the next word, but we have + +0:58:18.610 --> 0:58:21.088 +more information trained there. + +0:58:21.088 --> 0:58:26.123 +Therefore, in the system it has to learn less +additional things. + +0:58:27.167 --> 0:58:31.261 +And there is one Edendang which is done currently +in GPS. + +0:58:31.261 --> 0:58:38.319 +The only difference is that we have more layers, +bigger size, and we're using transformer neurocell + +0:58:38.319 --> 0:58:40.437 +potential instead of the RNA. + +0:58:40.437 --> 0:58:45.095 +But that is how you train like some large +language models at the. + +0:58:46.746 --> 0:58:55.044 +However, if you look at this contextual representation, +they might not be perfect. + +0:58:55.044 --> 0:59:02.942 +So if you think of this one as a contextual +representation of the third word,. + +0:59:07.587 --> 0:59:16.686 +Is representing a three in the context of +a sentence, however only in the context of + +0:59:16.686 --> 0:59:18.185 +the previous. + +0:59:18.558 --> 0:59:27.413 +However, we have an architecture which can +also take both sides and we have used that + +0:59:27.413 --> 0:59:30.193 +already in the ink holder. + +0:59:30.630 --> 0:59:34.264 +So we could do the iron easily on your, also +in the backward direction. + +0:59:34.874 --> 0:59:42.826 +By just having the states the other way around +and then we couldn't combine the forward and + +0:59:42.826 --> 0:59:49.135 +the forward into a joint one where we are doing +this type of prediction. + +0:59:49.329 --> 0:59:50.858 +So you have the word embedding. + +0:59:51.011 --> 1:00:02.095 +Then you have two in the states, one on the +forward arm and one on the backward arm, and + +1:00:02.095 --> 1:00:10.314 +then you can, for example, take the cocagenation +of both of them. + +1:00:10.490 --> 1:00:23.257 +Now this same here represents mainly this +word because this is what both puts in it last + +1:00:23.257 --> 1:00:30.573 +and we know is focusing on what is happening +last. + +1:00:31.731 --> 1:00:40.469 +However, there is a bit of difference when +training that as a language model you already + +1:00:40.469 --> 1:00:41.059 +have. + +1:00:43.203 --> 1:00:44.956 +Maybe There's Again This Masking. + +1:00:46.546 --> 1:00:47.748 +That is one solution. + +1:00:47.748 --> 1:00:52.995 +First of all, why we can't do it is the information +you leak it, so you cannot just predict the + +1:00:52.995 --> 1:00:53.596 +next word. + +1:00:53.596 --> 1:00:58.132 +If we just predict the next word in this type +of model, that's a very simple task. + +1:00:58.738 --> 1:01:09.581 +You know the next word because it's influencing +this hidden state predicting something is not + +1:01:09.581 --> 1:01:11.081 +a good task. + +1:01:11.081 --> 1:01:18.455 +You have to define: Because in this case what +will end with the system will just ignore these + +1:01:18.455 --> 1:01:22.966 +estates and what will learn is copy this information +directly in here. + +1:01:23.343 --> 1:01:31.218 +So it would be representing this word and +you would have nearly a perfect model because + +1:01:31.218 --> 1:01:38.287 +you only need to find encoding where you can +encode all words somehow in this. + +1:01:38.458 --> 1:01:44.050 +The only thing can learn is that turn and +encode all my words in this upper hidden. + +1:01:44.985 --> 1:01:53.779 +Therefore, it's not really useful, so we need +to find a bit of different ways out. + +1:01:55.295 --> 1:01:57.090 +There is a masking one. + +1:01:57.090 --> 1:02:03.747 +I'll come to that shortly just a bit that +other things also have been done, so the other + +1:02:03.747 --> 1:02:06.664 +thing is not to directly combine them. + +1:02:06.664 --> 1:02:13.546 +That was in the animal paper, so you have +them forward R&M and you keep them completely + +1:02:13.546 --> 1:02:14.369 +separated. + +1:02:14.594 --> 1:02:20.458 +So you never merged to state. + +1:02:20.458 --> 1:02:33.749 +At the end, the representation of the word +is now from the forward. + +1:02:33.873 --> 1:02:35.953 +So it's always the hidden state before the +good thing. + +1:02:36.696 --> 1:02:41.286 +These two you join now to your to the representation. + +1:02:42.022 --> 1:02:48.685 +And then you have now a representation also +about like the whole sentence for the word, + +1:02:48.685 --> 1:02:51.486 +but there is no information leakage. + +1:02:51.486 --> 1:02:58.149 +One way of doing this is instead of doing +a bidirection along you do a forward pass and + +1:02:58.149 --> 1:02:59.815 +then join the hidden. + +1:03:00.380 --> 1:03:05.960 +So you can do that in all layers. + +1:03:05.960 --> 1:03:16.300 +In the end you do the forwarded layers and +you get the hidden. + +1:03:16.596 --> 1:03:19.845 +However, it's a bit of a complicated. + +1:03:19.845 --> 1:03:25.230 +You have to keep both separate and merge things +so can you do. + +1:03:27.968 --> 1:03:33.030 +And that is the moment where like the big. + +1:03:34.894 --> 1:03:39.970 +The big success of the burnt model was used +where it okay. + +1:03:39.970 --> 1:03:47.281 +Maybe in bite and rich case it's not good +to do the next word prediction, but we can + +1:03:47.281 --> 1:03:48.314 +do masking. + +1:03:48.308 --> 1:03:56.019 +Masking mainly means we do a prediction of +something in the middle or some words. + +1:03:56.019 --> 1:04:04.388 +So the idea is if we have the input, we are +putting noise into the input, removing them, + +1:04:04.388 --> 1:04:07.961 +and then the model we are interested. + +1:04:08.048 --> 1:04:15.327 +Now there can be no information leakage because +this wasn't predicting that one is a big challenge. + +1:04:16.776 --> 1:04:19.957 +Do any assumption about our model? + +1:04:19.957 --> 1:04:26.410 +It doesn't need to be a forward model or a +backward model or anything. + +1:04:26.410 --> 1:04:29.500 +You can always predict the three. + +1:04:30.530 --> 1:04:34.844 +There's maybe one bit of a disadvantage. + +1:04:34.844 --> 1:04:40.105 +Do you see what could be a bit of a problem +this? + +1:05:00.000 --> 1:05:06.429 +Yes, so yeah, you can of course mask more, +but to see it more globally, just first assume + +1:05:06.429 --> 1:05:08.143 +you're only masked one. + +1:05:08.143 --> 1:05:13.930 +For the whole sentence, we get one feedback +signal, like what is the word three. + +1:05:13.930 --> 1:05:22.882 +So we have one training example: If you do +the language modeling taste, we predicted here, + +1:05:22.882 --> 1:05:24.679 +we predicted here. + +1:05:25.005 --> 1:05:26.735 +So we have number of tokens. + +1:05:26.735 --> 1:05:30.970 +For each token we have a feet pad and say +what is the best correction. + +1:05:31.211 --> 1:05:43.300 +So in this case this is less efficient because +we are getting less feedback signals on what + +1:05:43.300 --> 1:05:45.797 +we should predict. + +1:05:48.348 --> 1:05:56.373 +So and bird, the main ideas are that you're +doing this bidirectional model with masking. + +1:05:56.373 --> 1:05:59.709 +It's using transformer architecture. + +1:06:00.320 --> 1:06:06.326 +There are two more minor changes. + +1:06:06.326 --> 1:06:16.573 +We'll see that this next word prediction is +another task. + +1:06:16.957 --> 1:06:30.394 +You want to learn more about what language +is to really understand following a story or + +1:06:30.394 --> 1:06:35.127 +their independent tokens into. + +1:06:38.158 --> 1:06:42.723 +The input is using word units as we use it. + +1:06:42.723 --> 1:06:50.193 +It has some special token that is framing +for the next word prediction. + +1:06:50.470 --> 1:07:04.075 +It's more for classification task because +you may be learning a general representation + +1:07:04.075 --> 1:07:07.203 +as a full sentence. + +1:07:07.607 --> 1:07:19.290 +You're doing segment embedding, so you have +an embedding for it. + +1:07:19.290 --> 1:07:24.323 +This is the first sentence. + +1:07:24.684 --> 1:07:29.099 +Now what is more challenging is this masking. + +1:07:29.099 --> 1:07:30.827 +What do you mask? + +1:07:30.827 --> 1:07:35.050 +We already have the crush enough or should. + +1:07:35.275 --> 1:07:42.836 +So there has been afterwards eating some work +like, for example, a bearer. + +1:07:42.836 --> 1:07:52.313 +It's not super sensitive, but if you do it +completely wrong then you're not letting anything. + +1:07:52.572 --> 1:07:54.590 +That's Then Another Question There. + +1:07:56.756 --> 1:08:04.594 +Should I mask all types of should I always +mask the footwork or if I have a subword to + +1:08:04.594 --> 1:08:10.630 +mask only like a subword and predict them based +on the other ones? + +1:08:10.630 --> 1:08:14.504 +Of course, it's a bit of a different task. + +1:08:14.894 --> 1:08:21.210 +If you know three parts of the words, it might +be easier to guess the last because they here + +1:08:21.210 --> 1:08:27.594 +took the easiest selection, so not considering +words anymore at all because you're doing that + +1:08:27.594 --> 1:08:32.280 +in the preprocessing and just taking always +words and like subwords. + +1:08:32.672 --> 1:08:36.089 +Think in group there is done differently. + +1:08:36.089 --> 1:08:40.401 +They mark always the full words, but guess +it's not. + +1:08:41.001 --> 1:08:46.044 +And then what to do with the mask word in +eighty percent of the cases. + +1:08:46.044 --> 1:08:50.803 +If the word is masked, they replace it with +a special token thing. + +1:08:50.803 --> 1:08:57.197 +This is a mask token in ten percent they put +in some random other token in there, and ten + +1:08:57.197 --> 1:08:59.470 +percent they keep it on change. + +1:09:02.202 --> 1:09:10.846 +And then what you can do is also this next +word prediction. + +1:09:10.846 --> 1:09:14.880 +The man went to Mass Store. + +1:09:14.880 --> 1:09:17.761 +He bought a gallon. + +1:09:18.418 --> 1:09:24.088 +So may you see you're joining them, you're +doing both masks and prediction that you're. + +1:09:24.564 --> 1:09:29.449 +Is a penguin mask or flyless birds. + +1:09:29.449 --> 1:09:41.390 +These two sentences have nothing to do with +each other, so you can do also this type of + +1:09:41.390 --> 1:09:43.018 +prediction. + +1:09:47.127 --> 1:09:57.043 +And then the whole bird model, so here you +have the input here to transform the layers, + +1:09:57.043 --> 1:09:58.170 +and then. + +1:09:58.598 --> 1:10:17.731 +And this model was quite successful in general +applications. + +1:10:17.937 --> 1:10:27.644 +However, there is like a huge thing of different +types of models coming from them. + +1:10:27.827 --> 1:10:38.709 +So based on others these supervised molds +like a whole setup came out of there and now + +1:10:38.709 --> 1:10:42.086 +this is getting even more. + +1:10:42.082 --> 1:10:46.640 +With availability of a large language model +than the success. + +1:10:47.007 --> 1:10:48.436 +We have now even larger ones. + +1:10:48.828 --> 1:10:50.961 +Interestingly, it goes a bit. + +1:10:50.910 --> 1:10:57.847 +Change the bit again from like more the spider +action model to uni directional models. + +1:10:57.847 --> 1:11:02.710 +Are at the moment maybe a bit more we're coming +to them now? + +1:11:02.710 --> 1:11:09.168 +Do you see one advantage while what is another +event and we have the efficiency? + +1:11:09.509 --> 1:11:15.901 +Is one other reason why you are sometimes +more interested in uni-direction models than + +1:11:15.901 --> 1:11:17.150 +in bi-direction. + +1:11:22.882 --> 1:11:30.220 +It depends on the pass, but for example for +a language generation pass, the eccard is not + +1:11:30.220 --> 1:11:30.872 +really. + +1:11:32.192 --> 1:11:40.924 +It doesn't work so if you want to do a generation +like the decoder you don't know the future + +1:11:40.924 --> 1:11:42.896 +so you cannot apply. + +1:11:43.223 --> 1:11:53.870 +So this time of model can be used for the +encoder in an encoder model, but it cannot + +1:11:53.870 --> 1:11:57.002 +be used for the decoder. + +1:12:00.000 --> 1:12:05.012 +That's a good view to the next overall cast +of models. + +1:12:05.012 --> 1:12:08.839 +Perhaps if you view it from the sequence. + +1:12:09.009 --> 1:12:12.761 +We have the encoder base model. + +1:12:12.761 --> 1:12:16.161 +That's what we just look at. + +1:12:16.161 --> 1:12:20.617 +They are bidirectional and typically. + +1:12:20.981 --> 1:12:22.347 +That Is the One We Looked At. + +1:12:22.742 --> 1:12:34.634 +At the beginning is the decoder based model, +so see out in regressive models which are unidirective + +1:12:34.634 --> 1:12:42.601 +like an based model, and there we can do the +next word prediction. + +1:12:43.403 --> 1:12:52.439 +And what you can also do first, and there +you can also have a special things called prefix + +1:12:52.439 --> 1:12:53.432 +language. + +1:12:54.354 --> 1:13:05.039 +Because we are saying it might be helpful +that some of your input can also use bi-direction. + +1:13:05.285 --> 1:13:12.240 +And that is somehow doing what it is called +prefix length. + +1:13:12.240 --> 1:13:19.076 +On the first tokens you directly give your +bidirectional. + +1:13:19.219 --> 1:13:28.774 +So you somehow merge that and that mainly +works only in transformer based models because. + +1:13:29.629 --> 1:13:33.039 +There is no different number of parameters +in our end. + +1:13:33.039 --> 1:13:34.836 +We need a back foot our end. + +1:13:34.975 --> 1:13:38.533 +Transformer: The only difference is how you +mask your attention. + +1:13:38.878 --> 1:13:44.918 +We have seen that in the anchoder and decoder +the number of parameters is different because + +1:13:44.918 --> 1:13:50.235 +you do cross attention, but if you do forward +and backward or union directions,. + +1:13:50.650 --> 1:13:58.736 +It's only like you mask your attention to +only look at the bad past or to look into the + +1:13:58.736 --> 1:13:59.471 +future. + +1:14:00.680 --> 1:14:03.326 +And now you can of course also do mixing. + +1:14:03.563 --> 1:14:08.306 +So this is a bi-directional attention matrix +where you can attend to everything. + +1:14:08.588 --> 1:14:23.516 +There is a uni-direction or causal where you +can look at the past and you can do the first + +1:14:23.516 --> 1:14:25.649 +three words. + +1:14:29.149 --> 1:14:42.831 +That somehow clear based on that, then of +course you cannot do the other things. + +1:14:43.163 --> 1:14:50.623 +So the idea is we have our anchor to decoder +architecture. + +1:14:50.623 --> 1:14:57.704 +Can we also train them completely in a side +supervisor? + +1:14:58.238 --> 1:15:09.980 +And in this case we have the same input to +both, so in this case we need to do some type + +1:15:09.980 --> 1:15:12.224 +of masking here. + +1:15:12.912 --> 1:15:17.696 +Here we don't need to do the masking, but +here we need to masking that doesn't know ever + +1:15:17.696 --> 1:15:17.911 +so. + +1:15:20.440 --> 1:15:30.269 +And this type of model got quite successful +also, especially for pre-training machine translation. + +1:15:30.330 --> 1:15:39.059 +The first model doing that is a Bart model, +which exactly does that, and yes, it's one + +1:15:39.059 --> 1:15:42.872 +successful way to pre train your one. + +1:15:42.872 --> 1:15:47.087 +It's pretraining your full encoder model. + +1:15:47.427 --> 1:15:54.365 +Where you put in contrast to machine translation, +where you put in source sentence, we can't + +1:15:54.365 --> 1:15:55.409 +do that here. + +1:15:55.715 --> 1:16:01.382 +But we can just put the second twice in there, +and then it's not a trivial task. + +1:16:01.382 --> 1:16:02.432 +We can change. + +1:16:03.003 --> 1:16:12.777 +And there is like they do different corruption +techniques so you can also do. + +1:16:13.233 --> 1:16:19.692 +That you couldn't do in an agricultural system +because then it wouldn't be there and you cannot + +1:16:19.692 --> 1:16:20.970 +predict somewhere. + +1:16:20.970 --> 1:16:26.353 +So the anchor, the number of input and output +tokens always has to be the same. + +1:16:26.906 --> 1:16:29.818 +You cannot do a prediction for something which +isn't in it. + +1:16:30.110 --> 1:16:38.268 +Here in the decoder side it's unidirection +so we can also delete the top and then try + +1:16:38.268 --> 1:16:40.355 +to generate the full. + +1:16:41.061 --> 1:16:45.250 +We can do sentence permutation. + +1:16:45.250 --> 1:16:54.285 +We can document rotation and text infilling +so there is quite a bit. + +1:16:55.615 --> 1:17:06.568 +So you see there's quite a lot of types of +models that you can use in order to pre-train. + +1:17:07.507 --> 1:17:14.985 +Then, of course, there is again for the language +one. + +1:17:14.985 --> 1:17:21.079 +The other question is how do you integrate? + +1:17:21.761 --> 1:17:26.636 +And there's also, like yeah, quite some different +ways of techniques. + +1:17:27.007 --> 1:17:28.684 +It's a Bit Similar to Before. + +1:17:28.928 --> 1:17:39.068 +So the easiest thing is you take your word +embeddings or your free trained model. + +1:17:39.068 --> 1:17:47.971 +You freeze them and stack your decoder layers +and keep these ones free. + +1:17:48.748 --> 1:17:54.495 +Can also be done if you have this type of +bark model. + +1:17:54.495 --> 1:18:03.329 +What you can do is you freeze your word embeddings, +for example some products and. + +1:18:05.865 --> 1:18:17.296 +The other thing is you initialize them so +you initialize your models but you train everything + +1:18:17.296 --> 1:18:19.120 +so you're not. + +1:18:22.562 --> 1:18:29.986 +Then one thing, if you think about Bart, you +want to have the Chinese language, the Italian + +1:18:29.986 --> 1:18:32.165 +language, and the deconer. + +1:18:32.165 --> 1:18:35.716 +However, in Bart we have the same language. + +1:18:36.516 --> 1:18:46.010 +The one you get is from English, so what you +can do there is so you cannot try to do some. + +1:18:46.366 --> 1:18:52.562 +Below the barge, in order to learn some language +specific stuff, or there's a masculine barge, + +1:18:52.562 --> 1:18:58.823 +which is trained on many languages, but it's +trained only on like the Old Coast Modern Language + +1:18:58.823 --> 1:19:03.388 +House, which may be trained in German and English, +but not on German. + +1:19:03.923 --> 1:19:08.779 +So then you would still need to find June +and the model needs to learn how to better + +1:19:08.779 --> 1:19:10.721 +do the attention cross lingually. + +1:19:10.721 --> 1:19:15.748 +It's only on the same language but it mainly +only has to learn this mapping and not all + +1:19:15.748 --> 1:19:18.775 +the rest and that's why it's still quite successful. + +1:19:21.982 --> 1:19:27.492 +Now certain thing which is very commonly used +is what is required to it as adapters. + +1:19:27.607 --> 1:19:29.754 +So for example you take and buy. + +1:19:29.709 --> 1:19:35.218 +And you put some adapters on the inside of +the networks so that it's small new layers + +1:19:35.218 --> 1:19:40.790 +which are in between put in there and then +you only train these adapters or also train + +1:19:40.790 --> 1:19:41.815 +these adapters. + +1:19:41.815 --> 1:19:47.900 +For example, an embryo you could see that +this learns to map the Sears language representation + +1:19:47.900 --> 1:19:50.334 +to the Tiger language representation. + +1:19:50.470 --> 1:19:52.395 +And then you don't have to change that luck. + +1:19:52.792 --> 1:19:59.793 +You give it extra ability to really perform +well on that. + +1:19:59.793 --> 1:20:05.225 +These are quite small and so very efficient. + +1:20:05.905 --> 1:20:12.632 +That is also very commonly used, for example +in modular systems where you have some adaptors + +1:20:12.632 --> 1:20:16.248 +in between here which might be language specific. + +1:20:16.916 --> 1:20:22.247 +So they are trained only for one language. + +1:20:22.247 --> 1:20:33.777 +The model has some or both and once has the +ability to do multilingually to share knowledge. + +1:20:34.914 --> 1:20:39.058 +But there's one chance in general in the multilingual +systems. + +1:20:39.058 --> 1:20:40.439 +It works quite well. + +1:20:40.439 --> 1:20:46.161 +There's one case or one specific use case +for multilingual where this normally doesn't + +1:20:46.161 --> 1:20:47.344 +really work well. + +1:20:47.344 --> 1:20:49.975 +Do you have an idea what that could be? + +1:20:55.996 --> 1:20:57.536 +It's for Zero Shot Cases. + +1:20:57.998 --> 1:21:03.660 +Because having here some situation with this +might be very language specific and zero shot, + +1:21:03.660 --> 1:21:09.015 +the idea is always to learn representations +view which are more language dependent and + +1:21:09.015 --> 1:21:10.184 +with the adaptors. + +1:21:10.184 --> 1:21:15.601 +Of course you get in representations again +which are more language specific and then it + +1:21:15.601 --> 1:21:17.078 +doesn't work that well. + +1:21:20.260 --> 1:21:37.730 +And there is also the idea of doing more knowledge +pistolation. + +1:21:39.179 --> 1:21:42.923 +And now the idea is okay. + +1:21:42.923 --> 1:21:54.157 +We are training it the same, but what we want +to achieve is that the encoder. + +1:21:54.414 --> 1:22:03.095 +So you should learn faster by trying to make +these states as similar as possible. + +1:22:03.095 --> 1:22:11.777 +So you compare the first-hit state of the +pre-trained model and try to make them. + +1:22:12.192 --> 1:22:18.144 +For example, by using the out two norms, so +by just making these two representations the + +1:22:18.144 --> 1:22:26.373 +same: The same vocabulary: Why does it need +the same vocabulary with any idea? + +1:22:34.754 --> 1:22:46.137 +If you have different vocabulary, it's typical +you also have different sequenced lengths here. + +1:22:46.137 --> 1:22:50.690 +The number of sequences is different. + +1:22:51.231 --> 1:22:58.888 +If you now have pipe stains and four states +here, it's no longer straightforward which + +1:22:58.888 --> 1:23:01.089 +states compare to which. + +1:23:02.322 --> 1:23:05.246 +And that's just easier if you have like the +same number. + +1:23:05.246 --> 1:23:08.940 +You can always compare the first to the first +and second to the second. + +1:23:09.709 --> 1:23:16.836 +So therefore at least the very easy way of +knowledge destination only works if you have. + +1:23:17.177 --> 1:23:30.030 +Course: You could do things like yeah, the +average should be the same, but of course there's + +1:23:30.030 --> 1:23:33.071 +a less strong signal. + +1:23:34.314 --> 1:23:42.979 +But the advantage here is that you have a +diameter training signal here on the handquarter + +1:23:42.979 --> 1:23:51.455 +so you can directly make some of the encoder +already giving a good signal while normally + +1:23:51.455 --> 1:23:52.407 +an empty. + +1:23:56.936 --> 1:24:13.197 +Yes, think this is most things for today, +so what you should keep in mind is remind me. + +1:24:13.393 --> 1:24:18.400 +The one is a back translation idea. + +1:24:18.400 --> 1:24:29.561 +If you have monolingual and use that, the +other one is to: And mentally it is often helpful + +1:24:29.561 --> 1:24:33.614 +to combine them so you can even use both of +that. + +1:24:33.853 --> 1:24:38.908 +So you can use pre-trained walls, but then +you can even still do back translation where + +1:24:38.908 --> 1:24:40.057 +it's still helpful. + +1:24:40.160 --> 1:24:45.502 +We have the advantage we are training like +everything working together on the task so + +1:24:45.502 --> 1:24:51.093 +it might be helpful even to backtranslate some +data and then use it in a real translation + +1:24:51.093 --> 1:24:56.683 +setup because in pretraining of course the +beach challenge is always that you're training + +1:24:56.683 --> 1:24:57.739 +it on different. + +1:24:58.058 --> 1:25:03.327 +Different ways of how you integrate this knowledge. + +1:25:03.327 --> 1:25:08.089 +Even if you just use a full model, so in this. + +1:25:08.748 --> 1:25:11.128 +This is the most similar you can get. + +1:25:11.128 --> 1:25:13.945 +You're doing no changes to the architecture. + +1:25:13.945 --> 1:25:19.643 +You're really taking the model and just fine +tuning them on the new task, but it still has + +1:25:19.643 --> 1:25:24.026 +to completely newly learn how to do the attention +and how to do that. + +1:25:24.464 --> 1:25:29.971 +And that might be, for example, helpful to +have more back-translated data to learn them. + +1:25:32.192 --> 1:25:34.251 +That's for today. + +1:25:34.251 --> 1:25:44.661 +There's one important thing that next Tuesday +there is a conference or a workshop or so in + +1:25:44.661 --> 1:25:45.920 +this room. + +1:25:47.127 --> 1:25:56.769 +You should get an e-mail if you're in Elias +that there's a room change for Tuesdays and + +1:25:56.769 --> 1:25:57.426 +it's. + +1:25:57.637 --> 1:26:03.890 +There are more questions, yeah, have a more +general position, especially: In computer vision + +1:26:03.890 --> 1:26:07.347 +you can enlarge your data center data orientation. + +1:26:07.347 --> 1:26:08.295 +Is there any? + +1:26:08.388 --> 1:26:15.301 +It's similar to a large speech for text for +the data of an edge. + +1:26:15.755 --> 1:26:29.176 +And you can use this back translation and +also masking, but back translation is some + +1:26:29.176 --> 1:26:31.228 +way of data. + +1:26:31.371 --> 1:26:35.629 +So it has also been, for example, even its +used not only for monolingual data. + +1:26:36.216 --> 1:26:54.060 +If you have good MP system, it can also be +used for parallel data. + +1:26:54.834 --> 1:26:59.139 +So would say this is the most similar one. + +1:26:59.139 --> 1:27:03.143 +There's ways you can do power phrasing. + +1:27:05.025 --> 1:27:12.057 +But for example there is very hard to do this +by rules like which words to replace because + +1:27:12.057 --> 1:27:18.936 +there is not a coup like you cannot always +say this word can always be replaced by that. + +1:27:19.139 --> 1:27:27.225 +Mean, although they are many perfect synonyms, +normally they are good in some cases, but not + +1:27:27.225 --> 1:27:29.399 +in all cases, and so on. + +1:27:29.399 --> 1:27:36.963 +And if you don't do a rule based, you have +to train your model and then the freshness. + +1:27:38.058 --> 1:27:57.236 +The same architecture as the pre-trained mount. + +1:27:57.457 --> 1:27:59.810 +Should be of the same dimension, so it's easiest +to have the same dimension. + +1:28:00.000 --> 1:28:01.590 +Architecture. + +1:28:01.590 --> 1:28:05.452 +We later will learn inefficiency. + +1:28:05.452 --> 1:28:12.948 +You can also do knowledge cessulation with, +for example, smaller. + +1:28:12.948 --> 1:28:16.469 +You can learn the same within. + +1:28:17.477 --> 1:28:22.949 +Eight layers for it so that is possible, but +yeah agree it should be of the same. + +1:28:23.623 --> 1:28:32.486 +Yeah yeah you need the question then of course +you can do it like it's an initialization or + +1:28:32.486 --> 1:28:41.157 +you can do it doing training but normally it +most makes sense during the normal training. + +1:28:45.865 --> 1:28:53.963 +Do it, then thanks a lot, and then we'll see +each other again on Tuesday. + +0:00:00.981 --> 0:00:20.036 +Today about is how to use some type of additional +resources to improve the translation. + +0:00:20.300 --> 0:00:28.188 +We have in the first part of the semester +two thirds of the semester how to build some + +0:00:28.188 --> 0:00:31.361 +of your basic machine translation. + +0:00:31.571 --> 0:00:42.317 +Now the basic components are both for statistical +and for neural, with the encoded decoding. + +0:00:43.123 --> 0:00:46.000 +Now, of course, that's not where it stops. + +0:00:46.000 --> 0:00:51.286 +It's still what nearly every machine translation +system is currently in there. + +0:00:51.286 --> 0:00:57.308 +However, there's a lot of challenges which +you need to address in addition and which need + +0:00:57.308 --> 0:00:58.245 +to be solved. + +0:00:58.918 --> 0:01:09.858 +And there we want to start to tell you what +else can you do around this, and partly. + +0:01:10.030 --> 0:01:14.396 +And one important question there is on what +do you train your models? + +0:01:14.394 --> 0:01:32.003 +Because like this type of parallel data, it's +easier in machine translation than in other + +0:01:32.003 --> 0:01:33.569 +trusts. + +0:01:33.853 --> 0:01:41.178 +And therefore an important question is, can +we also learn from like other sources and through? + +0:01:41.701 --> 0:01:47.830 +Because if you remember strongly right at +the beginning of the election,. + +0:01:51.171 --> 0:01:53.801 +This Is How We Train All Our. + +0:01:54.194 --> 0:01:59.887 +Machine learning models from statistical to +neural. + +0:01:59.887 --> 0:02:09.412 +This doesn't have changed so we need this +type of parallel data where we have a source + +0:02:09.412 --> 0:02:13.462 +sentence aligned with a target data. + +0:02:13.493 --> 0:02:19.135 +We have now a strong model here, a very good +model to do that. + +0:02:19.135 --> 0:02:22.091 +However, we always rely on this. + +0:02:22.522 --> 0:02:28.437 +For languages, high risk language pairs say +from German to English or other European languages, + +0:02:28.437 --> 0:02:31.332 +there is decent amount at least for similarly. + +0:02:31.471 --> 0:02:37.630 +But even there if we are going to very specific +domains it might get difficult and then your + +0:02:37.630 --> 0:02:43.525 +system performance might drop because if you +want to translate now some medical text for + +0:02:43.525 --> 0:02:50.015 +example of course you need to also have peril +data in the medical domain to know how to translate + +0:02:50.015 --> 0:02:50.876 +these types. + +0:02:51.231 --> 0:02:55.264 +Phrases how to use the vocabulary and so on +in the style. + +0:02:55.915 --> 0:03:04.887 +And if you are going to other languages, there +is a lot bigger challenge and the question + +0:03:04.887 --> 0:03:05.585 +there. + +0:03:05.825 --> 0:03:09.649 +So is really this the only resource we can +use. + +0:03:09.889 --> 0:03:19.462 +Can be adapted or training phase in order +to also make use of other types of models that + +0:03:19.462 --> 0:03:27.314 +might enable us to build strong systems with +other types of information. + +0:03:27.707 --> 0:03:35.276 +And that we will look into now in the next +starting from from just saying the next election. + +0:03:35.515 --> 0:03:40.697 +So this idea we already have covered on Tuesday. + +0:03:40.697 --> 0:03:45.350 +One very successful idea for this is to do. + +0:03:45.645 --> 0:03:51.990 +So that we're no longer doing translation +between languages, but we can do translation + +0:03:51.990 --> 0:03:55.928 +between languages and share common knowledge +between. + +0:03:56.296 --> 0:04:03.888 +You also learned about things like zero shots +machine translation so you can translate between + +0:04:03.888 --> 0:04:06.446 +languages where you don't have. + +0:04:06.786 --> 0:04:09.790 +Which is the case for many, many language +pairs. + +0:04:10.030 --> 0:04:16.954 +Like even with German, you have not translation +parallel data to all languages around the world, + +0:04:16.954 --> 0:04:23.450 +or most of them you have it to the Europeans +once, maybe even for Japanese, so it will get + +0:04:23.450 --> 0:04:26.377 +difficult to get a really decent amount. + +0:04:26.746 --> 0:04:35.332 +There is quite a lot of data, for example +English to Japanese, but German to Japanese + +0:04:35.332 --> 0:04:37.827 +or German to Vietnamese. + +0:04:37.827 --> 0:04:41.621 +There is some data from Multilingual. + +0:04:42.042 --> 0:04:54.584 +So there is a very promising direction if +you want to build translation systems between + +0:04:54.584 --> 0:05:00.142 +language peers, typically not English. + +0:05:01.221 --> 0:05:05.887 +And the other ideas, of course, we don't have +to either just search for it. + +0:05:06.206 --> 0:05:12.505 +Some work on a data crawling so if I don't +have a corpus directly or I don't have an high + +0:05:12.505 --> 0:05:19.014 +quality corpus like from the European Parliament +for a TED corpus so maybe it makes sense to + +0:05:19.014 --> 0:05:23.913 +crawl more data and get additional sources +so you can build stronger. + +0:05:24.344 --> 0:05:35.485 +There has been quite a big effort in Europe +to collect really large data sets for parallel + +0:05:35.485 --> 0:05:36.220 +data. + +0:05:36.220 --> 0:05:40.382 +How can we do this data crawling? + +0:05:40.600 --> 0:05:46.103 +There the interesting thing from the machine +translation point is not just general data + +0:05:46.103 --> 0:05:46.729 +crawling. + +0:05:47.067 --> 0:05:50.037 +But how can we explicitly crawl data? + +0:05:50.037 --> 0:05:52.070 +Which is some of a peril? + +0:05:52.132 --> 0:05:58.461 +So there is in the Internet quite a lot of +data which has been company websites which + +0:05:58.461 --> 0:06:01.626 +have been translated and things like that. + +0:06:01.626 --> 0:06:05.158 +So how can you extract them parallel fragments? + +0:06:06.566 --> 0:06:13.404 +That is typically more noisy than where you +do more at hands where mean if you have Parliament. + +0:06:13.693 --> 0:06:17.680 +You can do some rules how to extract parallel +things. + +0:06:17.680 --> 0:06:24.176 +Here there is more to it, so the quality is +later maybe not as good, but normally scale + +0:06:24.176 --> 0:06:26.908 +is then a possibility to address it. + +0:06:26.908 --> 0:06:30.304 +So you just have so much more data that even. + +0:06:33.313 --> 0:06:40.295 +The other thing can be used monolingual data +and monolingual data has a big advantage that + +0:06:40.295 --> 0:06:46.664 +we can have a huge amount of that so that you +can be autocrawed from the Internet. + +0:06:46.664 --> 0:06:51.728 +The nice thing is you can also get it typically +for many domains. + +0:06:52.352 --> 0:06:59.558 +There is just so much more magnitude of monolingual +data so that it might be very helpful. + +0:06:59.559 --> 0:07:03.054 +We can do that in statistical machine translation. + +0:07:03.054 --> 0:07:06.755 +It was quite easy to integrate using language +models. + +0:07:08.508 --> 0:07:16.912 +In neural machine translation we have the +advantage that we have this overall architecture + +0:07:16.912 --> 0:07:22.915 +that does everything together, but it has also +the disadvantage. + +0:07:23.283 --> 0:07:25.675 +We'll look today at two things. + +0:07:25.675 --> 0:07:32.925 +On the one end you can still try to do a bit +of language modeling in there and add an additional + +0:07:32.925 --> 0:07:35.168 +language model into in there. + +0:07:35.168 --> 0:07:38.232 +There is some work, one very successful. + +0:07:38.178 --> 0:07:43.764 +A way in which I think is used in most systems +at the moment is to do some scientific data. + +0:07:43.763 --> 0:07:53.087 +Is a very easy thing, but you can just translate +there and use it as training gator, and normally. + +0:07:53.213 --> 0:07:59.185 +And thereby you are able to use like some +type of monolingual a day. + +0:08:00.380 --> 0:08:05.271 +Another way to do it is unsupervised and the +extreme case. + +0:08:05.271 --> 0:08:11.158 +If you have a scenario then you only have +data, only monolingual data. + +0:08:11.158 --> 0:08:13.976 +Can you still build translations? + +0:08:14.754 --> 0:08:27.675 +If you have large amounts of data and languages +are not too dissimilar, you can build translation + +0:08:27.675 --> 0:08:31.102 +systems without parallel. + +0:08:32.512 --> 0:08:36.267 +That we will see you then next Thursday. + +0:08:37.857 --> 0:08:50.512 +And then there is now a third type of pre-trained +model that recently became very successful + +0:08:50.512 --> 0:08:55.411 +and now with large language models. + +0:08:55.715 --> 0:09:03.525 +So the idea is we are no longer sharing the +real data, but it can also help to train a + +0:09:03.525 --> 0:09:04.153 +model. + +0:09:04.364 --> 0:09:11.594 +And that is now a big advantage of deep learning +based approaches. + +0:09:11.594 --> 0:09:22.169 +There you have this ability that you can train +a model in some task and then apply it to another. + +0:09:22.722 --> 0:09:33.405 +And then, of course, the question is, can +I have an initial task where there's huge amounts + +0:09:33.405 --> 0:09:34.450 +of data? + +0:09:34.714 --> 0:09:40.251 +And the test that typically you pre train +on is more like similar to a language moral + +0:09:40.251 --> 0:09:45.852 +task either direct to a language moral task +or like a masking task which is related so + +0:09:45.852 --> 0:09:51.582 +the idea is oh I can train on this data and +the knowledge about words how they relate to + +0:09:51.582 --> 0:09:53.577 +each other I can use in there. + +0:09:53.753 --> 0:10:00.276 +So it's a different way of using language +models. + +0:10:00.276 --> 0:10:06.276 +There's more transfer learning at the end +of. + +0:10:09.029 --> 0:10:17.496 +So first we will start with how can we use +monolingual data to do a Yeah to do a machine + +0:10:17.496 --> 0:10:18.733 +translation? + +0:10:20.040 --> 0:10:27.499 +That: Big difference is you should remember +from what I mentioned before is. + +0:10:27.499 --> 0:10:32.783 +In statistical machine translation we directly +have the opportunity. + +0:10:32.783 --> 0:10:39.676 +There's peril data for the translation model +and monolingual data for the language model. + +0:10:39.679 --> 0:10:45.343 +And you combine your translation model and +language model, and then you can make use of + +0:10:45.343 --> 0:10:45.730 +both. + +0:10:46.726 --> 0:10:53.183 +That you can make use of these large large +amounts of monolingual data, but of course + +0:10:53.183 --> 0:10:55.510 +it has also some disadvantage. + +0:10:55.495 --> 0:11:01.156 +Because we say the problem is we are optimizing +both parts a bit independently to each other + +0:11:01.156 --> 0:11:06.757 +and we say oh yeah the big disadvantage of +newer machine translations now we are optimizing + +0:11:06.757 --> 0:11:10.531 +the overall architecture everything together +to perform best. + +0:11:10.890 --> 0:11:16.994 +And then, of course, we can't do there, so +Leo we can can only do a mural like use power + +0:11:16.994 --> 0:11:17.405 +data. + +0:11:17.897 --> 0:11:28.714 +So the question is, but this advantage is +not so important that we can train everything, + +0:11:28.714 --> 0:11:35.276 +but we have a moral legal data or even small +amounts. + +0:11:35.675 --> 0:11:43.102 +So in data we know it's not only important +the amount of data we have but also like how + +0:11:43.102 --> 0:11:50.529 +similar it is to your test data so it can be +that this modeling data is quite small but + +0:11:50.529 --> 0:11:55.339 +it's very well fitting and then it's still +very helpful. + +0:11:55.675 --> 0:12:02.691 +At the first year of surprisingness, if we +are here successful with integrating a language + +0:12:02.691 --> 0:12:09.631 +model into a translation system, maybe we can +also integrate some type of language models + +0:12:09.631 --> 0:12:14.411 +into our empty system in order to make it better +and perform. + +0:12:16.536 --> 0:12:23.298 +The first thing we can do is we know there +is language models, so let's try to integrate. + +0:12:23.623 --> 0:12:31.096 +There was our language model because these +works were mainly done before transformer-based + +0:12:31.096 --> 0:12:31.753 +models. + +0:12:32.152 --> 0:12:38.764 +In general, of course, you can do the same +thing with transformer baseball. + +0:12:38.764 --> 0:12:50.929 +There is nothing about whether: It's just +that it has mainly been done before people + +0:12:50.929 --> 0:13:01.875 +started using R&S and they tried to do +this more in cases. + +0:13:07.087 --> 0:13:22.938 +So what we're happening here is in some of +this type of idea, and in key system you remember + +0:13:22.938 --> 0:13:25.495 +the attention. + +0:13:25.605 --> 0:13:29.465 +Gets it was your last in this day that you +calculate easy attention. + +0:13:29.729 --> 0:13:36.610 +We get the context back, then combine both +and then base the next in state and then predict. + +0:13:37.057 --> 0:13:42.424 +So this is our system, and the question is, +can we send our integrated language model? + +0:13:42.782 --> 0:13:49.890 +And somehow it makes sense to take out a neural +language model because we are anyway in the + +0:13:49.890 --> 0:13:50.971 +neural space. + +0:13:50.971 --> 0:13:58.465 +It's not surprising that it contrasts to statistical +work used and grants it might make sense to + +0:13:58.465 --> 0:14:01.478 +take a bit of a normal language model. + +0:14:01.621 --> 0:14:06.437 +And there would be something like on Tubbles +Air, a neural language model, and our man based + +0:14:06.437 --> 0:14:11.149 +is you have a target word, you put it in, you +get a new benchmark, and then you always put + +0:14:11.149 --> 0:14:15.757 +in the words and get new hidden states, and +you can do some predictions at the output to + +0:14:15.757 --> 0:14:16.948 +predict the next word. + +0:14:17.597 --> 0:14:26.977 +So if we're having this type of in language +model, there's like two main questions we have + +0:14:26.977 --> 0:14:34.769 +to answer: So how do we combine now on the +one hand our system and on the other hand our + +0:14:34.769 --> 0:14:35.358 +model? + +0:14:35.358 --> 0:14:42.004 +You see that was mentioned before when we +started talking about ENCODA models. + +0:14:42.004 --> 0:14:45.369 +They can be viewed as a language model. + +0:14:45.805 --> 0:14:47.710 +The wine is lengthened, unconditioned. + +0:14:47.710 --> 0:14:49.518 +It's just modeling the target sides. + +0:14:49.970 --> 0:14:56.963 +And the other one is a conditional language +one, which is a language one conditioned on + +0:14:56.963 --> 0:14:57.837 +the Sewer. + +0:14:58.238 --> 0:15:03.694 +So how can you combine to language models? + +0:15:03.694 --> 0:15:14.860 +Of course, it's like the translation model +will be more important because it has access + +0:15:14.860 --> 0:15:16.763 +to the source. + +0:15:18.778 --> 0:15:22.571 +If we have that, the other question is okay. + +0:15:22.571 --> 0:15:24.257 +Now we have models. + +0:15:24.257 --> 0:15:25.689 +How do we train? + +0:15:26.026 --> 0:15:30.005 +Pickers integrated them. + +0:15:30.005 --> 0:15:34.781 +We have now two sets of data. + +0:15:34.781 --> 0:15:42.741 +We have parallel data where you can do the +lower. + +0:15:44.644 --> 0:15:53.293 +So the first idea is we can do something more +like a parallel combination. + +0:15:53.293 --> 0:15:55.831 +We just keep running. + +0:15:56.036 --> 0:15:59.864 +So here you see your system that is running. + +0:16:00.200 --> 0:16:09.649 +It's normally completely independent of your +language model, which is up there, so down + +0:16:09.649 --> 0:16:13.300 +here we have just our NMT system. + +0:16:13.313 --> 0:16:26.470 +The only thing which is used is we have the +words, and of course they are put into both + +0:16:26.470 --> 0:16:30.059 +systems, and out there. + +0:16:30.050 --> 0:16:42.221 +So we use them somehow for both, and then +we are doing our decision just by merging these + +0:16:42.221 --> 0:16:42.897 +two. + +0:16:43.343 --> 0:16:53.956 +So there can be, for example, we are doing +a probability distribution here, and then we + +0:16:53.956 --> 0:17:03.363 +are taking the average of post-perability distribution +to do our predictions. + +0:17:11.871 --> 0:17:18.923 +You could also take the output with Steve's +to be more in chore about the mixture. + +0:17:20.000 --> 0:17:32.896 +Yes, you could also do that, so it's more +like engaging mechanisms that you're not doing. + +0:17:32.993 --> 0:17:41.110 +Another one would be cochtrinate the hidden +states, and then you would have another layer + +0:17:41.110 --> 0:17:41.831 +on top. + +0:17:43.303 --> 0:17:56.889 +You think about if you do the conqueredination +instead of taking the instead and then merging + +0:17:56.889 --> 0:18:01.225 +the probability distribution. + +0:18:03.143 --> 0:18:16.610 +Introduce many new parameters, and these parameters +have somehow something special compared to + +0:18:16.610 --> 0:18:17.318 +the. + +0:18:23.603 --> 0:18:37.651 +So before all the error other parameters can +be trained independent, the language model + +0:18:37.651 --> 0:18:42.121 +can be trained independent. + +0:18:43.043 --> 0:18:51.749 +If you have a joint layer, of course you need +to train them because you have now inputs. + +0:18:54.794 --> 0:19:02.594 +Not surprisingly, if you have a parallel combination +of whether you could, the other way is to do + +0:19:02.594 --> 0:19:04.664 +more serial combinations. + +0:19:04.924 --> 0:19:10.101 +How can you do a similar combination? + +0:19:10.101 --> 0:19:18.274 +Your final decision makes sense to do a face +on the system. + +0:19:18.438 --> 0:19:20.996 +So you have on top of your normal and system. + +0:19:21.121 --> 0:19:30.678 +The only thing is now you're inputting into +your system. + +0:19:30.678 --> 0:19:38.726 +You're no longer inputting the word embeddings. + +0:19:38.918 --> 0:19:45.588 +So you're training your mainly what you have +your lower layers here which are trained more + +0:19:45.588 --> 0:19:52.183 +on the purely language model style and then +on top your putting into the NMT system where + +0:19:52.183 --> 0:19:55.408 +it now has already here the language model. + +0:19:55.815 --> 0:19:58.482 +So here you can also view it. + +0:19:58.482 --> 0:20:06.481 +Here you have more contextual embeddings which +no longer depend only on the word but they + +0:20:06.481 --> 0:20:10.659 +also depend on the context of the target site. + +0:20:11.051 --> 0:20:19.941 +But you have more understanding of the source +word, so you have a language in the current + +0:20:19.941 --> 0:20:21.620 +target sentence. + +0:20:21.881 --> 0:20:27.657 +So if it's like the word can, for example, +will be put in here always the same independent + +0:20:27.657 --> 0:20:31.147 +of its user can of beans, or if it's like I +can do it. + +0:20:31.147 --> 0:20:37.049 +However, because you are having your language +model style, you have maybe disintegrated this + +0:20:37.049 --> 0:20:40.984 +already a bit, and you give this information +directly to the. + +0:20:41.701 --> 0:20:43.095 +An empty cyst. + +0:20:44.364 --> 0:20:49.850 +You, if you're remembering more the transformer +based approach, you have some layers. + +0:20:49.850 --> 0:20:55.783 +The lower layers are purely languaged while +the other ones are with attention to the source. + +0:20:55.783 --> 0:21:01.525 +So you can view it also that you just have +lower layers which don't attend to the source. + +0:21:02.202 --> 0:21:07.227 +This is purely a language model, and then +at some point you're starting to attend to + +0:21:07.227 --> 0:21:08.587 +the source and use it. + +0:21:13.493 --> 0:21:20.781 +Yes, so this is how you combine them in peril +or first do the language model and then do. + +0:21:23.623 --> 0:21:26.147 +Questions for the integration. + +0:21:31.831 --> 0:21:35.034 +Not really sure about the input of the. + +0:21:35.475 --> 0:21:38.102 +Model, and in this case in the sequence. + +0:21:38.278 --> 0:21:54.854 +Case so the actual word that we transferred +into a numerical lecture, and this is an input. + +0:21:56.176 --> 0:22:03.568 +That depends on if you view the word embedding +as part of the language model. + +0:22:03.568 --> 0:22:10.865 +So if you first put the word target word then +you do the one hot end coding. + +0:22:11.691 --> 0:22:13.805 +And then the word embedding there is the r& + +0:22:13.805 --> 0:22:13.937 +n. + +0:22:14.314 --> 0:22:21.035 +So you can use this together as your language +model when you first do the word embedding. + +0:22:21.401 --> 0:22:24.346 +All you can say is like before. + +0:22:24.346 --> 0:22:28.212 +It's more a definition, but you're right. + +0:22:28.212 --> 0:22:30.513 +So what's the steps out? + +0:22:30.513 --> 0:22:36.128 +You take the word, the one hut encoding, the +word embedding. + +0:22:36.516 --> 0:22:46.214 +What one of these parrots, you know, called +a language model is definition wise and not + +0:22:46.214 --> 0:22:47.978 +that important. + +0:22:53.933 --> 0:23:02.264 +So the question is how can you then train +them and make this this one work? + +0:23:02.264 --> 0:23:02.812 +The. + +0:23:03.363 --> 0:23:15.201 +So in the case where you combine the language +one of the abilities you can train them independently + +0:23:15.201 --> 0:23:18.516 +and just put them together. + +0:23:18.918 --> 0:23:27.368 +Might not be the best because we have no longer +the stability that we had before that optimally + +0:23:27.368 --> 0:23:29.128 +performed together. + +0:23:29.128 --> 0:23:33.881 +It's not clear if they really work the best +together. + +0:23:34.514 --> 0:23:41.585 +At least you need to somehow find how much +do you trust the one model and how much. + +0:23:43.323 --> 0:23:45.058 +Still in some cases useful. + +0:23:45.058 --> 0:23:48.530 +It might be helpful if you have only data +and software. + +0:23:48.928 --> 0:23:59.064 +However, in MT we have one specific situation +that at least for the MT part parallel is also + +0:23:59.064 --> 0:24:07.456 +always monolingual data, so what we definitely +can do is train the language. + +0:24:08.588 --> 0:24:18.886 +So what we also can do is more like the pre-training +approach. + +0:24:18.886 --> 0:24:24.607 +We first train the language model. + +0:24:24.704 --> 0:24:27.334 +The pre-training approach. + +0:24:27.334 --> 0:24:33.470 +You first train on the monolingual data and +then you join the. + +0:24:33.933 --> 0:24:41.143 +Of course, the model size is this way, but +the data size is too bigly the other way around. + +0:24:41.143 --> 0:24:47.883 +You often have a lot more monolingual data +than you have here parallel data, in which + +0:24:47.883 --> 0:24:52.350 +scenario can you imagine where this type of +pretraining? + +0:24:56.536 --> 0:24:57.901 +Any Ideas. + +0:25:04.064 --> 0:25:12.772 +One example where this might also be helpful +if you want to adapt to domains. + +0:25:12.772 --> 0:25:22.373 +So let's say you do medical sentences and +if you want to translate medical sentences. + +0:25:23.083 --> 0:25:26.706 +In this case it could be or its most probable +happen. + +0:25:26.706 --> 0:25:32.679 +You're learning here up there what medical +means, but in your fine tuning step the model + +0:25:32.679 --> 0:25:38.785 +is forgotten everything about Medicare, so +you may be losing all the information you gain. + +0:25:39.099 --> 0:25:42.366 +So this type of priest training step is good. + +0:25:42.366 --> 0:25:47.978 +If your pretraining data is more general, +very large and then you're adapting. + +0:25:48.428 --> 0:25:56.012 +But in the task with moral lingual data, which +should be used to adapt the system to some + +0:25:56.012 --> 0:25:57.781 +general topic style. + +0:25:57.817 --> 0:26:06.795 +Then, of course, this is not a good strategy +because you might forgot about everything up + +0:26:06.795 --> 0:26:09.389 +there and you don't have. + +0:26:09.649 --> 0:26:14.678 +So then you have to check what you can do +for them. + +0:26:14.678 --> 0:26:23.284 +You can freeze this part and change it any +more so you don't lose the ability or you can + +0:26:23.284 --> 0:26:25.702 +do a direct combination. + +0:26:25.945 --> 0:26:31.028 +Where you jointly train both of them, so you +train the NMT system on the, and then you train + +0:26:31.028 --> 0:26:34.909 +the language model always in parallels so that +you don't forget about. + +0:26:35.395 --> 0:26:37.684 +And what you learn of the length. + +0:26:37.937 --> 0:26:46.711 +Depends on what you want to combine because +it's large data and you have a good general + +0:26:46.711 --> 0:26:48.107 +knowledge in. + +0:26:48.548 --> 0:26:55.733 +Then you normally don't really forget it because +it's also in the or you use it to adapt to + +0:26:55.733 --> 0:26:57.295 +something specific. + +0:26:57.295 --> 0:26:58.075 +Then you. + +0:27:01.001 --> 0:27:06.676 +Then this is a way of how we can make use +of monolingual data. + +0:27:07.968 --> 0:27:12.116 +It seems to be the easiest one somehow. + +0:27:12.116 --> 0:27:20.103 +It's more similar to what we are doing with +statistical machine translation. + +0:27:21.181 --> 0:27:31.158 +Normally always beats this type of model, +which in some view can be like from the conceptual + +0:27:31.158 --> 0:27:31.909 +thing. + +0:27:31.909 --> 0:27:36.844 +It's even easier from the computational side. + +0:27:40.560 --> 0:27:42.078 +And the idea is OK. + +0:27:42.078 --> 0:27:49.136 +We have monolingual data that we just translate +and then generate some type of parallel data + +0:27:49.136 --> 0:27:50.806 +and use that then to. + +0:27:51.111 --> 0:28:00.017 +So if you want to build a German-to-English +system first, take the large amount of data + +0:28:00.017 --> 0:28:02.143 +you have translated. + +0:28:02.402 --> 0:28:10.446 +Then you have more peril data and the interesting +thing is if you then train on the joint thing + +0:28:10.446 --> 0:28:18.742 +or on the original peril data and on what is +artificial where you have generated the translations. + +0:28:18.918 --> 0:28:26.487 +So you can because you are not doing the same +era all the times and you have some knowledge. + +0:28:28.028 --> 0:28:43.199 +With this first approach, however, there is +one issue why it might not work the best. + +0:28:49.409 --> 0:28:51.177 +Very a bit shown in the image to you. + +0:28:53.113 --> 0:28:58.153 +You trade on that quality data. + +0:28:58.153 --> 0:29:02.563 +Here is a bit of a problem. + +0:29:02.563 --> 0:29:08.706 +Your English style is not really good. + +0:29:08.828 --> 0:29:12.213 +And as you're saying, the system always mistranslates. + +0:29:13.493 --> 0:29:19.798 +Something then you will learn that this is +correct because now it's a training game and + +0:29:19.798 --> 0:29:23.022 +you will encourage it to make it more often. + +0:29:23.022 --> 0:29:29.614 +So the problem with training on your own areas +yeah you might prevent some areas you rarely + +0:29:29.614 --> 0:29:29.901 +do. + +0:29:30.150 --> 0:29:31.749 +But errors use systematically. + +0:29:31.749 --> 0:29:34.225 +Do you even enforce more and will even do +more? + +0:29:34.654 --> 0:29:40.145 +So that might not be the best solution to +have any idea how you could do it better. + +0:29:44.404 --> 0:29:57.754 +Is one way there is even a bit of more simple +idea. + +0:30:04.624 --> 0:30:10.975 +The problem is yeah, the translations are +not perfect, so the output and you're learning + +0:30:10.975 --> 0:30:12.188 +something wrong. + +0:30:12.188 --> 0:30:17.969 +Normally it's less bad if your inputs are +not bad, but your outputs are perfect. + +0:30:18.538 --> 0:30:24.284 +So if your inputs are wrong you may learn +that if you're doing this wrong input you're + +0:30:24.284 --> 0:30:30.162 +generating something correct, but you're not +learning to generate something which is not + +0:30:30.162 --> 0:30:30.756 +correct. + +0:30:31.511 --> 0:30:47.124 +So often the case it is that it is more important +than your target is correct. + +0:30:47.347 --> 0:30:52.182 +But you can assume in your application scenario +you hope that you may only get correct inputs. + +0:30:52.572 --> 0:31:02.535 +So that is not harming you, and in machine +translation we have one very nice advantage: + +0:31:02.762 --> 0:31:04.648 +And also the other way around. + +0:31:04.648 --> 0:31:10.062 +It's a very similar task, so there's a task +to translate from German to English, but the + +0:31:10.062 --> 0:31:13.894 +task to translate from English to German is +very similar, and. + +0:31:14.094 --> 0:31:19.309 +So what we can do is we can just switch it +initially and generate the data the other way + +0:31:19.309 --> 0:31:19.778 +around. + +0:31:20.120 --> 0:31:25.959 +So what we are doing here is we are starting +with an English to German system. + +0:31:25.959 --> 0:31:32.906 +Then we are translating the English data into +German where the German is maybe not very nice. + +0:31:33.293 --> 0:31:51.785 +And then we are training on our original data +and on the back translated data. + +0:31:52.632 --> 0:32:02.332 +So here we have the advantage that our target +side is human quality and only the input. + +0:32:03.583 --> 0:32:08.113 +Then this helps us to get really good. + +0:32:08.113 --> 0:32:15.431 +There is one difference if you think about +the data resources. + +0:32:21.341 --> 0:32:27.336 +Too obvious here we need a target site monolingual +layer. + +0:32:27.336 --> 0:32:31.574 +In the first example we had source site. + +0:32:31.931 --> 0:32:45.111 +So back translation is normally working if +you have target size peril later and not search + +0:32:45.111 --> 0:32:48.152 +side modeling later. + +0:32:48.448 --> 0:32:56.125 +Might be also, like if you think about it, +understand a little better to understand the + +0:32:56.125 --> 0:32:56.823 +target. + +0:32:57.117 --> 0:33:01.469 +On the source side you have to understand +the content. + +0:33:01.469 --> 0:33:08.749 +On the target side you have to generate really +sentences and somehow it's more difficult to + +0:33:08.749 --> 0:33:12.231 +generate something than to only understand. + +0:33:17.617 --> 0:33:30.734 +This works well if you have to select how +many back translated data do you use. + +0:33:31.051 --> 0:33:32.983 +Because only there's like a lot more. + +0:33:33.253 --> 0:33:42.136 +Question: Should take all of my data there +is two problems with it? + +0:33:42.136 --> 0:33:51.281 +Of course it's expensive because you have +to translate all this data. + +0:33:51.651 --> 0:34:00.946 +So if you don't know the normal good starting +point is to take equal amount of data as many + +0:34:00.946 --> 0:34:02.663 +back translated. + +0:34:02.963 --> 0:34:04.673 +It depends on the used case. + +0:34:04.673 --> 0:34:08.507 +If we have very few data here, it makes more +sense to have more. + +0:34:08.688 --> 0:34:15.224 +Depends on how good your quality is here, +so the better the more data you might use because + +0:34:15.224 --> 0:34:16.574 +quality is better. + +0:34:16.574 --> 0:34:22.755 +So it depends on a lot of things, but your +rule of sum is like which general way often + +0:34:22.755 --> 0:34:24.815 +is to have equal amounts of. + +0:34:26.646 --> 0:34:29.854 +And you can, of course, do that now. + +0:34:29.854 --> 0:34:34.449 +I said already that it's better to have the +quality. + +0:34:34.449 --> 0:34:38.523 +At the end, of course, depends on this system. + +0:34:38.523 --> 0:34:46.152 +Also, because the better this system is, the +better your synthetic data is, the better. + +0:34:47.207 --> 0:34:50.949 +That leads to what is referred to as iterated +back translation. + +0:34:51.291 --> 0:34:56.917 +So you play them on English to German, and +you translate the data on. + +0:34:56.957 --> 0:35:03.198 +Then you train a model on German to English +with the additional data. + +0:35:03.198 --> 0:35:09.796 +Then you translate German data and then you +train to gain your first one. + +0:35:09.796 --> 0:35:14.343 +So in the second iteration this quality is +better. + +0:35:14.334 --> 0:35:19.900 +System is better because it's not only trained +on the small data but additionally on back + +0:35:19.900 --> 0:35:22.003 +translated data with this system. + +0:35:22.442 --> 0:35:24.458 +And so you can get better. + +0:35:24.764 --> 0:35:28.053 +However, typically you can stop quite early. + +0:35:28.053 --> 0:35:35.068 +Maybe one iteration is good, but then you +have diminishing gains after two or three iterations. + +0:35:35.935 --> 0:35:46.140 +There is very slight difference because you +need a quite big difference in the quality + +0:35:46.140 --> 0:35:46.843 +here. + +0:35:47.207 --> 0:36:02.262 +Language is also good because it means you +can already train it with relatively bad profiles. + +0:36:03.723 --> 0:36:10.339 +It's a design decision would advise so guess +because it's easy to get it. + +0:36:10.550 --> 0:36:20.802 +Replace that because you have a higher quality +real data, but then I think normally it's okay + +0:36:20.802 --> 0:36:22.438 +to replace it. + +0:36:22.438 --> 0:36:28.437 +I would assume it's not too much of a difference, +but. + +0:36:34.414 --> 0:36:42.014 +That's about like using monolingual data before +we go into the pre-train models to have any + +0:36:42.014 --> 0:36:43.005 +more crash. + +0:36:49.029 --> 0:36:55.740 +Yes, so the other thing which we can do and +which is recently more and more successful + +0:36:55.740 --> 0:37:02.451 +and even more successful since we have this +really large language models where you can + +0:37:02.451 --> 0:37:08.545 +even do the translation task with this is the +way of using pre-trained models. + +0:37:08.688 --> 0:37:16.135 +So you learn a representation of one task, +and then you use this representation from another. + +0:37:16.576 --> 0:37:26.862 +It was made maybe like one of the first words +where it really used largely is doing something + +0:37:26.862 --> 0:37:35.945 +like a bird which you pre trained on purely +text era and you take it in fine tune. + +0:37:36.496 --> 0:37:42.953 +And one big advantage, of course, is that +people can only share data but also pre-trained. + +0:37:43.423 --> 0:37:59.743 +The recent models and the large language ones +which are available. + +0:37:59.919 --> 0:38:09.145 +Where I think it costs several millions to +train them all, just if you would buy the GPUs + +0:38:09.145 --> 0:38:15.397 +from some cloud company and train that the +cost of training. + +0:38:15.475 --> 0:38:21.735 +And guess as a student project you won't have +the budget to like build these models. + +0:38:21.801 --> 0:38:24.598 +So another idea is what you can do is okay. + +0:38:24.598 --> 0:38:27.330 +Maybe if these months are once available,. + +0:38:27.467 --> 0:38:36.598 +Can take them and use them as an also resource +similar to pure text, and you can now build + +0:38:36.598 --> 0:38:44.524 +models which somehow learn not only from from +data but also from other models. + +0:38:44.844 --> 0:38:49.127 +So it's a quite new way of thinking of how +to train. + +0:38:49.127 --> 0:38:53.894 +We are not only learning from examples, but +we might also. + +0:38:54.534 --> 0:39:05.397 +The nice thing is that this type of training +where we are not learning directly from data + +0:39:05.397 --> 0:39:07.087 +but learning. + +0:39:07.427 --> 0:39:17.647 +So the main idea this go is you have a person +initial task. + +0:39:17.817 --> 0:39:26.369 +And if you're working with anLP, that means +you're training pure taxator because that's + +0:39:26.369 --> 0:39:30.547 +where you have the largest amount of data. + +0:39:30.951 --> 0:39:35.854 +And then you're defining some type of task +in order to you do your creek training. + +0:39:36.176 --> 0:39:43.092 +And: The typical task you can train on on +that is like the language waddling task. + +0:39:43.092 --> 0:39:50.049 +So to predict the next word or we have a related +task to predict something in between, we'll + +0:39:50.049 --> 0:39:52.667 +see depending on the architecture. + +0:39:52.932 --> 0:39:58.278 +But somehow to predict something which you +have not in the input is a task which is easy + +0:39:58.278 --> 0:40:00.740 +to generate, so you just need your data. + +0:40:00.740 --> 0:40:06.086 +That's why it's called self supervised, so +you're creating your supervised pending data. + +0:40:06.366 --> 0:40:07.646 +By yourself. + +0:40:07.646 --> 0:40:15.133 +On the other hand, you need a lot of knowledge +and that is the other thing. + +0:40:15.735 --> 0:40:24.703 +Because there is this idea that the meaning +of a word heavily depends on the context that. + +0:40:25.145 --> 0:40:36.846 +So can give you a sentence with some giverish +word and there's some name and although you've + +0:40:36.846 --> 0:40:41.627 +never heard the name you will assume. + +0:40:42.062 --> 0:40:44.149 +And exactly the same thing. + +0:40:44.149 --> 0:40:49.143 +The models can also learn something about +the world by just using. + +0:40:49.649 --> 0:40:53.651 +So that is typically the mule. + +0:40:53.651 --> 0:40:59.848 +Then we can use this model to train the system. + +0:41:00.800 --> 0:41:03.368 +Course we might need to adapt the system. + +0:41:03.368 --> 0:41:07.648 +To do that we have to change the architecture +we might use only some. + +0:41:07.627 --> 0:41:09.443 +Part of the pre-trained model. + +0:41:09.443 --> 0:41:14.773 +In there we have seen that a bit already in +the R&N case you can also see that we have + +0:41:14.773 --> 0:41:17.175 +also mentioned the pre-training already. + +0:41:17.437 --> 0:41:22.783 +So you can use the R&N as one of these +approaches. + +0:41:22.783 --> 0:41:28.712 +You train the R&M language more on large +pre-train data. + +0:41:28.712 --> 0:41:32.309 +Then you put it somewhere into your. + +0:41:33.653 --> 0:41:37.415 +So this gives you the ability to really do +these types of tests. + +0:41:37.877 --> 0:41:53.924 +So you can build a system which is knowledge, +which is just trained on large amounts of data. + +0:41:56.376 --> 0:42:01.564 +So the question is maybe what type of information +so what type of models can you? + +0:42:01.821 --> 0:42:05.277 +And we want today to look at briefly at swings. + +0:42:05.725 --> 0:42:08.850 +That was what was initially done. + +0:42:08.850 --> 0:42:17.213 +It wasn't as famous as in machine translation +as in other things, but it's also used there + +0:42:17.213 --> 0:42:21.072 +and that is to use static word embedding. + +0:42:21.221 --> 0:42:28.981 +So we have this mapping from the one hot to +a small continuous word representation. + +0:42:29.229 --> 0:42:38.276 +Using this one in your NG system, so you can, +for example, replace the embedding layer by + +0:42:38.276 --> 0:42:38.779 +the. + +0:42:39.139 --> 0:42:41.832 +That is helpful to be a really small amount +of data. + +0:42:42.922 --> 0:42:48.517 +And we're always in this pre-training phase +and have the thing the advantage is. + +0:42:48.468 --> 0:42:52.411 +More data than the trade off, so you can get +better. + +0:42:52.411 --> 0:42:59.107 +The disadvantage is, does anybody have an +idea of what might be the disadvantage of using + +0:42:59.107 --> 0:43:00.074 +things like. + +0:43:04.624 --> 0:43:12.175 +What was one mentioned today giving like big +advantage of the system compared to previous. + +0:43:20.660 --> 0:43:25.134 +Where one advantage was the enter end training, +so you have the enter end training so that + +0:43:25.134 --> 0:43:27.937 +all parameters and all components play optimal +together. + +0:43:28.208 --> 0:43:33.076 +If you know pre-train something on one fast, +it may be no longer optimal fitting to everything + +0:43:33.076 --> 0:43:33.384 +else. + +0:43:33.893 --> 0:43:37.862 +So what do pretending or not? + +0:43:37.862 --> 0:43:48.180 +It depends on how important everything is +optimal together and how important. + +0:43:48.388 --> 0:43:51.874 +Is a iquality of large amount. + +0:43:51.874 --> 0:44:00.532 +The pre-change one is so much better that +it's helpful and the advantage of. + +0:44:00.600 --> 0:44:11.211 +Getting everything optimal together, yes, +we would use random instructions for raising. + +0:44:11.691 --> 0:44:26.437 +The problem is you might be already in some +area where it's not easy to get. + +0:44:26.766 --> 0:44:35.329 +But often in some way right, so often it's +not about your really worse pre trained monolepsy. + +0:44:35.329 --> 0:44:43.254 +If you're going already in some direction, +and if this is not really optimal for you,. + +0:44:43.603 --> 0:44:52.450 +But if you're not really getting better because +you have a decent amount of data, it's so different + +0:44:52.450 --> 0:44:52.981 +that. + +0:44:53.153 --> 0:44:59.505 +Initially it wasn't a machine translation +done so much because there are more data in + +0:44:59.505 --> 0:45:06.153 +MPs than in other tasks, but now with really +large amounts of monolingual data we do some + +0:45:06.153 --> 0:45:09.403 +type of pretraining in currently all state. + +0:45:12.632 --> 0:45:14.302 +The other one is okay now. + +0:45:14.302 --> 0:45:18.260 +It's always like how much of the model do +you plea track a bit? + +0:45:18.658 --> 0:45:22.386 +To the other one you can do contextural word +embedded. + +0:45:22.386 --> 0:45:28.351 +That is something like bird or Roberta where +you train already a sequence model and the + +0:45:28.351 --> 0:45:34.654 +embeddings you're using are no longer specific +for word but they are also taking the context + +0:45:34.654 --> 0:45:35.603 +into account. + +0:45:35.875 --> 0:45:50.088 +The embedding you're using is no longer depending +on the word itself but on the whole sentence, + +0:45:50.088 --> 0:45:54.382 +so you can use this context. + +0:45:55.415 --> 0:46:02.691 +You can use similar things also in the decoder +just by having layers which don't have access + +0:46:02.691 --> 0:46:12.430 +to the source, but there it still might have +and these are typically models like: And finally + +0:46:12.430 --> 0:46:14.634 +they will look at the end. + +0:46:14.634 --> 0:46:19.040 +You can also have models which are already +sequenced. + +0:46:19.419 --> 0:46:28.561 +So you may be training a sequence to sequence +models. + +0:46:28.561 --> 0:46:35.164 +You have to make it a bit challenging. + +0:46:36.156 --> 0:46:43.445 +But the idea is really you're pre-training +your whole model and then you'll find tuning. + +0:46:47.227 --> 0:46:59.614 +But let's first do a bit of step back and +look into what are the different things. + +0:46:59.614 --> 0:47:02.151 +The first thing. + +0:47:02.382 --> 0:47:11.063 +The wooden bettings are just this first layer +and you can train them with feedback annual + +0:47:11.063 --> 0:47:12.028 +networks. + +0:47:12.212 --> 0:47:22.761 +But you can also train them with an N language +model, and by now you hopefully have also seen + +0:47:22.761 --> 0:47:27.699 +that you cannot transform a language model. + +0:47:30.130 --> 0:47:37.875 +So this is how you can train them and you're +training them. + +0:47:37.875 --> 0:47:45.234 +For example, to speak the next word that is +the easiest. + +0:47:45.525 --> 0:47:55.234 +And that is what is now referred to as South +Supervised Learning and, for example, all the + +0:47:55.234 --> 0:48:00.675 +big large language models like Chad GPT and +so on. + +0:48:00.675 --> 0:48:03.129 +They are trained with. + +0:48:03.823 --> 0:48:15.812 +So that is where you can hopefully learn how +a word is used because you always try to previct + +0:48:15.812 --> 0:48:17.725 +the next word. + +0:48:19.619 --> 0:48:27.281 +Word embedding: Why do you keep the first +look at the word embeddings and the use of + +0:48:27.281 --> 0:48:29.985 +word embeddings for our task? + +0:48:29.985 --> 0:48:38.007 +The main advantage was it might be only the +first layer where you typically have most of + +0:48:38.007 --> 0:48:39.449 +the parameters. + +0:48:39.879 --> 0:48:57.017 +Most of your parameters already on the large +data, then on your target data you have to + +0:48:57.017 --> 0:48:59.353 +train less. + +0:48:59.259 --> 0:49:06.527 +Big difference that your input size is so +much bigger than the size of the novel in size. + +0:49:06.626 --> 0:49:17.709 +So it's a normally sign, maybe like, but your +input and banning size is something like. + +0:49:17.709 --> 0:49:20.606 +Then here you have to. + +0:49:23.123 --> 0:49:30.160 +While here you see it's only like zero point +five times as much in the layer. + +0:49:30.750 --> 0:49:40.367 +So here is where most of your parameters are, +which means if you already replace the word + +0:49:40.367 --> 0:49:48.915 +embeddings they might look a bit small in your +overall and in key architecture. + +0:49:57.637 --> 0:50:01.249 +The thing is we have seen these were the bettings. + +0:50:01.249 --> 0:50:04.295 +They can be very good use for other types. + +0:50:04.784 --> 0:50:08.994 +You learn some general relations between words. + +0:50:08.994 --> 0:50:17.454 +If you're doing this type of language modeling +cast, you predict: The one thing is you have + +0:50:17.454 --> 0:50:24.084 +a lot of data, so the one question is we want +to have data to trade a model. + +0:50:24.084 --> 0:50:28.734 +The other thing, the tasks need to be somehow +useful. + +0:50:29.169 --> 0:50:43.547 +If you would predict the first letter of the +word, then you wouldn't learn anything about + +0:50:43.547 --> 0:50:45.144 +the word. + +0:50:45.545 --> 0:50:53.683 +And the interesting thing is people have looked +at these wood embeddings. + +0:50:53.954 --> 0:50:58.550 +And looking at the word embeddings. + +0:50:58.550 --> 0:51:09.276 +You can ask yourself how they look and visualize +them by doing dimension reduction. + +0:51:09.489 --> 0:51:13.236 +Don't know if you and you are listening to +artificial intelligence. + +0:51:13.236 --> 0:51:15.110 +Advanced artificial intelligence. + +0:51:15.515 --> 0:51:23.217 +We had on yesterday there how to do this type +of representation, but you can do this time + +0:51:23.217 --> 0:51:29.635 +of representation, and now you're seeing interesting +things that normally. + +0:51:30.810 --> 0:51:41.027 +Now you can represent a here in a three dimensional +space with some dimension reduction. + +0:51:41.027 --> 0:51:46.881 +For example, the relation between male and +female. + +0:51:47.447 --> 0:51:56.625 +So this vector between the male and female +version of something is always not the same, + +0:51:56.625 --> 0:51:58.502 +but it's related. + +0:51:58.718 --> 0:52:14.522 +So you can do a bit of maths, so you do take +king, you subtract this vector, add this vector. + +0:52:14.894 --> 0:52:17.591 +So that means okay, there is really something +stored. + +0:52:17.591 --> 0:52:19.689 +Some information are stored in that book. + +0:52:20.040 --> 0:52:22.492 +Similar, you can do it with bug answers. + +0:52:22.492 --> 0:52:25.004 +You see here swimming slang walking walk. + +0:52:25.265 --> 0:52:34.620 +So again these vectors are not the same, but +they are related. + +0:52:34.620 --> 0:52:42.490 +So you learn something from going from here +to here. + +0:52:43.623 --> 0:52:49.761 +Or semantically, the relations between city +and capital have exactly the same sense. + +0:52:51.191 --> 0:52:56.854 +And people had even done that question answering +about that if they showed the diembeddings + +0:52:56.854 --> 0:52:57.839 +and the end of. + +0:52:58.218 --> 0:53:06.711 +All you can also do is don't trust the dimensions +of the reaction because maybe there is something. + +0:53:06.967 --> 0:53:16.863 +You can also look into what happens really +in the individual space. + +0:53:16.863 --> 0:53:22.247 +What is the nearest neighbor of the. + +0:53:22.482 --> 0:53:29.608 +So you can take the relationship between France +and Paris and add it to Italy and you'll. + +0:53:30.010 --> 0:53:33.078 +You can do big and bigger and you have small +and smaller and stuff. + +0:53:33.593 --> 0:53:49.417 +Because it doesn't work everywhere, there +is also some typical dish here in German. + +0:53:51.491 --> 0:54:01.677 +You can do what the person is doing for famous +ones, of course only like Einstein scientists + +0:54:01.677 --> 0:54:06.716 +that find midfielders not completely correct. + +0:54:06.846 --> 0:54:10.134 +You see the examples are a bit old. + +0:54:10.134 --> 0:54:15.066 +The politicians are no longer they am, but +of course. + +0:54:16.957 --> 0:54:26.759 +What people have done there, especially at +the beginning training our end language model, + +0:54:26.759 --> 0:54:28.937 +was very expensive. + +0:54:29.309 --> 0:54:38.031 +So one famous model was, but we are not really +interested in the language model performance. + +0:54:38.338 --> 0:54:40.581 +Think something good to keep in mind. + +0:54:40.581 --> 0:54:42.587 +What are we really interested in? + +0:54:42.587 --> 0:54:45.007 +Do we really want to have an R&N no? + +0:54:45.007 --> 0:54:48.607 +In this case we are only interested in this +type of mapping. + +0:54:49.169 --> 0:54:55.500 +And so successful and very successful was +this word to vet. + +0:54:55.535 --> 0:54:56.865 +The idea is okay. + +0:54:56.865 --> 0:55:03.592 +We are not training real language one, making +it even simpler and doing this, for example, + +0:55:03.592 --> 0:55:05.513 +continuous peck of words. + +0:55:05.513 --> 0:55:12.313 +We're just having four input tokens and we're +predicting what is the word in the middle and + +0:55:12.313 --> 0:55:15.048 +this is just like two linear layers. + +0:55:15.615 --> 0:55:21.627 +So it's even simplifying things and making +the calculation faster because that is what + +0:55:21.627 --> 0:55:22.871 +we're interested. + +0:55:23.263 --> 0:55:32.897 +All this continuous skip ground models with +these other models which refer to as where + +0:55:32.897 --> 0:55:34.004 +to where. + +0:55:34.234 --> 0:55:42.394 +Where you have one equal word and the other +way around, you're predicting the four words + +0:55:42.394 --> 0:55:43.585 +around them. + +0:55:43.585 --> 0:55:45.327 +It's very similar. + +0:55:45.327 --> 0:55:48.720 +The task is in the end very similar. + +0:55:51.131 --> 0:56:01.407 +Before we are going to the next point, anything +about normal weight vectors or weight embedding. + +0:56:04.564 --> 0:56:07.794 +The next thing is contexture. + +0:56:07.794 --> 0:56:12.208 +Word embeddings and the idea is helpful. + +0:56:12.208 --> 0:56:19.206 +However, we might even be able to get more +from one lingo layer. + +0:56:19.419 --> 0:56:31.732 +And now in the word that is overlap of these +two meanings, so it represents both the meaning + +0:56:31.732 --> 0:56:33.585 +of can do it. + +0:56:34.834 --> 0:56:40.410 +But we might be able to in the pre-trained +model already disambiguate this because they + +0:56:40.410 --> 0:56:41.044 +are used. + +0:56:41.701 --> 0:56:53.331 +So if we can have a model which can not only +represent a word but can also represent the + +0:56:53.331 --> 0:56:58.689 +meaning of the word within the context,. + +0:56:59.139 --> 0:57:03.769 +So then we are going to context your word +embeddings. + +0:57:03.769 --> 0:57:07.713 +We are really having a representation in the. + +0:57:07.787 --> 0:57:11.519 +And we have a very good architecture for that +already. + +0:57:11.691 --> 0:57:23.791 +The hidden state represents what is currently +said, but it's focusing on what is the last + +0:57:23.791 --> 0:57:29.303 +one, so it's some of the representation. + +0:57:29.509 --> 0:57:43.758 +The first one doing that is something like +the Elmo paper where they instead of this is + +0:57:43.758 --> 0:57:48.129 +the normal language model. + +0:57:48.008 --> 0:57:50.714 +Within the third, predicting the fourth, and +so on. + +0:57:50.714 --> 0:57:53.004 +So you are always predicting the next work. + +0:57:53.193 --> 0:57:57.335 +The architecture is the heaven words embedding +layer and then layers. + +0:57:57.335 --> 0:58:03.901 +See you, for example: And now instead of using +this one in the end, you're using here this + +0:58:03.901 --> 0:58:04.254 +one. + +0:58:04.364 --> 0:58:11.245 +This represents the meaning of this word mainly +in the context of what we have seen before. + +0:58:11.871 --> 0:58:18.610 +We can train it in a language model style +always predicting the next word, but we have + +0:58:18.610 --> 0:58:21.088 +more information trained there. + +0:58:21.088 --> 0:58:26.123 +Therefore, in the system it has to learn less +additional things. + +0:58:27.167 --> 0:58:31.261 +And there is one Edendang which is done currently +in GPS. + +0:58:31.261 --> 0:58:38.319 +The only difference is that we have more layers, +bigger size, and we're using transformer neurocell + +0:58:38.319 --> 0:58:40.437 +potential instead of the RNA. + +0:58:40.437 --> 0:58:45.095 +But that is how you train like some large +language models at the. + +0:58:46.746 --> 0:58:55.044 +However, if you look at this contextual representation, +they might not be perfect. + +0:58:55.044 --> 0:59:02.942 +So if you think of this one as a contextual +representation of the third word,. + +0:59:07.587 --> 0:59:16.686 +Is representing a three in the context of +a sentence, however only in the context of + +0:59:16.686 --> 0:59:18.185 +the previous. + +0:59:18.558 --> 0:59:27.413 +However, we have an architecture which can +also take both sides and we have used that + +0:59:27.413 --> 0:59:30.193 +already in the ink holder. + +0:59:30.630 --> 0:59:34.264 +So we could do the iron easily on your, also +in the backward direction. + +0:59:34.874 --> 0:59:42.826 +By just having the states the other way around +and then we couldn't combine the forward and + +0:59:42.826 --> 0:59:49.135 +the forward into a joint one where we are doing +this type of prediction. + +0:59:49.329 --> 0:59:50.858 +So you have the word embedding. + +0:59:51.011 --> 1:00:02.095 +Then you have two in the states, one on the +forward arm and one on the backward arm, and + +1:00:02.095 --> 1:00:10.314 +then you can, for example, take the cocagenation +of both of them. + +1:00:10.490 --> 1:00:23.257 +Now this same here represents mainly this +word because this is what both puts in it last + +1:00:23.257 --> 1:00:30.573 +and we know is focusing on what is happening +last. + +1:00:31.731 --> 1:00:40.469 +However, there is a bit of difference when +training that as a language model you already + +1:00:40.469 --> 1:00:41.059 +have. + +1:00:43.203 --> 1:00:44.956 +Maybe There's Again This Masking. + +1:00:46.546 --> 1:00:47.748 +That is one solution. + +1:00:47.748 --> 1:00:52.995 +First of all, why we can't do it is the information +you leak it, so you cannot just predict the + +1:00:52.995 --> 1:00:53.596 +next word. + +1:00:53.596 --> 1:00:58.132 +If we just predict the next word in this type +of model, that's a very simple task. + +1:00:58.738 --> 1:01:09.581 +You know the next word because it's influencing +this hidden state predicting something is not + +1:01:09.581 --> 1:01:11.081 +a good task. + +1:01:11.081 --> 1:01:18.455 +You have to define: Because in this case what +will end with the system will just ignore these + +1:01:18.455 --> 1:01:22.966 +estates and what will learn is copy this information +directly in here. + +1:01:23.343 --> 1:01:31.218 +So it would be representing this word and +you would have nearly a perfect model because + +1:01:31.218 --> 1:01:38.287 +you only need to find encoding where you can +encode all words somehow in this. + +1:01:38.458 --> 1:01:44.050 +The only thing can learn is that turn and +encode all my words in this upper hidden. + +1:01:44.985 --> 1:01:53.779 +Therefore, it's not really useful, so we need +to find a bit of different ways out. + +1:01:55.295 --> 1:01:57.090 +There is a masking one. + +1:01:57.090 --> 1:02:03.747 +I'll come to that shortly just a bit that +other things also have been done, so the other + +1:02:03.747 --> 1:02:06.664 +thing is not to directly combine them. + +1:02:06.664 --> 1:02:13.546 +That was in the animal paper, so you have +them forward R&M and you keep them completely + +1:02:13.546 --> 1:02:14.369 +separated. + +1:02:14.594 --> 1:02:20.458 +So you never merged to state. + +1:02:20.458 --> 1:02:33.749 +At the end, the representation of the word +is now from the forward. + +1:02:33.873 --> 1:02:35.953 +So it's always the hidden state before the +good thing. + +1:02:36.696 --> 1:02:41.286 +These two you join now to your to the representation. + +1:02:42.022 --> 1:02:48.685 +And then you have now a representation also +about like the whole sentence for the word, + +1:02:48.685 --> 1:02:51.486 +but there is no information leakage. + +1:02:51.486 --> 1:02:58.149 +One way of doing this is instead of doing +a bidirection along you do a forward pass and + +1:02:58.149 --> 1:02:59.815 +then join the hidden. + +1:03:00.380 --> 1:03:05.960 +So you can do that in all layers. + +1:03:05.960 --> 1:03:16.300 +In the end you do the forwarded layers and +you get the hidden. + +1:03:16.596 --> 1:03:19.845 +However, it's a bit of a complicated. + +1:03:19.845 --> 1:03:25.230 +You have to keep both separate and merge things +so can you do. + +1:03:27.968 --> 1:03:33.030 +And that is the moment where like the big. + +1:03:34.894 --> 1:03:39.970 +The big success of the burnt model was used +where it okay. + +1:03:39.970 --> 1:03:47.281 +Maybe in bite and rich case it's not good +to do the next word prediction, but we can + +1:03:47.281 --> 1:03:48.314 +do masking. + +1:03:48.308 --> 1:03:56.019 +Masking mainly means we do a prediction of +something in the middle or some words. + +1:03:56.019 --> 1:04:04.388 +So the idea is if we have the input, we are +putting noise into the input, removing them, + +1:04:04.388 --> 1:04:07.961 +and then the model we are interested. + +1:04:08.048 --> 1:04:15.327 +Now there can be no information leakage because +this wasn't predicting that one is a big challenge. + +1:04:16.776 --> 1:04:19.957 +Do any assumption about our model? + +1:04:19.957 --> 1:04:26.410 +It doesn't need to be a forward model or a +backward model or anything. + +1:04:26.410 --> 1:04:29.500 +You can always predict the three. + +1:04:30.530 --> 1:04:34.844 +There's maybe one bit of a disadvantage. + +1:04:34.844 --> 1:04:40.105 +Do you see what could be a bit of a problem +this? + +1:05:00.000 --> 1:05:06.429 +Yes, so yeah, you can of course mask more, +but to see it more globally, just first assume + +1:05:06.429 --> 1:05:08.143 +you're only masked one. + +1:05:08.143 --> 1:05:13.930 +For the whole sentence, we get one feedback +signal, like what is the word three. + +1:05:13.930 --> 1:05:22.882 +So we have one training example: If you do +the language modeling taste, we predicted here, + +1:05:22.882 --> 1:05:24.679 +we predicted here. + +1:05:25.005 --> 1:05:26.735 +So we have number of tokens. + +1:05:26.735 --> 1:05:30.970 +For each token we have a feet pad and say +what is the best correction. + +1:05:31.211 --> 1:05:43.300 +So in this case this is less efficient because +we are getting less feedback signals on what + +1:05:43.300 --> 1:05:45.797 +we should predict. + +1:05:48.348 --> 1:05:56.373 +So and bird, the main ideas are that you're +doing this bidirectional model with masking. + +1:05:56.373 --> 1:05:59.709 +It's using transformer architecture. + +1:06:00.320 --> 1:06:06.326 +There are two more minor changes. + +1:06:06.326 --> 1:06:16.573 +We'll see that this next word prediction is +another task. + +1:06:16.957 --> 1:06:30.394 +You want to learn more about what language +is to really understand following a story or + +1:06:30.394 --> 1:06:35.127 +their independent tokens into. + +1:06:38.158 --> 1:06:42.723 +The input is using word units as we use it. + +1:06:42.723 --> 1:06:50.193 +It has some special token that is framing +for the next word prediction. + +1:06:50.470 --> 1:07:04.075 +It's more for classification task because +you may be learning a general representation + +1:07:04.075 --> 1:07:07.203 +as a full sentence. + +1:07:07.607 --> 1:07:19.290 +You're doing segment embedding, so you have +an embedding for it. + +1:07:19.290 --> 1:07:24.323 +This is the first sentence. + +1:07:24.684 --> 1:07:29.099 +Now what is more challenging is this masking. + +1:07:29.099 --> 1:07:30.827 +What do you mask? + +1:07:30.827 --> 1:07:35.050 +We already have the crush enough or should. + +1:07:35.275 --> 1:07:42.836 +So there has been afterwards eating some work +like, for example, a bearer. + +1:07:42.836 --> 1:07:52.313 +It's not super sensitive, but if you do it +completely wrong then you're not letting anything. + +1:07:52.572 --> 1:07:54.590 +That's Then Another Question There. + +1:07:56.756 --> 1:08:04.594 +Should I mask all types of should I always +mask the footwork or if I have a subword to + +1:08:04.594 --> 1:08:10.630 +mask only like a subword and predict them based +on the other ones? + +1:08:10.630 --> 1:08:14.504 +Of course, it's a bit of a different task. + +1:08:14.894 --> 1:08:21.210 +If you know three parts of the words, it might +be easier to guess the last because they here + +1:08:21.210 --> 1:08:27.594 +took the easiest selection, so not considering +words anymore at all because you're doing that + +1:08:27.594 --> 1:08:32.280 +in the preprocessing and just taking always +words and like subwords. + +1:08:32.672 --> 1:08:36.089 +Think in group there is done differently. + +1:08:36.089 --> 1:08:40.401 +They mark always the full words, but guess +it's not. + +1:08:41.001 --> 1:08:46.044 +And then what to do with the mask word in +eighty percent of the cases. + +1:08:46.044 --> 1:08:50.803 +If the word is masked, they replace it with +a special token thing. + +1:08:50.803 --> 1:08:57.197 +This is a mask token in ten percent they put +in some random other token in there, and ten + +1:08:57.197 --> 1:08:59.470 +percent they keep it on change. + +1:09:02.202 --> 1:09:10.846 +And then what you can do is also this next +word prediction. + +1:09:10.846 --> 1:09:14.880 +The man went to Mass Store. + +1:09:14.880 --> 1:09:17.761 +He bought a gallon. + +1:09:18.418 --> 1:09:24.088 +So may you see you're joining them, you're +doing both masks and prediction that you're. + +1:09:24.564 --> 1:09:29.449 +Is a penguin mask or flyless birds. + +1:09:29.449 --> 1:09:41.390 +These two sentences have nothing to do with +each other, so you can do also this type of + +1:09:41.390 --> 1:09:43.018 +prediction. + +1:09:47.127 --> 1:09:56.572 +And then the whole bird model, so here you +have the in-foot to transform the layers, and + +1:09:56.572 --> 1:09:58.164 +you can train. + +1:09:58.598 --> 1:10:17.731 +And this model was quite successful in general +applications. + +1:10:17.937 --> 1:10:27.644 +However, there is like a huge thing of different +types of models coming from them. + +1:10:27.827 --> 1:10:38.709 +So based on others these supervised molds +like a whole setup came out of there and now + +1:10:38.709 --> 1:10:42.086 +this is getting even more. + +1:10:42.082 --> 1:10:46.640 +With availability of a large language model +than the success. + +1:10:47.007 --> 1:10:48.436 +We have now even larger ones. + +1:10:48.828 --> 1:10:50.961 +Interestingly, it goes a bit. + +1:10:50.910 --> 1:10:57.847 +Change the bit again from like more the spider +action model to uni directional models. + +1:10:57.847 --> 1:11:02.710 +Are at the moment maybe a bit more we're coming +to them now? + +1:11:02.710 --> 1:11:09.168 +Do you see one advantage while what is another +event and we have the efficiency? + +1:11:09.509 --> 1:11:15.901 +Is one other reason why you are sometimes +more interested in uni-direction models than + +1:11:15.901 --> 1:11:17.150 +in bi-direction. + +1:11:22.882 --> 1:11:30.220 +It depends on the pass, but for example for +a language generation pass, the eccard is not + +1:11:30.220 --> 1:11:30.872 +really. + +1:11:32.192 --> 1:11:40.924 +It doesn't work so if you want to do a generation +like the decoder you don't know the future + +1:11:40.924 --> 1:11:42.896 +so you cannot apply. + +1:11:43.223 --> 1:11:53.870 +So this time of model can be used for the +encoder in an encoder model, but it cannot + +1:11:53.870 --> 1:11:57.002 +be used for the decoder. + +1:12:00.000 --> 1:12:05.012 +That's a good view to the next overall cast +of models. + +1:12:05.012 --> 1:12:08.839 +Perhaps if you view it from the sequence. + +1:12:09.009 --> 1:12:12.761 +We have the encoder base model. + +1:12:12.761 --> 1:12:16.161 +That's what we just look at. + +1:12:16.161 --> 1:12:20.617 +They are bidirectional and typically. + +1:12:20.981 --> 1:12:22.347 +That Is the One We Looked At. + +1:12:22.742 --> 1:12:34.634 +At the beginning is the decoder based model, +so see out in regressive models which are unidirective + +1:12:34.634 --> 1:12:42.601 +like an based model, and there we can do the +next word prediction. + +1:12:43.403 --> 1:12:52.439 +And what you can also do first, and there +you can also have a special things called prefix + +1:12:52.439 --> 1:12:53.432 +language. + +1:12:54.354 --> 1:13:05.039 +Because we are saying it might be helpful +that some of your input can also use bi-direction. + +1:13:05.285 --> 1:13:12.240 +And that is somehow doing what it is called +prefix length. + +1:13:12.240 --> 1:13:19.076 +On the first tokens you directly give your +bidirectional. + +1:13:19.219 --> 1:13:28.774 +So you somehow merge that and that mainly +works only in transformer based models because. + +1:13:29.629 --> 1:13:33.039 +There is no different number of parameters +in our end. + +1:13:33.039 --> 1:13:34.836 +We need a back foot our end. + +1:13:34.975 --> 1:13:38.533 +Transformer: The only difference is how you +mask your attention. + +1:13:38.878 --> 1:13:44.918 +We have seen that in the anchoder and decoder +the number of parameters is different because + +1:13:44.918 --> 1:13:50.235 +you do cross attention, but if you do forward +and backward or union directions,. + +1:13:50.650 --> 1:13:58.419 +It's only like that you mask your attention +to only look at the bad past or to look into + +1:13:58.419 --> 1:13:59.466 +the future. + +1:14:00.680 --> 1:14:03.326 +And now you can of course also do mixing. + +1:14:03.563 --> 1:14:08.306 +So this is a bi-directional attention matrix +where you can attend to everything. + +1:14:08.588 --> 1:14:23.516 +There is a uni-direction or causal where you +can look at the past and you can do the first + +1:14:23.516 --> 1:14:25.649 +three words. + +1:14:29.149 --> 1:14:42.831 +That somehow clear based on that, then of +course you cannot do the other things. + +1:14:43.163 --> 1:14:50.623 +So the idea is we have our anchor to decoder +architecture. + +1:14:50.623 --> 1:14:57.704 +Can we also train them completely in a side +supervisor? + +1:14:58.238 --> 1:15:09.980 +And in this case we have the same input to +both, so in this case we need to do some type + +1:15:09.980 --> 1:15:12.224 +of masking here. + +1:15:12.912 --> 1:15:17.591 +Here we don't need to do the masking, but +here we need to the masking that doesn't know + +1:15:17.591 --> 1:15:17.910 +ever. + +1:15:20.440 --> 1:15:30.269 +And this type of model got quite successful +also, especially for pre-training machine translation. + +1:15:30.330 --> 1:15:39.059 +The first model doing that is a Bart model, +which exactly does that, and yes, it's one + +1:15:39.059 --> 1:15:42.872 +successful way to pre train your one. + +1:15:42.872 --> 1:15:47.087 +It's pretraining your full encoder model. + +1:15:47.427 --> 1:15:54.365 +Where you put in contrast to machine translation, +where you put in source sentence, we can't + +1:15:54.365 --> 1:15:55.409 +do that here. + +1:15:55.715 --> 1:16:01.382 +But we can just put the second twice in there, +and then it's not a trivial task. + +1:16:01.382 --> 1:16:02.432 +We can change. + +1:16:03.003 --> 1:16:12.777 +And there is like they do different corruption +techniques so you can also do. + +1:16:13.233 --> 1:16:19.692 +That you couldn't do in an agricultural system +because then it wouldn't be there and you cannot + +1:16:19.692 --> 1:16:20.970 +predict somewhere. + +1:16:20.970 --> 1:16:26.353 +So the anchor, the number of input and output +tokens always has to be the same. + +1:16:26.906 --> 1:16:29.818 +You cannot do a prediction for something which +isn't in it. + +1:16:30.110 --> 1:16:38.268 +Here in the decoder side it's unidirection +so we can also delete the top and then try + +1:16:38.268 --> 1:16:40.355 +to generate the full. + +1:16:41.061 --> 1:16:45.250 +We can do sentence permutation. + +1:16:45.250 --> 1:16:54.285 +We can document rotation and text infilling +so there is quite a bit. + +1:16:55.615 --> 1:17:06.568 +So you see there's quite a lot of types of +models that you can use in order to pre-train. + +1:17:07.507 --> 1:17:14.985 +Then, of course, there is again for the language +one. + +1:17:14.985 --> 1:17:21.079 +The other question is how do you integrate? + +1:17:21.761 --> 1:17:26.636 +And there's also, like yeah, quite some different +ways of techniques. + +1:17:27.007 --> 1:17:28.684 +It's a Bit Similar to Before. + +1:17:28.928 --> 1:17:39.068 +So the easiest thing is you take your word +embeddings or your free trained model. + +1:17:39.068 --> 1:17:47.971 +You freeze them and stack your decoder layers +and keep these ones free. + +1:17:48.748 --> 1:17:54.495 +Can also be done if you have this type of +bark model. + +1:17:54.495 --> 1:18:03.329 +What you can do is you freeze your word embeddings, +for example some products and. + +1:18:05.865 --> 1:18:17.296 +The other thing is you initialize them so +you initialize your models but you train everything + +1:18:17.296 --> 1:18:19.120 +so you're not. + +1:18:22.562 --> 1:18:29.986 +Then one thing, if you think about Bart, you +want to have the Chinese language, the Italian + +1:18:29.986 --> 1:18:32.165 +language, and the deconer. + +1:18:32.165 --> 1:18:35.716 +However, in Bart we have the same language. + +1:18:36.516 --> 1:18:46.010 +The one you get is from English, so what you +can do there is so you cannot try to do some. + +1:18:46.366 --> 1:18:52.562 +Below the barge, in order to learn some language +specific stuff, or there's a masculine barge, + +1:18:52.562 --> 1:18:58.823 +which is trained on many languages, but it's +trained only on like the Old Coast Modern Language + +1:18:58.823 --> 1:19:03.388 +House, which may be trained in German and English, +but not on German. + +1:19:03.923 --> 1:19:08.779 +So then you would still need to find June +and the model needs to learn how to better + +1:19:08.779 --> 1:19:10.721 +do the attention cross lingually. + +1:19:10.721 --> 1:19:15.748 +It's only on the same language but it mainly +only has to learn this mapping and not all + +1:19:15.748 --> 1:19:18.775 +the rest and that's why it's still quite successful. + +1:19:21.982 --> 1:19:27.492 +Now certain thing which is very commonly used +is what is required to it as adapters. + +1:19:27.607 --> 1:19:29.754 +So for example you take and buy. + +1:19:29.709 --> 1:19:35.218 +And you put some adapters on the inside of +the networks so that it's small new layers + +1:19:35.218 --> 1:19:40.790 +which are in between put in there and then +you only train these adapters or also train + +1:19:40.790 --> 1:19:41.815 +these adapters. + +1:19:41.815 --> 1:19:47.900 +For example, an embryo you could see that +this learns to map the Sears language representation + +1:19:47.900 --> 1:19:50.334 +to the Tiger language representation. + +1:19:50.470 --> 1:19:52.395 +And then you don't have to change that luck. + +1:19:52.792 --> 1:19:59.793 +You give it extra ability to really perform +well on that. + +1:19:59.793 --> 1:20:05.225 +These are quite small and so very efficient. + +1:20:05.905 --> 1:20:12.632 +That is also very commonly used, for example +in modular systems where you have some adaptors + +1:20:12.632 --> 1:20:16.248 +in between here which might be language specific. + +1:20:16.916 --> 1:20:22.247 +So they are trained only for one language. + +1:20:22.247 --> 1:20:33.777 +The model has some or both and once has the +ability to do multilingually to share knowledge. + +1:20:34.914 --> 1:20:39.058 +But there's one chance in general in the multilingual +systems. + +1:20:39.058 --> 1:20:40.439 +It works quite well. + +1:20:40.439 --> 1:20:46.161 +There's one case or one specific use case +for multilingual where this normally doesn't + +1:20:46.161 --> 1:20:47.344 +really work well. + +1:20:47.344 --> 1:20:49.975 +Do you have an idea what that could be? + +1:20:55.996 --> 1:20:57.536 +It's for Zero Shot Cases. + +1:20:57.998 --> 1:21:03.660 +Because having here some situation with this +might be very language specific and zero shot, + +1:21:03.660 --> 1:21:09.015 +the idea is always to learn representations +view which are more language dependent and + +1:21:09.015 --> 1:21:10.184 +with the adaptors. + +1:21:10.184 --> 1:21:15.601 +Of course you get in representations again +which are more language specific and then it + +1:21:15.601 --> 1:21:17.078 +doesn't work that well. + +1:21:20.260 --> 1:21:37.730 +And there is also the idea of doing more knowledge +pistolation. + +1:21:39.179 --> 1:21:42.923 +And now the idea is okay. + +1:21:42.923 --> 1:21:54.157 +We are training it the same, but what we want +to achieve is that the encoder. + +1:21:54.414 --> 1:22:03.095 +So you should learn faster by trying to make +these states as similar as possible. + +1:22:03.095 --> 1:22:11.777 +So you compare the first-hit state of the +pre-trained model and try to make them. + +1:22:12.192 --> 1:22:18.144 +For example, by using the out two norms, so +by just making these two representations the + +1:22:18.144 --> 1:22:26.373 +same: The same vocabulary: Why does it need +the same vocabulary with any idea? + +1:22:34.754 --> 1:22:46.137 +If you have different vocabulary, it's typical +you also have different sequenced lengths here. + +1:22:46.137 --> 1:22:50.690 +The number of sequences is different. + +1:22:51.231 --> 1:22:58.888 +If you now have pipe stains and four states +here, it's no longer straightforward which + +1:22:58.888 --> 1:23:01.089 +states compare to which. + +1:23:02.322 --> 1:23:05.246 +And that's just easier if you have like the +same number. + +1:23:05.246 --> 1:23:08.940 +You can always compare the first to the first +and second to the second. + +1:23:09.709 --> 1:23:16.836 +So therefore at least the very easy way of +knowledge destination only works if you have. + +1:23:17.177 --> 1:23:30.030 +Course: You could do things like yeah, the +average should be the same, but of course there's + +1:23:30.030 --> 1:23:33.071 +a less strong signal. + +1:23:34.314 --> 1:23:42.979 +But the advantage here is that you have a +diameter training signal here on the handquarter + +1:23:42.979 --> 1:23:51.455 +so you can directly make some of the encoder +already giving a good signal while normally + +1:23:51.455 --> 1:23:52.407 +an empty. + +1:23:56.936 --> 1:24:13.197 +Yes, think this is most things for today, +so what you should keep in mind is remind me. + +1:24:13.393 --> 1:24:18.400 +The one is a back translation idea. + +1:24:18.400 --> 1:24:29.561 +If you have monolingual and use that, the +other one is to: And mentally it is often helpful + +1:24:29.561 --> 1:24:33.614 +to combine them so you can even use both of +that. + +1:24:33.853 --> 1:24:38.908 +So you can use pre-trained walls, but then +you can even still do back translation where + +1:24:38.908 --> 1:24:40.057 +it's still helpful. + +1:24:40.160 --> 1:24:45.502 +We have the advantage we are training like +everything working together on the task so + +1:24:45.502 --> 1:24:51.093 +it might be helpful even to backtranslate some +data and then use it in a real translation + +1:24:51.093 --> 1:24:56.683 +setup because in pretraining of course the +beach challenge is always that you're training + +1:24:56.683 --> 1:24:57.739 +it on different. + +1:24:58.058 --> 1:25:03.327 +Different ways of how you integrate this knowledge. + +1:25:03.327 --> 1:25:08.089 +Even if you just use a full model, so in this. + +1:25:08.748 --> 1:25:11.128 +This is the most similar you can get. + +1:25:11.128 --> 1:25:13.945 +You're doing no changes to the architecture. + +1:25:13.945 --> 1:25:19.643 +You're really taking the model and just fine +tuning them on the new task, but it still has + +1:25:19.643 --> 1:25:24.026 +to completely newly learn how to do the attention +and how to do that. + +1:25:24.464 --> 1:25:29.971 +And that might be, for example, helpful to +have more back-translated data to learn them. + +1:25:32.192 --> 1:25:34.251 +That's for today. + +1:25:34.251 --> 1:25:44.661 +There's one important thing that next Tuesday +there is a conference or a workshop or so in + +1:25:44.661 --> 1:25:45.920 +this room. + +1:25:47.127 --> 1:25:56.769 +You should get an e-mail if you're in Elias +that there's a room change for Tuesdays and + +1:25:56.769 --> 1:25:57.426 +it's. + +1:25:57.637 --> 1:26:03.890 +There are more questions, yeah, have a more +general position, especially: In computer vision + +1:26:03.890 --> 1:26:07.347 +you can enlarge your data center data orientation. + +1:26:07.347 --> 1:26:08.295 +Is there any? + +1:26:08.388 --> 1:26:15.301 +It's similar to a large speech for text for +the data of an edge. + +1:26:15.755 --> 1:26:29.176 +And you can use this back translation and +also masking, but back translation is some + +1:26:29.176 --> 1:26:31.228 +way of data. + +1:26:31.371 --> 1:26:35.629 +So it has also been, for example, even its +used not only for monolingual data. + +1:26:36.216 --> 1:26:54.060 +If you have good MP system, it can also be +used for parallel data. + +1:26:54.834 --> 1:26:59.139 +So would say this is the most similar one. + +1:26:59.139 --> 1:27:03.143 +There's ways you can do power phrasing. + +1:27:05.025 --> 1:27:12.057 +But for example there is very hard to do this +by rules like which words to replace because + +1:27:12.057 --> 1:27:18.936 +there is not a coup like you cannot always +say this word can always be replaced by that. + +1:27:19.139 --> 1:27:27.225 +Mean, although they are many perfect synonyms, +normally they are good in some cases, but not + +1:27:27.225 --> 1:27:29.399 +in all cases, and so on. + +1:27:29.399 --> 1:27:36.963 +And if you don't do a rule based, you have +to train your model and then the freshness. + +1:27:38.058 --> 1:27:57.236 +The same architecture as the pre-trained mount. + +1:27:57.457 --> 1:27:59.810 +Should be of the same dimension, so it's easiest +to have the same dimension. + +1:28:00.000 --> 1:28:01.590 +Architecture. + +1:28:01.590 --> 1:28:05.452 +We later will learn inefficiency. + +1:28:05.452 --> 1:28:12.948 +You can also do knowledge cessulation with, +for example, smaller. + +1:28:12.948 --> 1:28:16.469 +You can learn the same within. + +1:28:17.477 --> 1:28:22.949 +Eight layers for it so that is possible, but +yeah agree it should be of the same. + +1:28:23.623 --> 1:28:32.486 +Yeah yeah you need the question then of course +you can do it like it's an initialization or + +1:28:32.486 --> 1:28:41.157 +you can do it doing training but normally it +most makes sense during the normal training. + +1:28:45.865 --> 1:28:53.963 +Do it, then thanks a lot, and then we'll see +each other again on Tuesday. + +0:00:00.981 --> 0:00:20.036 +Today about is how to use some type of additional +resources to improve the translation. + +0:00:20.300 --> 0:00:28.188 +We have in the first part of the semester +two thirds of the semester how to build some + +0:00:28.188 --> 0:00:31.361 +of your basic machine translation. + +0:00:31.571 --> 0:00:42.317 +Now the basic components are both for statistical +and for neural, with the encoded decoding. + +0:00:43.123 --> 0:00:46.000 +Now, of course, that's not where it stops. + +0:00:46.000 --> 0:00:51.286 +It's still what nearly every machine translation +system is currently in there. + +0:00:51.286 --> 0:00:57.308 +However, there's a lot of challenges which +you need to address in addition and which need + +0:00:57.308 --> 0:00:58.245 +to be solved. + +0:00:58.918 --> 0:01:09.858 +And there we want to start to tell you what +else can you do around this, and partly. + +0:01:10.030 --> 0:01:14.396 +And one important question there is on what +do you train your models? + +0:01:14.394 --> 0:01:32.003 +Because like this type of parallel data, it's +easier in machine translation than in other + +0:01:32.003 --> 0:01:33.569 +trusts. + +0:01:33.853 --> 0:01:41.178 +And therefore an important question is, can +we also learn from like other sources and through? + +0:01:41.701 --> 0:01:47.830 +Because if you remember strongly right at +the beginning of the election,. + +0:01:51.171 --> 0:01:53.801 +This Is How We Train All Our. + +0:01:54.194 --> 0:01:59.887 +Machine learning models from statistical to +neural. + +0:01:59.887 --> 0:02:09.412 +This doesn't have changed so we need this +type of parallel data where we have a source + +0:02:09.412 --> 0:02:13.462 +sentence aligned with a target data. + +0:02:13.493 --> 0:02:19.135 +We have now a strong model here, a very good +model to do that. + +0:02:19.135 --> 0:02:22.091 +However, we always rely on this. + +0:02:22.522 --> 0:02:28.395 +For languages, high risk language pairs say +from German to English or other European languages, + +0:02:28.395 --> 0:02:31.332 +there is decent amount, at least for similarly. + +0:02:31.471 --> 0:02:37.630 +But even there if we are going to very specific +domains it might get difficult and then your + +0:02:37.630 --> 0:02:43.525 +system performance might drop because if you +want to translate now some medical text for + +0:02:43.525 --> 0:02:50.015 +example of course you need to also have peril +data in the medical domain to know how to translate + +0:02:50.015 --> 0:02:50.876 +these types. + +0:02:51.231 --> 0:02:55.264 +Phrases how to use the vocabulary and so on +in the style. + +0:02:55.915 --> 0:03:04.887 +And if you are going to other languages, there +is a lot bigger challenge and the question + +0:03:04.887 --> 0:03:05.585 +there. + +0:03:05.825 --> 0:03:09.649 +So is really this the only resource we can +use. + +0:03:09.889 --> 0:03:19.462 +Can be adapted or training phase in order +to also make use of other types of models that + +0:03:19.462 --> 0:03:27.314 +might enable us to build strong systems with +other types of information. + +0:03:27.707 --> 0:03:35.276 +And that we will look into now in the next +starting from from just saying the next election. + +0:03:35.515 --> 0:03:40.697 +So this idea we already have covered on Tuesday. + +0:03:40.697 --> 0:03:45.350 +One very successful idea for this is to do. + +0:03:45.645 --> 0:03:51.990 +So that we're no longer doing translation +between languages, but we can do translation + +0:03:51.990 --> 0:03:55.928 +between languages and share common knowledge +between. + +0:03:56.296 --> 0:04:03.888 +You also learned about things like zero shots +machine translation so you can translate between + +0:04:03.888 --> 0:04:06.446 +languages where you don't have. + +0:04:06.786 --> 0:04:09.790 +Which is the case for many, many language +pairs. + +0:04:10.030 --> 0:04:19.209 +Like even with German, you have not translation +parallel data to all languages around the world, + +0:04:19.209 --> 0:04:26.400 +or most of them you have it to the Europeans +once, maybe even for Japanese. + +0:04:26.746 --> 0:04:35.332 +There is quite a lot of data, for example +English to Japanese, but German to Japanese + +0:04:35.332 --> 0:04:37.827 +or German to Vietnamese. + +0:04:37.827 --> 0:04:41.621 +There is some data from Multilingual. + +0:04:42.042 --> 0:04:54.584 +So there is a very promising direction if +you want to build translation systems between + +0:04:54.584 --> 0:05:00.142 +language peers, typically not English. + +0:05:01.221 --> 0:05:05.887 +And the other ideas, of course, we don't have +to either just search for it. + +0:05:06.206 --> 0:05:12.505 +Some work on a data crawling so if I don't +have a corpus directly or I don't have an high + +0:05:12.505 --> 0:05:19.014 +quality corpus like from the European Parliament +for a TED corpus so maybe it makes sense to + +0:05:19.014 --> 0:05:23.913 +crawl more data and get additional sources +so you can build stronger. + +0:05:24.344 --> 0:05:35.485 +There has been quite a big effort in Europe +to collect really large data sets for parallel + +0:05:35.485 --> 0:05:36.220 +data. + +0:05:36.220 --> 0:05:40.382 +How can we do this data crawling? + +0:05:40.600 --> 0:05:46.103 +There the interesting thing from the machine +translation point is not just general data + +0:05:46.103 --> 0:05:46.729 +crawling. + +0:05:47.067 --> 0:05:50.037 +But how can we explicitly crawl data? + +0:05:50.037 --> 0:05:52.070 +Which is some of a peril? + +0:05:52.132 --> 0:05:58.461 +So there is in the Internet quite a lot of +data which has been company websites which + +0:05:58.461 --> 0:06:01.626 +have been translated and things like that. + +0:06:01.626 --> 0:06:05.158 +So how can you extract them parallel fragments? + +0:06:06.566 --> 0:06:13.404 +That is typically more noisy than where you +do more at hands where mean if you have Parliament. + +0:06:13.693 --> 0:06:17.680 +You can do some rules how to extract parallel +things. + +0:06:17.680 --> 0:06:24.176 +Here there is more to it, so the quality is +later maybe not as good, but normally scale + +0:06:24.176 --> 0:06:26.908 +is then a possibility to address it. + +0:06:26.908 --> 0:06:30.304 +So you just have so much more data that even. + +0:06:33.313 --> 0:06:40.295 +The other thing can be used monolingual data +and monolingual data has a big advantage that + +0:06:40.295 --> 0:06:46.664 +we can have a huge amount of that so that you +can be autocrawed from the Internet. + +0:06:46.664 --> 0:06:51.728 +The nice thing is you can also get it typically +for many domains. + +0:06:52.352 --> 0:06:59.558 +There is just so much more magnitude of monolingual +data so that it might be very helpful. + +0:06:59.559 --> 0:07:03.054 +We can do that in statistical machine translation. + +0:07:03.054 --> 0:07:06.755 +It was quite easy to integrate using language +models. + +0:07:08.508 --> 0:07:16.912 +In neural machine translation we have the +advantage that we have this overall architecture + +0:07:16.912 --> 0:07:22.915 +that does everything together, but it has also +the disadvantage. + +0:07:23.283 --> 0:07:25.675 +We'll look today at two things. + +0:07:25.675 --> 0:07:32.925 +On the one end you can still try to do a bit +of language modeling in there and add an additional + +0:07:32.925 --> 0:07:35.168 +language model into in there. + +0:07:35.168 --> 0:07:38.232 +There is some work, one very successful. + +0:07:38.178 --> 0:07:43.764 +A way in which I think is used in most systems +at the moment is to do some scientific data. + +0:07:43.763 --> 0:07:53.087 +Is a very easy thing, but you can just translate +there and use it as training gator, and normally. + +0:07:53.213 --> 0:07:59.185 +And thereby you are able to use like some +type of monolingual a day. + +0:08:00.380 --> 0:08:05.271 +Another way to do it is unsupervised and the +extreme case. + +0:08:05.271 --> 0:08:11.158 +If you have a scenario then you only have +data, only monolingual data. + +0:08:11.158 --> 0:08:13.976 +Can you still build translations? + +0:08:14.754 --> 0:08:27.675 +If you have large amounts of data and languages +are not too dissimilar, you can build translation + +0:08:27.675 --> 0:08:31.102 +systems without parallel. + +0:08:32.512 --> 0:08:36.267 +That we will see you then next Thursday. + +0:08:37.857 --> 0:08:50.512 +And then there is now a third type of pre-trained +model that recently became very successful + +0:08:50.512 --> 0:08:55.411 +and now with large language models. + +0:08:55.715 --> 0:09:03.525 +So the idea is we are no longer sharing the +real data, but it can also help to train a + +0:09:03.525 --> 0:09:04.153 +model. + +0:09:04.364 --> 0:09:11.594 +And that is now a big advantage of deep learning +based approaches. + +0:09:11.594 --> 0:09:22.169 +There you have this ability that you can train +a model in some task and then apply it to another. + +0:09:22.722 --> 0:09:33.405 +And then, of course, the question is, can +I have an initial task where there's huge amounts + +0:09:33.405 --> 0:09:34.450 +of data? + +0:09:34.714 --> 0:09:40.251 +And the test that typically you pre train +on is more like similar to a language moral + +0:09:40.251 --> 0:09:45.852 +task either direct to a language moral task +or like a masking task which is related so + +0:09:45.852 --> 0:09:51.582 +the idea is oh I can train on this data and +the knowledge about words how they relate to + +0:09:51.582 --> 0:09:53.577 +each other I can use in there. + +0:09:53.753 --> 0:10:00.276 +So it's a different way of using language +models. + +0:10:00.276 --> 0:10:06.276 +There's more transfer learning at the end +of. + +0:10:09.029 --> 0:10:17.496 +So first we will start with how can we use +monolingual data to do a Yeah to do a machine + +0:10:17.496 --> 0:10:18.733 +translation? + +0:10:20.040 --> 0:10:27.499 +That: Big difference is you should remember +from what I mentioned before is. + +0:10:27.499 --> 0:10:32.783 +In statistical machine translation we directly +have the opportunity. + +0:10:32.783 --> 0:10:39.676 +There's peril data for the translation model +and monolingual data for the language model. + +0:10:39.679 --> 0:10:45.343 +And you combine your translation model and +language model, and then you can make use of + +0:10:45.343 --> 0:10:45.730 +both. + +0:10:46.726 --> 0:10:53.183 +That you can make use of these large large +amounts of monolingual data, but of course + +0:10:53.183 --> 0:10:55.510 +it has also some disadvantage. + +0:10:55.495 --> 0:11:01.156 +Because we say the problem is we are optimizing +both parts a bit independently to each other + +0:11:01.156 --> 0:11:06.757 +and we say oh yeah the big disadvantage of +newer machine translations now we are optimizing + +0:11:06.757 --> 0:11:10.531 +the overall architecture everything together +to perform best. + +0:11:10.890 --> 0:11:16.994 +And then, of course, we can't do there, so +Leo we can can only do a mural like use power + +0:11:16.994 --> 0:11:17.405 +data. + +0:11:17.897 --> 0:11:28.714 +So the question is, but this advantage is +not so important that we can train everything, + +0:11:28.714 --> 0:11:35.276 +but we have a moral legal data or even small +amounts. + +0:11:35.675 --> 0:11:43.102 +So in data we know it's not only important +the amount of data we have but also like how + +0:11:43.102 --> 0:11:50.529 +similar it is to your test data so it can be +that this modeling data is quite small but + +0:11:50.529 --> 0:11:55.339 +it's very well fitting and then it's still +very helpful. + +0:11:55.675 --> 0:12:02.691 +At the first year of surprisingness, if we +are here successful with integrating a language + +0:12:02.691 --> 0:12:09.631 +model into a translation system, maybe we can +also integrate some type of language models + +0:12:09.631 --> 0:12:14.411 +into our empty system in order to make it better +and perform. + +0:12:16.536 --> 0:12:23.298 +The first thing we can do is we know there +is language models, so let's try to integrate. + +0:12:23.623 --> 0:12:31.096 +There was our language model because these +works were mainly done before transformer-based + +0:12:31.096 --> 0:12:31.753 +models. + +0:12:32.152 --> 0:12:38.764 +In general, of course, you can do the same +thing with transformer baseball. + +0:12:38.764 --> 0:12:50.929 +There is nothing about whether: It's just +that it has mainly been done before people + +0:12:50.929 --> 0:13:01.875 +started using R&S and they tried to do +this more in cases. + +0:13:07.087 --> 0:13:22.938 +So what we're happening here is in some of +this type of idea, and in key system you remember + +0:13:22.938 --> 0:13:25.495 +the attention. + +0:13:25.605 --> 0:13:29.465 +Gets it was your last in this day that you +calculate easy attention. + +0:13:29.729 --> 0:13:36.610 +We get the context back, then combine both +and then base the next in state and then predict. + +0:13:37.057 --> 0:13:42.424 +So this is our system, and the question is, +can we send our integrated language model? + +0:13:42.782 --> 0:13:49.890 +And somehow it makes sense to take out a neural +language model because we are anyway in the + +0:13:49.890 --> 0:13:50.971 +neural space. + +0:13:50.971 --> 0:13:58.465 +It's not surprising that it contrasts to statistical +work used and grants it might make sense to + +0:13:58.465 --> 0:14:01.478 +take a bit of a normal language model. + +0:14:01.621 --> 0:14:06.437 +And there would be something like on Tubbles +Air, a neural language model, and our man based + +0:14:06.437 --> 0:14:11.149 +is you have a target word, you put it in, you +get a new benchmark, and then you always put + +0:14:11.149 --> 0:14:15.757 +in the words and get new hidden states, and +you can do some predictions at the output to + +0:14:15.757 --> 0:14:16.948 +predict the next word. + +0:14:17.597 --> 0:14:26.977 +So if we're having this type of in language +model, there's like two main questions we have + +0:14:26.977 --> 0:14:34.769 +to answer: So how do we combine now on the +one hand our system and on the other hand our + +0:14:34.769 --> 0:14:35.358 +model? + +0:14:35.358 --> 0:14:42.004 +You see that was mentioned before when we +started talking about ENCODA models. + +0:14:42.004 --> 0:14:45.369 +They can be viewed as a language model. + +0:14:45.805 --> 0:14:47.710 +The wine is lengthened, unconditioned. + +0:14:47.710 --> 0:14:49.518 +It's just modeling the target sides. + +0:14:49.970 --> 0:14:56.963 +And the other one is a conditional language +one, which is a language one conditioned on + +0:14:56.963 --> 0:14:57.837 +the Sewer. + +0:14:58.238 --> 0:15:03.694 +So how can you combine to language models? + +0:15:03.694 --> 0:15:14.860 +Of course, it's like the translation model +will be more important because it has access + +0:15:14.860 --> 0:15:16.763 +to the source. + +0:15:18.778 --> 0:15:22.571 +If we have that, the other question is okay. + +0:15:22.571 --> 0:15:24.257 +Now we have models. + +0:15:24.257 --> 0:15:25.689 +How do we train? + +0:15:26.026 --> 0:15:30.005 +Pickers integrated them. + +0:15:30.005 --> 0:15:34.781 +We have now two sets of data. + +0:15:34.781 --> 0:15:42.741 +We have parallel data where you can do the +lower. + +0:15:44.644 --> 0:15:53.293 +So the first idea is we can do something more +like a parallel combination. + +0:15:53.293 --> 0:15:55.831 +We just keep running. + +0:15:56.036 --> 0:15:59.864 +So here you see your system that is running. + +0:16:00.200 --> 0:16:09.649 +It's normally completely independent of your +language model, which is up there, so down + +0:16:09.649 --> 0:16:13.300 +here we have just our NMT system. + +0:16:13.313 --> 0:16:26.470 +The only thing which is used is we have the +words, and of course they are put into both + +0:16:26.470 --> 0:16:30.059 +systems, and out there. + +0:16:30.050 --> 0:16:42.221 +So we use them somehow for both, and then +we are doing our decision just by merging these + +0:16:42.221 --> 0:16:42.897 +two. + +0:16:43.343 --> 0:16:53.956 +So there can be, for example, we are doing +a probability distribution here, and then we + +0:16:53.956 --> 0:17:03.363 +are taking the average of post-perability distribution +to do our predictions. + +0:17:11.871 --> 0:17:18.923 +You could also take the output with Steve's +to be more in chore about the mixture. + +0:17:20.000 --> 0:17:32.896 +Yes, you could also do that, so it's more +like engaging mechanisms that you're not doing. + +0:17:32.993 --> 0:17:41.110 +Another one would be cochtrinate the hidden +states, and then you would have another layer + +0:17:41.110 --> 0:17:41.831 +on top. + +0:17:43.303 --> 0:17:56.889 +You think about if you do the conqueredination +instead of taking the instead and then merging + +0:17:56.889 --> 0:18:01.225 +the probability distribution. + +0:18:03.143 --> 0:18:16.610 +Introduce many new parameters, and these parameters +have somehow something special compared to + +0:18:16.610 --> 0:18:17.318 +the. + +0:18:23.603 --> 0:18:37.651 +So before all the error other parameters can +be trained independent, the language model + +0:18:37.651 --> 0:18:42.121 +can be trained independent. + +0:18:43.043 --> 0:18:51.749 +If you have a joint layer, of course you need +to train them because you have now inputs. + +0:18:54.794 --> 0:19:02.594 +Not surprisingly, if you have a parallel combination +of whether you could, the other way is to do + +0:19:02.594 --> 0:19:04.664 +more serial combinations. + +0:19:04.924 --> 0:19:10.101 +How can you do a similar combination? + +0:19:10.101 --> 0:19:18.274 +Your final decision makes sense to do a face +on the system. + +0:19:18.438 --> 0:19:20.996 +So you have on top of your normal and system. + +0:19:21.121 --> 0:19:30.678 +The only thing is now you're inputting into +your system. + +0:19:30.678 --> 0:19:38.726 +You're no longer inputting the word embeddings. + +0:19:38.918 --> 0:19:45.588 +So you're training your mainly what you have +your lower layers here which are trained more + +0:19:45.588 --> 0:19:52.183 +on the purely language model style and then +on top your putting into the NMT system where + +0:19:52.183 --> 0:19:55.408 +it now has already here the language model. + +0:19:55.815 --> 0:19:58.482 +So here you can also view it. + +0:19:58.482 --> 0:20:06.481 +Here you have more contextual embeddings which +no longer depend only on the word but they + +0:20:06.481 --> 0:20:10.659 +also depend on the context of the target site. + +0:20:11.051 --> 0:20:19.941 +But you have more understanding of the source +word, so you have a language in the current + +0:20:19.941 --> 0:20:21.620 +target sentence. + +0:20:21.881 --> 0:20:27.657 +So if it's like the word can, for example, +will be put in here always the same independent + +0:20:27.657 --> 0:20:31.147 +of its user can of beans, or if it's like I +can do it. + +0:20:31.147 --> 0:20:37.049 +However, because you are having your language +model style, you have maybe disintegrated this + +0:20:37.049 --> 0:20:40.984 +already a bit, and you give this information +directly to the. + +0:20:41.701 --> 0:20:43.095 +An empty cyst. + +0:20:44.364 --> 0:20:49.850 +You, if you're remembering more the transformer +based approach, you have some layers. + +0:20:49.850 --> 0:20:55.783 +The lower layers are purely languaged while +the other ones are with attention to the source. + +0:20:55.783 --> 0:21:01.525 +So you can view it also that you just have +lower layers which don't attend to the source. + +0:21:02.202 --> 0:21:07.227 +This is purely a language model, and then +at some point you're starting to attend to + +0:21:07.227 --> 0:21:08.587 +the source and use it. + +0:21:13.493 --> 0:21:20.781 +Yes, so this is how you combine them in peril +or first do the language model and then do. + +0:21:23.623 --> 0:21:26.147 +Questions for the integration. + +0:21:31.831 --> 0:21:35.034 +Not really sure about the input of the. + +0:21:35.475 --> 0:21:38.102 +Model, and in this case in the sequence. + +0:21:38.278 --> 0:21:53.199 +Case so the actual word that we transferred +into a numerical lecture, and this is an input + +0:21:53.199 --> 0:21:54.838 +into the. + +0:21:56.176 --> 0:22:03.568 +That depends on if you view the word embedding +as part of the language model. + +0:22:03.568 --> 0:22:10.865 +So if you first put the word target word then +you do the one hot end coding. + +0:22:11.691 --> 0:22:13.805 +And then the word embedding there is the r& + +0:22:13.805 --> 0:22:13.937 +n. + +0:22:14.314 --> 0:22:21.035 +So you can use this together as your language +model when you first do the word embedding. + +0:22:21.401 --> 0:22:24.346 +All you can say is like before. + +0:22:24.346 --> 0:22:28.212 +It's more a definition, but you're right. + +0:22:28.212 --> 0:22:30.513 +So what's the steps out? + +0:22:30.513 --> 0:22:36.128 +You take the word, the one hut encoding, the +word embedding. + +0:22:36.516 --> 0:22:46.214 +What one of these parrots, you know, called +a language model is definition wise and not + +0:22:46.214 --> 0:22:47.978 +that important. + +0:22:53.933 --> 0:23:02.264 +So the question is how can you then train +them and make this this one work? + +0:23:02.264 --> 0:23:02.812 +The. + +0:23:03.363 --> 0:23:15.201 +So in the case where you combine the language +one of the abilities you can train them independently + +0:23:15.201 --> 0:23:18.516 +and just put them together. + +0:23:18.918 --> 0:23:27.368 +Might not be the best because we have no longer +the stability that we had before that optimally + +0:23:27.368 --> 0:23:29.128 +performed together. + +0:23:29.128 --> 0:23:33.881 +It's not clear if they really work the best +together. + +0:23:34.514 --> 0:23:41.585 +At least you need to somehow find how much +do you trust the one model and how much. + +0:23:43.323 --> 0:23:45.058 +Still in some cases useful. + +0:23:45.058 --> 0:23:48.530 +It might be helpful if you have only data +and software. + +0:23:48.928 --> 0:23:59.064 +However, in MT we have one specific situation +that at least for the MT part parallel is also + +0:23:59.064 --> 0:24:07.456 +always monolingual data, so what we definitely +can do is train the language. + +0:24:08.588 --> 0:24:18.886 +So what we also can do is more like the pre-training +approach. + +0:24:18.886 --> 0:24:24.607 +We first train the language model. + +0:24:24.704 --> 0:24:27.334 +The pre-training approach. + +0:24:27.334 --> 0:24:33.470 +You first train on the monolingual data and +then you join the. + +0:24:33.933 --> 0:24:41.143 +Of course, the model size is this way, but +the data size is too bigly the other way around. + +0:24:41.143 --> 0:24:47.883 +You often have a lot more monolingual data +than you have here parallel data, in which + +0:24:47.883 --> 0:24:52.350 +scenario can you imagine where this type of +pretraining? + +0:24:56.536 --> 0:24:57.901 +Any Ideas. + +0:25:04.064 --> 0:25:12.772 +One example where this might also be helpful +if you want to adapt to domains. + +0:25:12.772 --> 0:25:22.373 +So let's say you do medical sentences and +if you want to translate medical sentences. + +0:25:23.083 --> 0:25:26.706 +In this case it could be or its most probable +happen. + +0:25:26.706 --> 0:25:32.679 +You're learning here up there what medical +means, but in your fine tuning step the model + +0:25:32.679 --> 0:25:38.785 +is forgotten everything about Medicare, so +you may be losing all the information you gain. + +0:25:39.099 --> 0:25:42.366 +So this type of priest training step is good. + +0:25:42.366 --> 0:25:47.978 +If your pretraining data is more general, +very large and then you're adapting. + +0:25:48.428 --> 0:25:56.012 +But in the task with moral lingual data, which +should be used to adapt the system to some + +0:25:56.012 --> 0:25:57.781 +general topic style. + +0:25:57.817 --> 0:26:06.795 +Then, of course, this is not a good strategy +because you might forgot about everything up + +0:26:06.795 --> 0:26:09.389 +there and you don't have. + +0:26:09.649 --> 0:26:14.678 +So then you have to check what you can do +for them. + +0:26:14.678 --> 0:26:23.284 +You can freeze this part and change it any +more so you don't lose the ability or you can + +0:26:23.284 --> 0:26:25.702 +do a direct combination. + +0:26:25.945 --> 0:26:31.028 +Where you jointly train both of them, so you +train the NMT system on the, and then you train + +0:26:31.028 --> 0:26:34.909 +the language model always in parallels so that +you don't forget about. + +0:26:35.395 --> 0:26:37.684 +And what you learn of the length. + +0:26:37.937 --> 0:26:46.711 +Depends on what you want to combine because +it's large data and you have a good general + +0:26:46.711 --> 0:26:48.107 +knowledge in. + +0:26:48.548 --> 0:26:55.733 +Then you normally don't really forget it because +it's also in the or you use it to adapt to + +0:26:55.733 --> 0:26:57.295 +something specific. + +0:26:57.295 --> 0:26:58.075 +Then you. + +0:27:01.001 --> 0:27:06.676 +Then this is a way of how we can make use +of monolingual data. + +0:27:07.968 --> 0:27:12.116 +It seems to be the easiest one somehow. + +0:27:12.116 --> 0:27:20.103 +It's more similar to what we are doing with +statistical machine translation. + +0:27:21.181 --> 0:27:31.158 +Normally always beats this type of model, +which in some view can be like from the conceptual + +0:27:31.158 --> 0:27:31.909 +thing. + +0:27:31.909 --> 0:27:36.844 +It's even easier from the computational side. + +0:27:40.560 --> 0:27:42.078 +And the idea is OK. + +0:27:42.078 --> 0:27:49.136 +We have monolingual data that we just translate +and then generate some type of parallel data + +0:27:49.136 --> 0:27:50.806 +and use that then to. + +0:27:51.111 --> 0:28:00.017 +So if you want to build a German-to-English +system first, take the large amount of data + +0:28:00.017 --> 0:28:02.143 +you have translated. + +0:28:02.402 --> 0:28:10.446 +Then you have more peril data and the interesting +thing is if you then train on the joint thing + +0:28:10.446 --> 0:28:18.742 +or on the original peril data and on what is +artificial where you have generated the translations. + +0:28:18.918 --> 0:28:26.487 +So you can because you are not doing the same +era all the times and you have some knowledge. + +0:28:28.028 --> 0:28:43.199 +With this first approach, however, there is +one issue why it might not work the best. + +0:28:49.409 --> 0:28:51.177 +Very a bit shown in the image to you. + +0:28:53.113 --> 0:28:58.153 +You trade on that quality data. + +0:28:58.153 --> 0:29:02.563 +Here is a bit of a problem. + +0:29:02.563 --> 0:29:08.706 +Your English style is not really good. + +0:29:08.828 --> 0:29:12.213 +And as you're saying, the system always mistranslates. + +0:29:13.493 --> 0:29:19.798 +Something then you will learn that this is +correct because now it's a training game and + +0:29:19.798 --> 0:29:23.022 +you will encourage it to make it more often. + +0:29:23.022 --> 0:29:29.614 +So the problem with training on your own areas +yeah you might prevent some areas you rarely + +0:29:29.614 --> 0:29:29.901 +do. + +0:29:30.150 --> 0:29:31.749 +But errors use systematically. + +0:29:31.749 --> 0:29:34.225 +Do you even enforce more and will even do +more? + +0:29:34.654 --> 0:29:40.145 +So that might not be the best solution to +have any idea how you could do it better. + +0:29:44.404 --> 0:29:57.754 +Is one way there is even a bit of more simple +idea. + +0:30:04.624 --> 0:30:10.975 +The problem is yeah, the translations are +not perfect, so the output and you're learning + +0:30:10.975 --> 0:30:12.188 +something wrong. + +0:30:12.188 --> 0:30:17.969 +Normally it's less bad if your inputs are +not bad, but your outputs are perfect. + +0:30:18.538 --> 0:30:24.284 +So if your inputs are wrong you may learn +that if you're doing this wrong input you're + +0:30:24.284 --> 0:30:30.162 +generating something correct, but you're not +learning to generate something which is not + +0:30:30.162 --> 0:30:30.756 +correct. + +0:30:31.511 --> 0:30:47.124 +So often the case it is that it is more important +than your target is correct. + +0:30:47.347 --> 0:30:52.182 +But you can assume in your application scenario +you hope that you may only get correct inputs. + +0:30:52.572 --> 0:31:02.535 +So that is not harming you, and in machine +translation we have one very nice advantage: + +0:31:02.762 --> 0:31:04.648 +And also the other way around. + +0:31:04.648 --> 0:31:10.062 +It's a very similar task, so there's a task +to translate from German to English, but the + +0:31:10.062 --> 0:31:13.894 +task to translate from English to German is +very similar, and. + +0:31:14.094 --> 0:31:19.309 +So what we can do is we can just switch it +initially and generate the data the other way + +0:31:19.309 --> 0:31:19.778 +around. + +0:31:20.120 --> 0:31:25.959 +So what we are doing here is we are starting +with an English to German system. + +0:31:25.959 --> 0:31:32.906 +Then we are translating the English data into +German where the German is maybe not very nice. + +0:31:33.293 --> 0:31:51.785 +And then we are training on our original data +and on the back translated data. + +0:31:52.632 --> 0:32:02.332 +So here we have the advantage that our target +side is human quality and only the input. + +0:32:03.583 --> 0:32:08.113 +Then this helps us to get really good. + +0:32:08.113 --> 0:32:15.431 +There is one difference if you think about +the data resources. + +0:32:21.341 --> 0:32:27.336 +Too obvious here we need a target site monolingual +layer. + +0:32:27.336 --> 0:32:31.574 +In the first example we had source site. + +0:32:31.931 --> 0:32:45.111 +So back translation is normally working if +you have target size peril later and not search + +0:32:45.111 --> 0:32:48.152 +side modeling later. + +0:32:48.448 --> 0:32:56.125 +Might be also, like if you think about it, +understand a little better to understand the + +0:32:56.125 --> 0:32:56.823 +target. + +0:32:57.117 --> 0:33:01.469 +On the source side you have to understand +the content. + +0:33:01.469 --> 0:33:08.749 +On the target side you have to generate really +sentences and somehow it's more difficult to + +0:33:08.749 --> 0:33:12.231 +generate something than to only understand. + +0:33:17.617 --> 0:33:30.734 +This works well if you have to select how +many back translated data do you use. + +0:33:31.051 --> 0:33:32.983 +Because only there's like a lot more. + +0:33:33.253 --> 0:33:42.136 +Question: Should take all of my data there +is two problems with it? + +0:33:42.136 --> 0:33:51.281 +Of course it's expensive because you have +to translate all this data. + +0:33:51.651 --> 0:34:00.946 +So if you don't know the normal good starting +point is to take equal amount of data as many + +0:34:00.946 --> 0:34:02.663 +back translated. + +0:34:02.963 --> 0:34:04.673 +It depends on the used case. + +0:34:04.673 --> 0:34:08.507 +If we have very few data here, it makes more +sense to have more. + +0:34:08.688 --> 0:34:15.224 +Depends on how good your quality is here, +so the better the more data you might use because + +0:34:15.224 --> 0:34:16.574 +quality is better. + +0:34:16.574 --> 0:34:22.755 +So it depends on a lot of things, but your +rule of sum is like which general way often + +0:34:22.755 --> 0:34:24.815 +is to have equal amounts of. + +0:34:26.646 --> 0:34:29.854 +And you can, of course, do that now. + +0:34:29.854 --> 0:34:34.449 +I said already that it's better to have the +quality. + +0:34:34.449 --> 0:34:38.523 +At the end, of course, depends on this system. + +0:34:38.523 --> 0:34:46.152 +Also, because the better this system is, the +better your synthetic data is, the better. + +0:34:47.207 --> 0:34:50.949 +That leads to what is referred to as iterated +back translation. + +0:34:51.291 --> 0:34:56.917 +So you play them on English to German, and +you translate the data on. + +0:34:56.957 --> 0:35:03.198 +Then you train a model on German to English +with the additional data. + +0:35:03.198 --> 0:35:09.796 +Then you translate German data and then you +train to gain your first one. + +0:35:09.796 --> 0:35:14.343 +So in the second iteration this quality is +better. + +0:35:14.334 --> 0:35:19.900 +System is better because it's not only trained +on the small data but additionally on back + +0:35:19.900 --> 0:35:22.003 +translated data with this system. + +0:35:22.442 --> 0:35:24.458 +And so you can get better. + +0:35:24.764 --> 0:35:28.053 +However, typically you can stop quite early. + +0:35:28.053 --> 0:35:35.068 +Maybe one iteration is good, but then you +have diminishing gains after two or three iterations. + +0:35:35.935 --> 0:35:46.140 +There is very slight difference because you +need a quite big difference in the quality + +0:35:46.140 --> 0:35:46.843 +here. + +0:35:47.207 --> 0:36:02.262 +Language is also good because it means you +can already train it with relatively bad profiles. + +0:36:03.723 --> 0:36:10.339 +It's a design decision would advise so guess +because it's easy to get it. + +0:36:10.550 --> 0:36:20.802 +Replace that because you have a higher quality +real data, but then I think normally it's okay + +0:36:20.802 --> 0:36:22.438 +to replace it. + +0:36:22.438 --> 0:36:28.437 +I would assume it's not too much of a difference, +but. + +0:36:34.414 --> 0:36:42.014 +That's about like using monolingual data before +we go into the pre-train models to have any + +0:36:42.014 --> 0:36:43.005 +more crash. + +0:36:49.029 --> 0:36:55.740 +Yes, so the other thing which we can do and +which is recently more and more successful + +0:36:55.740 --> 0:37:02.451 +and even more successful since we have this +really large language models where you can + +0:37:02.451 --> 0:37:08.545 +even do the translation task with this is the +way of using pre-trained models. + +0:37:08.688 --> 0:37:16.135 +So you learn a representation of one task, +and then you use this representation from another. + +0:37:16.576 --> 0:37:26.862 +It was made maybe like one of the first words +where it really used largely is doing something + +0:37:26.862 --> 0:37:35.945 +like a bird which you pre trained on purely +text era and you take it in fine tune. + +0:37:36.496 --> 0:37:42.953 +And one big advantage, of course, is that +people can only share data but also pre-trained. + +0:37:43.423 --> 0:37:59.743 +The recent models and the large language ones +which are available. + +0:37:59.919 --> 0:38:09.145 +Where I think it costs several millions to +train them all, just if you would buy the GPUs + +0:38:09.145 --> 0:38:15.397 +from some cloud company and train that the +cost of training. + +0:38:15.475 --> 0:38:21.735 +And guess as a student project you won't have +the budget to like build these models. + +0:38:21.801 --> 0:38:24.598 +So another idea is what you can do is okay. + +0:38:24.598 --> 0:38:27.330 +Maybe if these months are once available,. + +0:38:27.467 --> 0:38:36.598 +Can take them and use them as an also resource +similar to pure text, and you can now build + +0:38:36.598 --> 0:38:44.524 +models which somehow learn not only from from +data but also from other models. + +0:38:44.844 --> 0:38:49.127 +So it's a quite new way of thinking of how +to train. + +0:38:49.127 --> 0:38:53.894 +We are not only learning from examples, but +we might also. + +0:38:54.534 --> 0:39:05.397 +The nice thing is that this type of training +where we are not learning directly from data + +0:39:05.397 --> 0:39:07.087 +but learning. + +0:39:07.427 --> 0:39:17.647 +So the main idea this go is you have a person +initial task. + +0:39:17.817 --> 0:39:26.369 +And if you're working with anLP, that means +you're training pure taxator because that's + +0:39:26.369 --> 0:39:30.547 +where you have the largest amount of data. + +0:39:30.951 --> 0:39:35.857 +And then you're defining some type of task +in order to do your creek training. + +0:39:36.176 --> 0:39:43.092 +And: The typical task you can train on on +that is like the language waddling task. + +0:39:43.092 --> 0:39:50.049 +So to predict the next word or we have a related +task to predict something in between, we'll + +0:39:50.049 --> 0:39:52.667 +see depending on the architecture. + +0:39:52.932 --> 0:39:58.278 +But somehow to predict something which you +have not in the input is a task which is easy + +0:39:58.278 --> 0:40:00.740 +to generate, so you just need your data. + +0:40:00.740 --> 0:40:06.086 +That's why it's called self supervised, so +you're creating your supervised pending data. + +0:40:06.366 --> 0:40:07.646 +By yourself. + +0:40:07.646 --> 0:40:15.133 +On the other hand, you need a lot of knowledge +and that is the other thing. + +0:40:15.735 --> 0:40:24.703 +Because there is this idea that the meaning +of a word heavily depends on the context that. + +0:40:25.145 --> 0:40:36.846 +So can give you a sentence with some giverish +word and there's some name and although you've + +0:40:36.846 --> 0:40:41.627 +never heard the name you will assume. + +0:40:42.062 --> 0:40:44.149 +And exactly the same thing. + +0:40:44.149 --> 0:40:49.143 +The models can also learn something about +the world by just using. + +0:40:49.649 --> 0:40:53.651 +So that is typically the mule. + +0:40:53.651 --> 0:40:59.848 +Then we can use this model to train the system. + +0:41:00.800 --> 0:41:03.368 +Course we might need to adapt the system. + +0:41:03.368 --> 0:41:07.648 +To do that we have to change the architecture +we might use only some. + +0:41:07.627 --> 0:41:09.443 +Part of the pre-trained model. + +0:41:09.443 --> 0:41:14.773 +In there we have seen that a bit already in +the R&N case you can also see that we have + +0:41:14.773 --> 0:41:17.175 +also mentioned the pre-training already. + +0:41:17.437 --> 0:41:22.783 +So you can use the R&N as one of these +approaches. + +0:41:22.783 --> 0:41:28.712 +You train the R&M language more on large +pre-train data. + +0:41:28.712 --> 0:41:32.309 +Then you put it somewhere into your. + +0:41:33.653 --> 0:41:37.415 +So this gives you the ability to really do +these types of tests. + +0:41:37.877 --> 0:41:53.924 +So you can build a system which is knowledge, +which is just trained on large amounts of data. + +0:41:56.376 --> 0:42:01.564 +So the question is maybe what type of information +so what type of models can you? + +0:42:01.821 --> 0:42:05.277 +And we want today to look at briefly at swings. + +0:42:05.725 --> 0:42:08.704 +First, that was what was initially done. + +0:42:08.704 --> 0:42:15.314 +It wasn't as famous as in machine translation +as in other things, but it's also used there + +0:42:15.314 --> 0:42:21.053 +and that is to use static word embedding, so +just the first step we know here. + +0:42:21.221 --> 0:42:28.981 +So we have this mapping from the one hot to +a small continuous word representation. + +0:42:29.229 --> 0:42:38.276 +Using this one in your NG system, so you can, +for example, replace the embedding layer by + +0:42:38.276 --> 0:42:38.779 +the. + +0:42:39.139 --> 0:42:41.832 +That is helpful to be a really small amount +of data. + +0:42:42.922 --> 0:42:48.517 +And we're always in this pre-training phase +and have the thing the advantage is. + +0:42:48.468 --> 0:42:52.411 +More data than the trade off, so you can get +better. + +0:42:52.411 --> 0:42:59.107 +The disadvantage is, does anybody have an +idea of what might be the disadvantage of using + +0:42:59.107 --> 0:43:00.074 +things like. + +0:43:04.624 --> 0:43:12.175 +What was one mentioned today giving like big +advantage of the system compared to previous. + +0:43:20.660 --> 0:43:25.134 +Where one advantage was the enter end training, +so you have the enter end training so that + +0:43:25.134 --> 0:43:27.937 +all parameters and all components play optimal +together. + +0:43:28.208 --> 0:43:33.076 +If you know pre-train something on one fast, +it may be no longer optimal fitting to everything + +0:43:33.076 --> 0:43:33.384 +else. + +0:43:33.893 --> 0:43:37.862 +So what do pretending or not? + +0:43:37.862 --> 0:43:48.180 +It depends on how important everything is +optimal together and how important. + +0:43:48.388 --> 0:43:50.454 +Of large amount. + +0:43:50.454 --> 0:44:00.541 +The pre-change one is so much better that +it's helpful, and the advantage of that. + +0:44:00.600 --> 0:44:11.211 +Getting everything optimal together, yes, +we would use random instructions for raising. + +0:44:11.691 --> 0:44:26.437 +The problem is you might be already in some +area where it's not easy to get. + +0:44:26.766 --> 0:44:35.329 +But often in some way right, so often it's +not about your really worse pre trained monolepsy. + +0:44:35.329 --> 0:44:43.254 +If you're going already in some direction, +and if this is not really optimal for you,. + +0:44:43.603 --> 0:44:52.450 +But if you're not really getting better because +you have a decent amount of data, it's so different + +0:44:52.450 --> 0:44:52.981 +that. + +0:44:53.153 --> 0:44:59.505 +Initially it wasn't a machine translation +done so much because there are more data in + +0:44:59.505 --> 0:45:06.153 +MPs than in other tasks, but now with really +large amounts of monolingual data we do some + +0:45:06.153 --> 0:45:09.403 +type of pretraining in currently all state. + +0:45:12.632 --> 0:45:14.302 +The other one is okay now. + +0:45:14.302 --> 0:45:18.260 +It's always like how much of the model do +you plea track a bit? + +0:45:18.658 --> 0:45:22.386 +To the other one you can do contextural word +embedded. + +0:45:22.386 --> 0:45:28.351 +That is something like bird or Roberta where +you train already a sequence model and the + +0:45:28.351 --> 0:45:34.654 +embeddings you're using are no longer specific +for word but they are also taking the context + +0:45:34.654 --> 0:45:35.603 +into account. + +0:45:35.875 --> 0:45:50.088 +The embedding you're using is no longer depending +on the word itself but on the whole sentence, + +0:45:50.088 --> 0:45:54.382 +so you can use this context. + +0:45:55.415 --> 0:46:02.691 +You can use similar things also in the decoder +just by having layers which don't have access + +0:46:02.691 --> 0:46:12.430 +to the source, but there it still might have +and these are typically models like: And finally + +0:46:12.430 --> 0:46:14.634 +they will look at the end. + +0:46:14.634 --> 0:46:19.040 +You can also have models which are already +sequenced. + +0:46:19.419 --> 0:46:28.561 +So you may be training a sequence to sequence +models. + +0:46:28.561 --> 0:46:35.164 +You have to make it a bit challenging. + +0:46:36.156 --> 0:46:43.445 +But the idea is really you're pre-training +your whole model and then you'll find tuning. + +0:46:47.227 --> 0:46:59.614 +But let's first do a bit of step back and +look into what are the different things. + +0:46:59.614 --> 0:47:02.151 +The first thing. + +0:47:02.382 --> 0:47:11.063 +The wooden bettings are just this first layer +and you can train them with feedback annual + +0:47:11.063 --> 0:47:12.028 +networks. + +0:47:12.212 --> 0:47:22.761 +But you can also train them with an N language +model, and by now you hopefully have also seen + +0:47:22.761 --> 0:47:27.699 +that you cannot transform a language model. + +0:47:30.130 --> 0:47:37.875 +So this is how you can train them and you're +training them. + +0:47:37.875 --> 0:47:45.234 +For example, to speak the next word that is +the easiest. + +0:47:45.525 --> 0:47:55.234 +And that is what is now referred to as South +Supervised Learning and, for example, all the + +0:47:55.234 --> 0:48:00.675 +big large language models like Chad GPT and +so on. + +0:48:00.675 --> 0:48:03.129 +They are trained with. + +0:48:03.823 --> 0:48:15.812 +So that is where you can hopefully learn how +a word is used because you always try to previct + +0:48:15.812 --> 0:48:17.725 +the next word. + +0:48:19.619 --> 0:48:27.281 +Word embedding: Why do you keep the first +look at the word embeddings and the use of + +0:48:27.281 --> 0:48:29.985 +word embeddings for our task? + +0:48:29.985 --> 0:48:38.007 +The main advantage was it might be only the +first layer where you typically have most of + +0:48:38.007 --> 0:48:39.449 +the parameters. + +0:48:39.879 --> 0:48:57.017 +Most of your parameters already on the large +data, then on your target data you have to + +0:48:57.017 --> 0:48:59.353 +train less. + +0:48:59.259 --> 0:49:06.527 +Big difference that your input size is so +much bigger than the size of the novel in size. + +0:49:06.626 --> 0:49:17.709 +So it's a normally sign, maybe like, but your +input and banning size is something like. + +0:49:17.709 --> 0:49:20.606 +Then here you have to. + +0:49:23.123 --> 0:49:30.160 +While here you see it's only like zero point +five times as much in the layer. + +0:49:30.750 --> 0:49:36.534 +So here is where most of your parameters are, +which means if you already replace the word + +0:49:36.534 --> 0:49:41.739 +embeddings, they might look a bit small in +your overall and in key architecture. + +0:49:41.739 --> 0:49:47.395 +It's where most of the things are, and if +you're doing that you already have really big + +0:49:47.395 --> 0:49:48.873 +games and can do that. + +0:49:57.637 --> 0:50:01.249 +The thing is we have seen these were the bettings. + +0:50:01.249 --> 0:50:04.295 +They can be very good use for other types. + +0:50:04.784 --> 0:50:08.994 +You learn some general relations between words. + +0:50:08.994 --> 0:50:17.454 +If you're doing this type of language modeling +cast, you predict: The one thing is you have + +0:50:17.454 --> 0:50:24.084 +a lot of data, so the one question is we want +to have data to trade a model. + +0:50:24.084 --> 0:50:28.734 +The other thing, the tasks need to be somehow +useful. + +0:50:29.169 --> 0:50:43.547 +If you would predict the first letter of the +word, then you wouldn't learn anything about + +0:50:43.547 --> 0:50:45.144 +the word. + +0:50:45.545 --> 0:50:53.683 +And the interesting thing is people have looked +at these wood embeddings. + +0:50:53.954 --> 0:50:58.550 +And looking at the word embeddings. + +0:50:58.550 --> 0:51:09.276 +You can ask yourself how they look and visualize +them by doing dimension reduction. + +0:51:09.489 --> 0:51:13.236 +Don't know if you and you are listening to +artificial intelligence. + +0:51:13.236 --> 0:51:15.110 +Advanced artificial intelligence. + +0:51:15.515 --> 0:51:23.217 +We had on yesterday there how to do this type +of representation, but you can do this time + +0:51:23.217 --> 0:51:29.635 +of representation, and now you're seeing interesting +things that normally. + +0:51:30.810 --> 0:51:41.027 +Now you can represent a here in a three dimensional +space with some dimension reduction. + +0:51:41.027 --> 0:51:46.881 +For example, the relation between male and +female. + +0:51:47.447 --> 0:51:56.625 +So this vector between the male and female +version of something is always not the same, + +0:51:56.625 --> 0:51:58.502 +but it's related. + +0:51:58.718 --> 0:52:14.522 +So you can do a bit of maths, so you do take +king, you subtract this vector, add this vector. + +0:52:14.894 --> 0:52:17.591 +So that means okay, there is really something +stored. + +0:52:17.591 --> 0:52:19.689 +Some information are stored in that book. + +0:52:20.040 --> 0:52:22.621 +Similar, you can do it with Bob Hansen. + +0:52:22.621 --> 0:52:25.009 +See here swimming slam walking walk. + +0:52:25.265 --> 0:52:34.620 +So again these vectors are not the same, but +they are related. + +0:52:34.620 --> 0:52:42.490 +So you learn something from going from here +to here. + +0:52:43.623 --> 0:52:49.761 +Or semantically, the relations between city +and capital have exactly the same sense. + +0:52:51.191 --> 0:52:56.854 +And people had even done that question answering +about that if they showed the diembeddings + +0:52:56.854 --> 0:52:57.839 +and the end of. + +0:52:58.218 --> 0:53:06.711 +All you can also do is don't trust the dimensions +of the reaction because maybe there is something. + +0:53:06.967 --> 0:53:16.863 +You can also look into what happens really +in the individual space. + +0:53:16.863 --> 0:53:22.247 +What is the nearest neighbor of the. + +0:53:22.482 --> 0:53:29.608 +So you can take the relationship between France +and Paris and add it to Italy and you'll. + +0:53:30.010 --> 0:53:33.078 +You can do big and bigger and you have small +and smaller and stuff. + +0:53:33.593 --> 0:53:49.417 +Because it doesn't work everywhere, there +is also some typical dish here in German. + +0:53:51.491 --> 0:54:01.677 +You can do what the person is doing for famous +ones, of course only like Einstein scientists + +0:54:01.677 --> 0:54:06.716 +that find midfielders not completely correct. + +0:54:06.846 --> 0:54:10.134 +You see the examples are a bit old. + +0:54:10.134 --> 0:54:15.066 +The politicians are no longer they am, but +of course. + +0:54:16.957 --> 0:54:26.759 +What people have done there, especially at +the beginning training our end language model, + +0:54:26.759 --> 0:54:28.937 +was very expensive. + +0:54:29.309 --> 0:54:38.031 +So one famous model was, but we are not really +interested in the language model performance. + +0:54:38.338 --> 0:54:40.581 +Think something good to keep in mind. + +0:54:40.581 --> 0:54:42.587 +What are we really interested in? + +0:54:42.587 --> 0:54:45.007 +Do we really want to have an R&N no? + +0:54:45.007 --> 0:54:48.607 +In this case we are only interested in this +type of mapping. + +0:54:49.169 --> 0:54:55.500 +And so successful and very successful was +this word to vet. + +0:54:55.535 --> 0:54:56.865 +The idea is okay. + +0:54:56.865 --> 0:55:03.592 +We are not training real language one, making +it even simpler and doing this, for example, + +0:55:03.592 --> 0:55:05.513 +continuous peck of words. + +0:55:05.513 --> 0:55:12.313 +We're just having four input tokens and we're +predicting what is the word in the middle and + +0:55:12.313 --> 0:55:15.048 +this is just like two linear layers. + +0:55:15.615 --> 0:55:21.627 +So it's even simplifying things and making +the calculation faster because that is what + +0:55:21.627 --> 0:55:22.871 +we're interested. + +0:55:23.263 --> 0:55:32.897 +All this continuous skip ground models with +these other models which refer to as where + +0:55:32.897 --> 0:55:34.004 +to where. + +0:55:34.234 --> 0:55:42.394 +Where you have one equal word and the other +way around, you're predicting the four words + +0:55:42.394 --> 0:55:43.585 +around them. + +0:55:43.585 --> 0:55:45.327 +It's very similar. + +0:55:45.327 --> 0:55:48.720 +The task is in the end very similar. + +0:55:51.131 --> 0:56:01.407 +Before we are going to the next point, anything +about normal weight vectors or weight embedding. + +0:56:04.564 --> 0:56:07.794 +The next thing is contexture. + +0:56:07.794 --> 0:56:12.208 +Word embeddings and the idea is helpful. + +0:56:12.208 --> 0:56:19.206 +However, we might even be able to get more +from one lingo layer. + +0:56:19.419 --> 0:56:31.732 +And now in the word that is overlap of these +two meanings, so it represents both the meaning + +0:56:31.732 --> 0:56:33.585 +of can do it. + +0:56:34.834 --> 0:56:40.410 +But we might be able to in the pre-trained +model already disambiguate this because they + +0:56:40.410 --> 0:56:41.044 +are used. + +0:56:41.701 --> 0:56:53.331 +So if we can have a model which can not only +represent a word but can also represent the + +0:56:53.331 --> 0:56:58.689 +meaning of the word within the context,. + +0:56:59.139 --> 0:57:03.769 +So then we are going to context your word +embeddings. + +0:57:03.769 --> 0:57:07.713 +We are really having a representation in the. + +0:57:07.787 --> 0:57:11.519 +And we have a very good architecture for that +already. + +0:57:11.691 --> 0:57:23.791 +The hidden state represents what is currently +said, but it's focusing on what is the last + +0:57:23.791 --> 0:57:29.303 +one, so it's some of the representation. + +0:57:29.509 --> 0:57:43.758 +The first one doing that is something like +the Elmo paper where they instead of this is + +0:57:43.758 --> 0:57:48.129 +the normal language model. + +0:57:48.008 --> 0:57:50.714 +Within the third, predicting the fourth, and +so on. + +0:57:50.714 --> 0:57:53.004 +So you are always predicting the next work. + +0:57:53.193 --> 0:57:57.335 +The architecture is the heaven words embedding +layer and then layers. + +0:57:57.335 --> 0:58:03.901 +See you, for example: And now instead of using +this one in the end, you're using here this + +0:58:03.901 --> 0:58:04.254 +one. + +0:58:04.364 --> 0:58:11.245 +This represents the meaning of this word mainly +in the context of what we have seen before. + +0:58:11.871 --> 0:58:18.610 +We can train it in a language model style +always predicting the next word, but we have + +0:58:18.610 --> 0:58:21.088 +more information trained there. + +0:58:21.088 --> 0:58:26.123 +Therefore, in the system it has to learn less +additional things. + +0:58:27.167 --> 0:58:31.261 +And there is one Edendang which is done currently +in GPS. + +0:58:31.261 --> 0:58:38.319 +The only difference is that we have more layers, +bigger size, and we're using transformer neurocell + +0:58:38.319 --> 0:58:40.437 +potential instead of the RNA. + +0:58:40.437 --> 0:58:45.095 +But that is how you train like some large +language models at the. + +0:58:46.746 --> 0:58:55.044 +However, if you look at this contextual representation, +they might not be perfect. + +0:58:55.044 --> 0:59:02.942 +So if you think of this one as a contextual +representation of the third word,. + +0:59:07.587 --> 0:59:16.686 +Is representing a three in the context of +a sentence, however only in the context of + +0:59:16.686 --> 0:59:18.185 +the previous. + +0:59:18.558 --> 0:59:27.413 +However, we have an architecture which can +also take both sides and we have used that + +0:59:27.413 --> 0:59:30.193 +already in the ink holder. + +0:59:30.630 --> 0:59:34.264 +So we could do the iron easily on your, also +in the backward direction. + +0:59:34.874 --> 0:59:42.826 +By just having the states the other way around +and then we couldn't combine the forward and + +0:59:42.826 --> 0:59:49.135 +the forward into a joint one where we are doing +this type of prediction. + +0:59:49.329 --> 0:59:50.858 +So you have the word embedding. + +0:59:51.011 --> 1:00:02.095 +Then you have two in the states, one on the +forward arm and one on the backward arm, and + +1:00:02.095 --> 1:00:10.314 +then you can, for example, take the cocagenation +of both of them. + +1:00:10.490 --> 1:00:23.257 +Now this same here represents mainly this +word because this is what both puts in it last + +1:00:23.257 --> 1:00:30.573 +and we know is focusing on what is happening +last. + +1:00:31.731 --> 1:00:40.469 +However, there is a bit of difference when +training that as a language model you already + +1:00:40.469 --> 1:00:41.059 +have. + +1:00:43.203 --> 1:00:44.956 +Maybe There's Again This Masking. + +1:00:46.546 --> 1:00:47.748 +That is one solution. + +1:00:47.748 --> 1:00:52.995 +First of all, why we can't do it is the information +you leak it, so you cannot just predict the + +1:00:52.995 --> 1:00:53.596 +next word. + +1:00:53.596 --> 1:00:58.132 +If we just predict the next word in this type +of model, that's a very simple task. + +1:00:58.738 --> 1:01:09.581 +You know the next word because it's influencing +this hidden state predicting something is not + +1:01:09.581 --> 1:01:11.081 +a good task. + +1:01:11.081 --> 1:01:18.455 +You have to define: Because in this case what +will end with the system will just ignore these + +1:01:18.455 --> 1:01:22.966 +estates and what will learn is copy this information +directly in here. + +1:01:23.343 --> 1:01:31.218 +So it would be representing this word and +you would have nearly a perfect model because + +1:01:31.218 --> 1:01:38.287 +you only need to find encoding where you can +encode all words somehow in this. + +1:01:38.458 --> 1:01:44.050 +The only thing can learn is that turn and +encode all my words in this upper hidden. + +1:01:44.985 --> 1:01:53.779 +Therefore, it's not really useful, so we need +to find a bit of different ways out. + +1:01:55.295 --> 1:01:57.090 +There is a masking one. + +1:01:57.090 --> 1:02:03.747 +I'll come to that shortly just a bit that +other things also have been done, so the other + +1:02:03.747 --> 1:02:06.664 +thing is not to directly combine them. + +1:02:06.664 --> 1:02:13.546 +That was in the animal paper, so you have +them forward R&M and you keep them completely + +1:02:13.546 --> 1:02:14.369 +separated. + +1:02:14.594 --> 1:02:20.458 +So you never merged to state. + +1:02:20.458 --> 1:02:33.749 +At the end, the representation of the word +is now from the forward. + +1:02:33.873 --> 1:02:35.953 +So it's always the hidden state before the +good thing. + +1:02:36.696 --> 1:02:41.286 +These two you join now to your to the representation. + +1:02:42.022 --> 1:02:48.685 +And then you have now a representation also +about like the whole sentence for the word, + +1:02:48.685 --> 1:02:51.486 +but there is no information leakage. + +1:02:51.486 --> 1:02:58.149 +One way of doing this is instead of doing +a bidirection along you do a forward pass and + +1:02:58.149 --> 1:02:59.815 +then join the hidden. + +1:03:00.380 --> 1:03:05.960 +So you can do that in all layers. + +1:03:05.960 --> 1:03:16.300 +In the end you do the forwarded layers and +you get the hidden. + +1:03:16.596 --> 1:03:19.845 +However, it's a bit of a complicated. + +1:03:19.845 --> 1:03:25.230 +You have to keep both separate and merge things +so can you do. + +1:03:27.968 --> 1:03:33.030 +And that is the moment where like the big. + +1:03:34.894 --> 1:03:39.970 +The big success of the burnt model was used +where it okay. + +1:03:39.970 --> 1:03:47.281 +Maybe in bite and rich case it's not good +to do the next word prediction, but we can + +1:03:47.281 --> 1:03:48.314 +do masking. + +1:03:48.308 --> 1:03:56.019 +Masking mainly means we do a prediction of +something in the middle or some words. + +1:03:56.019 --> 1:04:04.388 +So the idea is if we have the input, we are +putting noise into the input, removing them, + +1:04:04.388 --> 1:04:07.961 +and then the model we are interested. + +1:04:08.048 --> 1:04:15.327 +Now there can be no information leakage because +this wasn't predicting that one is a big challenge. + +1:04:16.776 --> 1:04:19.957 +Do any assumption about our model? + +1:04:19.957 --> 1:04:26.410 +It doesn't need to be a forward model or a +backward model or anything. + +1:04:26.410 --> 1:04:29.500 +You can always predict the three. + +1:04:30.530 --> 1:04:34.844 +There's maybe one bit of a disadvantage. + +1:04:34.844 --> 1:04:40.105 +Do you see what could be a bit of a problem +this? + +1:05:00.000 --> 1:05:06.429 +Yes, so yeah, you can of course mask more, +but to see it more globally, just first assume + +1:05:06.429 --> 1:05:08.143 +you're only masked one. + +1:05:08.143 --> 1:05:13.930 +For the whole sentence, we get one feedback +signal, like what is the word three. + +1:05:13.930 --> 1:05:22.882 +So we have one training example: If you do +the language modeling taste, we predicted here, + +1:05:22.882 --> 1:05:24.679 +we predicted here. + +1:05:25.005 --> 1:05:26.735 +So we have number of tokens. + +1:05:26.735 --> 1:05:30.970 +For each token we have a feet pad and say +what is the best correction. + +1:05:31.211 --> 1:05:43.300 +So in this case this is less efficient because +we are getting less feedback signals on what + +1:05:43.300 --> 1:05:45.797 +we should predict. + +1:05:48.348 --> 1:05:56.373 +So and bird, the main ideas are that you're +doing this bidirectional model with masking. + +1:05:56.373 --> 1:05:59.709 +It's using transformer architecture. + +1:06:00.320 --> 1:06:06.326 +There are two more minor changes. + +1:06:06.326 --> 1:06:16.573 +We'll see that this next word prediction is +another task. + +1:06:16.957 --> 1:06:30.394 +You want to learn more about what language +is to really understand following a story or + +1:06:30.394 --> 1:06:35.127 +their independent tokens into. + +1:06:38.158 --> 1:06:42.723 +The input is using word units as we use it. + +1:06:42.723 --> 1:06:50.193 +It has some special token that is framing +for the next word prediction. + +1:06:50.470 --> 1:07:04.075 +It's more for classification task because +you may be learning a general representation + +1:07:04.075 --> 1:07:07.203 +as a full sentence. + +1:07:07.607 --> 1:07:19.290 +You're doing segment embedding, so you have +an embedding for it. + +1:07:19.290 --> 1:07:24.323 +This is the first sentence. + +1:07:24.684 --> 1:07:29.099 +Now what is more challenging is this masking. + +1:07:29.099 --> 1:07:30.827 +What do you mask? + +1:07:30.827 --> 1:07:35.050 +We already have the crush enough or should. + +1:07:35.275 --> 1:07:42.836 +So there has been afterwards eating some work +like, for example, a bearer. + +1:07:42.836 --> 1:07:52.313 +It's not super sensitive, but if you do it +completely wrong then you're not letting anything. + +1:07:52.572 --> 1:07:54.590 +That's Then Another Question There. + +1:07:56.756 --> 1:08:04.594 +Should I mask all types of should I always +mask the footwork or if I have a subword to + +1:08:04.594 --> 1:08:10.630 +mask only like a subword and predict them based +on the other ones? + +1:08:10.630 --> 1:08:14.504 +Of course, it's a bit of a different task. + +1:08:14.894 --> 1:08:21.210 +If you know three parts of the words, it might +be easier to guess the last because they here + +1:08:21.210 --> 1:08:27.594 +took the easiest selection, so not considering +words anymore at all because you're doing that + +1:08:27.594 --> 1:08:32.280 +in the preprocessing and just taking always +words and like subwords. + +1:08:32.672 --> 1:08:36.089 +Think in group there is done differently. + +1:08:36.089 --> 1:08:40.401 +They mark always the full words, but guess +it's not. + +1:08:41.001 --> 1:08:46.044 +And then what to do with the mask word in +eighty percent of the cases. + +1:08:46.044 --> 1:08:50.803 +If the word is masked, they replace it with +a special token thing. + +1:08:50.803 --> 1:08:57.197 +This is a mask token in ten percent they put +in some random other token in there, and ten + +1:08:57.197 --> 1:08:59.470 +percent they keep it on change. + +1:09:02.202 --> 1:09:10.846 +And then what you can do is also this next +word prediction. + +1:09:10.846 --> 1:09:14.880 +The man went to Mass Store. + +1:09:14.880 --> 1:09:17.761 +He bought a gallon. + +1:09:18.418 --> 1:09:24.088 +So may you see you're joining them, you're +doing both masks and prediction that you're. + +1:09:24.564 --> 1:09:29.449 +Is a penguin mask or flyless birds. + +1:09:29.449 --> 1:09:41.390 +These two sentences have nothing to do with +each other, so you can do also this type of + +1:09:41.390 --> 1:09:43.018 +prediction. + +1:09:47.127 --> 1:09:57.043 +And then the whole bird model, so here you +have the input here to transform the layers, + +1:09:57.043 --> 1:09:58.170 +and then. + +1:09:58.598 --> 1:10:17.731 +And this model was quite successful in general +applications. + +1:10:17.937 --> 1:10:27.644 +However, there is like a huge thing of different +types of models coming from them. + +1:10:27.827 --> 1:10:38.709 +So based on others these supervised molds +like a whole setup came out of there and now + +1:10:38.709 --> 1:10:42.086 +this is getting even more. + +1:10:42.082 --> 1:10:46.640 +With availability of a large language model +than the success. + +1:10:47.007 --> 1:10:48.436 +We have now even larger ones. + +1:10:48.828 --> 1:10:50.961 +Interestingly, it goes a bit. + +1:10:50.910 --> 1:10:57.847 +Change the bit again from like more the spider +action model to uni directional models. + +1:10:57.847 --> 1:11:02.710 +Are at the moment maybe a bit more we're coming +to them now? + +1:11:02.710 --> 1:11:09.168 +Do you see one advantage while what is another +event and we have the efficiency? + +1:11:09.509 --> 1:11:15.901 +Is one other reason why you are sometimes +more interested in uni-direction models than + +1:11:15.901 --> 1:11:17.150 +in bi-direction. + +1:11:22.882 --> 1:11:30.220 +It depends on the pass, but for example for +a language generation pass, the eccard is not + +1:11:30.220 --> 1:11:30.872 +really. + +1:11:32.192 --> 1:11:40.924 +It doesn't work so if you want to do a generation +like the decoder you don't know the future + +1:11:40.924 --> 1:11:42.896 +so you cannot apply. + +1:11:43.223 --> 1:11:53.870 +So this time of model can be used for the +encoder in an encoder model, but it cannot + +1:11:53.870 --> 1:11:57.002 +be used for the decoder. + +1:12:00.000 --> 1:12:05.012 +That's a good view to the next overall cast +of models. + +1:12:05.012 --> 1:12:08.839 +Perhaps if you view it from the sequence. + +1:12:09.009 --> 1:12:12.761 +We have the encoder base model. + +1:12:12.761 --> 1:12:16.161 +That's what we just look at. + +1:12:16.161 --> 1:12:20.617 +They are bidirectional and typically. + +1:12:20.981 --> 1:12:22.347 +That Is the One We Looked At. + +1:12:22.742 --> 1:12:34.634 +At the beginning is the decoder based model, +so see out in regressive models which are unidirective + +1:12:34.634 --> 1:12:42.601 +like an based model, and there we can do the +next word prediction. + +1:12:43.403 --> 1:12:52.439 +And what you can also do first, and there +you can also have a special things called prefix + +1:12:52.439 --> 1:12:53.432 +language. + +1:12:54.354 --> 1:13:05.039 +Because we are saying it might be helpful +that some of your input can also use bi-direction. + +1:13:05.285 --> 1:13:12.240 +And that is somehow doing what it is called +prefix length. + +1:13:12.240 --> 1:13:19.076 +On the first tokens you directly give your +bidirectional. + +1:13:19.219 --> 1:13:28.774 +So you somehow merge that and that mainly +works only in transformer based models because. + +1:13:29.629 --> 1:13:33.039 +There is no different number of parameters +in our end. + +1:13:33.039 --> 1:13:34.836 +We need a back foot our end. + +1:13:34.975 --> 1:13:38.533 +Transformer: The only difference is how you +mask your attention. + +1:13:38.878 --> 1:13:44.918 +We have seen that in the anchoder and decoder +the number of parameters is different because + +1:13:44.918 --> 1:13:50.235 +you do cross attention, but if you do forward +and backward or union directions,. + +1:13:50.650 --> 1:13:58.736 +It's only like you mask your attention to +only look at the bad past or to look into the + +1:13:58.736 --> 1:13:59.471 +future. + +1:14:00.680 --> 1:14:03.326 +And now you can of course also do mixing. + +1:14:03.563 --> 1:14:08.306 +So this is a bi-directional attention matrix +where you can attend to everything. + +1:14:08.588 --> 1:14:23.516 +There is a uni-direction or causal where you +can look at the past and you can do the first + +1:14:23.516 --> 1:14:25.649 +three words. + +1:14:29.149 --> 1:14:42.831 +That somehow clear based on that, then of +course you cannot do the other things. + +1:14:43.163 --> 1:14:50.623 +So the idea is we have our anchor to decoder +architecture. + +1:14:50.623 --> 1:14:57.704 +Can we also train them completely in a side +supervisor? + +1:14:58.238 --> 1:15:09.980 +And in this case we have the same input to +both, so in this case we need to do some type + +1:15:09.980 --> 1:15:12.224 +of masking here. + +1:15:12.912 --> 1:15:17.696 +Here we don't need to do the masking, but +here we need to masking that doesn't know ever + +1:15:17.696 --> 1:15:17.911 +so. + +1:15:20.440 --> 1:15:30.269 +And this type of model got quite successful +also, especially for pre-training machine translation. + +1:15:30.330 --> 1:15:39.059 +The first model doing that is a Bart model, +which exactly does that, and yes, it's one + +1:15:39.059 --> 1:15:42.872 +successful way to pre train your one. + +1:15:42.872 --> 1:15:47.087 +It's pretraining your full encoder model. + +1:15:47.427 --> 1:15:54.365 +Where you put in contrast to machine translation, +where you put in source sentence, we can't + +1:15:54.365 --> 1:15:55.409 +do that here. + +1:15:55.715 --> 1:16:01.382 +But we can just put the second twice in there, +and then it's not a trivial task. + +1:16:01.382 --> 1:16:02.432 +We can change. + +1:16:03.003 --> 1:16:12.777 +And there is like they do different corruption +techniques so you can also do. + +1:16:13.233 --> 1:16:19.692 +That you couldn't do in an agricultural system +because then it wouldn't be there and you cannot + +1:16:19.692 --> 1:16:20.970 +predict somewhere. + +1:16:20.970 --> 1:16:26.353 +So the anchor, the number of input and output +tokens always has to be the same. + +1:16:26.906 --> 1:16:29.818 +You cannot do a prediction for something which +isn't in it. + +1:16:30.110 --> 1:16:38.268 +Here in the decoder side it's unidirection +so we can also delete the top and then try + +1:16:38.268 --> 1:16:40.355 +to generate the full. + +1:16:41.061 --> 1:16:45.250 +We can do sentence permutation. + +1:16:45.250 --> 1:16:54.285 +We can document rotation and text infilling +so there is quite a bit. + +1:16:55.615 --> 1:17:06.568 +So you see there's quite a lot of types of +models that you can use in order to pre-train. + +1:17:07.507 --> 1:17:14.985 +Then, of course, there is again for the language +one. + +1:17:14.985 --> 1:17:21.079 +The other question is how do you integrate? + +1:17:21.761 --> 1:17:26.636 +And there's also, like yeah, quite some different +ways of techniques. + +1:17:27.007 --> 1:17:28.684 +It's a Bit Similar to Before. + +1:17:28.928 --> 1:17:39.068 +So the easiest thing is you take your word +embeddings or your free trained model. + +1:17:39.068 --> 1:17:47.971 +You freeze them and stack your decoder layers +and keep these ones free. + +1:17:48.748 --> 1:17:54.495 +Can also be done if you have this type of +bark model. + +1:17:54.495 --> 1:18:03.329 +What you can do is you freeze your word embeddings, +for example some products and. + +1:18:05.865 --> 1:18:17.296 +The other thing is you initialize them so +you initialize your models but you train everything + +1:18:17.296 --> 1:18:19.120 +so you're not. + +1:18:22.562 --> 1:18:29.986 +Then one thing, if you think about Bart, you +want to have the Chinese language, the Italian + +1:18:29.986 --> 1:18:32.165 +language, and the deconer. + +1:18:32.165 --> 1:18:35.716 +However, in Bart we have the same language. + +1:18:36.516 --> 1:18:46.010 +The one you get is from English, so what you +can do there is so you cannot try to do some. + +1:18:46.366 --> 1:18:52.562 +Below the barge, in order to learn some language +specific stuff, or there's a masculine barge, + +1:18:52.562 --> 1:18:58.823 +which is trained on many languages, but it's +trained only on like the Old Coast Modern Language + +1:18:58.823 --> 1:19:03.388 +House, which may be trained in German and English, +but not on German. + +1:19:03.923 --> 1:19:08.779 +So then you would still need to find June +and the model needs to learn how to better + +1:19:08.779 --> 1:19:10.721 +do the attention cross lingually. + +1:19:10.721 --> 1:19:15.748 +It's only on the same language but it mainly +only has to learn this mapping and not all + +1:19:15.748 --> 1:19:18.775 +the rest and that's why it's still quite successful. + +1:19:21.982 --> 1:19:27.492 +Now certain thing which is very commonly used +is what is required to it as adapters. + +1:19:27.607 --> 1:19:29.754 +So for example you take and buy. + +1:19:29.709 --> 1:19:35.218 +And you put some adapters on the inside of +the networks so that it's small new layers + +1:19:35.218 --> 1:19:40.790 +which are in between put in there and then +you only train these adapters or also train + +1:19:40.790 --> 1:19:41.815 +these adapters. + +1:19:41.815 --> 1:19:47.900 +For example, an embryo you could see that +this learns to map the Sears language representation + +1:19:47.900 --> 1:19:50.334 +to the Tiger language representation. + +1:19:50.470 --> 1:19:52.395 +And then you don't have to change that luck. + +1:19:52.792 --> 1:19:59.793 +You give it extra ability to really perform +well on that. + +1:19:59.793 --> 1:20:05.225 +These are quite small and so very efficient. + +1:20:05.905 --> 1:20:12.632 +That is also very commonly used, for example +in modular systems where you have some adaptors + +1:20:12.632 --> 1:20:16.248 +in between here which might be language specific. + +1:20:16.916 --> 1:20:22.247 +So they are trained only for one language. + +1:20:22.247 --> 1:20:33.777 +The model has some or both and once has the +ability to do multilingually to share knowledge. + +1:20:34.914 --> 1:20:39.058 +But there's one chance in general in the multilingual +systems. + +1:20:39.058 --> 1:20:40.439 +It works quite well. + +1:20:40.439 --> 1:20:46.161 +There's one case or one specific use case +for multilingual where this normally doesn't + +1:20:46.161 --> 1:20:47.344 +really work well. + +1:20:47.344 --> 1:20:49.975 +Do you have an idea what that could be? + +1:20:55.996 --> 1:20:57.536 +It's for Zero Shot Cases. + +1:20:57.998 --> 1:21:03.660 +Because having here some situation with this +might be very language specific and zero shot, + +1:21:03.660 --> 1:21:09.015 +the idea is always to learn representations +view which are more language dependent and + +1:21:09.015 --> 1:21:10.184 +with the adaptors. + +1:21:10.184 --> 1:21:15.601 +Of course you get in representations again +which are more language specific and then it + +1:21:15.601 --> 1:21:17.078 +doesn't work that well. + +1:21:20.260 --> 1:21:37.730 +And there is also the idea of doing more knowledge +pistolation. + +1:21:39.179 --> 1:21:42.923 +And now the idea is okay. + +1:21:42.923 --> 1:21:54.157 +We are training it the same, but what we want +to achieve is that the encoder. + +1:21:54.414 --> 1:22:03.095 +So you should learn faster by trying to make +these states as similar as possible. + +1:22:03.095 --> 1:22:11.777 +So you compare the first-hit state of the +pre-trained model and try to make them. + +1:22:12.192 --> 1:22:18.144 +For example, by using the out two norms, so +by just making these two representations the + +1:22:18.144 --> 1:22:26.373 +same: The same vocabulary: Why does it need +the same vocabulary with any idea? + +1:22:34.754 --> 1:22:46.137 +If you have different vocabulary, it's typical +you also have different sequenced lengths here. + +1:22:46.137 --> 1:22:50.690 +The number of sequences is different. + +1:22:51.231 --> 1:22:58.888 +If you now have pipe stains and four states +here, it's no longer straightforward which + +1:22:58.888 --> 1:23:01.089 +states compare to which. + +1:23:02.322 --> 1:23:05.246 +And that's just easier if you have like the +same number. + +1:23:05.246 --> 1:23:08.940 +You can always compare the first to the first +and second to the second. + +1:23:09.709 --> 1:23:16.836 +So therefore at least the very easy way of +knowledge destination only works if you have. + +1:23:17.177 --> 1:23:30.030 +Course: You could do things like yeah, the +average should be the same, but of course there's + +1:23:30.030 --> 1:23:33.071 +a less strong signal. + +1:23:34.314 --> 1:23:42.979 +But the advantage here is that you have a +diameter training signal here on the handquarter + +1:23:42.979 --> 1:23:51.455 +so you can directly make some of the encoder +already giving a good signal while normally + +1:23:51.455 --> 1:23:52.407 +an empty. + +1:23:56.936 --> 1:24:13.197 +Yes, think this is most things for today, +so what you should keep in mind is remind me. + +1:24:13.393 --> 1:24:18.400 +The one is a back translation idea. + +1:24:18.400 --> 1:24:29.561 +If you have monolingual and use that, the +other one is to: And mentally it is often helpful + +1:24:29.561 --> 1:24:33.614 +to combine them so you can even use both of +that. + +1:24:33.853 --> 1:24:38.908 +So you can use pre-trained walls, but then +you can even still do back translation where + +1:24:38.908 --> 1:24:40.057 +it's still helpful. + +1:24:40.160 --> 1:24:45.502 +We have the advantage we are training like +everything working together on the task so + +1:24:45.502 --> 1:24:51.093 +it might be helpful even to backtranslate some +data and then use it in a real translation + +1:24:51.093 --> 1:24:56.683 +setup because in pretraining of course the +beach challenge is always that you're training + +1:24:56.683 --> 1:24:57.739 +it on different. + +1:24:58.058 --> 1:25:03.327 +Different ways of how you integrate this knowledge. + +1:25:03.327 --> 1:25:08.089 +Even if you just use a full model, so in this. + +1:25:08.748 --> 1:25:11.128 +This is the most similar you can get. + +1:25:11.128 --> 1:25:13.945 +You're doing no changes to the architecture. + +1:25:13.945 --> 1:25:19.643 +You're really taking the model and just fine +tuning them on the new task, but it still has + +1:25:19.643 --> 1:25:24.026 +to completely newly learn how to do the attention +and how to do that. + +1:25:24.464 --> 1:25:29.971 +And that might be, for example, helpful to +have more back-translated data to learn them. + +1:25:32.192 --> 1:25:34.251 +That's for today. + +1:25:34.251 --> 1:25:44.661 +There's one important thing that next Tuesday +there is a conference or a workshop or so in + +1:25:44.661 --> 1:25:45.920 +this room. + +1:25:47.127 --> 1:25:56.769 +You should get an e-mail if you're in Elias +that there's a room change for Tuesdays and + +1:25:56.769 --> 1:25:57.426 +it's. + +1:25:57.637 --> 1:26:03.890 +There are more questions, yeah, have a more +general position, especially: In computer vision + +1:26:03.890 --> 1:26:07.347 +you can enlarge your data center data orientation. + +1:26:07.347 --> 1:26:08.295 +Is there any? + +1:26:08.388 --> 1:26:15.301 +It's similar to a large speech for text for +the data of an edge. + +1:26:15.755 --> 1:26:29.176 +And you can use this back translation and +also masking, but back translation is some + +1:26:29.176 --> 1:26:31.228 +way of data. + +1:26:31.371 --> 1:26:35.629 +So it has also been, for example, even its +used not only for monolingual data. + +1:26:36.216 --> 1:26:54.060 +If you have good MP system, it can also be +used for parallel data. + +1:26:54.834 --> 1:26:59.139 +So would say this is the most similar one. + +1:26:59.139 --> 1:27:03.143 +There's ways you can do power phrasing. + +1:27:05.025 --> 1:27:12.057 +But for example there is very hard to do this +by rules like which words to replace because + +1:27:12.057 --> 1:27:18.936 +there is not a coup like you cannot always +say this word can always be replaced by that. + +1:27:19.139 --> 1:27:27.225 +Mean, although they are many perfect synonyms, +normally they are good in some cases, but not + +1:27:27.225 --> 1:27:29.399 +in all cases, and so on. + +1:27:29.399 --> 1:27:36.963 +And if you don't do a rule based, you have +to train your model and then the freshness. + +1:27:38.058 --> 1:27:57.236 +The same architecture as the pre-trained mount. + +1:27:57.457 --> 1:27:59.810 +Should be of the same dimension, so it's easiest +to have the same dimension. + +1:28:00.000 --> 1:28:01.590 +Architecture. + +1:28:01.590 --> 1:28:05.452 +We later will learn inefficiency. + +1:28:05.452 --> 1:28:12.948 +You can also do knowledge cessulation with, +for example, smaller. + +1:28:12.948 --> 1:28:16.469 +You can learn the same within. + +1:28:17.477 --> 1:28:22.949 +Eight layers for it so that is possible, but +yeah agree it should be of the same. + +1:28:23.623 --> 1:28:32.486 +Yeah yeah you need the question then of course +you can do it like it's an initialization or + +1:28:32.486 --> 1:28:41.157 +you can do it doing training but normally it +most makes sense during the normal training. + +1:28:45.865 --> 1:28:53.963 +Do it, then thanks a lot, and then we'll see +each other again on Tuesday. + +0:00:00.981 --> 0:00:17.559 +What we want today about is how to use some +type of additional resources to improve the + +0:00:17.559 --> 0:00:20.008 +translation. + +0:00:20.300 --> 0:00:31.387 +We have in the first part of the semester +how to build some of your basic machine translation. + +0:00:31.571 --> 0:00:40.743 +You know now the basic components both for +statistical and for neural, with the encoder + +0:00:40.743 --> 0:00:42.306 +decoder model. + +0:00:43.123 --> 0:00:45.950 +Now, of course, that's not where it stops. + +0:00:45.950 --> 0:00:51.340 +It's still what in nearly every machine translation +system is currently in India. + +0:00:51.340 --> 0:00:57.323 +However, there is a lot of challenges which +you need to address in addition and which need + +0:00:57.323 --> 0:00:58.243 +to be solved. + +0:00:58.918 --> 0:01:03.031 +We want to start with these parts. + +0:01:03.031 --> 0:01:07.614 +What else can you do around this part? + +0:01:07.614 --> 0:01:09.847 +You can be honest. + +0:01:10.030 --> 0:01:14.396 +And one important question there is on what +do you train your models? + +0:01:14.394 --> 0:01:27.237 +Because this type of parallel data is easier +in machine translation than many other tasks + +0:01:27.237 --> 0:01:33.516 +where you have a decent amount of training. + +0:01:33.853 --> 0:01:40.789 +And therefore an important question is: Can +we also learn from other sources and improve + +0:01:40.789 --> 0:01:41.178 +our. + +0:01:41.701 --> 0:01:47.840 +Because if you remember from quite the beginning +of the lecture,. + +0:01:51.171 --> 0:01:53.801 +This is how we train all our. + +0:01:54.194 --> 0:02:01.318 +Machine learning models, all the corpus bases +from statistical to neural. + +0:02:01.318 --> 0:02:09.694 +This doesn't have change, so we need this +type of parallel data where we have a source + +0:02:09.694 --> 0:02:13.449 +sentence aligned with the target data. + +0:02:13.493 --> 0:02:19.654 +We have now a strong model here, a very good +model to do that. + +0:02:19.654 --> 0:02:22.099 +However, we always rely. + +0:02:22.522 --> 0:02:27.376 +More languages, higher resource languages, +prayers that say from German to English or + +0:02:27.376 --> 0:02:31.327 +other European languages, there is a decent +amount at least for some. + +0:02:31.471 --> 0:02:46.131 +But even there, if we're going to very specific +domains, it might get difficult and then your + +0:02:46.131 --> 0:02:50.966 +system performance might drop. + +0:02:51.231 --> 0:02:55.261 +Phrases how to use the vocabulary, and so +on, and the style. + +0:02:55.915 --> 0:03:04.104 +And if you're going to other languages, there +is of course a lot bigger challenge. + +0:03:04.104 --> 0:03:05.584 +Why can't you? + +0:03:05.825 --> 0:03:09.647 +So is really this the only resource you can +use. + +0:03:09.889 --> 0:03:20.667 +Or can we adapt our models in order to also +make use of other types of models that might + +0:03:20.667 --> 0:03:27.328 +enable us to build strong systems with other +types of. + +0:03:27.707 --> 0:03:35.283 +And that's what we will look into now in the +next, starting from Tuesday in the next. + +0:03:35.515 --> 0:03:43.437 +So this idea we already have covered on Tuesday, +so one very successful idea for this is to + +0:03:43.437 --> 0:03:45.331 +do more multilingual. + +0:03:45.645 --> 0:03:52.010 +So that we're no longer only doing translation +between two languages, but we can do translation + +0:03:52.010 --> 0:03:55.922 +between many languages and share common knowledge +between. + +0:03:56.296 --> 0:04:06.477 +And you also learned about that you can even +do things like zero shot machine translations. + +0:04:06.786 --> 0:04:09.792 +Which is the case for many many language pairs. + +0:04:10.030 --> 0:04:17.406 +Even with German, you have not translation +parallel data to all languages around the world, + +0:04:17.406 --> 0:04:22.698 +or most of them you have it to the Europeans, +maybe for Japanese. + +0:04:22.698 --> 0:04:26.386 +But even for Japanese, it will get difficult. + +0:04:26.746 --> 0:04:32.862 +There is quite a lot of data, for example +English to Japanese, but German to Vietnamese. + +0:04:32.862 --> 0:04:39.253 +There is some data from Multilingual Corpora +where you can extract the name, but your amount + +0:04:39.253 --> 0:04:41.590 +really is dropping significantly. + +0:04:42.042 --> 0:04:54.907 +So that is a very promising direction if you +want to build translation systems between language + +0:04:54.907 --> 0:05:00.134 +pairs, typically not English, because. + +0:05:01.221 --> 0:05:05.888 +And the other ideas, of course, we don't have +data, just search for data. + +0:05:06.206 --> 0:05:15.755 +There is some work on data crawling so if +don't have a corpus directly or don't have + +0:05:15.755 --> 0:05:23.956 +a high quality corpus from the European Parliament +for TED corpus maybe. + +0:05:24.344 --> 0:05:35.528 +There has been a big effort in Europe to collect +data sets for parallel data. + +0:05:35.528 --> 0:05:40.403 +How can we do this data crawling? + +0:05:40.600 --> 0:05:46.103 +There the interesting thing from the machine +translation point is not just general data + +0:05:46.103 --> 0:05:46.729 +crawling. + +0:05:47.067 --> 0:05:52.067 +But how can we explicitly crawl data, which +is somewhat parallel? + +0:05:52.132 --> 0:05:58.538 +So there is in the Internet quite a lot of +data which has been like company websites which + +0:05:58.538 --> 0:06:01.565 +have been translated and things like that. + +0:06:01.565 --> 0:06:05.155 +So how can you extract them and then extract +them? + +0:06:06.566 --> 0:06:13.406 +There is typically more noisy than where you +do more, hence mean if you have your Parliament. + +0:06:13.693 --> 0:06:21.305 +You can do some rules how to extract the parallel +things. + +0:06:21.305 --> 0:06:30.361 +Here there is more to it, so the quality is +later maybe not as good. + +0:06:33.313 --> 0:06:39.927 +The other thing is can we use monolingual +data and monolingual data has a big advantage + +0:06:39.927 --> 0:06:46.766 +that we can have a huge amount of that so that +you can be able to crawl from the internet. + +0:06:46.766 --> 0:06:51.726 +The nice thing is you can also get it typically +for many domains. + +0:06:52.352 --> 0:06:58.879 +There is just so much more magnitude more +of monolingual data so that it might be very + +0:06:58.879 --> 0:06:59.554 +helpful. + +0:06:59.559 --> 0:07:06.187 +We can do that in statistical machine translation +was quite easy to integrate using language + +0:07:06.187 --> 0:07:06.757 +models. + +0:07:08.508 --> 0:07:14.499 +In neural machine translation we have the +advantage that we have this overall and architecture + +0:07:14.499 --> 0:07:18.850 +that does everything together, but it has also +the disadvantage now. + +0:07:18.850 --> 0:07:22.885 +It's more difficult to put in this type of +information or make. + +0:07:23.283 --> 0:07:26.427 +We'll look to two things. + +0:07:26.427 --> 0:07:37.432 +You can still try to do a bit of language +modeling in there and add an additional language + +0:07:37.432 --> 0:07:38.279 +model. + +0:07:38.178 --> 0:07:43.771 +A way which I think is used in most systems +at the moment is to do synthetic data. + +0:07:43.763 --> 0:07:53.095 +It's a very easy thing, but you can just translate +there and then use it as training data. + +0:07:53.213 --> 0:07:59.192 +And thereby you are able to use like some +type of moonlighting. + +0:08:00.380 --> 0:08:09.521 +Another way to do it is to ensure that some +are in the extreme case. + +0:08:09.521 --> 0:08:14.026 +If you have a scenario that only. + +0:08:14.754 --> 0:08:24.081 +The impressive thing is if you have large +amounts of data and the languages are not too + +0:08:24.081 --> 0:08:31.076 +dissimilar, you can even in this case build +a translation system. + +0:08:32.512 --> 0:08:36.277 +That we will see then next Thursday. + +0:08:37.857 --> 0:08:55.462 +And then there is now a fourth type of restorer +that recently became very successful and now. + +0:08:55.715 --> 0:09:02.409 +So the idea is we are no longer sharing the +real data such as text data, but it can also + +0:09:02.409 --> 0:09:04.139 +help to train a model. + +0:09:04.364 --> 0:09:08.599 +And that is now a big advantage of deep learning +based approaches. + +0:09:08.599 --> 0:09:14.414 +There you have this ability that you can train +a model on some task and then you can modify + +0:09:14.414 --> 0:09:19.913 +it maybe and then apply it to another task +and you can somewhat transfer the knowledge + +0:09:19.913 --> 0:09:22.125 +from the first task to the second. + +0:09:22.722 --> 0:09:31.906 +And then, of course, the question is, can +it have an initial task where it's very easy + +0:09:31.906 --> 0:09:34.439 +to train on the second? + +0:09:34.714 --> 0:09:53.821 +The task that you pre-train on is more similar +to a language. + +0:09:53.753 --> 0:10:06.293 +A bit of a different way of using language +malls in this more transfer learning set. + +0:10:09.029 --> 0:10:18.747 +So first we will start with how can we use +monolingual data to do a machine translation? + +0:10:20.040 --> 0:10:22.542 +The. + +0:10:22.062 --> 0:10:28.924 +This big difference is you should remember +from what I mentioned before is in statistical + +0:10:28.924 --> 0:10:30.525 +machine translation. + +0:10:30.525 --> 0:10:33.118 +We directly have the opportunity. + +0:10:33.118 --> 0:10:39.675 +There's peril data for a translation model +and monolingual data for a language model. + +0:10:39.679 --> 0:10:45.735 +And you combine your translation model and +your language model, and then you can make. + +0:10:46.726 --> 0:10:54.263 +That has big advantages that you can make +use of these large amounts of monolingual data, + +0:10:54.263 --> 0:10:55.519 +but of course. + +0:10:55.495 --> 0:11:02.198 +Because we said the problem is, we are optimizing +both parts independently to each other, and + +0:11:02.198 --> 0:11:09.329 +we say the big advantage of newer machine translation +is we are optimizing the overall architecture + +0:11:09.329 --> 0:11:10.541 +to perform best. + +0:11:10.890 --> 0:11:17.423 +And then, of course, we can't do that, so +here we can only use power there. + +0:11:17.897 --> 0:11:25.567 +So the question is, but if this advantage +is not so important, we can train everything, + +0:11:25.567 --> 0:11:33.499 +but we have large amounts of monolingual data +or small amounts, but they fit perfectly, so + +0:11:33.499 --> 0:11:35.242 +they are very good. + +0:11:35.675 --> 0:11:41.438 +So in data we know it's not only important +the amount of data we have but also like how + +0:11:41.438 --> 0:11:43.599 +similar it is to your test data. + +0:11:43.599 --> 0:11:49.230 +So it can be that this volume is even only +quite small but it's very well fitting and + +0:11:49.230 --> 0:11:51.195 +then it's still very helpful. + +0:11:51.195 --> 0:11:55.320 +So the question is if this is the case how +can we make use of? + +0:11:55.675 --> 0:12:03.171 +And the first year of surprisingness, if we +are here successful with integrating a language + +0:12:03.171 --> 0:12:10.586 +model into a translation system, maybe we can +also integrate some types of language models + +0:12:10.586 --> 0:12:14.415 +into our MT system in order to make it better. + +0:12:16.536 --> 0:12:19.000 +The first thing we can do is okay. + +0:12:19.000 --> 0:12:23.293 +We know there is language models, so let's +try to integrate. + +0:12:23.623 --> 0:12:30.693 +There was mainly used language models because +these works were mainly done before transformer + +0:12:30.693 --> 0:12:31.746 +based models. + +0:12:32.152 --> 0:12:41.567 +And generally, of course, you can do the same +thing with all the Transformers baseballs. + +0:12:41.721 --> 0:12:58.900 +It has mainly been done before people started +using R&S, and they tried to do this more + +0:12:58.900 --> 0:13:01.888 +in cases where. + +0:13:07.087 --> 0:13:17.508 +So what we're having here is some of this +type of idea. + +0:13:17.508 --> 0:13:25.511 +This is a key system here as you remember. + +0:13:25.605 --> 0:13:29.470 +Gets in with your last instinct and calculates +your attention. + +0:13:29.729 --> 0:13:36.614 +We get the context and combine both and then +based on that and then predict the target. + +0:13:37.057 --> 0:13:42.423 +So this is our anti-system, and the question +is, can we somehow integrate the language? + +0:13:42.782 --> 0:13:55.788 +And of course, if someone makes sense to take +out a neural language model because we're anyway + +0:13:55.788 --> 0:14:01.538 +in the neural space, it's not surprising. + +0:14:01.621 --> 0:14:15.522 +And there would be something like on top of +there and you're a language model and you have + +0:14:15.522 --> 0:14:17.049 +a target. + +0:14:17.597 --> 0:14:27.007 +So if we're having this type of language model, +there's two main questions we have to answer. + +0:14:27.007 --> 0:14:28.108 +How do we? + +0:14:28.208 --> 0:14:37.935 +So how do we combine now on the one hand our +NMT system and on the other hand our RNA you + +0:14:37.935 --> 0:14:45.393 +see that was mentioned before when we started +talking about encoder. + +0:14:45.805 --> 0:14:49.523 +The wild is like unconditioned, it's just +modeling the targets side. + +0:14:49.970 --> 0:14:57.183 +And the other one is a conditional language, +which is a language condition on the sewer + +0:14:57.183 --> 0:14:57.839 +center. + +0:14:58.238 --> 0:15:03.144 +So the question is how can you not combine +two language models? + +0:15:03.144 --> 0:15:09.813 +Of course, it's like the translation model +will some will be more important because it + +0:15:09.813 --> 0:15:11.806 +has access to the source. + +0:15:11.806 --> 0:15:16.713 +We want to generate something which corresponds +to your source. + +0:15:18.778 --> 0:15:20.918 +If we had that, the other question is OK. + +0:15:20.918 --> 0:15:22.141 +Now we have two models. + +0:15:22.141 --> 0:15:25.656 +If we even have integrated them, the answer +is how do we train them? + +0:15:26.026 --> 0:15:39.212 +Because we have integrated them, we have no +two sets of data with parallel data where you + +0:15:39.212 --> 0:15:42.729 +can do the lower thing. + +0:15:44.644 --> 0:15:47.575 +So the first idea is okay. + +0:15:47.575 --> 0:15:53.436 +We can do something more like a parallel combination. + +0:15:53.436 --> 0:15:55.824 +We just keep running. + +0:15:56.036 --> 0:15:59.854 +So a year you see your NMT system that is +running. + +0:16:00.200 --> 0:16:08.182 +First of all, it's normally completely independent +of your language model, which is up there. + +0:16:08.182 --> 0:16:13.278 +So down here we have just our NMT system, +which is running. + +0:16:13.313 --> 0:16:26.439 +The only thing which is used is we have the +words inputted, and of course they are put + +0:16:26.439 --> 0:16:28.099 +into both. + +0:16:28.099 --> 0:16:41.334 +We also put: So we use them in parallel, and +then we are doing our decision just by merging + +0:16:41.334 --> 0:16:42.905 +these two. + +0:16:43.343 --> 0:16:52.288 +So there can be, for example, we are doing +a probability distribution here, we are doing + +0:16:52.288 --> 0:17:01.032 +a purability distribution here, and then we +are taking the average of both per ability + +0:17:01.032 --> 0:17:03.343 +to do our predictions. + +0:17:11.871 --> 0:17:18.929 +You could also take the output which seems +to be more short about the answer. + +0:17:20.000 --> 0:17:23.272 +Yes, you could also do that. + +0:17:23.272 --> 0:17:27.222 +It's more like a gating mechanism. + +0:17:27.222 --> 0:17:32.865 +You're not doing everything, but you're focusing. + +0:17:32.993 --> 0:17:38.927 +Another one would be you could also just concatenate +the hidden states and then you have another + +0:17:38.927 --> 0:17:41.802 +layer on top which based on the concatenation. + +0:17:43.303 --> 0:17:58.634 +If you think about it, you do the coordination +instead of taking the instead and then merging + +0:17:58.634 --> 0:18:01.244 +the perability. + +0:18:03.143 --> 0:18:15.027 +Yes, in the end you introduce many new parameters +and these parameters have somehow something + +0:18:15.027 --> 0:18:17.303 +special compared. + +0:18:23.603 --> 0:18:33.657 +So before all the other parameters can be +trained independently of each other, the language + +0:18:33.657 --> 0:18:42.071 +one can be trained independent and an antisystem +can be trained independent. + +0:18:43.043 --> 0:18:51.198 +If you have a joint layer of course you need +to train them because you have inputs so you + +0:18:51.198 --> 0:19:01.560 +need: Not surprisingly, if you have a parallel +combination or whether you could, the other + +0:19:01.560 --> 0:19:04.664 +way is to do more serial combinations. + +0:19:04.924 --> 0:19:10.382 +How can you do a similar combination? + +0:19:10.382 --> 0:19:18.281 +Your final decision makes sense to do it based +on the. + +0:19:18.438 --> 0:19:20.997 +So you have on top of your normal an system. + +0:19:21.121 --> 0:19:30.826 +The only thing is now your inputting into +your NIT system. + +0:19:30.826 --> 0:19:38.723 +You're no longer inputting the word embeddings. + +0:19:38.918 --> 0:19:47.819 +You're training the lower layers here which +are trained more on the purely language model + +0:19:47.819 --> 0:19:55.434 +and on top you're putting into the NMT system +where it now has the language. + +0:19:55.815 --> 0:19:59.003 +So here you can also view it here. + +0:19:59.003 --> 0:20:06.836 +You have more contextual embeddings which +no longer depend on the word, but they also + +0:20:06.836 --> 0:20:10.661 +depend on the context of the target site. + +0:20:11.051 --> 0:20:21.797 +More understanding of the source word. + +0:20:21.881 --> 0:20:34.761 +So if it's like the word can, for example, +will be put in here always the same, independent + +0:20:34.761 --> 0:20:41.060 +of its use of can of beans, or if can do it. + +0:20:41.701 --> 0:20:43.165 +Empties. + +0:20:44.364 --> 0:20:54.959 +So another view, if you're remembering more +the transformer based approach, is you have + +0:20:54.959 --> 0:21:01.581 +some layers, and the lower layers are purely +language. + +0:21:02.202 --> 0:21:08.052 +This is purely language model and then at +some point you're starting to attend to the + +0:21:08.052 --> 0:21:08.596 +source. + +0:21:13.493 --> 0:21:20.774 +Yes, so these are two ways of how you combine +it, so run them in peril, or first do the language. + +0:21:23.623 --> 0:21:26.147 +Questions for the integration. + +0:21:31.831 --> 0:21:35.034 +Not really sure about the input of the. + +0:21:35.475 --> 0:21:38.123 +And this case with a sequence. + +0:21:38.278 --> 0:21:50.721 +Is the input and bedding, the target word +embedding, or the actual word, and then we + +0:21:50.721 --> 0:21:54.821 +transfer it to a numerical. + +0:21:56.176 --> 0:22:08.824 +That depends on if you view the word embedding +as part of the language model, so of course + +0:22:08.824 --> 0:22:10.909 +you first put. + +0:22:11.691 --> 0:22:13.938 +And then the word embedding there is the r&n. + +0:22:14.314 --> 0:22:20.296 +So of course you can view this together as +your language model when you first do the word + +0:22:20.296 --> 0:22:21.027 +embedding. + +0:22:21.401 --> 0:22:28.098 +All you can say are the RNAs and this is like +before. + +0:22:28.098 --> 0:22:36.160 +It's more a definition, but you're right, +so what are the steps? + +0:22:36.516 --> 0:22:46.655 +One of these parts, you know, called a language +model is definitionally not that important, + +0:22:46.655 --> 0:22:47.978 +but that's. + +0:22:53.933 --> 0:23:02.812 +So the question is how can you then train +them and make make this this one work? + +0:23:03.363 --> 0:23:15.492 +So in the case where you combine the language +of our abilities you can train them independently + +0:23:15.492 --> 0:23:18.524 +and then just put them. + +0:23:18.918 --> 0:23:29.623 +It might not be the best because we have no +longer this ability before that. + +0:23:29.623 --> 0:23:33.932 +They optimal perform together. + +0:23:34.514 --> 0:23:41.050 +At least you need to summarize how much do +you trust the one model and how much do you + +0:23:41.050 --> 0:23:41.576 +trust. + +0:23:43.323 --> 0:23:48.529 +But still in some cases usually it might be +helpful if you have only data and so on. + +0:23:48.928 --> 0:24:06.397 +However, we have one specific situation that +leads to the pearl leader is always mono legal + +0:24:06.397 --> 0:24:07.537 +data. + +0:24:08.588 --> 0:24:17.693 +So what we can also do is more the pre-training +approach. + +0:24:17.693 --> 0:24:24.601 +We first train the language model and then. + +0:24:24.704 --> 0:24:33.468 +So the pre-training approach you first train +on the monolingual data and then you join the. + +0:24:33.933 --> 0:24:45.077 +Of course, the model size is this way, but +the data size is of course too big. + +0:24:45.077 --> 0:24:52.413 +You often have more monolingual data than +parallel. + +0:24:56.536 --> 0:24:57.901 +Any ideas. + +0:25:04.064 --> 0:25:10.108 +Had one example where this might also be helpful +if you want to adapt to a domain so let's say + +0:25:10.108 --> 0:25:16.281 +you do medical sentences and if you want to +translate medical sentences and you have monolingual + +0:25:16.281 --> 0:25:22.007 +data on the target side for medical sentences +but you only have parallel data for general + +0:25:22.007 --> 0:25:22.325 +use. + +0:25:23.083 --> 0:25:30.601 +In this case it could be, or it's the most +probable happen if you're learning out there + +0:25:30.601 --> 0:25:38.804 +what medical means, but then in your fine tuning +step the model is forgetting everything about. + +0:25:39.099 --> 0:25:42.340 +So this type of priest training step is good. + +0:25:42.340 --> 0:25:47.978 +If your pretraining data is more general, +very large, and then you're adapting. + +0:25:48.428 --> 0:25:55.545 +But in the task we have monolingual data, +which should be used to adapt the system to + +0:25:55.545 --> 0:25:57.780 +some genre of topic style. + +0:25:57.817 --> 0:26:08.572 +Then, of course, this is not a good strategy +because you might forget about everything up + +0:26:08.572 --> 0:26:09.408 +there. + +0:26:09.649 --> 0:26:17.494 +So then you have to check what you can do +for them to see. + +0:26:17.494 --> 0:26:25.738 +You can freeze this part and you can do a +direct combination. + +0:26:25.945 --> 0:26:33.796 +Where you train both of them, and then you +train the language more and parallel on their + +0:26:33.796 --> 0:26:34.942 +one so that. + +0:26:35.395 --> 0:26:37.687 +Eh What You Learn in the Length. + +0:26:37.937 --> 0:26:48.116 +So the bit depends on what you want to combine +is that you use a language model because it's. + +0:26:48.548 --> 0:26:56.380 +Then you normally don't really forget it because +it's also in the or you use it to adapt to + +0:26:56.380 --> 0:26:58.083 +something specific. + +0:27:01.001 --> 0:27:06.662 +Then there is so this is a way of how we can +make use of monolingual data. + +0:27:07.968 --> 0:27:11.787 +It seems to be the easiest one somehow. + +0:27:11.787 --> 0:27:19.140 +It's more similar to what we are doing with +statistical machine translation. + +0:27:19.140 --> 0:27:20.095 +However,. + +0:27:21.181 --> 0:27:27.211 +Normally always beats this type of model, +which in some view can be from the conceptual + +0:27:27.211 --> 0:27:27.691 +thing. + +0:27:27.691 --> 0:27:31.460 +At least it's even easier from the computational +side. + +0:27:31.460 --> 0:27:36.805 +Sometimes it has a disadvantage that it's +more problematic or more difficult. + +0:27:40.560 --> 0:27:42.576 +And the idea is okay. + +0:27:42.576 --> 0:27:45.141 +We have a monolingual data. + +0:27:45.141 --> 0:27:50.822 +We just translate it and then generate some +type of parallel. + +0:27:51.111 --> 0:28:00.465 +So if you want to build a German to English +system, your first trained German to English + +0:28:00.465 --> 0:28:02.147 +system on your. + +0:28:02.402 --> 0:28:05.217 +Then you have more pearl data. + +0:28:05.217 --> 0:28:13.482 +The interesting thing is if you then train +on the joint thing, on the original pearl data, + +0:28:13.482 --> 0:28:18.749 +and on that one is artificial, it even normally +improves. + +0:28:18.918 --> 0:28:26.490 +You can because you're not doing the same +error all the time and you have some knowledge. + +0:28:28.028 --> 0:28:40.080 +With this first approach, however, there's +one issue: why it might not work the best, + +0:28:40.080 --> 0:28:43.163 +so could you imagine? + +0:28:49.409 --> 0:28:51.186 +Ready a bit shown in image two. + +0:28:53.113 --> 0:29:00.637 +Have a few trains on bad quality data. + +0:29:00.637 --> 0:29:08.741 +The system will learn also in the states. + +0:29:08.828 --> 0:29:12.210 +And as you're saying, it's a system always +mistranslates. + +0:29:13.493 --> 0:29:14.497 +Something. + +0:29:14.497 --> 0:29:23.623 +Then you will learn that this is correct because +now it's training data and you will even encourage + +0:29:23.623 --> 0:29:25.996 +it to make it more often. + +0:29:25.996 --> 0:29:29.921 +So the problem on training on your own is. + +0:29:30.150 --> 0:29:34.222 +But however, as you systematically do, you +even enforce more and will even do more. + +0:29:34.654 --> 0:29:37.401 +So that might not be the best solution. + +0:29:37.401 --> 0:29:40.148 +Do any idea how you could do it better? + +0:29:44.404 --> 0:29:57.653 +If you had something else to prevent some +systematic problems, yes, that is one way. + +0:30:04.624 --> 0:30:10.809 +The problem is yeah, the translations are +not perfect, so the output and you're learning + +0:30:10.809 --> 0:30:11.990 +something wrong. + +0:30:11.990 --> 0:30:17.967 +Normally it's less bad if your inputs are +somewhat bad, but your outputs are perfect. + +0:30:18.538 --> 0:30:26.670 +So if your inputs are wrong you maybe learn +that if you're doing this wrong input you're + +0:30:26.670 --> 0:30:30.782 +generating something correct but you're not. + +0:30:31.511 --> 0:30:40.911 +So often the case is that it's more important +that your target is correct. + +0:30:40.911 --> 0:30:47.052 +If on the source there is something crazy, +then. + +0:30:47.347 --> 0:30:52.184 +But you can assume in your application scenario +you hope that you mainly get correct input. + +0:30:52.572 --> 0:31:02.126 +So that is not harming you as much, and in +machine translation we have some of these symmetries, + +0:31:02.126 --> 0:31:02.520 +so. + +0:31:02.762 --> 0:31:04.578 +And also the other way around. + +0:31:04.578 --> 0:31:09.792 +It's a very similar task, so there's a task +to translate from German to English, but the + +0:31:09.792 --> 0:31:13.892 +task to translate from English to German is +very similar and helpful. + +0:31:14.094 --> 0:31:19.313 +So what we can do is, we can just switch it +initially and generate the data the other way + +0:31:19.313 --> 0:31:19.777 +around. + +0:31:20.120 --> 0:31:25.699 +So what we are doing here is we are starting +with an English to German system. + +0:31:25.699 --> 0:31:32.126 +Then we are translating the English data into +German, where the German is maybe not really + +0:31:32.126 --> 0:31:32.903 +very nice. + +0:31:33.293 --> 0:31:46.045 +And then we're training on our original data +and on the back translated data where only + +0:31:46.045 --> 0:31:51.696 +the input is good and it's like human. + +0:31:52.632 --> 0:32:01.622 +So here we have now the advantage that always +our target site is of human quality and the + +0:32:01.622 --> 0:32:02.322 +input. + +0:32:03.583 --> 0:32:08.998 +And then this helps us to get really good +form. + +0:32:08.998 --> 0:32:15.428 +There's one important difference if you think +about the. + +0:32:21.341 --> 0:32:31.604 +It's too obvious here we need a target side +monolingual layer and the first. + +0:32:31.931 --> 0:32:47.143 +So back translation is normally working if +you have target size parallel and not search + +0:32:47.143 --> 0:32:48.180 +side. + +0:32:48.448 --> 0:32:55.493 +Might be also a bit if you think about it +understandable that it's more important to + +0:32:55.493 --> 0:32:56.819 +be like better. + +0:32:57.117 --> 0:33:04.472 +On the suicide you have to understand the +content, on the target side you have to generate + +0:33:04.472 --> 0:33:12.232 +really sentences and somehow it's more difficult +to generate something than to only understand. + +0:33:17.617 --> 0:33:29.916 +One other thing, so typically it's shown here +differently, but typically it's like this works + +0:33:29.916 --> 0:33:30.701 +well. + +0:33:31.051 --> 0:33:32.978 +Because normally there's like a lot more. + +0:33:33.253 --> 0:33:36.683 +So the question is, should really take all +of my data? + +0:33:36.683 --> 0:33:38.554 +There's two problems with it. + +0:33:38.554 --> 0:33:42.981 +Of course, it's expensive because you have +to translate all this data. + +0:33:42.981 --> 0:33:48.407 +And secondly, if you had, although now your +packet site is wrong, it might be that you + +0:33:48.407 --> 0:33:51.213 +still have your wrong correlations in there. + +0:33:51.651 --> 0:34:01.061 +So if you don't know the normally good starting +point is to take equal amount of data as many + +0:34:01.061 --> 0:34:02.662 +backtranslated. + +0:34:02.963 --> 0:34:05.366 +Of course, it depends on the use case. + +0:34:05.366 --> 0:34:07.215 +There are very few data here. + +0:34:07.215 --> 0:34:08.510 +It makes more sense. + +0:34:08.688 --> 0:34:14.273 +It depends on how good your quality is here, +so the better the model is observable, the + +0:34:14.273 --> 0:34:17.510 +more data you might use because quality is +better. + +0:34:17.510 --> 0:34:23.158 +So it depends on a lot of things, but yeah, +a rule of sample like good general way often + +0:34:23.158 --> 0:34:24.808 +is to have equal amounts. + +0:34:26.646 --> 0:34:31.233 +And you can of course do that now iteratively. + +0:34:31.233 --> 0:34:39.039 +It said already that the quality at the end, +of course, depends on this system. + +0:34:39.039 --> 0:34:46.163 +Also, because the better this system is, the +better your synthetic data. + +0:34:47.207 --> 0:34:50.949 +That leads to what is referred to as iterated +back translation. + +0:34:51.291 --> 0:34:56.911 +So you're playing a model on English to German +and you translate the data. + +0:34:56.957 --> 0:35:03.397 +Then you train a model on German to English +with the additional data. + +0:35:03.397 --> 0:35:11.954 +Then you translate German when you translate +German data and then you train again your first + +0:35:11.954 --> 0:35:12.414 +one. + +0:35:12.414 --> 0:35:14.346 +So you iterate that. + +0:35:14.334 --> 0:35:19.653 +Because now your system is better because +it's not only trained on the small data but + +0:35:19.653 --> 0:35:22.003 +additionally on back translated data. + +0:35:22.442 --> 0:35:24.458 +And so you can get better. + +0:35:24.764 --> 0:35:31.739 +However, typically you can stop quite early, +so maybe one iteration is good, but then you + +0:35:31.739 --> 0:35:35.072 +have diminishing gains after two or three. + +0:35:35.935 --> 0:35:44.094 +There's very slight difference and then yeah +because you need of course quite big difference + +0:35:44.094 --> 0:35:45.937 +in the quality here. + +0:35:45.937 --> 0:35:46.814 +In order. + +0:35:47.207 --> 0:35:59.810 +Which is not too good because it means you +can already have to train it with relatively + +0:35:59.810 --> 0:36:02.245 +bad performance. + +0:36:03.723 --> 0:36:10.323 +And they don't yeah, a design decision would +advise so guess because it's easy to get it. + +0:36:10.550 --> 0:36:16.617 +Better to replace that because you have a +higher quality, but you of course keep your + +0:36:16.617 --> 0:36:18.310 +high quality real data. + +0:36:18.310 --> 0:36:21.626 +Then I think normally it's okay to replace +it. + +0:36:21.626 --> 0:36:24.518 +Of course you can also try to append it. + +0:36:24.518 --> 0:36:28.398 +I would assume it's not too much of a difference, +but. + +0:36:34.414 --> 0:36:40.567 +That's about like using monolingual data before +we go into the pre-train models. + +0:36:40.567 --> 0:36:42.998 +Do you have any more questions? + +0:36:49.029 --> 0:36:57.521 +Yes, so the other thing we can do and which +is recently more and more successful and even + +0:36:57.521 --> 0:37:05.731 +more successful since we have these really +large language models where you can even do + +0:37:05.731 --> 0:37:08.562 +a translation task with this. + +0:37:08.688 --> 0:37:16.132 +So here the idea is you learn a representation +of one task and then you use this representation. + +0:37:16.576 --> 0:37:27.276 +It was made maybe like one of the first where +it's really used largely is doing something + +0:37:27.276 --> 0:37:35.954 +like a bird which you pre-train on purely text +editor and then you take. + +0:37:36.496 --> 0:37:42.952 +And the one big advantage, of course, is that +people can only share data but also pre-train. + +0:37:43.423 --> 0:37:53.247 +So if you think of the recent models and the +large language models which are available, + +0:37:53.247 --> 0:37:59.611 +it is not possible for universities often to +train them. + +0:37:59.919 --> 0:38:09.413 +Think it costs several millions to train the +model just if you rent the GPS from some cloud + +0:38:09.413 --> 0:38:15.398 +company and train that the cost of training +these models. + +0:38:15.475 --> 0:38:21.735 +And guess as a student project you won't have +the budget to like build these models. + +0:38:21.801 --> 0:38:24.630 +So another idea is what you can do is okay. + +0:38:24.630 --> 0:38:27.331 +Maybe if these months are once available. + +0:38:27.467 --> 0:38:34.723 +You can take them and use them as a resource +similar to pure text, and you can now build + +0:38:34.723 --> 0:38:41.734 +models which some will learn not only from +from data but also from other models which + +0:38:41.734 --> 0:38:44.506 +are maybe trained on other tasks. + +0:38:44.844 --> 0:38:48.647 +So it's a quite new way of thinking of how +to train. + +0:38:48.647 --> 0:38:53.885 +So we are not only learning from examples, +but we might also learn from. + +0:38:54.534 --> 0:39:03.937 +The nice thing is that this type of training +where we are not learning directly from data + +0:39:03.937 --> 0:39:07.071 +by learning from other tasks. + +0:39:07.427 --> 0:39:15.581 +So the main idea to start with is to have +a personal initial task, and typically this + +0:39:15.581 --> 0:39:24.425 +initial task is for: And if you're working +with, that means you're training pure taxator + +0:39:24.425 --> 0:39:30.547 +because you have the largest amount of data +from the Internet. + +0:39:30.951 --> 0:39:35.857 +And then you're defining some type of task +in order to do your quick training. + +0:39:36.176 --> 0:39:42.056 +And: There's a typical task you can train +on. + +0:39:42.056 --> 0:39:52.709 +That is like the language modeling text, so +to predict the next word, all we have related. + +0:39:52.932 --> 0:40:04.654 +But to predict something which you have not +in the input is a task which is easy to generate. + +0:40:04.654 --> 0:40:06.150 +That's why. + +0:40:06.366 --> 0:40:14.005 +By yourself, on the other hand, you need a +lot of knowledge, and that is the other thing + +0:40:14.005 --> 0:40:15.120 +you need to. + +0:40:15.735 --> 0:40:23.690 +Because there is this idea that the meaning +of the word heavily depends on the context + +0:40:23.690 --> 0:40:24.695 +it's used. + +0:40:25.145 --> 0:40:36.087 +So can give you a sentence with some gibberish +word and there's some name, and although you've + +0:40:36.087 --> 0:40:41.616 +never read the name, you will just assume that. + +0:40:42.062 --> 0:40:48.290 +Exactly the same thing, the models can also +learn something about the words in there by + +0:40:48.290 --> 0:40:49.139 +just using. + +0:40:49.649 --> 0:40:53.246 +So that is typically the new. + +0:40:53.246 --> 0:40:59.839 +Then we can use this model, use our data to +train the. + +0:41:00.800 --> 0:41:04.703 +Of course, it might need to adapt the system. + +0:41:04.703 --> 0:41:07.672 +To do that we might use only some. + +0:41:07.627 --> 0:41:16.326 +Part of the pre-train model in there is that +we have seen that a bit already in the RNA + +0:41:16.326 --> 0:41:17.215 +case is. + +0:41:17.437 --> 0:41:22.670 +So you can view the RN as one of these approaches. + +0:41:22.670 --> 0:41:28.518 +You train the RN language while on large pre-train +data. + +0:41:28.518 --> 0:41:32.314 +Then you put it somewhere into your. + +0:41:33.653 --> 0:41:37.415 +So this gives you the ability to really do +these types of tests. + +0:41:37.877 --> 0:41:49.027 +So that you can build a system which uses +knowledge, which is just trained on large amounts + +0:41:49.027 --> 0:41:52.299 +of data and extracting it. + +0:41:52.299 --> 0:41:53.874 +So it knows. + +0:41:56.376 --> 0:42:01.561 +So the question is that yeah, what type of +information so what type of models can you? + +0:42:01.821 --> 0:42:05.278 +And we want to today look at briefly at three. + +0:42:05.725 --> 0:42:08.474 +Was initially done. + +0:42:08.474 --> 0:42:21.118 +It wasn't as famous as in machine translation +as in other things, but it's also used there. + +0:42:21.221 --> 0:42:28.974 +So where you have this mapping from the one +hot to a small continuous word representation? + +0:42:29.229 --> 0:42:37.891 +Using this one in your anthrax you can, for +example, replace the embedding layer by the + +0:42:37.891 --> 0:42:38.776 +trained. + +0:42:39.139 --> 0:42:41.832 +That is helpful to be a really small amount +of data. + +0:42:42.922 --> 0:42:48.520 +You're always in this pre training phase and +have the thing the advantage is. + +0:42:48.468 --> 0:42:55.515 +More data, that's the trade off so you can +get better. + +0:42:55.515 --> 0:43:00.128 +Disadvantage is, does anybody have? + +0:43:04.624 --> 0:43:12.173 +Was one of the mentioned today, even like +big advantages of the system compared to previous. + +0:43:20.660 --> 0:43:26.781 +Where one advantage was the end to end training +so that all parameters and all components are + +0:43:26.781 --> 0:43:27.952 +optimal together. + +0:43:28.208 --> 0:43:33.386 +If you know pre-train something on one pass, +it's maybe no longer optimal fitting to everything. + +0:43:33.893 --> 0:43:40.338 +So that is similar to what should do pretaining +or not. + +0:43:40.338 --> 0:43:48.163 +It depends on how important everything is +optimal together and how. + +0:43:48.388 --> 0:44:00.552 +If the state is a high quality of large amount, +the pre trained one is just so much better. + +0:44:00.600 --> 0:44:11.215 +Standing everything optimal together, we would +use random actions for amazing vices. + +0:44:11.691 --> 0:44:18.791 +Mean, we assume some structures that are trained +basically. + +0:44:18.791 --> 0:44:26.364 +Yes, if you're fine tuning everything, it +might be the problem. + +0:44:26.766 --> 0:44:31.139 +But often yeah, in some way right, so often +it's not about. + +0:44:31.139 --> 0:44:37.624 +You're really worse with some pre-trained +molecules because you're going already in some + +0:44:37.624 --> 0:44:43.236 +direction, and if this is not really optimal +for you, it might be difficult. + +0:44:43.603 --> 0:44:51.774 +But the bigger is, if you're not getting better +because you have a decent amount of data, it's + +0:44:51.774 --> 0:44:52.978 +so different. + +0:44:53.153 --> 0:45:04.884 +But mean initially it wasn't a machine translation +done so much because there was more data in + +0:45:04.884 --> 0:45:09.452 +the task, but now it's really large. + +0:45:12.632 --> 0:45:14.188 +The other one is then OK. + +0:45:14.188 --> 0:45:18.258 +Now it's always like how much of the model +do your pre-track a bit? + +0:45:18.658 --> 0:45:25.057 +The other one you can do is tack contextual +words and then something like bird or a robota + +0:45:25.057 --> 0:45:31.667 +where you train more already as sequence models +and the embeddings you're using are no longer + +0:45:31.667 --> 0:45:35.605 +specific for words but they're also taking +the context. + +0:45:35.875 --> 0:45:54.425 +Embedding you're using is no longer only depending +on the word itself but on the whole sentence. + +0:45:55.415 --> 0:46:03.714 +And of course you can use similar things also +in the decoder just by having layers which + +0:46:03.714 --> 0:46:09.122 +don't have access to the source but there it's +still not. + +0:46:11.451 --> 0:46:19.044 +And finally, and then we'll look at the end, +you can also have models which are already. + +0:46:19.419 --> 0:46:28.605 +So you may be training a sequence model, but +not a monolingual data. + +0:46:28.605 --> 0:46:35.128 +Of course you have to make it a bit challenging. + +0:46:36.156 --> 0:46:43.445 +But the idea is really you're pre-training +your whole model and then you're fine tuning. + +0:46:47.227 --> 0:46:59.487 +But let's first do a bit of step back and +look into what are the differences. + +0:46:59.487 --> 0:47:02.159 +The first thing. + +0:47:02.382 --> 0:47:06.870 +The word embeddings are just this first layer. + +0:47:06.870 --> 0:47:12.027 +You can train them with feed-forward neural +networks. + +0:47:12.212 --> 0:47:25.683 +But you can also train them in language model, +and by now you hopefully have also seen that + +0:47:25.683 --> 0:47:27.733 +you can also. + +0:47:30.130 --> 0:47:41.558 +So this is how you can train them, and you +are training them to predict the next word, + +0:47:41.558 --> 0:47:45.236 +the typical language model. + +0:47:45.525 --> 0:47:52.494 +And that is what is now referred to as a South +Supervised Learning, and for example all the + +0:47:52.494 --> 0:47:56.357 +big large language models like Chat, gp and +so on. + +0:47:56.357 --> 0:48:03.098 +They are trained at an end or feet, but exactly +with this objective to predict the next. + +0:48:03.823 --> 0:48:12.847 +So that is where you can hopefully learn what +a word is used because you always try to predict + +0:48:12.847 --> 0:48:17.692 +the next word and then you have a ready intuition. + +0:48:19.619 --> 0:48:25.374 +In the word embedding, why do people first +look at the word embeddings and the use of + +0:48:25.374 --> 0:48:27.582 +word embeddings for other tasks? + +0:48:27.582 --> 0:48:32.600 +The main advantage is it might be only the +first layer you would think of. + +0:48:32.600 --> 0:48:34.474 +What does it really matter? + +0:48:34.474 --> 0:48:39.426 +However, it is the layer where you typically +have most of the parameters. + +0:48:39.879 --> 0:48:52.201 +Of course, if you have trained on most of +your parameters already on the large data, + +0:48:52.201 --> 0:48:59.304 +then on your target data you have to train +less. + +0:48:59.259 --> 0:49:05.841 +This big difference that your input size is +so much bigger than the size of the normal + +0:49:05.841 --> 0:49:06.522 +in size. + +0:49:06.626 --> 0:49:16.551 +So it's a normal size, maybe two hundred and +fifty, but your input embedding besides vocabulary + +0:49:16.551 --> 0:49:20.583 +size is something like fifty thousand. + +0:49:23.123 --> 0:49:30.163 +And bending while here you see, it's only +like times as much in the layer. + +0:49:30.750 --> 0:49:36.747 +So here's where most of your parameters are, +which means if you already replace the word + +0:49:36.747 --> 0:49:41.329 +embeddings, it might look a bit small in your +overall architecture. + +0:49:41.329 --> 0:49:47.056 +It's where most of the things are, and if +you're doing that, you already have really + +0:49:47.056 --> 0:49:48.876 +big games and can do that. + +0:49:57.637 --> 0:50:04.301 +The thing is we have seen these wooden beddings +can be very good used for other taps. + +0:50:04.784 --> 0:50:08.921 +Now you learn some relation between words. + +0:50:08.921 --> 0:50:14.790 +If you're doing this type of language modeling, +you predict. + +0:50:15.215 --> 0:50:21.532 +The one thing is, of course, you have a lot +of data, so the one question is we want to + +0:50:21.532 --> 0:50:25.961 +have a lot of data to good training models, +the other thing. + +0:50:25.961 --> 0:50:28.721 +The tasks need to be somewhat useful. + +0:50:29.169 --> 0:50:41.905 +If you would predict the first letter of the +word, it has to be a task where you need some + +0:50:41.905 --> 0:50:45.124 +syntactic information. + +0:50:45.545 --> 0:50:53.066 +The interesting thing is people have looked +at these world embeddings here in a language + +0:50:53.066 --> 0:50:53.658 +model. + +0:50:53.954 --> 0:51:04.224 +And you're looking at the word embeddings, +which are these vectors here. + +0:51:04.224 --> 0:51:09.289 +You can ask yourself, do they look? + +0:51:09.489 --> 0:51:15.122 +Don't know if your view is listening to artificial +advance artificial intelligence. + +0:51:15.515 --> 0:51:23.994 +We had on yesterday how to do this type of +representation, but you can do this kind of + +0:51:23.994 --> 0:51:29.646 +representation, and now you're seeing interesting +things. + +0:51:30.810 --> 0:51:41.248 +Now you can represent it here in a three dimensional +space with a dimension reduction. + +0:51:41.248 --> 0:51:46.886 +Then you can look into it and the interesting. + +0:51:47.447 --> 0:51:57.539 +So this vector between the male and the female +version of something is not the same, but it's + +0:51:57.539 --> 0:51:58.505 +related. + +0:51:58.718 --> 0:52:11.256 +So you can do a bit of nuts, you subtract +this vector, add this vector, and then you + +0:52:11.256 --> 0:52:14.501 +look around this one. + +0:52:14.894 --> 0:52:19.691 +So that means okay, there is really something +stored, some information stored in that book. + +0:52:20.040 --> 0:52:25.003 +Similar you can do it with Buck and since +you see here swimming slam walk and walk. + +0:52:25.265 --> 0:52:42.534 +So again these vectors are not the same, but +they're related for going from here to here. + +0:52:43.623 --> 0:52:47.508 +Are semantically the relations between city +and capital? + +0:52:47.508 --> 0:52:49.757 +You have exactly the same thing. + +0:52:51.191 --> 0:52:57.857 +People having done question answering about +that if they show these embeddings and. + +0:52:58.218 --> 0:53:05.198 +Or you can also, if you don't trust the the +dimensional reduction because you say maybe + +0:53:05.198 --> 0:53:06.705 +there's something. + +0:53:06.967 --> 0:53:16.473 +Done you can also look into what happens really +in the indimensional space. + +0:53:16.473 --> 0:53:22.227 +You can look at what is the nearest neighbor. + +0:53:22.482 --> 0:53:29.605 +So you can take the relationship between France +and Paris and add it to Italy and nicely see. + +0:53:30.010 --> 0:53:33.082 +You can do big and bigger and you have small +and small lines. + +0:53:33.593 --> 0:53:38.202 +It doesn't work everywhere. + +0:53:38.202 --> 0:53:49.393 +There are also some which sometimes work, +so if you have a typical. + +0:53:51.491 --> 0:53:56.832 +You can do what the person is doing for famous +ones. + +0:53:56.832 --> 0:54:05.800 +Of course, only like Einstein, scientist, +that Messier finds Midfield are not completely + +0:54:05.800 --> 0:54:06.707 +correct. + +0:54:06.846 --> 0:54:09.781 +You'll see the examples are a bit old. + +0:54:09.781 --> 0:54:15.050 +The politicians are no longer there, but the +first one doesn't learn. + +0:54:16.957 --> 0:54:29.003 +What people have done there of courses, especially +at the beginning. + +0:54:29.309 --> 0:54:36.272 +So one famous model was, but we're not really +interested in the language model performance. + +0:54:36.272 --> 0:54:38.013 +We're only interested. + +0:54:38.338 --> 0:54:40.634 +Think something good to keep in mind. + +0:54:40.634 --> 0:54:42.688 +What are we really interested in? + +0:54:42.688 --> 0:54:44.681 +Do we really want to have an RN? + +0:54:44.681 --> 0:54:44.923 +No. + +0:54:44.923 --> 0:54:48.608 +In this case we are only interested in this +type of mapping. + +0:54:49.169 --> 0:54:55.536 +And so very successful was this word to beg. + +0:54:55.535 --> 0:55:02.597 +We are not training real language when making +it even simpler and doing this for example + +0:55:02.597 --> 0:55:04.660 +continuous back of words. + +0:55:04.660 --> 0:55:11.801 +We are just having four input tokens and we +are predicting what is the word in the middle + +0:55:11.801 --> 0:55:15.054 +and this is just like two linear layers. + +0:55:15.615 --> 0:55:22.019 +It's even simplifying things and making the +calculation faster because that is what we're + +0:55:22.019 --> 0:55:22.873 +interested. + +0:55:23.263 --> 0:55:34.059 +All this continues skip ground models of these +other two models. + +0:55:34.234 --> 0:55:38.273 +You have one equal word and it's the other +way around. + +0:55:38.273 --> 0:55:41.651 +You're predicting the four words around them. + +0:55:41.651 --> 0:55:43.047 +It's very similar. + +0:55:43.047 --> 0:55:48.702 +The task is in the end very similar, but in +all of them it's about learning. + +0:55:51.131 --> 0:56:01.416 +Before we go into the next part, let's talk +about the normal white vector or white line. + +0:56:04.564 --> 0:56:07.562 +The next thing is contextual word embeddings. + +0:56:07.562 --> 0:56:08.670 +The idea is yes. + +0:56:08.670 --> 0:56:09.778 +This is helpful. + +0:56:09.778 --> 0:56:14.080 +However, we might be able to get more from +just only lingo later. + +0:56:14.080 --> 0:56:19.164 +For example, if you think about the word can, +it can have different meanings. + +0:56:19.419 --> 0:56:32.619 +And now in the word embeddings how you have +an overlap of these two meanings, so it represents + +0:56:32.619 --> 0:56:33.592 +those. + +0:56:34.834 --> 0:56:40.318 +But we might be able to in the pre-train model +already disambiguate these because they use + +0:56:40.318 --> 0:56:41.041 +completely. + +0:56:41.701 --> 0:56:50.998 +So if we can have a model which can not only +represent the word, but it can also represent + +0:56:50.998 --> 0:56:58.660 +the meaning of the word within the context, +it might be even more helpful. + +0:56:59.139 --> 0:57:03.342 +So then we're going to contextual word embeddings. + +0:57:03.342 --> 0:57:07.709 +We're really having a representation of the +context. + +0:57:07.787 --> 0:57:11.519 +And we have a very good architecture for that +already. + +0:57:11.691 --> 0:57:20.551 +It's like our base language model where you +have to do the hidden state. + +0:57:20.551 --> 0:57:29.290 +The hidden state represents what is apparently +said, but it's focusing. + +0:57:29.509 --> 0:57:43.814 +The first one doing that is in something like +the Elmo paper where they instead of like this + +0:57:43.814 --> 0:57:48.121 +is a normal language model. + +0:57:48.008 --> 0:57:52.735 +Put in the third predicting the fourth and +so on, so you're always predicting the next + +0:57:52.735 --> 0:57:53.007 +one. + +0:57:53.193 --> 0:57:57.919 +The architecture of the heaven works embedding +layer, and then two are an layer here. + +0:57:57.919 --> 0:58:04.255 +For example: And now instead of using this +one in the end you're using here this one. + +0:58:04.364 --> 0:58:11.245 +This represents the meaning of this word mainly +in the context of what we have seen before. + +0:58:11.871 --> 0:58:22.909 +We can train it in a language model or predicting +the next word, but we have more information, + +0:58:22.909 --> 0:58:26.162 +train there, and therefore. + +0:58:27.167 --> 0:58:31.168 +And there is one even done currently in. + +0:58:31.168 --> 0:58:40.536 +The only difference is that we have more layers, +bigger size, and we're using transform on here + +0:58:40.536 --> 0:58:44.634 +or self-attention instead of the R&F. + +0:58:44.634 --> 0:58:45.122 +But. + +0:58:46.746 --> 0:58:52.737 +However, if you look at this contextual representation, +they might not be perfect. + +0:58:52.737 --> 0:58:58.584 +So what do you think of this one as contextual +representation of the third word? + +0:58:58.584 --> 0:59:02.914 +Do you see anything which is not really considered +in this? + +0:59:07.587 --> 0:59:11.492 +Only one way yes, so that is not a big issue +here. + +0:59:11.492 --> 0:59:18.154 +It's representing a string in the context +of a sentence, however, only in the context. + +0:59:18.558 --> 0:59:28.394 +However, we have an architecture which can +also take both sides and we have used it in + +0:59:28.394 --> 0:59:30.203 +the ink holder. + +0:59:30.630 --> 0:59:34.269 +So we could do the and easily only us in the +backboard direction. + +0:59:34.874 --> 0:59:46.889 +By just having the other way around, and then +we couldn't combine the forward and into a + +0:59:46.889 --> 0:59:49.184 +joint one where. + +0:59:49.329 --> 0:59:50.861 +So You Have a Word embedding. + +0:59:51.011 --> 1:00:03.910 +Then you have two states, one with a forward, +and then one with a backward. + +1:00:03.910 --> 1:00:10.359 +For example, take the representation. + +1:00:10.490 --> 1:00:21.903 +Now this same here represents mainly this +word because this is where what both focuses + +1:00:21.903 --> 1:00:30.561 +on is what is happening last but is also looking +at the previous. + +1:00:31.731 --> 1:00:41.063 +However, there is a bit different when training +that as a language model you already have. + +1:00:43.203 --> 1:00:44.956 +Maybe there's again this masking. + +1:00:46.546 --> 1:00:47.814 +That is one solution. + +1:00:47.814 --> 1:00:53.407 +First of all, why we can't do it is the information +you leave it, so you cannot just predict the + +1:00:53.407 --> 1:00:54.041 +next word. + +1:00:54.041 --> 1:00:58.135 +If we just predict the next word in this type +of model, that's a very. + +1:00:58.738 --> 1:01:04.590 +You know the next word because it's influencing +this hidden stage and then it's very easy so + +1:01:04.590 --> 1:01:07.736 +predicting something you know is not a good +task. + +1:01:07.736 --> 1:01:09.812 +This is what I mentioned before. + +1:01:09.812 --> 1:01:13.336 +You have to define somehow a task which is +challenging. + +1:01:13.753 --> 1:01:19.007 +Because in this case one would, I mean, the +system would just ignore the states and what + +1:01:19.007 --> 1:01:22.961 +it would learn is that you copy this information +directly in here. + +1:01:23.343 --> 1:01:31.462 +So it would mainly be representing this word +and you would have a perfect model because + +1:01:31.462 --> 1:01:38.290 +you only need to find an encoding where you +can encode all words somehow. + +1:01:38.458 --> 1:01:44.046 +The only thing that will learn is that tenor +and coat all my words in this upper hidden. + +1:01:44.985 --> 1:01:49.584 +And then, of course, it's not really useful. + +1:01:49.584 --> 1:01:53.775 +We need to find a bit of different ways. + +1:01:55.295 --> 1:01:59.440 +There is a masking one. + +1:01:59.440 --> 1:02:06.003 +I'll come to that shortly just a bit. + +1:02:06.003 --> 1:02:14.466 +The other thing is not to directly combine +them. + +1:02:14.594 --> 1:02:22.276 +So you never merge the states only at the +end. + +1:02:22.276 --> 1:02:33.717 +The representation of the words is now from +the forward and the next. + +1:02:33.873 --> 1:02:35.964 +So it's always a hidden state before that. + +1:02:36.696 --> 1:02:41.273 +And these two you're joined now to your to +the representation. + +1:02:42.022 --> 1:02:50.730 +And then you have now a representation also +about the whole sentence for the word, but + +1:02:50.730 --> 1:02:53.933 +there's no information leakage. + +1:02:53.933 --> 1:02:59.839 +One way of doing this is instead of doing +a bidirectional. + +1:03:00.380 --> 1:03:08.079 +You can do that, of course, in all layers. + +1:03:08.079 --> 1:03:16.315 +In the end you have different bedding states. + +1:03:16.596 --> 1:03:20.246 +However, it's a bit of a complicated. + +1:03:20.246 --> 1:03:25.241 +You have to keep up separate and then merge +things. + +1:03:27.968 --> 1:03:33.007 +And that is is the moment where, like the, +the peak. + +1:03:34.894 --> 1:03:42.018 +Idea of the big success of the bird model +was used, maybe in bidirector case. + +1:03:42.018 --> 1:03:48.319 +It's not good to do the next word prediction, +but we can do masking. + +1:03:48.308 --> 1:03:59.618 +And masking maybe means we do a prediction +of something in the middle or some words. + +1:03:59.618 --> 1:04:08.000 +If we have the input, we're just putting noise +into the input. + +1:04:08.048 --> 1:04:14.040 +Now there can be no information leakage because +this wasn't in the input. + +1:04:14.040 --> 1:04:15.336 +Now predicting. + +1:04:16.776 --> 1:04:20.524 +So thereby we don't do any assumption again +about our models. + +1:04:20.524 --> 1:04:24.815 +It doesn't need to be a forward model or a +backward model or anything. + +1:04:24.815 --> 1:04:29.469 +You can have any type of architecture and +you can always predict the street. + +1:04:30.530 --> 1:04:39.112 +There is maybe one disadvantage: do you see +what could be a bit of a problem this type + +1:04:39.112 --> 1:04:40.098 +compared? + +1:05:00.000 --> 1:05:05.920 +Yes, so yeah mean you cannot cross mass more, +but to see it more globally just twist assume + +1:05:05.920 --> 1:05:07.142 +you only mask one. + +1:05:07.142 --> 1:05:12.676 +For the whole sentence we get one feedback +signal like what is the word street, so we + +1:05:12.676 --> 1:05:16.280 +have one training sample, a model for the whole +center. + +1:05:17.397 --> 1:05:19.461 +The language modeling paste. + +1:05:19.461 --> 1:05:21.240 +We predicted here three. + +1:05:21.240 --> 1:05:22.947 +We predicted here four. + +1:05:22.947 --> 1:05:24.655 +We predicted here five. + +1:05:25.005 --> 1:05:26.973 +So we have a number of tokens. + +1:05:26.973 --> 1:05:30.974 +For each token we have a feet bed and saying +what is the best. + +1:05:31.211 --> 1:05:39.369 +So in this case of course this is a lot less +efficient because we are getting less feedback + +1:05:39.369 --> 1:05:45.754 +signals on what we should predict compared +to models where we're doing. + +1:05:48.348 --> 1:05:54.847 +So in birth the main idea this bidirectional +model was masking. + +1:05:54.847 --> 1:05:59.721 +It was the first large model using transformer. + +1:06:00.320 --> 1:06:06.326 +There are two more minor changes. + +1:06:06.326 --> 1:06:16.573 +We'll see that this next word prediction is +another task. + +1:06:16.957 --> 1:06:25.395 +Again you want to learn more about what language +is to really understand. + +1:06:25.395 --> 1:06:35.089 +Are these two sentences like following a story +or they're independent of each other? + +1:06:38.158 --> 1:06:43.026 +The input is using subword units as we're +using it and we're using it. + +1:06:43.026 --> 1:06:48.992 +It has some special token, the beginning, +the CLS token that is straining for the next + +1:06:48.992 --> 1:06:50.158 +word prediction. + +1:06:50.470 --> 1:06:57.296 +It's more for machine translation. + +1:06:57.296 --> 1:07:07.242 +It's more for classification tasks because +you're. + +1:07:07.607 --> 1:07:24.323 +You have two sentences, and then you have +a position of encoding as we know them in general. + +1:07:24.684 --> 1:07:28.812 +Now what is more challenging is masking. + +1:07:28.812 --> 1:07:30.927 +So what do you mask? + +1:07:30.927 --> 1:07:35.055 +We already have to question like should. + +1:07:35.275 --> 1:07:44.453 +So there has been afterwards eating some work +like, for example, Urbana, which tries to improve. + +1:07:44.453 --> 1:07:52.306 +It's not super sensitive, but of course if +you do it completely wrong then you're. + +1:07:52.572 --> 1:07:54.590 +That's then another question there. + +1:07:56.756 --> 1:08:03.285 +All types should always mask the poor word. + +1:08:03.285 --> 1:08:14.562 +If have a subword, it's good to mask only +like a subword and predict based. + +1:08:14.894 --> 1:08:20.755 +You know, like three parts of the words, it +might be easier to get the last because they + +1:08:20.755 --> 1:08:27.142 +here took the easiest selections, not considering +words anymore at all because you're doing that + +1:08:27.142 --> 1:08:32.278 +in the pre-processing and just taking always +words like subwords and masking. + +1:08:32.672 --> 1:08:36.286 +Their thinking will bear them differently. + +1:08:36.286 --> 1:08:40.404 +They mark always the full words, but guess +it's. + +1:08:41.001 --> 1:08:46.969 +And then what to do with the mask work in +eighty percent of the cases is the word is + +1:08:46.969 --> 1:08:47.391 +mask. + +1:08:47.391 --> 1:08:50.481 +They replace it with a special token thing. + +1:08:50.481 --> 1:08:52.166 +This is the mask token. + +1:08:52.166 --> 1:08:58.486 +In ten percent they put in some random other +token in there, and in ten percent they keep + +1:08:58.486 --> 1:08:59.469 +it unchanged. + +1:09:02.202 --> 1:09:11.519 +And then what you can do is also this next +prediction. + +1:09:11.519 --> 1:09:17.786 +So if you have the man went to mass. + +1:09:18.418 --> 1:09:24.090 +So may you see you're joining that you're +doing both masks and next prediction that. + +1:09:24.564 --> 1:09:34.402 +And if the sentence is pinguine masks are +flyless birds, then these two sentences have + +1:09:34.402 --> 1:09:42.995 +nothing to do with each other, and so in this +case it's not the next token. + +1:09:47.127 --> 1:09:56.184 +And that is the whole bird model, so here +is the input, here the transformable layers, + +1:09:56.184 --> 1:09:58.162 +and you can train. + +1:09:58.598 --> 1:10:08.580 +And this model was quite successful in general +applications. + +1:10:08.580 --> 1:10:17.581 +It was not as successful as people are nowadays +using. + +1:10:17.937 --> 1:10:27.644 +However, there is like a huge thing of different +types of models coming from that. + +1:10:27.827 --> 1:10:39.109 +So based on bird and other semi-supervised +models like a whole setup came out of there + +1:10:39.109 --> 1:10:42.091 +and there's different. + +1:10:42.082 --> 1:10:46.637 +With the availability of large languages more +than the success. + +1:10:47.007 --> 1:10:48.436 +We have now even larger ones. + +1:10:48.828 --> 1:10:50.961 +Interestingly, it goes a bit. + +1:10:50.910 --> 1:10:59.321 +Change the bit again from like more this spider +action model to unidirectional models, or at + +1:10:59.321 --> 1:11:03.843 +the moment maybe a bit more we're coming to +them. + +1:11:03.843 --> 1:11:09.179 +Now do you see one advantage,, and we have +the efficiency. + +1:11:09.509 --> 1:11:16.670 +There's one other reason why you sometimes +are more interested in unidirectional models + +1:11:16.670 --> 1:11:17.158 +than. + +1:11:22.882 --> 1:11:30.882 +Mean it depends on the task, but for example +for a language generation task, the task. + +1:11:32.192 --> 1:11:34.574 +It's not only interesting, it doesn't work. + +1:11:34.574 --> 1:11:39.283 +So if you want to do a generation like the +decoder so you want to generate a sentence, + +1:11:39.283 --> 1:11:42.856 +you don't know the future so you cannot apply +this type of model. + +1:11:43.223 --> 1:11:49.498 +This time off model can be used for the encoder +in an encoder model but cannot be used for + +1:11:49.498 --> 1:11:55.497 +the decoder because it is trained that only +works and it has information on both sides + +1:11:55.497 --> 1:11:56.945 +and if you're doing. + +1:12:00.000 --> 1:12:05.559 +Yeah, that's a good view to the next overall +task of models. + +1:12:05.559 --> 1:12:08.839 +We have so if you view it from the. + +1:12:09.009 --> 1:12:13.137 +Of you we have the encoder baseball. + +1:12:13.137 --> 1:12:16.372 +That's what we just look at. + +1:12:16.372 --> 1:12:20.612 +They are bidirectional and typically. + +1:12:20.981 --> 1:12:22.347 +That is the one we looked at. + +1:12:22.742 --> 1:12:35.217 +At the beginning is the decoder-based model, +so the outer-regressive mounts which are unit + +1:12:35.217 --> 1:12:42.619 +based model, and there we can do the next prediction. + +1:12:43.403 --> 1:12:52.421 +And what you can also do first, and there +you can also have special things called prefix + +1:12:52.421 --> 1:12:53.434 +language. + +1:12:54.354 --> 1:13:04.079 +Because we are saying it might be helpful +that some of your inputs you can use by direction + +1:13:04.079 --> 1:13:17.334 +because: That is what is called a prefix where +you say on the first tokens you have bidirectional + +1:13:17.334 --> 1:13:19.094 +connections. + +1:13:19.219 --> 1:13:28.768 +You somehow merge that mainly works only in +transformer based models because the uni direction. + +1:13:29.629 --> 1:13:34.894 +There is no different number of parameters. + +1:13:34.975 --> 1:13:38.533 +Transformer: The only difference is how you +mask your attention. + +1:13:38.878 --> 1:13:47.691 +We have seen that in the encoder, in the decoder, +the number of parameters is different because + +1:13:47.691 --> 1:13:50.261 +you do the cross-attention. + +1:13:50.650 --> 1:13:58.389 +It's only like you mask your attention to +only look at the bad past or also look into + +1:13:58.389 --> 1:13:59.469 +the future. + +1:14:00.680 --> 1:14:03.323 +And now you can, of course, also do mixing. + +1:14:03.563 --> 1:14:08.307 +So this is a bidirectional attention metric +where you can attend to everything. + +1:14:08.588 --> 1:14:23.477 +That is a unidirection or causal where you +can only look at the past and you can do this + +1:14:23.477 --> 1:14:25.652 +with prefix. + +1:14:29.149 --> 1:14:42.829 +Some are all clear based on that, then of +course you can also do the other thing. + +1:14:43.163 --> 1:14:54.497 +So the idea is we have our encoder, decoder +architecture, can we also train them completely + +1:14:54.497 --> 1:14:57.700 +in a side supervised way? + +1:14:58.238 --> 1:15:06.206 +In this case we have the same input to both, +so in this case we would have the sentence + +1:15:06.206 --> 1:15:08.470 +as input in the decoder. + +1:15:08.470 --> 1:15:12.182 +Then we need to do some type of masking. + +1:15:12.912 --> 1:15:16.245 +Here we don't need to do the masking, but +here we need to do. + +1:15:16.245 --> 1:15:17.911 +The masking doesn't know ever. + +1:15:20.440 --> 1:15:30.269 +And this type of model got quite successful +also, especially for pre-training machine translation. + +1:15:30.330 --> 1:15:45.934 +This is the first model of the BART model, +which is one successful way to pre-train your + +1:15:45.934 --> 1:15:47.162 +model. + +1:15:47.427 --> 1:15:52.858 +Where you put in source sentence, we can't +do that here. + +1:15:52.858 --> 1:15:55.430 +We only have one language. + +1:15:55.715 --> 1:16:00.932 +But we can just put this twice in there, and +that is not a trivial task. + +1:16:00.932 --> 1:16:08.517 +We can change it in: They do quite a bit of +different corruption techniques. + +1:16:08.517 --> 1:16:12.751 +You can do token masking and you can also. + +1:16:13.233 --> 1:16:20.785 +That you couldn't do and go the only system +because then it wouldn't be there if you cannot + +1:16:20.785 --> 1:16:22.345 +predict somewhere. + +1:16:22.345 --> 1:16:26.368 +So the number of input and output tokens always. + +1:16:26.906 --> 1:16:29.820 +You cannot do a prediction for something which +isn't it? + +1:16:30.110 --> 1:16:39.714 +Here in the decoder side it's uni-direction +so we can also delete and then generate the + +1:16:39.714 --> 1:16:40.369 +full. + +1:16:41.061 --> 1:16:48.628 +We can do sentence per rotation where you +change the sentence. + +1:16:48.628 --> 1:16:54.274 +We can document rotation and text and filling. + +1:16:55.615 --> 1:17:05.870 +So you see there's quite a lot of types of +models that you can use in order to pre-train + +1:17:05.870 --> 1:17:06.561 +your. + +1:17:07.507 --> 1:17:12.512 +And these are the models you can use. + +1:17:12.512 --> 1:17:21.072 +Of course, the other question is how do you +integrate them into? + +1:17:21.761 --> 1:17:26.638 +And there's also like yeah quite some different +ways of techniques. + +1:17:27.007 --> 1:17:28.684 +It's a Bit Similar to Before. + +1:17:28.928 --> 1:17:39.307 +So the easiest thing is you take your word +embeddings or your pre-train model. + +1:17:39.307 --> 1:17:47.979 +If you're contextual embedding several layers +you freeze them in. + +1:17:48.748 --> 1:17:53.978 +Can also be done if you have a bark model. + +1:17:53.978 --> 1:18:03.344 +You freeze your wooden beddings, for example, +and only train the top layers. + +1:18:05.865 --> 1:18:14.965 +The other thing is you initialize them so +you initialize your models but then you train + +1:18:14.965 --> 1:18:19.102 +everything so you're not only training. + +1:18:22.562 --> 1:18:32.600 +When you have then one thing, if you think +about Bart, there's one thing, so you want + +1:18:32.600 --> 1:18:35.752 +to have the same language. + +1:18:36.516 --> 1:18:46.013 +Typically mean the one you get is from English, +so you can not try to do some language. + +1:18:46.366 --> 1:18:55.165 +Below the barge, in order to learn some language +specific stuff or there's a multilingual barge + +1:18:55.165 --> 1:19:03.415 +which is trained on many languages, it's trained +only on like it's more or less language. + +1:19:03.923 --> 1:19:09.745 +So then you would still need to find June +and the model needs to learn how to better + +1:19:09.745 --> 1:19:12.074 +do the attention cross lingually. + +1:19:12.074 --> 1:19:18.102 +It's only on the same language but it mainly +only has to learn this mapping and not all + +1:19:18.102 --> 1:19:18.787 +the rest. + +1:19:21.982 --> 1:19:27.492 +A third thing which is is very commonly used +is what is frequent to it as adapters. + +1:19:27.607 --> 1:19:29.749 +So, for example, you take and bark. + +1:19:29.709 --> 1:19:35.502 +And you put some adapters on the inside of +the network so that it's small new layers which + +1:19:35.502 --> 1:19:41.676 +are in between put in there and then you only +train these adapters or also train these adapters. + +1:19:41.676 --> 1:19:47.724 +So for example in Embry you could see that +this learns to map the Seus language representation + +1:19:47.724 --> 1:19:50.333 +to the targeted language representation. + +1:19:50.470 --> 1:19:52.395 +And then you don't have to change that luck. + +1:19:52.792 --> 1:20:04.197 +Ideas that you give it some extra ability +to really perform well on that, and then it's + +1:20:04.197 --> 1:20:05.234 +easier. + +1:20:05.905 --> 1:20:15.117 +Is also very commonly used, for example, in +multilingual systems where the idea is you + +1:20:15.117 --> 1:20:16.282 +have some. + +1:20:16.916 --> 1:20:23.505 +So they are trained only for one language +pair, so the model has some of those it once + +1:20:23.505 --> 1:20:27.973 +has the abilities to do multilingually to share +knowledge. + +1:20:27.973 --> 1:20:33.729 +But then there is some knowledge which is +very language specific, and then. + +1:20:34.914 --> 1:20:39.291 +But there's one chance in general, the multilingual +systems. + +1:20:39.291 --> 1:20:40.798 +It works quite well. + +1:20:40.798 --> 1:20:47.542 +There's one specific use case for multilingual, +where this normally doesn't really work well. + +1:20:47.542 --> 1:20:49.981 +Do you have an idea of what that? + +1:20:55.996 --> 1:20:57.534 +It's for Zero Short Cases. + +1:20:57.998 --> 1:21:06.051 +Because then you're having to hear some situation +which might be very language specific again + +1:21:06.051 --> 1:21:15.046 +in zero shot, the idea is always to learn representations +via which are more language dependent and with + +1:21:15.046 --> 1:21:17.102 +the adaptors of course. + +1:21:20.260 --> 1:21:37.655 +And there's also the idea of doing more like +a knowledge ventilation setup, so in this. + +1:21:39.179 --> 1:21:41.177 +And now the idea is okay. + +1:21:41.177 --> 1:21:48.095 +We are training it the same, but what we want +to achieve is that the hidden stages of the + +1:21:48.095 --> 1:21:54.090 +encoder are as similar to the one as the pre-train +model, just as additional. + +1:21:54.414 --> 1:22:07.569 +So you should learn faster by telling the +model to make these states as similar as possible. + +1:22:07.569 --> 1:22:11.813 +You compare the first hidden. + +1:22:12.192 --> 1:22:18.549 +For example, by using the L2 norm, so by just +making these two representations the same. + +1:22:20.020 --> 1:22:22.880 +Now here it requires the same vocabulary. + +1:22:22.880 --> 1:22:25.468 +Why does it need the same vocabulary? + +1:22:25.468 --> 1:22:26.354 +Give me the. + +1:22:34.754 --> 1:22:39.132 +You have different vocabulary. + +1:22:39.132 --> 1:22:50.711 +You also have different like sequence lengths +because if you use different these. + +1:22:51.231 --> 1:22:55.680 +Then what happens is now we have states here. + +1:22:55.680 --> 1:23:01.097 +It's no longer straightforward which states +to compare. + +1:23:02.322 --> 1:23:05.892 +And then it's just easier to have like the +same number. + +1:23:05.892 --> 1:23:08.952 +You can always compare the first to the second. + +1:23:09.709 --> 1:23:16.836 +So therefore at least the very easy way of +knowledge destination only works if you have. + +1:23:17.177 --> 1:23:30.871 +Of course you could do things like the average +should be the same, but of course that's less + +1:23:30.871 --> 1:23:33.080 +strong signal. + +1:23:34.314 --> 1:23:47.087 +But the advantage here is that you have a +direct training signal here on the ink corner + +1:23:47.087 --> 1:23:52.457 +so you can directly make the signal. + +1:23:56.936 --> 1:24:11.208 +Yes, think this is most things for today, +so what you should keep in mind today is two + +1:24:11.208 --> 1:24:18.147 +techniques: The one is a back translation idea. + +1:24:18.147 --> 1:24:26.598 +If you have monolingual letters, you back +translate it and use. + +1:24:26.886 --> 1:24:33.608 +And yeah, it is even often helpful to even +combine them so you can even use both of them. + +1:24:33.853 --> 1:24:39.669 +You can do use pre-trained walls, but then +you can even still do back translation where + +1:24:39.669 --> 1:24:40.066 +it's. + +1:24:40.160 --> 1:24:47.058 +We have the advantage that we are training +like everything working together on the tasks + +1:24:47.058 --> 1:24:54.422 +so it might be helpful even to backtranslate +some data and then use it in the real translation + +1:24:54.422 --> 1:24:57.755 +because in pre-training the big challenge. + +1:24:58.058 --> 1:25:07.392 +You can see there is different ways of integrating +this knowledge, but even if you use a full + +1:25:07.392 --> 1:25:08.087 +model. + +1:25:08.748 --> 1:25:11.713 +This is the most similar you can get. + +1:25:11.713 --> 1:25:15.224 +You're doing no changes to the architecture. + +1:25:15.224 --> 1:25:20.608 +You're really taking the model and just fine +tuning on the new task. + +1:25:20.608 --> 1:25:24.041 +But it still has to completely newly learn. + +1:25:24.464 --> 1:25:29.978 +Might be, for example, helpful to have more +back translated data to learn them. + +1:25:32.192 --> 1:25:45.096 +Good, that's important thing that next Tuesday +there is a conference or a workshop in this + +1:25:45.096 --> 1:25:45.947 +room. + +1:25:47.127 --> 1:25:54.405 +You should get an email if you're an alias +that there is a room change for Tuesdays, only + +1:25:54.405 --> 1:25:57.398 +for Tuesdays, and it's again normal. + +1:25:57.637 --> 1:26:03.714 +Some more questions again have a more general +perspective, especially: Computer vision. + +1:26:03.714 --> 1:26:07.246 +You can enlarge your data set with data augmentation. + +1:26:07.246 --> 1:26:08.293 +It's there and. + +1:26:08.388 --> 1:26:15.306 +Similarly to a large speech or text, so the +data orientation. + +1:26:15.755 --> 1:26:27.013 +You can use this back translation and also +the masking, but a bit like that would say + +1:26:27.013 --> 1:26:31.201 +that is the most similar thing. + +1:26:31.371 --> 1:26:35.632 +So it has also been, for example, it's used +not only for monolingual data. + +1:26:36.216 --> 1:26:40.958 +If you have good MP system, it can also be +used for parallel data by having like augmenting + +1:26:40.958 --> 1:26:46.061 +your data with more data because then you have +the human translation and the automatic translation + +1:26:46.061 --> 1:26:46.783 +is both good. + +1:26:46.783 --> 1:26:51.680 +You're just having more data and better feedback +signal and different ways because there's not + +1:26:51.680 --> 1:26:53.845 +only one correct translation but several. + +1:26:54.834 --> 1:26:58.327 +Would say this is the most similar one. + +1:26:58.327 --> 1:27:00.947 +Just rotate things and so on. + +1:27:00.947 --> 1:27:03.130 +There's ways you can do. + +1:27:05.025 --> 1:27:07.646 +But for example there's rarely use. + +1:27:07.646 --> 1:27:13.907 +It's very hard to do this by by rules like +which words to replace because there's not + +1:27:13.907 --> 1:27:14.490 +a cool. + +1:27:14.490 --> 1:27:18.931 +You cannot like always say this word can always +be replaced. + +1:27:19.139 --> 1:27:28.824 +Mean, although they are my perfect synonyms, +they are good in some cases, but not in all + +1:27:28.824 --> 1:27:29.585 +cases. + +1:27:29.585 --> 1:27:36.985 +And if you don't do a rule base, you have +to train the model again. + +1:27:38.058 --> 1:27:57.050 +Here we can compare the hidden stages to the +same architecture as the free train normal. + +1:27:57.457 --> 1:27:59.817 +Should be of the same dimension, so it's easiest +to have the. + +1:28:00.000 --> 1:28:03.780 +Architecture: We later will learn in efficiency. + +1:28:03.780 --> 1:28:08.949 +You can also do knowledge destillation with, +for example, smaller. + +1:28:08.949 --> 1:28:15.816 +So you can have twelve layers, only five, +and then you try to learn the same within five + +1:28:15.816 --> 1:28:16.433 +layers. + +1:28:17.477 --> 1:28:22.945 +Eight layers, so that is possible, but yeah +agree it should be of the same hidden size. + +1:28:23.623 --> 1:28:35.963 +The question then, of course, is you can do +it as an initialization or you can do it during + +1:28:35.963 --> 1:28:37.305 +training? + +1:28:37.305 --> 1:28:41.195 +You have some main training. + +1:28:45.865 --> 1:28:53.964 +Good, then thanks a lot, and then we'll see +each other again on Tuesday. + diff --git a/demo_data/lectures/Lecture-11-15.06.2023/video.mp4 b/demo_data/lectures/Lecture-11-15.06.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..89042f361da5bdac35449f158e0e98df4aa8ba1f --- /dev/null +++ b/demo_data/lectures/Lecture-11-15.06.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:018f7b42f2225e9ea6d68c39e22111b3d3e172c045fde57e3dfd6b2ca3df4198 +size 123175586 diff --git a/demo_data/lectures/Lecture-12-20.06.2023/English.vtt b/demo_data/lectures/Lecture-12-20.06.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..b142373c8cabe046f6dce73dee7676cb59c1154d --- /dev/null +++ b/demo_data/lectures/Lecture-12-20.06.2023/English.vtt @@ -0,0 +1,10713 @@ +WEBVTT + +0:00:03.243 --> 0:00:18.400 +Hey welcome to our video, small room today +and to the lecture machine translation. + +0:00:19.579 --> 0:00:32.295 +So the idea is we have like last time we started +addressing problems and building machine translation. + +0:00:32.772 --> 0:00:39.140 +And we looked into different ways of how we +can use other types of resources. + +0:00:39.379 --> 0:00:54.656 +Last time we looked into language models and +especially pre-trained models which are different + +0:00:54.656 --> 0:00:59.319 +paradigms and learning data. + +0:01:00.480 --> 0:01:07.606 +However, there is one other way of getting +data and that is just searching for more data. + +0:01:07.968 --> 0:01:14.637 +And the nice thing is it was a worldwide web. + +0:01:14.637 --> 0:01:27.832 +We have a very big data resource where there's +various types of data which we can all use. + +0:01:28.128 --> 0:01:38.902 +If you want to build a machine translation +for a specific language or specific to Maine, + +0:01:38.902 --> 0:01:41.202 +it might be worse. + +0:01:46.586 --> 0:01:55.399 +In general, the other year we had different +types of additional resources we can have. + +0:01:55.399 --> 0:01:59.654 +Today we look into the state of crawling. + +0:01:59.654 --> 0:02:05.226 +It always depends a bit on what type of task +you have. + +0:02:05.525 --> 0:02:08.571 +We're crawling, you point off no possibilities. + +0:02:08.828 --> 0:02:14.384 +We have seen some weeks ago that Maje Lingo +models another thing where you can try to share + +0:02:14.384 --> 0:02:16.136 +knowledge between languages. + +0:02:16.896 --> 0:02:26.774 +Last we looked into monolingual data and next +we also unsupervised them too which is purely + +0:02:26.774 --> 0:02:29.136 +based on monolingual. + +0:02:29.689 --> 0:02:35.918 +What we today will focus on is really web +crawling of parallel data. + +0:02:35.918 --> 0:02:40.070 +We will focus not on the crawling pad itself. + +0:02:41.541 --> 0:02:49.132 +Networking lecture is something about one +of the best techniques to do web trolleying + +0:02:49.132 --> 0:02:53.016 +and then we'll just rely on existing tools. + +0:02:53.016 --> 0:02:59.107 +But the challenge is normally if you have +web data that's pure text. + +0:03:00.920 --> 0:03:08.030 +And these are all different ways of how we +can do that, and today is focused on that. + +0:03:08.508 --> 0:03:21.333 +So why would we be interested in that there +is quite different ways of collecting data? + +0:03:21.333 --> 0:03:28.473 +If you're currently when we talk about parallel. + +0:03:28.548 --> 0:03:36.780 +The big difference is that you focus on one +specific website so you can manually check + +0:03:36.780 --> 0:03:37.632 +how you. + +0:03:38.278 --> 0:03:49.480 +This you can do for dedicated resources where +you have high quality data. + +0:03:50.510 --> 0:03:56.493 +Another thing which has been developed or +has been done for several tasks is also is + +0:03:56.493 --> 0:03:59.732 +like you can do something like crowdsourcing. + +0:03:59.732 --> 0:04:05.856 +I don't know if you know about sites like +Amazon Mechanical Turing or things like that + +0:04:05.856 --> 0:04:08.038 +so you can there get a lot of. + +0:04:07.988 --> 0:04:11.544 +Writing between cheap labors would like easy +translations for you. + +0:04:12.532 --> 0:04:22.829 +Of course you can't collect millions of sentences, +but if it's like thousands of sentences that's + +0:04:22.829 --> 0:04:29.134 +also sourced, it's often interesting when you +have somehow. + +0:04:29.509 --> 0:04:36.446 +However, this is a field of itself, so crowdsourcing +is not that easy. + +0:04:36.446 --> 0:04:38.596 +It's not like upload. + +0:04:38.738 --> 0:04:50.806 +If you're doing that you will have very poor +quality, for example in the field of machine + +0:04:50.806 --> 0:04:52.549 +translation. + +0:04:52.549 --> 0:04:57.511 +Crowdsourcing is very commonly used. + +0:04:57.397 --> 0:05:00.123 +The problem there is. + +0:05:00.480 --> 0:05:08.181 +Since they are paid quite bad, of course, +a lot of people also try to make it put into + +0:05:08.181 --> 0:05:09.598 +it as possible. + +0:05:09.869 --> 0:05:21.076 +So if you're just using it without any control +mechanisms, the quality will be bad. + +0:05:21.076 --> 0:05:27.881 +What you can do is like doing additional checking. + +0:05:28.188 --> 0:05:39.084 +And think recently read a paper that now these +things can be worse because people don't do + +0:05:39.084 --> 0:05:40.880 +it themselves. + +0:05:41.281 --> 0:05:46.896 +So it's a very interesting topic. + +0:05:46.896 --> 0:05:55.320 +There has been a lot of resources created +by this. + +0:05:57.657 --> 0:06:09.796 +It's really about large scale data, then of +course doing some type of web crawling is the + +0:06:09.796 --> 0:06:10.605 +best. + +0:06:10.930 --> 0:06:17.296 +However, the biggest issue in this case is +in the quality. + +0:06:17.296 --> 0:06:22.690 +So how can we ensure that somehow the quality +of. + +0:06:23.003 --> 0:06:28.656 +Because if you just, we all know that in the +Internet there's also a lot of tools. + +0:06:29.149 --> 0:06:37.952 +Low quality staff, and especially now the +bigger question is how can we ensure that translations + +0:06:37.952 --> 0:06:41.492 +are really translations of each other? + +0:06:45.065 --> 0:06:58.673 +Why is this interesting so we had this number +before so there is some estimates that roughly + +0:06:58.673 --> 0:07:05.111 +a human reads around three hundred million. + +0:07:05.525 --> 0:07:16.006 +If you look into the web you will have millions +of words there so you can really get a large + +0:07:16.006 --> 0:07:21.754 +amount of data and if you think about monolingual. + +0:07:22.042 --> 0:07:32.702 +So at least for some language pairs there +is a large amount of data you can have. + +0:07:32.852 --> 0:07:37.783 +Languages are official languages in one country. + +0:07:37.783 --> 0:07:46.537 +There's always a very great success because +a lot of websites from the government need + +0:07:46.537 --> 0:07:48.348 +to be translated. + +0:07:48.568 --> 0:07:58.777 +For example, a large purpose like in India, +which we have worked with in India, so you + +0:07:58.777 --> 0:08:00.537 +have parallel. + +0:08:01.201 --> 0:08:02.161 +Two questions. + +0:08:02.161 --> 0:08:08.438 +First of all, if jet GPS and machine translation +tools are more becoming ubiquitous and everybody + +0:08:08.438 --> 0:08:14.138 +uses them, don't we get a problem because we +want to crawl the web and use the data and. + +0:08:15.155 --> 0:08:18.553 +Yes, that is a severe problem. + +0:08:18.553 --> 0:08:26.556 +Of course, are we only training on training +data which is automatically? + +0:08:26.766 --> 0:08:41.182 +And if we are doing that, of course, we talked +about the synthetic data where we do back translation. + +0:08:41.341 --> 0:08:46.446 +But of course it gives you some aren't up +about norm, you cannot be much better than + +0:08:46.446 --> 0:08:46.806 +this. + +0:08:48.308 --> 0:08:57.194 +That is, we'll get more and more on issues, +so maybe at some point we won't look at the + +0:08:57.194 --> 0:09:06.687 +current Internet, but focus on oats like image +of the Internet, which are created by Archive. + +0:09:07.527 --> 0:09:18.611 +There's lots of classification algorithms +on how to classify automatic data they had + +0:09:18.611 --> 0:09:26.957 +a very interesting paper on how to watermark +their translation. + +0:09:27.107 --> 0:09:32.915 +So there's like two scenarios of course in +this program: The one thing you might want + +0:09:32.915 --> 0:09:42.244 +to find your own translation if you're a big +company and say do an antisystem that may be + +0:09:42.244 --> 0:09:42.866 +used. + +0:09:43.083 --> 0:09:49.832 +This problem might be that most of the translation +out there is created by you. + +0:09:49.832 --> 0:10:01.770 +You might be able: And there is a relatively +easy way of doing that so that there are other + +0:10:01.770 --> 0:10:09.948 +peoples' mainly that can do it like the search +or teacher. + +0:10:09.929 --> 0:10:12.878 +They are different, but there is not the one +correction station. + +0:10:13.153 --> 0:10:23.763 +So what you then can't do is you can't output +the best one to the user, but the highest value. + +0:10:23.763 --> 0:10:30.241 +For example, it's easy, but you can take the +translation. + +0:10:30.870 --> 0:10:40.713 +And if you always give the translation of +your investments, which are all good with the + +0:10:40.713 --> 0:10:42.614 +most ease, then. + +0:10:42.942 --> 0:10:55.503 +But of course this you can only do with most +of the data generated by your model. + +0:10:55.503 --> 0:11:02.855 +What we are now seeing is not only checks, +but. + +0:11:03.163 --> 0:11:13.295 +But it's definitely an additional research +question that might get more and more importance, + +0:11:13.295 --> 0:11:18.307 +and it might be an additional filtering step. + +0:11:18.838 --> 0:11:29.396 +There are other issues in data quality, so +in which direction wasn't translated, so that + +0:11:29.396 --> 0:11:31.650 +is not interested. + +0:11:31.891 --> 0:11:35.672 +But if you're now reaching better and better +quality, it makes a difference. + +0:11:35.672 --> 0:11:39.208 +The original data was from German to English +or from English to German. + +0:11:39.499 --> 0:11:44.797 +Because translation, they call it translate +Chinese. + +0:11:44.797 --> 0:11:53.595 +So if you generate German from English, it +has a more similar structure as if you would + +0:11:53.595 --> 0:11:55.195 +directly speak. + +0:11:55.575 --> 0:11:57.187 +So um. + +0:11:57.457 --> 0:12:03.014 +These are all issues which you then might +do like do additional training to remove them + +0:12:03.014 --> 0:12:07.182 +or you first train on them and later train +on other quality data. + +0:12:07.182 --> 0:12:11.034 +But yet that's a general view on so it's an +important issue. + +0:12:11.034 --> 0:12:17.160 +But until now I think it hasn't been addressed +that much maybe because the quality was decently. + +0:12:18.858 --> 0:12:23.691 +Actually, I think we're sure if we have the +time we use the Internet. + +0:12:23.691 --> 0:12:29.075 +The problem is, it's a lot of English speaking +text, but most used languages. + +0:12:29.075 --> 0:12:34.460 +I don't know some language in Africa that's +spoken, but we do about that one. + +0:12:34.460 --> 0:12:37.566 +I mean, that's why most data is English too. + +0:12:38.418 --> 0:12:42.259 +Other languages, and then you get the best. + +0:12:42.259 --> 0:12:46.013 +If there is no data on the Internet, then. + +0:12:46.226 --> 0:12:48.255 +So there is still a lot of data collection. + +0:12:48.255 --> 0:12:50.976 +Also in the wild way you try to improve there +and collect. + +0:12:51.431 --> 0:12:57.406 +But English is the most in the world, but +you find surprisingly much data also for other + +0:12:57.406 --> 0:12:58.145 +languages. + +0:12:58.678 --> 0:13:04.227 +Of course, only if they're written remember. + +0:13:04.227 --> 0:13:15.077 +Most languages are not written at all, but +for them you might find some video, but it's + +0:13:15.077 --> 0:13:17.420 +difficult to find. + +0:13:17.697 --> 0:13:22.661 +So this is mainly done for the web trawling. + +0:13:22.661 --> 0:13:29.059 +It's mainly done for languages which are commonly +spoken. + +0:13:30.050 --> 0:13:38.773 +Is exactly the next point, so this is that +much data is only true for English and some + +0:13:38.773 --> 0:13:41.982 +other languages, but of course. + +0:13:41.982 --> 0:13:50.285 +And therefore a lot of research on how to +make things efficient and efficient and learn + +0:13:50.285 --> 0:13:54.248 +faster from pure data is still essential. + +0:13:59.939 --> 0:14:06.326 +So what we are interested in now on data is +parallel data. + +0:14:06.326 --> 0:14:10.656 +We assume always we have parallel data. + +0:14:10.656 --> 0:14:12.820 +That means we have. + +0:14:13.253 --> 0:14:20.988 +To be careful when you start crawling from +the web, we might get only related types of. + +0:14:21.421 --> 0:14:30.457 +So one comedy thing is what people refer as +noisy parallel data where there is documents + +0:14:30.457 --> 0:14:34.315 +which are translations of each other. + +0:14:34.434 --> 0:14:44.300 +So you have senses where there is no translation +on the other side because you have. + +0:14:44.484 --> 0:14:50.445 +So if you have these types of documents your +algorithm to extract parallel data might be + +0:14:50.445 --> 0:14:51.918 +a bit more difficult. + +0:14:52.352 --> 0:15:04.351 +Know if you can still remember in the beginning +of the lecture when we talked about different + +0:15:04.351 --> 0:15:06.393 +data resources. + +0:15:06.286 --> 0:15:11.637 +But the first step is then approached to a +light source and target sentences, and it was + +0:15:11.637 --> 0:15:16.869 +about like a steep vocabulary, and then you +have some probabilities for one to one and + +0:15:16.869 --> 0:15:17.590 +one to one. + +0:15:17.590 --> 0:15:23.002 +It's very like simple algorithm, but yet it +works fine for really a high quality parallel + +0:15:23.002 --> 0:15:23.363 +data. + +0:15:23.623 --> 0:15:30.590 +But when we're talking about noisy data, we +might have to do additional steps and use more + +0:15:30.590 --> 0:15:35.872 +advanced models to extract what is parallel +and to get high quality. + +0:15:36.136 --> 0:15:44.682 +So if we just had no easy parallel data, the +document might not be as easy to extract. + +0:15:49.249 --> 0:15:54.877 +And then there is even the more extreme pains, +which has also been used to be honest. + +0:15:54.877 --> 0:15:58.214 +The use of this data is reasoning not that +common. + +0:15:58.214 --> 0:16:04.300 +It was more interested maybe like ten or fifteen +years ago, and that is what people referred + +0:16:04.300 --> 0:16:05.871 +to as comparative data. + +0:16:06.266 --> 0:16:17.167 +And then the idea is you even don't have translations +like sentences which are translations of each + +0:16:17.167 --> 0:16:25.234 +other, but you have more news documents or +articles about the same topic. + +0:16:25.205 --> 0:16:32.410 +But it's more that you find phrases which +are too big in the user, so even black fragments. + +0:16:32.852 --> 0:16:44.975 +So if you think about the pedia, for example, +these articles have to be written in like the + +0:16:44.975 --> 0:16:51.563 +Wikipedia general idea independent of each +other. + +0:16:51.791 --> 0:17:01.701 +They have different information in there, +and I mean, the German movie gets more detail + +0:17:01.701 --> 0:17:04.179 +than the English one. + +0:17:04.179 --> 0:17:07.219 +However, it might be that. + +0:17:07.807 --> 0:17:20.904 +And the same thing is that you think about +newspaper articles if they're at the same time. + +0:17:21.141 --> 0:17:24.740 +And so this is an ability to learn. + +0:17:24.740 --> 0:17:29.738 +For example, new phrases, vocabulary and stature. + +0:17:29.738 --> 0:17:36.736 +If you don't have parallel data, but you could +monitor all time long. + +0:17:37.717 --> 0:17:49.020 +And then not everything will be the same, +but there might be an overlap about events. + +0:17:54.174 --> 0:18:00.348 +So if we're talking about web trolling said +in the beginning it was really about specific. + +0:18:00.660 --> 0:18:18.878 +They do very good things by hand and really +focus on them and do a very specific way of + +0:18:18.878 --> 0:18:20.327 +doing. + +0:18:20.540 --> 0:18:23.464 +The European Parliament was very focused in +Ted. + +0:18:23.464 --> 0:18:26.686 +Maybe you even have looked in the particular +session. + +0:18:27.427 --> 0:18:40.076 +And these are still important, but they are +of course very specific in covering different + +0:18:40.076 --> 0:18:41.341 +pockets. + +0:18:42.002 --> 0:18:55.921 +Then there was a focus on language centering, +so there was a big drawer, for example, that + +0:18:55.921 --> 0:18:59.592 +you can check websites. + +0:19:00.320 --> 0:19:06.849 +Apparently what really people like is a more +general approach where you just have to specify. + +0:19:06.849 --> 0:19:13.239 +I'm interested in data from German to Lithuanian +and then you can as automatic as possible. + +0:19:13.239 --> 0:19:15.392 +We see what's normally needed. + +0:19:15.392 --> 0:19:19.628 +You can collect as much data and extract codelaia +from this. + +0:19:21.661 --> 0:19:25.633 +So is this our interest? + +0:19:25.633 --> 0:19:36.435 +Of course, the question is how can we build +these types of systems? + +0:19:36.616 --> 0:19:52.913 +The first are more general web crawling base +systems, so there is nothing about. + +0:19:53.173 --> 0:19:57.337 +Based on the websites you have, you have to +do like text extraction. + +0:19:57.597 --> 0:20:06.503 +We are typically not that much interested +in text and images in there, so we try to extract + +0:20:06.503 --> 0:20:07.083 +text. + +0:20:07.227 --> 0:20:16.919 +This is also not specific to machine translation, +but it's a more traditional way of doing web + +0:20:16.919 --> 0:20:17.939 +trolling. + +0:20:18.478 --> 0:20:22.252 +And at the end you have mirror like some other +set of document collectors. + +0:20:22.842 --> 0:20:37.025 +Is the idea, so you have the text, and often +this is a document, and so in the end. + +0:20:37.077 --> 0:20:51.523 +And that is some of your starting point now +for doing the more machine translation. + +0:20:52.672 --> 0:21:05.929 +One way of doing that now is very similar +to what you might have think about the traditional + +0:21:05.929 --> 0:21:06.641 +one. + +0:21:06.641 --> 0:21:10.633 +The first thing is to do a. + +0:21:11.071 --> 0:21:22.579 +So you have this based on the initial fact +that you know this is a German website in the + +0:21:22.579 --> 0:21:25.294 +English translation. + +0:21:25.745 --> 0:21:31.037 +And based on this document alignment, then +you can do your sentence alignment. + +0:21:31.291 --> 0:21:39.072 +And this is similar to what we had before +with the church accordion. + +0:21:39.072 --> 0:21:43.696 +This is typically more noisy peril data. + +0:21:43.623 --> 0:21:52.662 +So that you are not assuming that everything +is on both sides, that the order is the same, + +0:21:52.662 --> 0:21:56.635 +so you should do more flexible systems. + +0:21:58.678 --> 0:22:14.894 +Then it depends if the documents you were +drawing were really some type of parallel data. + +0:22:15.115 --> 0:22:35.023 +Say then you should do what is referred to +as fragmented extraction. + +0:22:36.136 --> 0:22:47.972 +One problem with these types of models is +if you are doing errors in your document alignment,. + +0:22:48.128 --> 0:22:55.860 +It means that if you are saying these two +documents are align then you can only find + +0:22:55.860 --> 0:22:58.589 +sense and if you are missing. + +0:22:59.259 --> 0:23:15.284 +Is very different, only small parts of the +document are parallel, and most parts are independent + +0:23:15.284 --> 0:23:17.762 +of each other. + +0:23:19.459 --> 0:23:31.318 +Therefore, more recently, there is also the +idea of directly doing sentence aligned so + +0:23:31.318 --> 0:23:35.271 +that you're directly taking. + +0:23:36.036 --> 0:23:41.003 +Was already one challenge of this one, the +second approach. + +0:23:42.922 --> 0:23:50.300 +Yes, so one big challenge on here, beef, then +you have to do a lot of comparison. + +0:23:50.470 --> 0:23:59.270 +You have to cook out every source, every target +set and square. + +0:23:59.270 --> 0:24:06.283 +If you think of a million or trillion pairs, +then. + +0:24:07.947 --> 0:24:12.176 +And this also gives you a reason for a last +step in both cases. + +0:24:12.176 --> 0:24:18.320 +So in both of them you have to remember you're +typically eating here in this very large data + +0:24:18.320 --> 0:24:18.650 +set. + +0:24:18.650 --> 0:24:24.530 +So all of these and also the document alignment +here they should be done very efficient. + +0:24:24.965 --> 0:24:42.090 +And if you want to do it very efficiently, +that means your quality will go lower. + +0:24:41.982 --> 0:24:47.348 +Because you just have to ever see it fast, +and then yeah you can put less computation + +0:24:47.348 --> 0:24:47.910 +on each. + +0:24:48.688 --> 0:25:06.255 +Therefore, in a lot of scenarios it makes +sense to make an additional filtering step + +0:25:06.255 --> 0:25:08.735 +at the end. + +0:25:08.828 --> 0:25:13.370 +And then we do a second filtering step where +we now can put a lot more effort. + +0:25:13.433 --> 0:25:20.972 +Because now we don't have like any square +possible combinations anymore, we have already + +0:25:20.972 --> 0:25:26.054 +selected and maybe in dimension of maybe like +two or three. + +0:25:26.054 --> 0:25:29.273 +For each sentence we even don't have. + +0:25:29.429 --> 0:25:39.234 +And then we can put a lot more effort in each +individual example and build a high quality + +0:25:39.234 --> 0:25:42.611 +classic fire to really select. + +0:25:45.125 --> 0:26:00.506 +Two or one example for that, so one of the +biggest projects doing this is the so-called + +0:26:00.506 --> 0:26:03.478 +Paratrol Corpus. + +0:26:03.343 --> 0:26:11.846 +Typically it's like before the picturing so +there are a lot of challenges on how you can. + +0:26:12.272 --> 0:26:25.808 +And the steps they start to be with the seatbelt, +so what you should give at the beginning is: + +0:26:26.146 --> 0:26:36.908 +Then they do the problem, the text extraction, +the document alignment, the sentence alignment, + +0:26:36.908 --> 0:26:45.518 +and the sentence filter, and it swings down +to implementing the text store. + +0:26:46.366 --> 0:26:51.936 +We'll see later for a lot of language pairs +exist so it's easier to download them and then + +0:26:51.936 --> 0:26:52.793 +like improve. + +0:26:53.073 --> 0:27:08.270 +For example, the crawling one thing they often +do is even not throw the direct website because + +0:27:08.270 --> 0:27:10.510 +there's also. + +0:27:10.770 --> 0:27:14.540 +Black parts of the Internet that they can +work on today. + +0:27:14.854 --> 0:27:22.238 +In more detail, this is a bit shown here. + +0:27:22.238 --> 0:27:31.907 +All the steps you can see are different possibilities. + +0:27:32.072 --> 0:27:39.018 +You need a bit of knowledge to do that, or +you can build a machine translation system. + +0:27:39.239 --> 0:27:47.810 +There are two different ways of deduction +and alignment. + +0:27:47.810 --> 0:27:52.622 +You can use sentence alignment. + +0:27:53.333 --> 0:28:02.102 +And how you can do the flexigrade exam, for +example, the lexic graph, or you can chin. + +0:28:02.422 --> 0:28:05.826 +To the next step in a bit more detail. + +0:28:05.826 --> 0:28:13.680 +But before we're doing it, I need more questions +about the general overview of how these. + +0:28:22.042 --> 0:28:37.058 +Yeah, so two or three things to web-drawing, +so you normally start with the URLs. + +0:28:37.058 --> 0:28:40.903 +It's most promising. + +0:28:41.021 --> 0:28:46.674 +Found that if you're interested in German +to English, you would maybe move some data + +0:28:46.674 --> 0:28:47.073 +from. + +0:28:47.407 --> 0:28:58.739 +Companies where you know they have a German +and an English website are from agencies which + +0:28:58.739 --> 0:29:08.359 +might be: And then we can use one of these +tools to start from there using standard web + +0:29:08.359 --> 0:29:10.328 +calling techniques. + +0:29:11.071 --> 0:29:23.942 +There are several challenges when doing that, +so if you request a website too often you can: + +0:29:25.305 --> 0:29:37.819 +You have to keep in history of the sites and +you click on all the links and then click on + +0:29:37.819 --> 0:29:40.739 +all the links again. + +0:29:41.721 --> 0:29:49.432 +To be very careful about legal issues starting +from this robotics day so get allowed to use. + +0:29:49.549 --> 0:29:58.941 +Mean, that's the one major thing about what +trolley general is. + +0:29:58.941 --> 0:30:05.251 +The problem is how you deal with property. + +0:30:05.685 --> 0:30:13.114 +That is why it is easier sometimes to start +with some quick fold data that you don't have. + +0:30:13.893 --> 0:30:22.526 +Of course, the network issues you retry, so +there's more technical things, but there's + +0:30:22.526 --> 0:30:23.122 +good. + +0:30:24.724 --> 0:30:35.806 +Another thing which is very helpful and is +often done is instead of doing the web trolling + +0:30:35.806 --> 0:30:38.119 +yourself, relying. + +0:30:38.258 --> 0:30:44.125 +And one thing is it's common crawl from the +web. + +0:30:44.125 --> 0:30:51.190 +Think on this common crawl a lot of these +language models. + +0:30:51.351 --> 0:30:59.763 +So think in American Company or organization +which really works on like writing. + +0:31:00.000 --> 0:31:01.111 +Possible. + +0:31:01.111 --> 0:31:10.341 +So the nice thing is if you start with this +you don't have to worry about network. + +0:31:10.250 --> 0:31:16.086 +I don't think you can do that because it's +too big, but you can do a pipeline on how to + +0:31:16.086 --> 0:31:16.683 +process. + +0:31:17.537 --> 0:31:28.874 +That is, of course, a general challenge in +all this web crawling and parallel web mining. + +0:31:28.989 --> 0:31:38.266 +That means you cannot just don't know the +data and study the processes. + +0:31:39.639 --> 0:31:45.593 +Here it might make sense to directly fields +of both domains that in some way bark just + +0:31:45.593 --> 0:31:46.414 +marginally. + +0:31:49.549 --> 0:31:59.381 +Then you can do the text extraction, which +means like converging two HTML and then splitting + +0:31:59.381 --> 0:32:01.707 +things from the HTML. + +0:32:01.841 --> 0:32:04.802 +Often very important is to do the language +I need. + +0:32:05.045 --> 0:32:16.728 +It's not that clear even if it's links which +language it is, but they are quite good tools + +0:32:16.728 --> 0:32:22.891 +like that can't identify from relatively short. + +0:32:23.623 --> 0:32:36.678 +And then you are now in the situation that +you have all your danger and that you can start. + +0:32:37.157 --> 0:32:43.651 +After the text extraction you have now a collection +or a large collection of of data where it's + +0:32:43.651 --> 0:32:49.469 +like text and maybe the document at use of +some meta information and now the question + +0:32:49.469 --> 0:32:55.963 +is based on this monolingual text or multilingual +text so text in many languages but not align. + +0:32:56.036 --> 0:32:59.863 +How can you now do a generate power? + +0:33:01.461 --> 0:33:06.289 +And UM. + +0:33:05.705 --> 0:33:13.322 +So if we're not seeing it as a task or if +we want to do it in a machine learning way, + +0:33:13.322 --> 0:33:20.940 +what we have is we have a set of sentences +and a suits language, and we have a set Of + +0:33:20.940 --> 0:33:23.331 +sentences from the target. + +0:33:23.823 --> 0:33:27.814 +This is the target language. + +0:33:27.814 --> 0:33:31.392 +This is the data we have. + +0:33:31.392 --> 0:33:37.034 +We kind of directly assume any ordering. + +0:33:38.018 --> 0:33:44.502 +More documents there are not really in line +or there is maybe a graph and what we are interested + +0:33:44.502 --> 0:33:50.518 +in is finding these alignments so which senses +are aligned to each other and which senses + +0:33:50.518 --> 0:33:53.860 +we can remove but we don't have translations +for. + +0:33:53.974 --> 0:34:00.339 +But exactly this mapping is what we are interested +in and what we need to find. + +0:34:01.901 --> 0:34:17.910 +And if we are modeling it more from the machine +translation point of view, what can model that + +0:34:17.910 --> 0:34:21.449 +as a classification? + +0:34:21.681 --> 0:34:36.655 +And so the main challenge of this is to build +this type of classifier and you want to decide. + +0:34:42.402 --> 0:34:50.912 +However, the biggest challenge has already +pointed out in the beginning is the sites if + +0:34:50.912 --> 0:34:53.329 +we have millions target. + +0:34:53.713 --> 0:35:05.194 +The number of comparison is n square, so this +very path is very inefficient, and we need + +0:35:05.194 --> 0:35:06.355 +to find. + +0:35:07.087 --> 0:35:16.914 +And traditionally there is the first one mentioned +before the local or the hierarchical meaning + +0:35:16.914 --> 0:35:20.292 +mining and there the idea is OK. + +0:35:20.292 --> 0:35:23.465 +First we are lining documents. + +0:35:23.964 --> 0:35:32.887 +Move back the things and align them, and once +you have the alignment you only need to remind. + +0:35:33.273 --> 0:35:51.709 +That of course makes anything more efficient +because we don't have to do all the comparison. + +0:35:53.253 --> 0:35:56.411 +Then it's, for example, in the before mentioned +apparel. + +0:35:57.217 --> 0:36:11.221 +But it has the issue that if this document +is bad you have error propagation and you can + +0:36:11.221 --> 0:36:14.211 +recover from that. + +0:36:14.494 --> 0:36:20.715 +Because then document that cannot say ever, +there are some sentences which are: Therefore, + +0:36:20.715 --> 0:36:24.973 +more recently there is also was referred to +as global mining. + +0:36:26.366 --> 0:36:31.693 +And there we really do this. + +0:36:31.693 --> 0:36:43.266 +Although it's in the square, we are doing +all the comparisons. + +0:36:43.523 --> 0:36:52.588 +So the idea is that you can do represent all +the sentences in a vector space. + +0:36:52.892 --> 0:37:06.654 +And then it's about nearest neighbor search +and there is a lot of very efficient algorithms. + +0:37:07.067 --> 0:37:20.591 +Then if you only compare them to your nearest +neighbors you don't have to do like a comparison + +0:37:20.591 --> 0:37:22.584 +but you have. + +0:37:26.186 --> 0:37:40.662 +So in the first step what we want to look +at is this: This document classification refers + +0:37:40.662 --> 0:37:49.584 +to the document alignment, and then we do the +sentence alignment. + +0:37:51.111 --> 0:37:58.518 +And if we're talking about document alignment, +there's like typically two steps in that: We + +0:37:58.518 --> 0:38:01.935 +first do a candidate selection. + +0:38:01.935 --> 0:38:10.904 +Often we have several steps and that is again +to make more things more efficiently. + +0:38:10.904 --> 0:38:13.360 +We have the candidate. + +0:38:13.893 --> 0:38:18.402 +The candidate select means OK, which documents +do we want to compare? + +0:38:19.579 --> 0:38:35.364 +Then if we have initial candidates which might +be parallel, we can do a classification test. + +0:38:35.575 --> 0:38:37.240 +And there is different ways. + +0:38:37.240 --> 0:38:40.397 +We can use lexical similarity or we can use +ten basic. + +0:38:41.321 --> 0:38:48.272 +The first and easiest thing is to take off +possible candidates. + +0:38:48.272 --> 0:38:55.223 +There's one possibility, the other one, is +based on structural. + +0:38:55.235 --> 0:39:05.398 +So based on how your website looks like, you +might find that there are only translations. + +0:39:05.825 --> 0:39:14.789 +This is typically the only case where we try +to do some kind of major information, which + +0:39:14.789 --> 0:39:22.342 +can be very useful because we know that websites, +for example, are linked. + +0:39:22.722 --> 0:39:35.586 +We can try to use some URL patterns, so if +we have some website which ends with the. + +0:39:35.755 --> 0:39:43.932 +So that can be easily used in order to find +candidates. + +0:39:43.932 --> 0:39:49.335 +Then we only compare websites where. + +0:39:49.669 --> 0:40:05.633 +The language and the translation of each other, +but typically you hear several heuristics to + +0:40:05.633 --> 0:40:07.178 +do that. + +0:40:07.267 --> 0:40:16.606 +Then you don't have to compare all websites, +but you only have to compare web sites. + +0:40:17.277 --> 0:40:27.607 +Cruiser problems especially with an hour day's +content management system. + +0:40:27.607 --> 0:40:32.912 +Sometimes it's nice and easy to read. + +0:40:33.193 --> 0:40:44.452 +So on the one hand there typically leads from +the parent's side to different languages. + +0:40:44.764 --> 0:40:46.632 +Now I can look at the kit websites. + +0:40:46.632 --> 0:40:49.381 +It's the same thing you can check on the difference. + +0:40:49.609 --> 0:41:06.835 +Languages: You can either do that from the +parent website or you can click on the English. + +0:41:06.926 --> 0:41:10.674 +You can therefore either like prepare to all +the websites. + +0:41:10.971 --> 0:41:18.205 +Can be even more focused and checked if the +link is somehow either flexible or the language + +0:41:18.205 --> 0:41:18.677 +name. + +0:41:19.019 --> 0:41:24.413 +So there really depends on how much you want +to filter out. + +0:41:24.413 --> 0:41:29.178 +There is always a trade-off between being +efficient. + +0:41:33.913 --> 0:41:49.963 +Based on that we then have our candidate list, +so we now have two independent sets of German + +0:41:49.963 --> 0:41:52.725 +documents, but. + +0:41:53.233 --> 0:42:03.515 +And now the task is, we want to extract these, +which are really translations of each other. + +0:42:03.823 --> 0:42:10.201 +So the question of how can we measure the +document similarity? + +0:42:10.201 --> 0:42:14.655 +Because what we then do is, we measure the. + +0:42:14.955 --> 0:42:27.096 +And here you already see why this is also +that problematic from where it's partial or + +0:42:27.096 --> 0:42:28.649 +similarly. + +0:42:30.330 --> 0:42:37.594 +All you can do that is again two folds. + +0:42:37.594 --> 0:42:48.309 +You can do it more content based or more structural +based. + +0:42:48.188 --> 0:42:53.740 +Calculating a lot of features and then maybe +training a classic pyramid small set which + +0:42:53.740 --> 0:42:57.084 +stands like based on the spesse feature is +the data. + +0:42:57.084 --> 0:42:58.661 +It is a corpus parallel. + +0:43:00.000 --> 0:43:10.955 +One way of doing that is to have traction +features, so the idea is the text length, so + +0:43:10.955 --> 0:43:12.718 +the document. + +0:43:13.213 --> 0:43:20.511 +Of course, text links will not be the same, +but if the one document has fifty words and + +0:43:20.511 --> 0:43:24.907 +the other five thousand words, it's quite realistic. + +0:43:25.305 --> 0:43:29.274 +So you can use the text length as one proxy +of. + +0:43:29.274 --> 0:43:32.334 +Is this might be a good translation? + +0:43:32.712 --> 0:43:41.316 +Now the thing is the alignment between the +structure. + +0:43:41.316 --> 0:43:52.151 +If you have here the website you can create +some type of structure. + +0:43:52.332 --> 0:44:04.958 +You can compare that to the French version +and then calculate some similarities because + +0:44:04.958 --> 0:44:07.971 +you see translation. + +0:44:08.969 --> 0:44:12.172 +Of course, it's getting more and more problematic. + +0:44:12.172 --> 0:44:16.318 +It does be a different structure than these +features are helpful. + +0:44:16.318 --> 0:44:22.097 +However, if you are doing it more in a trained +way, you can automatically learn how helpful + +0:44:22.097 --> 0:44:22.725 +they are. + +0:44:24.704 --> 0:44:37.516 +Then there are different ways of yeah: Content +based things: One easy thing, especially if + +0:44:37.516 --> 0:44:48.882 +you have systems that are using the same script +that you are looking for. + +0:44:48.888 --> 0:44:49.611 +The legs. + +0:44:49.611 --> 0:44:53.149 +We call them a beggar words and we'll look +into. + +0:44:53.149 --> 0:44:55.027 +You can use some type of. + +0:44:55.635 --> 0:44:58.418 +And neural embedding is also to abate him +at. + +0:45:02.742 --> 0:45:06.547 +And as then mean we have machine translation,. + +0:45:06.906 --> 0:45:14.640 +And one idea that you can also do is really +use the machine translation. + +0:45:14.874 --> 0:45:22.986 +Because this one is one which takes more effort, +so what you then have to do is put more effort. + +0:45:23.203 --> 0:45:37.526 +You wouldn't do this type of machine translation +based approach for a system which has product. + +0:45:38.018 --> 0:45:53.712 +But maybe your first of thinking why can't +do that because I'm collecting data to build + +0:45:53.712 --> 0:45:55.673 +an system. + +0:45:55.875 --> 0:46:01.628 +So you can use an initial system to translate +it, and then you can collect more data. + +0:46:01.901 --> 0:46:06.879 +And one way of doing that is, you're translating, +for example, all documents even to English. + +0:46:07.187 --> 0:46:25.789 +Then you only need two English data and you +do it in the example with three grams. + +0:46:25.825 --> 0:46:33.253 +For example, the current induction in 1 in +the Spanish, which is German induction in 1, + +0:46:33.253 --> 0:46:37.641 +which was Spanish induction in 2, which was +French. + +0:46:37.637 --> 0:46:52.225 +You're creating this index and then based +on that you can calculate how similar the documents. + +0:46:52.092 --> 0:46:58.190 +And then you can use the Cossack similarity +to really calculate which of the most similar + +0:46:58.190 --> 0:47:00.968 +document or how similar is the document. + +0:47:00.920 --> 0:47:04.615 +And then measure if this is a possible translation. + +0:47:05.285 --> 0:47:14.921 +Mean, of course, the document will not be +exactly the same, and even if you have a parallel + +0:47:14.921 --> 0:47:18.483 +document, French and German, and. + +0:47:18.898 --> 0:47:29.086 +You'll have not a perfect translation, therefore +it's looking into five front overlap since + +0:47:29.086 --> 0:47:31.522 +there should be last. + +0:47:34.074 --> 0:47:42.666 +Okay, before we take the next step and go +into the sentence alignment, there are more + +0:47:42.666 --> 0:47:44.764 +questions about the. + +0:47:51.131 --> 0:47:55.924 +Too Hot and. + +0:47:56.997 --> 0:47:59.384 +Well um. + +0:48:00.200 --> 0:48:05.751 +There is different ways of doing sentence +alignment. + +0:48:05.751 --> 0:48:12.036 +Here's one way to describe is to call the +other line again. + +0:48:12.172 --> 0:48:17.590 +Of course, we have the advantage that we have +only documents, so we might have like hundred + +0:48:17.590 --> 0:48:20.299 +sentences and hundred sentences in the tower. + +0:48:20.740 --> 0:48:31.909 +Although it still might be difficult to compare +all the things in parallel, and. + +0:48:31.791 --> 0:48:37.541 +And therefore typically these even assume +that we are only interested in a line character + +0:48:37.541 --> 0:48:40.800 +that can be identified on the sum of the diagonal. + +0:48:40.800 --> 0:48:46.422 +Of course, not exactly the diagonal will sum +some parts around it, but in order to make + +0:48:46.422 --> 0:48:47.891 +things more efficient. + +0:48:48.108 --> 0:48:55.713 +You can still do it around the diagonal because +if you say this is a parallel document, we + +0:48:55.713 --> 0:48:56.800 +assume that. + +0:48:56.836 --> 0:49:05.002 +We wouldn't have passed the document alignment, +therefore we wouldn't have seen it. + +0:49:05.505 --> 0:49:06.774 +In the underline. + +0:49:06.774 --> 0:49:10.300 +Then we are calculating the similarity for +these. + +0:49:10.270 --> 0:49:17.428 +Set this here based on the bilingual dictionary, +so it may be based on how much overlap you + +0:49:17.428 --> 0:49:17.895 +have. + +0:49:18.178 --> 0:49:24.148 +And then we are finding a path through it. + +0:49:24.148 --> 0:49:31.089 +You are finding a path which the lights ever +see. + +0:49:31.271 --> 0:49:41.255 +But you're trying to find a pass through your +document so that you get these parallel. + +0:49:41.201 --> 0:49:49.418 +And then the perfect ones here would be your +pass, where you just take this other parallel. + +0:49:51.011 --> 0:50:05.206 +The advantage is that on the one end limits +your search space, then centers alignment, + +0:50:05.206 --> 0:50:07.490 +and secondly. + +0:50:07.787 --> 0:50:10.013 +So what does it mean? + +0:50:10.013 --> 0:50:19.120 +So even if you have a very high probable pair, +you're not taking them on because overall. + +0:50:19.399 --> 0:50:27.063 +So sometimes it makes sense to also use this +global information and not only compare on + +0:50:27.063 --> 0:50:34.815 +individual sentences because what you're with +your parents is that sometimes it's only a + +0:50:34.815 --> 0:50:36.383 +good translation. + +0:50:38.118 --> 0:50:51.602 +So by this minion paste you're preventing +the system to do it at the border where there's + +0:50:51.602 --> 0:50:52.201 +no. + +0:50:53.093 --> 0:50:55.689 +So that might achieve you a bit better quality. + +0:50:56.636 --> 0:51:12.044 +The pack always ends if we write the button +for everybody, but it also means you couldn't + +0:51:12.044 --> 0:51:15.126 +necessarily have. + +0:51:15.375 --> 0:51:24.958 +Have some restrictions that is right, so first +of all they can't be translated out. + +0:51:25.285 --> 0:51:32.572 +So the handle line typically only really works +well if you have a relatively high quality. + +0:51:32.752 --> 0:51:39.038 +So if you have this more general data where +there's like some parts are translated and + +0:51:39.038 --> 0:51:39.471 +some. + +0:51:39.719 --> 0:51:43.604 +It doesn't really work, so it might. + +0:51:43.604 --> 0:51:53.157 +It's okay with having maybe at the end some +sentences which are missing, but in generally. + +0:51:53.453 --> 0:51:59.942 +So it's not robust against significant noise +on the. + +0:52:05.765 --> 0:52:12.584 +The second thing is is to what is referred +to as blue alibi. + +0:52:13.233 --> 0:52:16.982 +And this doesn't does, does not do us much. + +0:52:16.977 --> 0:52:30.220 +A global information you can translate each +sentence to English, and then you calculate + +0:52:30.220 --> 0:52:34.885 +the voice for the translation. + +0:52:35.095 --> 0:52:41.888 +And that you would get six answer points, +which are the ones in a purple ear. + +0:52:42.062 --> 0:52:56.459 +And then you have the ability to add some +points around it, which might be a bit lower. + +0:52:56.756 --> 0:53:06.962 +But here in this case you are able to deal +with reorderings, angles to deal with parts. + +0:53:07.247 --> 0:53:16.925 +Therefore, in this case we need a full scale +and key system to do this calculation while + +0:53:16.925 --> 0:53:17.686 +we're. + +0:53:18.318 --> 0:53:26.637 +Then, of course, the better your similarity +metric is, so the better you are able to do + +0:53:26.637 --> 0:53:35.429 +this comparison, the less you have to rely +on structural information that, in one sentence,. + +0:53:39.319 --> 0:53:53.411 +Anymore questions, and then there are things +like back in line which try to do the same. + +0:53:53.793 --> 0:53:59.913 +That means the idea is that you expect each +sentence. + +0:53:59.819 --> 0:54:02.246 +In a crossing will vector space. + +0:54:02.246 --> 0:54:08.128 +Crossing will vector space always means that +you have a vector or knight means. + +0:54:08.128 --> 0:54:14.598 +In this case you have a vector space where +sentences in different languages are near to + +0:54:14.598 --> 0:54:16.069 +each other if they. + +0:54:16.316 --> 0:54:23.750 +So you can have it again and so on, but just +next to each other and want to call you. + +0:54:24.104 --> 0:54:32.009 +And then you can of course measure now the +similarity by some distance matrix in this + +0:54:32.009 --> 0:54:32.744 +vector. + +0:54:33.033 --> 0:54:36.290 +And you're saying towards two senses are lying. + +0:54:36.290 --> 0:54:39.547 +If the distance in the vector space is somehow. + +0:54:40.240 --> 0:54:50.702 +We'll discuss that in a bit more heat soon +because these vector spades and bathings are + +0:54:50.702 --> 0:54:52.010 +even then. + +0:54:52.392 --> 0:54:55.861 +So the nice thing is with this. + +0:54:55.861 --> 0:55:05.508 +It's really good and good to get quite good +quality and can decide whether two sentences + +0:55:05.508 --> 0:55:08.977 +are translations of each other. + +0:55:08.888 --> 0:55:14.023 +In the fact-lined approach, but often they +even work on a global search way to really + +0:55:14.023 --> 0:55:15.575 +compare on everything to. + +0:55:16.236 --> 0:55:29.415 +What weak alignment also does is trying to +do to make this more efficient in finding the. + +0:55:29.309 --> 0:55:40.563 +If you don't want to compare everything to +everything, you first need sentence blocks, + +0:55:40.563 --> 0:55:41.210 +and. + +0:55:41.141 --> 0:55:42.363 +Then find him fast. + +0:55:42.562 --> 0:55:55.053 +You always have full sentence resolution, +but then you always compare on the area around. + +0:55:55.475 --> 0:56:11.501 +So if you do compare blocks on the source +of the target, then you have of your possibilities. + +0:56:11.611 --> 0:56:17.262 +So here the end times and comparison is a +lot less than the comparison you have here. + +0:56:17.777 --> 0:56:23.750 +And with neural embeddings you can also embed +not only single sentences and whole blocks. + +0:56:24.224 --> 0:56:28.073 +So how you make this in fast? + +0:56:28.073 --> 0:56:35.643 +You're starting from a coarse grain resolution +here where. + +0:56:36.176 --> 0:56:47.922 +Then you're getting a double pass where they +could be good and near this pass you're doing + +0:56:47.922 --> 0:56:49.858 +more and more. + +0:56:52.993 --> 0:56:54.601 +And yeah, what's the? + +0:56:54.601 --> 0:56:56.647 +This is the white egg lift. + +0:56:56.647 --> 0:56:59.352 +These are the sewers and the target. + +0:57:00.100 --> 0:57:16.163 +While it was sleeping in the forests and things, +I thought it was very strange to see this man. + +0:57:16.536 --> 0:57:25.197 +So you have the sentences, but if you do blocks +you have blocks that are in. + +0:57:30.810 --> 0:57:38.514 +This is the thing about the pipeline approach. + +0:57:38.514 --> 0:57:46.710 +We want to look at the global mining, but +before. + +0:57:53.633 --> 0:58:07.389 +In the global mining thing we have to also +do some filtering and so typically in the things + +0:58:07.389 --> 0:58:10.379 +they do they start. + +0:58:10.290 --> 0:58:14.256 +And then they are doing some pretty processing. + +0:58:14.254 --> 0:58:17.706 +So you try to at first to de-defecate paragraphs. + +0:58:17.797 --> 0:58:30.622 +So, of course, if you compare everything with +everything in two times the same input example, + +0:58:30.622 --> 0:58:35.748 +you will also: The hard thing is that you first +keep duplicating. + +0:58:35.748 --> 0:58:37.385 +You have each paragraph only one. + +0:58:37.958 --> 0:58:42.079 +There's a lot of text which occurs a lot of +times. + +0:58:42.079 --> 0:58:44.585 +They will happen all the time. + +0:58:44.884 --> 0:58:57.830 +There are pages about the cookie thing you +see and about accepting things. + +0:58:58.038 --> 0:59:04.963 +So you can already be duplicated here, or +your problem has crossed the website twice, + +0:59:04.963 --> 0:59:05.365 +and. + +0:59:06.066 --> 0:59:11.291 +Then you can remove low quality data like +cooking warnings that have biolabites start. + +0:59:12.012 --> 0:59:13.388 +Hey! + +0:59:13.173 --> 0:59:19.830 +So let you have maybe some other sentence, +and then you're doing a language idea. + +0:59:19.830 --> 0:59:29.936 +That means you want to have a text, which +is: You want to know for each sentence a paragraph + +0:59:29.936 --> 0:59:38.695 +which language it has so that you then, of +course, if you want. + +0:59:39.259 --> 0:59:44.987 +Finally, there is some complexity based film +screenings to believe, for example, for very + +0:59:44.987 --> 0:59:46.069 +high complexity. + +0:59:46.326 --> 0:59:59.718 +That means, for example, data where there's +a lot of crazy names which are growing. + +1:00:00.520 --> 1:00:09.164 +Sometimes it also improves very high perplexity +data because that is then unmanned generated + +1:00:09.164 --> 1:00:09.722 +data. + +1:00:11.511 --> 1:00:17.632 +And then the model which is mostly used for +that is what is called a laser model. + +1:00:18.178 --> 1:00:21.920 +It's based on machine translation. + +1:00:21.920 --> 1:00:28.442 +Hope it all recognizes the machine translation +architecture. + +1:00:28.442 --> 1:00:37.103 +However, there is a difference between a general +machine translation system and. + +1:01:00.000 --> 1:01:13.322 +Machine translation system, so it's messy. + +1:01:14.314 --> 1:01:24.767 +See one bigger difference, which is great +if I'm excluding that object or the other. + +1:01:25.405 --> 1:01:39.768 +There is one difference to the other, one +with attention, so we are having. + +1:01:40.160 --> 1:01:43.642 +And then we are using that here in there each +time set up. + +1:01:44.004 --> 1:01:54.295 +Mean, therefore, it's maybe a bit similar +to original anti-system without attention. + +1:01:54.295 --> 1:01:56.717 +It's quite similar. + +1:01:57.597 --> 1:02:10.011 +However, it has this disadvantage saying that +we have to put everything in one sentence and + +1:02:10.011 --> 1:02:14.329 +that maybe not all information. + +1:02:15.055 --> 1:02:25.567 +However, now in this type of framework we +are not really interested in machine translation, + +1:02:25.567 --> 1:02:27.281 +so this model. + +1:02:27.527 --> 1:02:34.264 +So we are training it to do machine translation. + +1:02:34.264 --> 1:02:42.239 +What that means in the end should be as much +information. + +1:02:43.883 --> 1:03:01.977 +Only all the information in here is able to +really well do the machine translation. + +1:03:02.642 --> 1:03:07.801 +So that is the first step, so we are doing +here. + +1:03:07.801 --> 1:03:17.067 +We are building the MT system, not with the +goal of making the best MT system, but with + +1:03:17.067 --> 1:03:22.647 +learning and sentences, and hopefully all important. + +1:03:22.882 --> 1:03:26.116 +Because otherwise we won't be able to generate +the translation. + +1:03:26.906 --> 1:03:31.287 +So it's a bit more on the bottom neck like +to try to put as much information. + +1:03:32.012 --> 1:03:36.426 +And if you think if you want to do later finding +the bear's neighbor or something like. + +1:03:37.257 --> 1:03:48.680 +So finding similarities is typically possible +with fixed dimensional things, so we can do + +1:03:48.680 --> 1:03:56.803 +that in an end dimensional space and find the +nearest neighbor. + +1:03:57.857 --> 1:03:59.837 +Yeah, it would be very difficult. + +1:04:00.300 --> 1:04:03.865 +There's one thing that we also do. + +1:04:03.865 --> 1:04:09.671 +We don't want to find the nearest neighbor +in the other. + +1:04:10.570 --> 1:04:13.424 +Do you have an idea how we can train them? + +1:04:13.424 --> 1:04:16.542 +This is a set that embeddings can be compared. + +1:04:23.984 --> 1:04:36.829 +Any idea do you think about two lectures, +a three lecture stack, one that did gave. + +1:04:41.301 --> 1:04:50.562 +We can train them on a multilingual setting +and that's how it's done in lasers so we're + +1:04:50.562 --> 1:04:56.982 +not doing it only from German to English but +we're training. + +1:04:57.017 --> 1:05:04.898 +Mean, if the English one has to be useful +for German, French and so on, and for German + +1:05:04.898 --> 1:05:13.233 +also, the German and the English and so have +to be useful, then somehow we'll automatically + +1:05:13.233 --> 1:05:16.947 +learn that these embattes are popularly. + +1:05:17.437 --> 1:05:28.562 +And then we can use an exact as we will plan +to have a similar sentence embedding. + +1:05:28.908 --> 1:05:39.734 +If you put in here a German and a French one +and always generate as they both have the same + +1:05:39.734 --> 1:05:48.826 +translations, you give these sentences: And +you should do exactly the same thing, so that's + +1:05:48.826 --> 1:05:50.649 +of course the easiest. + +1:05:51.151 --> 1:05:59.817 +If the sentence is very different then most +people will also hear the English decoder and + +1:05:59.817 --> 1:06:00.877 +therefore. + +1:06:02.422 --> 1:06:04.784 +So that is the first thing. + +1:06:04.784 --> 1:06:06.640 +Now we have this one. + +1:06:06.640 --> 1:06:10.014 +We have to be trained on parallel data. + +1:06:10.390 --> 1:06:22.705 +Then we can use these embeddings on our new +data and try to use them to make efficient + +1:06:22.705 --> 1:06:24.545 +comparisons. + +1:06:26.286 --> 1:06:30.669 +So how can you do comparison? + +1:06:30.669 --> 1:06:37.243 +Maybe the first thing you think of is to do. + +1:06:37.277 --> 1:06:44.365 +So you take all the German sentences, all +the French sentences. + +1:06:44.365 --> 1:06:49.460 +We compute the Cousin's simple limit between. + +1:06:49.469 --> 1:06:58.989 +And then you take all pairs where the similarity +is very high. + +1:07:00.180 --> 1:07:17.242 +So you have your French list, you have them, +and then you just take all sentences. + +1:07:19.839 --> 1:07:29.800 +It's an additional power method that we have, +but we have a lot of data who will find a point. + +1:07:29.800 --> 1:07:32.317 +It's a good point, but. + +1:07:35.595 --> 1:07:45.738 +It's also not that easy, so one problem is +that typically there are some sentences where. + +1:07:46.066 --> 1:07:48.991 +And other points where there is very few points +in the neighborhood. + +1:07:49.629 --> 1:08:06.241 +And then for things where a lot of things +are enabled you might extract not for one percent + +1:08:06.241 --> 1:08:08.408 +to do that. + +1:08:08.868 --> 1:08:18.341 +So what typically is happening is you do the +max merchant? + +1:08:18.341 --> 1:08:25.085 +How good is a pair compared to the other? + +1:08:25.305 --> 1:08:33.859 +So you take the similarity between X and Y, +and then you look at one of the eight nearest + +1:08:33.859 --> 1:08:35.190 +neighbors of. + +1:08:35.115 --> 1:08:48.461 +Of x and what are the eight nearest neighbors +of y, and the dividing of the similarity through + +1:08:48.461 --> 1:08:51.411 +the eight neighbors. + +1:08:51.671 --> 1:09:00.333 +So what you may be looking at are these two +sentences a lot more similar than all the other. + +1:09:00.840 --> 1:09:13.455 +And if these are exceptional and similar compared +to other sentences then they should be translations. + +1:09:16.536 --> 1:09:19.158 +Of course, that has also some. + +1:09:19.158 --> 1:09:24.148 +Then the good thing is there's a lot of similar +sentences. + +1:09:24.584 --> 1:09:30.641 +If there is a lot of similar sensations in +white then these are also very similar and + +1:09:30.641 --> 1:09:32.824 +you are doing more comparison. + +1:09:32.824 --> 1:09:36.626 +If all the arrows are far away then the translations. + +1:09:37.057 --> 1:09:40.895 +So think about this like short sentences. + +1:09:40.895 --> 1:09:47.658 +They might be that most things are similar, +but they are just in general. + +1:09:49.129 --> 1:09:59.220 +There are some problems that now we assume +there is only one pair of translations. + +1:09:59.759 --> 1:10:09.844 +So it has some problems in their two or three +ballad translations of that. + +1:10:09.844 --> 1:10:18.853 +Then, of course, this pair might not find +it, but in general this. + +1:10:19.139 --> 1:10:27.397 +For example, they have like all of these common +trawl. + +1:10:27.397 --> 1:10:32.802 +They have large parallel data sets. + +1:10:36.376 --> 1:10:38.557 +One point maybe also year. + +1:10:38.557 --> 1:10:45.586 +Of course, now it's important that we have +done the deduplication before because if we + +1:10:45.586 --> 1:10:52.453 +wouldn't have the deduplication, we would have +points which are the same coordinate. + +1:10:57.677 --> 1:11:03.109 +Maybe only one small things to that mean. + +1:11:03.109 --> 1:11:09.058 +A major issue in this case is still making +a. + +1:11:09.409 --> 1:11:18.056 +So you have to still do all of this comparison, +and that cannot be done just by simple. + +1:11:19.199 --> 1:11:27.322 +So what is done typically express the word, +you know things can be done in parallel. + +1:11:28.368 --> 1:11:36.024 +So calculating the embeddings and all that +stuff doesn't need to be sequential, but it's + +1:11:36.024 --> 1:11:37.143 +independent. + +1:11:37.357 --> 1:11:48.680 +What you typically do is create an event and +then you do some kind of projectization. + +1:11:48.708 --> 1:11:57.047 +So there is this space library which does +key nearest neighbor search very efficient + +1:11:57.047 --> 1:11:59.597 +in very high-dimensional. + +1:12:00.080 --> 1:12:03.410 +And then based on that you can now do comparison. + +1:12:03.410 --> 1:12:06.873 +You can even do the comparison in parallel +because. + +1:12:06.906 --> 1:12:13.973 +Can look at different areas of your space +and then compare the different pieces to find + +1:12:13.973 --> 1:12:14.374 +the. + +1:12:15.875 --> 1:12:30.790 +With this you are then able to do very fast +calculations on this type of sentence. + +1:12:31.451 --> 1:12:34.761 +So yeah this is currently one. + +1:12:35.155 --> 1:12:48.781 +Mean, those of them are covered with this, +so there's a parade. + +1:12:48.668 --> 1:12:55.543 +We are collected by that and most of them +are in a very big corporate for languages which + +1:12:55.543 --> 1:12:57.453 +you can hardly stand on. + +1:12:58.778 --> 1:13:01.016 +Do you have any more questions on this? + +1:13:05.625 --> 1:13:17.306 +And then some more words to this last set +here: So we have now done our pearl marker + +1:13:17.306 --> 1:13:25.165 +and we could assume that everything is fine +now. + +1:13:25.465 --> 1:13:35.238 +However, the problem with this noisy data +is that typically this is quite noisy still, + +1:13:35.238 --> 1:13:35.687 +so. + +1:13:36.176 --> 1:13:44.533 +In order to make things efficient to have +a high recall, the final data is often not + +1:13:44.533 --> 1:13:49.547 +of the best quality, not the same type of quality. + +1:13:49.789 --> 1:13:58.870 +So it is essential to do another figuring +step and to remove senses which might seem + +1:13:58.870 --> 1:14:01.007 +to be translations. + +1:14:01.341 --> 1:14:08.873 +And here, of course, the final evaluation +matrix would be how much do my system improve? + +1:14:09.089 --> 1:14:23.476 +And there are even challenges on doing that +so: people getting this noisy data like symmetrics + +1:14:23.476 --> 1:14:25.596 +or something. + +1:14:27.707 --> 1:14:34.247 +However, all these steps is of course very +time consuming, so you might not always want + +1:14:34.247 --> 1:14:37.071 +to do the full pipeline and training. + +1:14:37.757 --> 1:14:51.614 +So how can you model that we want to get this +best and normally what we always want? + +1:14:51.871 --> 1:15:02.781 +You also want to have the best over translation +quality, but this is also normally not achieved + +1:15:02.781 --> 1:15:03.917 +with all. + +1:15:04.444 --> 1:15:12.389 +And that's why you're doing this two-step +approach first of the second alignment. + +1:15:12.612 --> 1:15:27.171 +And after once you do the sentence filtering, +we can put a lot more alphabet in all the comparisons. + +1:15:27.627 --> 1:15:37.472 +For example, you can just translate the source +and compare that translation with the original + +1:15:37.472 --> 1:15:40.404 +one and calculate how good. + +1:15:40.860 --> 1:15:49.467 +And this, of course, you can do with the filing +set, but you can't do with your initial set + +1:15:49.467 --> 1:15:50.684 +of millions. + +1:15:54.114 --> 1:16:01.700 +So what it is again is the ancient test where +you input as a sentence pair as here, and then + +1:16:01.700 --> 1:16:09.532 +once you have a biometria, these are sentence +pairs with a high quality, and these are sentence + +1:16:09.532 --> 1:16:11.653 +pairs avec a low quality. + +1:16:12.692 --> 1:16:17.552 +Does anybody see what might be a challenge +if you want to train this type of classifier? + +1:16:22.822 --> 1:16:24.264 +How do you measure exactly? + +1:16:24.264 --> 1:16:26.477 +The quality is probably about the problem. + +1:16:27.887 --> 1:16:39.195 +Yes, that is one, that is true, there is even +more, more simple one, and high quality data + +1:16:39.195 --> 1:16:42.426 +here is not so difficult. + +1:16:43.303 --> 1:16:46.844 +Globally, yeah, probably we have a class in +balance. + +1:16:46.844 --> 1:16:49.785 +We don't see many bad quality combinations. + +1:16:49.785 --> 1:16:54.395 +It's hard to get there at the beginning, so +maybe how can you argue? + +1:16:54.395 --> 1:16:58.405 +Where do you find bad quality and what type +of bad quality? + +1:16:58.798 --> 1:17:05.122 +Because if it's too easy, you just take a +random germ and the random innocence that is + +1:17:05.122 --> 1:17:05.558 +very. + +1:17:05.765 --> 1:17:15.747 +But what you're interested is like bad quality +data, which still passes your first initial + +1:17:15.747 --> 1:17:16.405 +step. + +1:17:17.257 --> 1:17:28.824 +What you can use for that is you can use any +type of network or model that in the beginning, + +1:17:28.824 --> 1:17:33.177 +like in random forests, would see. + +1:17:33.613 --> 1:17:38.912 +So the positive examples are quite easy to +get. + +1:17:38.912 --> 1:17:44.543 +You just take parallel data and high quality +data. + +1:17:44.543 --> 1:17:45.095 +You. + +1:17:45.425 --> 1:17:47.565 +That is quite easy. + +1:17:47.565 --> 1:17:55.482 +You normally don't need a lot of data, then +to train in a few validation. + +1:17:57.397 --> 1:18:12.799 +The challenge is like the negative samples +because how would you generate negative samples? + +1:18:13.133 --> 1:18:17.909 +Because the negative examples are the ones +which ask the first step but don't ask the + +1:18:17.909 --> 1:18:18.353 +second. + +1:18:18.838 --> 1:18:23.682 +So how do you typically do it? + +1:18:23.682 --> 1:18:28.994 +You try to do synthetic examples. + +1:18:28.994 --> 1:18:33.369 +You can do random examples. + +1:18:33.493 --> 1:18:45.228 +But this is the typical error that you want +to detect when you do frequency based replacements. + +1:18:45.228 --> 1:18:52.074 +But this is one major issue when you generate +the data. + +1:18:52.132 --> 1:19:02.145 +That doesn't match well with what are the +real arrows that you're interested in. + +1:19:02.702 --> 1:19:13.177 +Is some of the most challenging here to find +the negative samples, which are hard enough + +1:19:13.177 --> 1:19:14.472 +to detect. + +1:19:17.537 --> 1:19:21.863 +And the other thing, which is difficult, is +of course the data ratio. + +1:19:22.262 --> 1:19:24.212 +Why is it important any? + +1:19:24.212 --> 1:19:29.827 +Why is the ratio between positive and negative +examples here important? + +1:19:30.510 --> 1:19:40.007 +Because in a case of plus imbalance we effectively +could learn to just that it's positive and + +1:19:40.007 --> 1:19:43.644 +high quality and we would be right. + +1:19:44.844 --> 1:19:46.654 +Yes, so I'm training. + +1:19:46.654 --> 1:19:51.180 +This is important, but otherwise it might +be too easy. + +1:19:51.180 --> 1:19:52.414 +You always do. + +1:19:52.732 --> 1:19:58.043 +And on the other head, of course, navy and +deputy, it's also important because if we have + +1:19:58.043 --> 1:20:03.176 +equal things, we're also assuming that this +might be the other one, and if the quality + +1:20:03.176 --> 1:20:06.245 +is worse or higher, we might also accept too +fewer. + +1:20:06.626 --> 1:20:10.486 +So this ratio is not easy to determine. + +1:20:13.133 --> 1:20:16.969 +What type of features can we use? + +1:20:16.969 --> 1:20:23.175 +Traditionally, we're also looking at word +translation. + +1:20:23.723 --> 1:20:37.592 +And nowadays, of course, we can model this +also with something like similar, so this is + +1:20:37.592 --> 1:20:38.696 +again. + +1:20:40.200 --> 1:20:42.306 +Language follow. + +1:20:42.462 --> 1:20:49.763 +So we can, for example, put the sentence in +there for the source and the target, and then + +1:20:49.763 --> 1:20:56.497 +based on this classification label we can classify +as this a parallel sentence or. + +1:20:56.476 --> 1:21:00.054 +So it's more like a normal classification +task. + +1:21:00.160 --> 1:21:09.233 +And by having a system which can have much +enable input, we can just put in two R. + +1:21:09.233 --> 1:21:16.886 +We can also put in two independent of each +other based on the hidden. + +1:21:17.657 --> 1:21:35.440 +You can, as you do any other type of classifier, +you can train them on top of. + +1:21:35.895 --> 1:21:42.801 +This so it tries to represent the full sentence +and that's what you also want to do on. + +1:21:43.103 --> 1:21:45.043 +The Other Thing What They Can't Do Is, of +Course. + +1:21:45.265 --> 1:21:46.881 +You can make here. + +1:21:46.881 --> 1:21:52.837 +You can do your summation of all the hidden +statements that you said. + +1:21:58.698 --> 1:22:10.618 +Okay, and then one thing which we skipped +until now, and that is only briefly this fragment. + +1:22:10.630 --> 1:22:19.517 +So if we have sentences which are not really +parallel, can we also extract information from + +1:22:19.517 --> 1:22:20.096 +them? + +1:22:22.002 --> 1:22:25.627 +And so what here the test is? + +1:22:25.627 --> 1:22:33.603 +We have a sentence and we want to find within +or a sentence pair. + +1:22:33.603 --> 1:22:38.679 +We want to find within the sentence pair. + +1:22:39.799 --> 1:22:46.577 +And how that, for example, has been done is +using a lexical positive and negative association. + +1:22:47.187 --> 1:22:57.182 +And then you can transform your target sentence +into a signal and find a thing where you have. + +1:22:57.757 --> 1:23:00.317 +So I'm Going to Get a Clear Eye. + +1:23:00.480 --> 1:23:15.788 +So you hear the English sentence, the other +language, and you have an alignment between + +1:23:15.788 --> 1:23:18.572 +them, and then. + +1:23:18.818 --> 1:23:21.925 +This is not a light cell from a negative signal. + +1:23:22.322 --> 1:23:40.023 +And then you drink some sauce on there because +you want to have an area where there's. + +1:23:40.100 --> 1:23:51.728 +It doesn't matter if you have simple arrows +here by smooth saying you can't extract. + +1:23:51.972 --> 1:23:58.813 +So you try to find long segments here where +at least most of the words are somehow aligned. + +1:24:00.040 --> 1:24:10.069 +And then you take this one in the side and +extract that one as your parallel fragment, + +1:24:10.069 --> 1:24:10.645 +and. + +1:24:10.630 --> 1:24:21.276 +So in the end you not only have full sentences +but you also have partial sentences which might + +1:24:21.276 --> 1:24:27.439 +be helpful for especially if you have quite +low upset. + +1:24:32.332 --> 1:24:36.388 +That's everything work for today. + +1:24:36.388 --> 1:24:44.023 +What you hopefully remember is the thing about +how the general. + +1:24:44.184 --> 1:24:54.506 +We talked about how we can do the document +alignment and then we can do the sentence alignment, + +1:24:54.506 --> 1:24:57.625 +which can be done after the. + +1:24:59.339 --> 1:25:12.611 +Any more questions think on Thursday we had +to do a switch, so on Thursday there will be + +1:25:12.611 --> 1:25:15.444 +a practical thing. + +0:00:01.921 --> 0:00:16.424 +Hey welcome to today's lecture, what we today +want to look at is how we can make new. + +0:00:16.796 --> 0:00:26.458 +So until now we have this global system, the +encoder and the decoder mostly, and we haven't + +0:00:26.458 --> 0:00:29.714 +really thought about how long. + +0:00:30.170 --> 0:00:42.684 +And what we, for example, know is yeah, you +can make the systems bigger in different ways. + +0:00:42.684 --> 0:00:47.084 +We can make them deeper so the. + +0:00:47.407 --> 0:00:56.331 +And if we have at least enough data that typically +helps you make things performance better,. + +0:00:56.576 --> 0:01:00.620 +But of course leads to problems that we need +more resources. + +0:01:00.620 --> 0:01:06.587 +That is a problem at universities where we +have typically limited computation capacities. + +0:01:06.587 --> 0:01:11.757 +So at some point you have such big models +that you cannot train them anymore. + +0:01:13.033 --> 0:01:23.792 +And also for companies is of course important +if it costs you like to generate translation + +0:01:23.792 --> 0:01:26.984 +just by power consumption. + +0:01:27.667 --> 0:01:35.386 +So yeah, there's different reasons why you +want to do efficient machine translation. + +0:01:36.436 --> 0:01:48.338 +One reason is there are different ways of +how you can improve your machine translation + +0:01:48.338 --> 0:01:50.527 +system once we. + +0:01:50.670 --> 0:01:55.694 +There can be different types of data we looked +into data crawling, monolingual data. + +0:01:55.875 --> 0:01:59.024 +All this data and the aim is always. + +0:01:59.099 --> 0:02:06.067 +Of course, we are not just purely interested +in having more data, but the idea why we want + +0:02:06.067 --> 0:02:12.959 +to have more data is that more data also means +that we have better quality because mostly + +0:02:12.959 --> 0:02:17.554 +we are interested in increasing the quality +of the machine. + +0:02:18.838 --> 0:02:24.892 +But there's also other ways of how you can +improve the quality of a machine translation. + +0:02:25.325 --> 0:02:36.450 +And what is, of course, that is where most +research is focusing on. + +0:02:36.450 --> 0:02:44.467 +It means all we want to build better algorithms. + +0:02:44.684 --> 0:02:48.199 +Course: The other things are normally as good. + +0:02:48.199 --> 0:02:54.631 +Sometimes it's easier to improve, so often +it's easier to just collect more data than + +0:02:54.631 --> 0:02:57.473 +to invent some great view algorithms. + +0:02:57.473 --> 0:03:00.315 +But yeah, both of them are important. + +0:03:00.920 --> 0:03:09.812 +But there is this third thing, especially +with neural machine translation, and that means + +0:03:09.812 --> 0:03:11.590 +we make a bigger. + +0:03:11.751 --> 0:03:16.510 +Can be, as said, that we have more layers, +that we have wider layers. + +0:03:16.510 --> 0:03:19.977 +The other thing we talked a bit about is ensemble. + +0:03:19.977 --> 0:03:24.532 +That means we are not building one new machine +translation system. + +0:03:24.965 --> 0:03:27.505 +And we can easily build four. + +0:03:27.505 --> 0:03:32.331 +What is the typical strategy to build different +systems? + +0:03:32.331 --> 0:03:33.177 +Remember. + +0:03:35.795 --> 0:03:40.119 +It should be of course a bit different if +you have the same. + +0:03:40.119 --> 0:03:44.585 +If they all predict the same then combining +them doesn't help. + +0:03:44.585 --> 0:03:48.979 +So what is the easiest way if you have to +build four systems? + +0:03:51.711 --> 0:04:01.747 +And the Charleston's will take, but this is +the best output of a single system. + +0:04:02.362 --> 0:04:10.165 +Mean now, it's really three different systems +so that you later can combine them and maybe + +0:04:10.165 --> 0:04:11.280 +the average. + +0:04:11.280 --> 0:04:16.682 +Ensembles are typically that the average is +all probabilities. + +0:04:19.439 --> 0:04:24.227 +The idea is to think about neural networks. + +0:04:24.227 --> 0:04:29.342 +There's one parameter which can easily adjust. + +0:04:29.342 --> 0:04:36.525 +That's exactly the easiest way to randomize +with three different. + +0:04:37.017 --> 0:04:43.119 +They have the same architecture, so all the +hydroparameters are the same, but they are + +0:04:43.119 --> 0:04:43.891 +different. + +0:04:43.891 --> 0:04:46.556 +They will have different predictions. + +0:04:48.228 --> 0:04:52.572 +So, of course, bigger amounts. + +0:04:52.572 --> 0:05:05.325 +Some of these are a bit the easiest way of +improving your quality because you don't really + +0:05:05.325 --> 0:05:08.268 +have to do anything. + +0:05:08.588 --> 0:05:12.588 +There is limits on that bigger models only +get better. + +0:05:12.588 --> 0:05:19.132 +If you have enough training data you can't +do like a handheld layer and you will not work + +0:05:19.132 --> 0:05:24.877 +on very small data but with a recent amount +of data that is the easiest thing. + +0:05:25.305 --> 0:05:33.726 +However, they are challenging with making +better models, bigger motors, and that is the + +0:05:33.726 --> 0:05:34.970 +computation. + +0:05:35.175 --> 0:05:44.482 +So, of course, if you have a bigger model +that can mean that you have longer running + +0:05:44.482 --> 0:05:49.518 +times, if you have models, you have to times. + +0:05:51.171 --> 0:05:56.685 +Normally you cannot paralyze the different +layers because the input to one layer is always + +0:05:56.685 --> 0:06:02.442 +the output of the previous layer, so you propagate +that so it will also increase your runtime. + +0:06:02.822 --> 0:06:10.720 +Then you have to store all your models in +memory. + +0:06:10.720 --> 0:06:20.927 +If you have double weights you will have: +Is more difficult to then do back propagation. + +0:06:20.927 --> 0:06:27.680 +You have to store in between the activations, +so there's not only do you increase the model + +0:06:27.680 --> 0:06:31.865 +in your memory, but also all these other variables +that. + +0:06:34.414 --> 0:06:36.734 +And so in general it is more expensive. + +0:06:37.137 --> 0:06:54.208 +And therefore there's good reasons in looking +into can we make these models sound more efficient. + +0:06:54.134 --> 0:07:00.982 +So it's been through the viewer, you can have +it okay, have one and one day of training time, + +0:07:00.982 --> 0:07:01.274 +or. + +0:07:01.221 --> 0:07:07.535 +Forty thousand euros and then what is the +best machine translation system I can get within + +0:07:07.535 --> 0:07:08.437 +this budget. + +0:07:08.969 --> 0:07:19.085 +And then, of course, you can make the models +bigger, but then you have to train them shorter, + +0:07:19.085 --> 0:07:24.251 +and then we can make more efficient algorithms. + +0:07:25.925 --> 0:07:31.699 +If you think about efficiency, there's a bit +different scenarios. + +0:07:32.312 --> 0:07:43.635 +So if you're more of coming from the research +community, what you'll be doing is building + +0:07:43.635 --> 0:07:47.913 +a lot of models in your research. + +0:07:48.088 --> 0:07:58.645 +So you're having your test set of maybe sentences, +calculating the blue score, then another model. + +0:07:58.818 --> 0:08:08.911 +So what that means is typically you're training +on millions of cents, so your training time + +0:08:08.911 --> 0:08:14.944 +is long, maybe a day, but maybe in other cases +a week. + +0:08:15.135 --> 0:08:22.860 +The testing is not really the cost efficient, +but the training is very costly. + +0:08:23.443 --> 0:08:37.830 +If you are more thinking of building models +for application, the scenario is quite different. + +0:08:38.038 --> 0:08:46.603 +And then you keep it running, and maybe thousands +of customers are using it in translating. + +0:08:46.603 --> 0:08:47.720 +So in that. + +0:08:48.168 --> 0:08:59.577 +And we will see that it is not always the +same type of challenges you can paralyze some + +0:08:59.577 --> 0:09:07.096 +things in training, which you cannot paralyze +in testing. + +0:09:07.347 --> 0:09:14.124 +For example, in training you have to do back +propagation, so you have to store the activations. + +0:09:14.394 --> 0:09:23.901 +Therefore, in testing we briefly discussed +that we would do it in more detail today in + +0:09:23.901 --> 0:09:24.994 +training. + +0:09:25.265 --> 0:09:36.100 +You know they're a target and you can process +everything in parallel while in testing. + +0:09:36.356 --> 0:09:46.741 +So you can only do one word at a time, and +so you can less paralyze this. + +0:09:46.741 --> 0:09:50.530 +Therefore, it's important. + +0:09:52.712 --> 0:09:55.347 +Is a specific task on this. + +0:09:55.347 --> 0:10:03.157 +For example, it's the efficiency task where +it's about making things as efficient. + +0:10:03.123 --> 0:10:09.230 +Is possible and they can look at different +resources. + +0:10:09.230 --> 0:10:14.207 +So how much deep fuel run time do you need? + +0:10:14.454 --> 0:10:19.366 +See how much memory you need or you can have +a fixed memory budget and then have to build + +0:10:19.366 --> 0:10:20.294 +the best system. + +0:10:20.500 --> 0:10:29.010 +And here is a bit like an example of that, +so there's three teams from Edinburgh from + +0:10:29.010 --> 0:10:30.989 +and they submitted. + +0:10:31.131 --> 0:10:36.278 +So then, of course, if you want to know the +most efficient system you have to do a bit + +0:10:36.278 --> 0:10:36.515 +of. + +0:10:36.776 --> 0:10:44.656 +You want to have a better quality or more +runtime and there's not the one solution. + +0:10:44.656 --> 0:10:46.720 +You can improve your. + +0:10:46.946 --> 0:10:49.662 +And that you see that there are different +systems. + +0:10:49.909 --> 0:11:06.051 +Here is how many words you can do for a second +on the clock, and you want to be as talk as + +0:11:06.051 --> 0:11:07.824 +possible. + +0:11:08.068 --> 0:11:08.889 +And you see here a bit. + +0:11:08.889 --> 0:11:09.984 +This is a little bit different. + +0:11:11.051 --> 0:11:27.717 +You want to be there on the top right corner +and you can get a score of something between + +0:11:27.717 --> 0:11:29.014 +words. + +0:11:30.250 --> 0:11:34.161 +Two hundred and fifty thousand, then you'll +ever come and score zero point three. + +0:11:34.834 --> 0:11:41.243 +There is, of course, any bit of a decision, +but the question is, like how far can you again? + +0:11:41.243 --> 0:11:47.789 +Some of all these points on this line would +be winners because they are somehow most efficient + +0:11:47.789 --> 0:11:53.922 +in a way that there's no system which achieves +the same quality with less computational. + +0:11:57.657 --> 0:12:04.131 +So there's the one question of which resources +are you interested. + +0:12:04.131 --> 0:12:07.416 +Are you running it on CPU or GPU? + +0:12:07.416 --> 0:12:11.668 +There's different ways of paralyzing stuff. + +0:12:14.654 --> 0:12:20.777 +Another dimension is how you process your +data. + +0:12:20.777 --> 0:12:27.154 +There's really the best processing and streaming. + +0:12:27.647 --> 0:12:34.672 +So in batch processing you have the whole +document available so you can translate all + +0:12:34.672 --> 0:12:39.981 +sentences in perimeter and then you're interested +in throughput. + +0:12:40.000 --> 0:12:43.844 +But you can then process, for example, especially +in GPS. + +0:12:43.844 --> 0:12:49.810 +That's interesting, you're not translating +one sentence at a time, but you're translating + +0:12:49.810 --> 0:12:56.108 +one hundred sentences or so in parallel, so +you have one more dimension where you can paralyze + +0:12:56.108 --> 0:12:57.964 +and then be more efficient. + +0:12:58.558 --> 0:13:14.863 +On the other hand, for example sorts of documents, +so we learned that if you do badge processing + +0:13:14.863 --> 0:13:16.544 +you have. + +0:13:16.636 --> 0:13:24.636 +Then, of course, it makes sense to sort the +sentences in order to have the minimum thing + +0:13:24.636 --> 0:13:25.535 +attached. + +0:13:27.427 --> 0:13:32.150 +The other scenario is more the streaming scenario +where you do life translation. + +0:13:32.512 --> 0:13:40.212 +So in that case you can't wait for the whole +document to pass, but you have to do. + +0:13:40.520 --> 0:13:49.529 +And then, for example, that's especially in +situations like speech translation, and then + +0:13:49.529 --> 0:13:53.781 +you're interested in things like latency. + +0:13:53.781 --> 0:14:00.361 +So how much do you have to wait to get the +output of a sentence? + +0:14:06.566 --> 0:14:16.956 +Finally, there is the thing about the implementation: +Today we're mainly looking at different algorithms, + +0:14:16.956 --> 0:14:23.678 +different models of how you can model them +in your machine translation system, but of + +0:14:23.678 --> 0:14:29.227 +course for the same algorithms there's also +different implementations. + +0:14:29.489 --> 0:14:38.643 +So, for example, for a machine translation +this tool could be very fast. + +0:14:38.638 --> 0:14:46.615 +So they have like coded a lot of the operations +very low resource, not low resource, low level + +0:14:46.615 --> 0:14:49.973 +on the directly on the QDAC kernels in. + +0:14:50.110 --> 0:15:00.948 +So the same attention network is typically +more efficient in that type of algorithm. + +0:15:00.880 --> 0:15:02.474 +Than in in any other. + +0:15:03.323 --> 0:15:13.105 +Of course, it might be other disadvantages, +so if you're a little worker or have worked + +0:15:13.105 --> 0:15:15.106 +in the practical. + +0:15:15.255 --> 0:15:22.604 +Because it's normally easier to understand, +easier to change, and so on, but there is again + +0:15:22.604 --> 0:15:23.323 +a train. + +0:15:23.483 --> 0:15:29.440 +You have to think about, do you want to include +this into my study or comparison or not? + +0:15:29.440 --> 0:15:36.468 +Should it be like I compare different implementations +and I also find the most efficient implementation? + +0:15:36.468 --> 0:15:39.145 +Or is it only about the pure algorithm? + +0:15:42.742 --> 0:15:50.355 +Yeah, when building these systems there is +a different trade-off to do. + +0:15:50.850 --> 0:15:56.555 +So there's one of the traders between memory +and throughput, so how many words can generate + +0:15:56.555 --> 0:15:57.299 +per second. + +0:15:57.557 --> 0:16:03.351 +So typically you can easily like increase +your scruple by increasing the batch size. + +0:16:03.643 --> 0:16:06.899 +So that means you are translating more sentences +in parallel. + +0:16:07.107 --> 0:16:09.241 +And gypsies are very good at that stuff. + +0:16:09.349 --> 0:16:15.161 +It should translate one sentence or one hundred +sentences, not the same time, but its. + +0:16:15.115 --> 0:16:20.784 +Rough are very similar because they are at +this efficient metrics multiplication so that + +0:16:20.784 --> 0:16:24.415 +you can do the same operation on all sentences +parallel. + +0:16:24.415 --> 0:16:30.148 +So typically that means if you increase your +benchmark you can do more things in parallel + +0:16:30.148 --> 0:16:31.995 +and you will translate more. + +0:16:31.952 --> 0:16:33.370 +Second. + +0:16:33.653 --> 0:16:43.312 +On the other hand, with this advantage, of +course you will need higher badge sizes and + +0:16:43.312 --> 0:16:44.755 +more memory. + +0:16:44.965 --> 0:16:56.452 +To begin with, the other problem is that you +have such big models that you can only translate + +0:16:56.452 --> 0:16:59.141 +with lower bed sizes. + +0:16:59.119 --> 0:17:08.466 +If you are running out of memory with translating, +one idea to go on that is to decrease your. + +0:17:13.453 --> 0:17:24.456 +Then there is the thing about quality in Screwport, +of course, and before it's like larger models, + +0:17:24.456 --> 0:17:28.124 +but in generally higher quality. + +0:17:28.124 --> 0:17:31.902 +The first one is always this way. + +0:17:32.092 --> 0:17:38.709 +Course: Not always larger model helps you +have over fitting at some point, but in generally. + +0:17:43.883 --> 0:17:52.901 +And with this a bit on this training and testing +thing we had before. + +0:17:53.113 --> 0:17:58.455 +So it wears all the difference between training +and testing, and for the encoder and decoder. + +0:17:58.798 --> 0:18:06.992 +So if we are looking at what mentioned before +at training time, we have a source sentence + +0:18:06.992 --> 0:18:17.183 +here: And how this is processed on a is not +the attention here. + +0:18:17.183 --> 0:18:21.836 +That's a tubical transformer. + +0:18:22.162 --> 0:18:31.626 +And how we can do that on a is that we can +paralyze the ear ever since. + +0:18:31.626 --> 0:18:40.422 +The first thing to know is: So that is, of +course, not in all cases. + +0:18:40.422 --> 0:18:49.184 +We'll later talk about speech translation +where we might want to translate. + +0:18:49.389 --> 0:18:56.172 +Without the general case in, it's like you +have the full sentence you want to translate. + +0:18:56.416 --> 0:19:02.053 +So the important thing is we are here everything +available on the source side. + +0:19:03.323 --> 0:19:13.524 +And then this was one of the big advantages +that you can remember back of transformer. + +0:19:13.524 --> 0:19:15.752 +There are several. + +0:19:16.156 --> 0:19:25.229 +But the other one is now that we can calculate +the full layer. + +0:19:25.645 --> 0:19:29.318 +There is no dependency between this and this +state or this and this state. + +0:19:29.749 --> 0:19:36.662 +So we always did like here to calculate the +key value and query, and based on that you + +0:19:36.662 --> 0:19:37.536 +calculate. + +0:19:37.937 --> 0:19:46.616 +Which means we can do all these calculations +here in parallel and in parallel. + +0:19:48.028 --> 0:19:55.967 +And there, of course, is this very efficiency +because again for GPS it's too bigly possible + +0:19:55.967 --> 0:20:00.887 +to do these things in parallel and one after +each other. + +0:20:01.421 --> 0:20:10.311 +And then we can also for each layer one by +one, and then we calculate here the encoder. + +0:20:10.790 --> 0:20:21.921 +In training now an important thing is that +for the decoder we have the full sentence available + +0:20:21.921 --> 0:20:28.365 +because we know this is the target we should +generate. + +0:20:29.649 --> 0:20:33.526 +We have models now in a different way. + +0:20:33.526 --> 0:20:38.297 +This hidden state is only on the previous +ones. + +0:20:38.598 --> 0:20:51.887 +And the first thing here depends only on this +information, so you see if you remember we + +0:20:51.887 --> 0:20:56.665 +had this masked self-attention. + +0:20:56.896 --> 0:21:04.117 +So that means, of course, we can only calculate +the decoder once the encoder is done, but that's. + +0:21:04.444 --> 0:21:06.656 +Percent can calculate the end quarter. + +0:21:06.656 --> 0:21:08.925 +Then we can calculate here the decoder. + +0:21:09.569 --> 0:21:25.566 +But again in training we have x, y and that +is available so we can calculate everything + +0:21:25.566 --> 0:21:27.929 +in parallel. + +0:21:28.368 --> 0:21:40.941 +So the interesting thing or advantage of transformer +is in training. + +0:21:40.941 --> 0:21:46.408 +We can do it for the decoder. + +0:21:46.866 --> 0:21:54.457 +That means you will have more calculations +because you can only calculate one layer at + +0:21:54.457 --> 0:22:02.310 +a time, but for example the length which is +too bigly quite long or doesn't really matter + +0:22:02.310 --> 0:22:03.270 +that much. + +0:22:05.665 --> 0:22:10.704 +However, in testing this situation is different. + +0:22:10.704 --> 0:22:13.276 +In testing we only have. + +0:22:13.713 --> 0:22:20.622 +So this means we start with a sense: We don't +know the full sentence yet because we ought + +0:22:20.622 --> 0:22:29.063 +to regularly generate that so for the encoder +we have the same here but for the decoder. + +0:22:29.409 --> 0:22:39.598 +In this case we only have the first and the +second instinct, but only for all states in + +0:22:39.598 --> 0:22:40.756 +parallel. + +0:22:41.101 --> 0:22:51.752 +And then we can do the next step for y because +we are putting our most probable one. + +0:22:51.752 --> 0:22:58.643 +We do greedy search or beam search, but you +cannot do. + +0:23:03.663 --> 0:23:16.838 +Yes, so if we are interesting in making things +more efficient for testing, which we see, for + +0:23:16.838 --> 0:23:22.363 +example in the scenario of really our. + +0:23:22.642 --> 0:23:34.286 +It makes sense that we think about our architecture +and that we are currently working on attention + +0:23:34.286 --> 0:23:35.933 +based models. + +0:23:36.096 --> 0:23:44.150 +The decoder there is some of the most time +spent testing and testing. + +0:23:44.150 --> 0:23:47.142 +It's similar, but during. + +0:23:47.167 --> 0:23:50.248 +Nothing about beam search. + +0:23:50.248 --> 0:23:59.833 +It might be even more complicated because +in beam search you have to try different. + +0:24:02.762 --> 0:24:15.140 +So the question is what can you now do in +order to make your model more efficient and + +0:24:15.140 --> 0:24:21.905 +better in translation in these types of cases? + +0:24:24.604 --> 0:24:30.178 +And the one thing is to look into the encoded +decoder trailer. + +0:24:30.690 --> 0:24:43.898 +And then until now we typically assume that +the depth of the encoder and the depth of the + +0:24:43.898 --> 0:24:48.154 +decoder is roughly the same. + +0:24:48.268 --> 0:24:55.553 +So if you haven't thought about it, you just +take what is running well. + +0:24:55.553 --> 0:24:57.678 +You would try to do. + +0:24:58.018 --> 0:25:04.148 +However, we saw now that there is a quite +big challenge and the runtime is a lot longer + +0:25:04.148 --> 0:25:04.914 +than here. + +0:25:05.425 --> 0:25:14.018 +The question is also the case for the calculations, +or do we have there the same issue that we + +0:25:14.018 --> 0:25:21.887 +only get the good quality if we are having +high and high, so we know that making these + +0:25:21.887 --> 0:25:25.415 +more depths is increasing our quality. + +0:25:25.425 --> 0:25:31.920 +But what we haven't talked about is really +important that we increase the depth the same + +0:25:31.920 --> 0:25:32.285 +way. + +0:25:32.552 --> 0:25:41.815 +So what we can put instead also do is something +like this where you have a deep encoder and + +0:25:41.815 --> 0:25:42.923 +a shallow. + +0:25:43.163 --> 0:25:57.386 +So that would be that you, for example, have +instead of having layers on the encoder, and + +0:25:57.386 --> 0:25:59.757 +layers on the. + +0:26:00.080 --> 0:26:10.469 +So in this case the overall depth from start +to end would be similar and so hopefully. + +0:26:11.471 --> 0:26:21.662 +But we could a lot more things hear parallelized, +and hear what is costly at the end during decoding + +0:26:21.662 --> 0:26:22.973 +the decoder. + +0:26:22.973 --> 0:26:29.330 +Because that does change in an outer regressive +way, there we. + +0:26:31.411 --> 0:26:33.727 +And that that can be analyzed. + +0:26:33.727 --> 0:26:38.734 +So here is some examples: Where people have +done all this. + +0:26:39.019 --> 0:26:55.710 +So here it's mainly interested on the orange +things, which is auto-regressive about the + +0:26:55.710 --> 0:26:57.607 +speed up. + +0:26:57.717 --> 0:27:15.031 +You have the system, so agree is not exactly +the same, but it's similar. + +0:27:15.055 --> 0:27:23.004 +It's always the case if you look at speed +up. + +0:27:23.004 --> 0:27:31.644 +Think they put a speed of so that's the baseline. + +0:27:31.771 --> 0:27:35.348 +So between and times as fast. + +0:27:35.348 --> 0:27:42.621 +If you switch from a system to where you have +layers in the. + +0:27:42.782 --> 0:27:52.309 +You see that although you have slightly more +parameters, more calculations are also roughly + +0:27:52.309 --> 0:28:00.283 +the same, but you can speed out because now +during testing you can paralyze. + +0:28:02.182 --> 0:28:09.754 +The other thing is that you're speeding up, +but if you look at the performance it's similar, + +0:28:09.754 --> 0:28:13.500 +so sometimes you improve, sometimes you lose. + +0:28:13.500 --> 0:28:20.421 +There's a bit of losing English to Romania, +but in general the quality is very slow. + +0:28:20.680 --> 0:28:30.343 +So you see that you can keep a similar performance +while improving your speed by just having different. + +0:28:30.470 --> 0:28:34.903 +And you also see the encoder layers from speed. + +0:28:34.903 --> 0:28:38.136 +They don't really metal that much. + +0:28:38.136 --> 0:28:38.690 +Most. + +0:28:38.979 --> 0:28:50.319 +Because if you compare the 12th system to +the 6th system you have a lower performance + +0:28:50.319 --> 0:28:57.309 +with 6th and colder layers but the speed is +similar. + +0:28:57.897 --> 0:29:02.233 +And see the huge decrease is it maybe due +to a lack of data. + +0:29:03.743 --> 0:29:11.899 +Good idea would say it's not the case. + +0:29:11.899 --> 0:29:23.191 +Romanian English should have the same number +of data. + +0:29:24.224 --> 0:29:31.184 +Maybe it's just that something in that language. + +0:29:31.184 --> 0:29:40.702 +If you generate Romanian maybe they need more +target dependencies. + +0:29:42.882 --> 0:29:46.263 +The Wine's the Eye Also Don't Know Any Sex +People Want To. + +0:29:47.887 --> 0:29:49.034 +There could be yeah the. + +0:29:49.889 --> 0:29:58.962 +As the maybe if you go from like a movie sphere +to a hybrid sphere, you can: It's very much + +0:29:58.962 --> 0:30:12.492 +easier to expand the vocabulary to English, +but it must be the vocabulary. + +0:30:13.333 --> 0:30:21.147 +Have to check, but would assume that in this +case the system is not retrained, but it's + +0:30:21.147 --> 0:30:22.391 +trained with. + +0:30:22.902 --> 0:30:30.213 +And that's why I was assuming that they have +the same, but maybe you'll write that in this + +0:30:30.213 --> 0:30:35.595 +piece, for example, if they were pre-trained, +the decoder English. + +0:30:36.096 --> 0:30:43.733 +But don't remember exactly if they do something +like that, but that could be a good. + +0:30:45.325 --> 0:30:52.457 +So this is some of the most easy way to speed +up. + +0:30:52.457 --> 0:31:01.443 +You just switch to hyperparameters, not to +implement anything. + +0:31:02.722 --> 0:31:08.367 +Of course, there's other ways of doing that. + +0:31:08.367 --> 0:31:11.880 +We'll look into two things. + +0:31:11.880 --> 0:31:16.521 +The other thing is the architecture. + +0:31:16.796 --> 0:31:28.154 +We are now at some of the baselines that we +are doing. + +0:31:28.488 --> 0:31:39.978 +However, in translation in the decoder side, +it might not be the best solution. + +0:31:39.978 --> 0:31:41.845 +There is no. + +0:31:42.222 --> 0:31:47.130 +So we can use different types of architectures, +also in the encoder and the. + +0:31:47.747 --> 0:31:52.475 +And there's two ways of what you could do +different, or there's more ways. + +0:31:52.912 --> 0:31:54.825 +We will look into two todays. + +0:31:54.825 --> 0:31:58.842 +The one is average attention, which is a very +simple solution. + +0:31:59.419 --> 0:32:01.464 +You can do as it says. + +0:32:01.464 --> 0:32:04.577 +It's not really attending anymore. + +0:32:04.577 --> 0:32:08.757 +It's just like equal attendance to everything. + +0:32:09.249 --> 0:32:23.422 +And the other idea, which is currently done +in most systems which are optimized to efficiency, + +0:32:23.422 --> 0:32:24.913 +is we're. + +0:32:25.065 --> 0:32:32.623 +But on the decoder side we are then not using +transformer or self attention, but we are using + +0:32:32.623 --> 0:32:39.700 +recurrent neural network because they are the +disadvantage of recurrent neural network. + +0:32:39.799 --> 0:32:48.353 +And then the recurrent is normally easier +to calculate because it only depends on inputs, + +0:32:48.353 --> 0:32:49.684 +the input on. + +0:32:51.931 --> 0:33:02.190 +So what is the difference between decoding +and why is the tension maybe not sufficient + +0:33:02.190 --> 0:33:03.841 +for decoding? + +0:33:04.204 --> 0:33:14.390 +If we want to populate the new state, we only +have to look at the input and the previous + +0:33:14.390 --> 0:33:15.649 +state, so. + +0:33:16.136 --> 0:33:19.029 +We are more conditional here networks. + +0:33:19.029 --> 0:33:19.994 +We have the. + +0:33:19.980 --> 0:33:31.291 +Dependency to a fixed number of previous ones, +but that's rarely used for decoding. + +0:33:31.291 --> 0:33:39.774 +In contrast, in transformer we have this large +dependency, so. + +0:33:40.000 --> 0:33:52.760 +So from t minus one to y t so that is somehow +and mainly not very efficient in this way mean + +0:33:52.760 --> 0:33:56.053 +it's very good because. + +0:33:56.276 --> 0:34:03.543 +However, the disadvantage is that we also +have to do all these calculations, so if we + +0:34:03.543 --> 0:34:10.895 +more view from the point of view of efficient +calculation, this might not be the best. + +0:34:11.471 --> 0:34:20.517 +So the question is, can we change our architecture +to keep some of the advantages but make things + +0:34:20.517 --> 0:34:21.994 +more efficient? + +0:34:24.284 --> 0:34:31.131 +The one idea is what is called the average +attention, and the interesting thing is this + +0:34:31.131 --> 0:34:32.610 +work surprisingly. + +0:34:33.013 --> 0:34:38.917 +So the only idea what you're doing is doing +the decoder. + +0:34:38.917 --> 0:34:42.646 +You're not doing attention anymore. + +0:34:42.646 --> 0:34:46.790 +The attention weights are all the same. + +0:34:47.027 --> 0:35:00.723 +So you don't calculate with query and key +the different weights, and then you just take + +0:35:00.723 --> 0:35:03.058 +equal weights. + +0:35:03.283 --> 0:35:07.585 +So here would be one third from this, one +third from this, and one third. + +0:35:09.009 --> 0:35:14.719 +And while it is sufficient you can now do +precalculation and things get more efficient. + +0:35:15.195 --> 0:35:18.803 +So first go the formula that's maybe not directed +here. + +0:35:18.979 --> 0:35:38.712 +So the difference here is that your new hint +stage is the sum of all the hint states, then. + +0:35:38.678 --> 0:35:40.844 +So here would be with this. + +0:35:40.844 --> 0:35:45.022 +It would be one third of this plus one third +of this. + +0:35:46.566 --> 0:35:57.162 +But if you calculate it this way, it's not +yet being more efficient because you still + +0:35:57.162 --> 0:36:01.844 +have to sum over here all the hidden. + +0:36:04.524 --> 0:36:22.932 +But you can not easily speed up these things +by having an in between value, which is just + +0:36:22.932 --> 0:36:24.568 +always. + +0:36:25.585 --> 0:36:30.057 +If you take this as ten to one, you take this +one class this one. + +0:36:30.350 --> 0:36:36.739 +Because this one then was before this, and +this one was this, so in the end. + +0:36:37.377 --> 0:36:49.545 +So now this one is not the final one in order +to get the final one to do the average. + +0:36:49.545 --> 0:36:50.111 +So. + +0:36:50.430 --> 0:37:00.264 +But then if you do this calculation with speed +up you can do it with a fixed number of steps. + +0:37:00.180 --> 0:37:11.300 +Instead of the sun which depends on age, so +you only have to do calculations to calculate + +0:37:11.300 --> 0:37:12.535 +this one. + +0:37:12.732 --> 0:37:21.253 +Can you do a lakes on a wet spoon? + +0:37:21.253 --> 0:37:32.695 +For example, a light spoon here now takes +and. + +0:37:32.993 --> 0:37:38.762 +That's a very good point and that's why this +is now in the image. + +0:37:38.762 --> 0:37:44.531 +It's not very good so this is the one with +tilder and the tilder. + +0:37:44.884 --> 0:37:57.895 +So this one is just the sum of these two, +because this is just this one. + +0:37:58.238 --> 0:38:08.956 +So the sum of this is exactly as the sum of +these, and the sum of these is the sum of here. + +0:38:08.956 --> 0:38:15.131 +So you only do the sum in here, and the multiplying. + +0:38:15.255 --> 0:38:22.145 +So what you can mainly do here is you can +do it more mathematically. + +0:38:22.145 --> 0:38:31.531 +You can know this by tea taking out of the +sum, and then you can calculate the sum different. + +0:38:36.256 --> 0:38:42.443 +That maybe looks a bit weird and simple, so +we were all talking about this great attention + +0:38:42.443 --> 0:38:47.882 +that we can focus on different parts, and a +bit surprising on this work is now. + +0:38:47.882 --> 0:38:53.321 +In the end it might also work well without +really putting and just doing equal. + +0:38:53.954 --> 0:38:56.164 +Mean it's not that easy. + +0:38:56.376 --> 0:38:58.261 +It's like sometimes this is working. + +0:38:58.261 --> 0:39:00.451 +There's also report weight work that well. + +0:39:01.481 --> 0:39:05.848 +But I think it's an interesting way and it +maybe shows that a lot of. + +0:39:05.805 --> 0:39:10.669 +Things in the self or in the transformer paper +which are more put as like yet. + +0:39:10.669 --> 0:39:14.301 +These are some hyperparameters that are rounded +like that. + +0:39:14.301 --> 0:39:19.657 +You do the lay on all in between and that +you do a feat forward before and things like + +0:39:19.657 --> 0:39:20.026 +that. + +0:39:20.026 --> 0:39:25.567 +But these are also all important and the right +set up around that is also very important. + +0:39:28.969 --> 0:39:38.598 +The other thing you can do in the end is not +completely different from this one. + +0:39:38.598 --> 0:39:42.521 +It's just like a very different. + +0:39:42.942 --> 0:39:54.338 +And that is a recurrent network which also +has this type of highway connection that can + +0:39:54.338 --> 0:40:01.330 +ignore the recurrent unit and directly put +the input. + +0:40:01.561 --> 0:40:10.770 +It's not really adding out, but if you see +the hitting step is your input, but what you + +0:40:10.770 --> 0:40:15.480 +can do is somehow directly go to the output. + +0:40:17.077 --> 0:40:28.390 +These are the four components of the simple +return unit, and the unit is motivated by GIS + +0:40:28.390 --> 0:40:33.418 +and by LCMs, which we have seen before. + +0:40:33.513 --> 0:40:43.633 +And that has proven to be very good for iron +ends, which allows you to have a gate on your. + +0:40:44.164 --> 0:40:48.186 +In this thing we have two gates, the reset +gate and the forget gate. + +0:40:48.768 --> 0:40:57.334 +So first we have the general structure which +has a cell state. + +0:40:57.334 --> 0:41:01.277 +Here we have the cell state. + +0:41:01.361 --> 0:41:09.661 +And then this goes next, and we always get +the different cell states over the times that. + +0:41:10.030 --> 0:41:11.448 +This Is the South Stand. + +0:41:11.771 --> 0:41:16.518 +How do we now calculate that just assume we +have an initial cell safe here? + +0:41:17.017 --> 0:41:19.670 +But the first thing is we're doing the forget +game. + +0:41:20.060 --> 0:41:34.774 +The forgetting models should the new cell +state mainly depend on the previous cell state + +0:41:34.774 --> 0:41:40.065 +or should it depend on our age. + +0:41:40.000 --> 0:41:41.356 +Like Add to Them. + +0:41:41.621 --> 0:41:42.877 +How can we model that? + +0:41:44.024 --> 0:41:45.599 +First we were at a cocktail. + +0:41:45.945 --> 0:41:52.151 +The forget gait is depending on minus one. + +0:41:52.151 --> 0:41:56.480 +You also see here the former. + +0:41:57.057 --> 0:42:01.963 +So we are multiplying both the cell state +and our input. + +0:42:01.963 --> 0:42:04.890 +With some weights we are getting. + +0:42:05.105 --> 0:42:08.472 +We are putting some Bay Inspector and then +we are doing Sigma Weed on that. + +0:42:08.868 --> 0:42:13.452 +So in the end we have numbers between zero +and one saying for each dimension. + +0:42:13.853 --> 0:42:22.041 +Like how much if it's near to zero we will +mainly use the new input. + +0:42:22.041 --> 0:42:31.890 +If it's near to one we will keep the input +and ignore the input at this dimension. + +0:42:33.313 --> 0:42:40.173 +And by this motivation we can then create +here the new sound state, and here you see + +0:42:40.173 --> 0:42:41.141 +the formal. + +0:42:41.601 --> 0:42:55.048 +So you take your foot back gate and multiply +it with your class. + +0:42:55.048 --> 0:43:00.427 +So if my was around then. + +0:43:00.800 --> 0:43:07.405 +In the other case, when the value was others, +that's what you added. + +0:43:07.405 --> 0:43:10.946 +Then you're adding a transformation. + +0:43:11.351 --> 0:43:24.284 +So if this value was maybe zero then you're +putting most of the information from inputting. + +0:43:25.065 --> 0:43:26.947 +Is already your element? + +0:43:26.947 --> 0:43:30.561 +The only question is now based on your element. + +0:43:30.561 --> 0:43:32.067 +What is the output? + +0:43:33.253 --> 0:43:47.951 +And there you have another opportunity so +you can either take the output or instead you + +0:43:47.951 --> 0:43:50.957 +prefer the input. + +0:43:52.612 --> 0:43:58.166 +So is the value also the same for the recept +game and the forget game. + +0:43:58.166 --> 0:43:59.417 +Yes, the movie. + +0:44:00.900 --> 0:44:10.004 +Yes exactly so the matrices are different +and therefore it can be and that should be + +0:44:10.004 --> 0:44:16.323 +and maybe there is sometimes you want to have +information. + +0:44:16.636 --> 0:44:23.843 +So here again we have this vector with values +between zero and which says controlling how + +0:44:23.843 --> 0:44:25.205 +the information. + +0:44:25.505 --> 0:44:36.459 +And then the output is calculated here similar +to a cell stage, but again input is from. + +0:44:36.536 --> 0:44:45.714 +So either the reset gate decides should give +what is currently stored in there, or. + +0:44:46.346 --> 0:44:58.647 +So it's not exactly as the thing we had before, +with the residual connections where we added + +0:44:58.647 --> 0:45:01.293 +up, but here we do. + +0:45:04.224 --> 0:45:08.472 +This is the general idea of a simple recurrent +neural network. + +0:45:08.472 --> 0:45:13.125 +Then we will now look at how we can make things +even more efficient. + +0:45:13.125 --> 0:45:17.104 +But first do you have more questions on how +it is working? + +0:45:23.063 --> 0:45:38.799 +Now these calculations are a bit where things +get more efficient because this somehow. + +0:45:38.718 --> 0:45:43.177 +It depends on all the other damage for the +second one also. + +0:45:43.423 --> 0:45:48.904 +Because if you do a matrix multiplication +with a vector like for the output vector, each + +0:45:48.904 --> 0:45:52.353 +diameter of the output vector depends on all +the other. + +0:45:52.973 --> 0:46:06.561 +The cell state here depends because this one +is used here, and somehow the first dimension + +0:46:06.561 --> 0:46:11.340 +of the cell state only depends. + +0:46:11.931 --> 0:46:17.973 +In order to make that, of course, is sometimes +again making things less paralyzeable if things + +0:46:17.973 --> 0:46:18.481 +depend. + +0:46:19.359 --> 0:46:35.122 +Can easily make that different by changing +from the metric product to not a vector. + +0:46:35.295 --> 0:46:51.459 +So you do first, just like inside here, you +take like the first dimension, my second dimension. + +0:46:52.032 --> 0:46:53.772 +Is, of course, narrow. + +0:46:53.772 --> 0:46:59.294 +This should be reset or this should be because +it should be a different. + +0:46:59.899 --> 0:47:12.053 +Now the first dimension only depends on the +first dimension, so you don't have dependencies + +0:47:12.053 --> 0:47:16.148 +any longer between dimensions. + +0:47:18.078 --> 0:47:25.692 +Maybe it gets a bit clearer if you see about +it in this way, so what we have to do now. + +0:47:25.966 --> 0:47:31.911 +First, we have to do a metrics multiplication +on to gather and to get the. + +0:47:32.292 --> 0:47:38.041 +And then we only have the element wise operations +where we take this output. + +0:47:38.041 --> 0:47:38.713 +We take. + +0:47:39.179 --> 0:47:42.978 +Minus one and our original. + +0:47:42.978 --> 0:47:52.748 +Here we only have elemental abrasions which +can be optimally paralyzed. + +0:47:53.273 --> 0:48:07.603 +So here we have additional paralyzed things +across the dimension and don't have to do that. + +0:48:09.929 --> 0:48:24.255 +Yeah, but this you can do like in parallel +again for all xts. + +0:48:24.544 --> 0:48:33.014 +Here you can't do it in parallel, but you +only have to do it on each seat, and then you + +0:48:33.014 --> 0:48:34.650 +can parallelize. + +0:48:35.495 --> 0:48:39.190 +But this maybe for the dimension. + +0:48:39.190 --> 0:48:42.124 +Maybe it's also important. + +0:48:42.124 --> 0:48:46.037 +I don't know if they have tried it. + +0:48:46.037 --> 0:48:55.383 +I assume it's not only for dimension reduction, +but it's hard because you can easily. + +0:49:01.001 --> 0:49:08.164 +People have even like made the second thing +even more easy. + +0:49:08.164 --> 0:49:10.313 +So there is this. + +0:49:10.313 --> 0:49:17.954 +This is how we have the highway connections +in the transformer. + +0:49:17.954 --> 0:49:20.699 +Then it's like you do. + +0:49:20.780 --> 0:49:24.789 +So that is like how things are put together +as a transformer. + +0:49:25.125 --> 0:49:39.960 +And that is a similar and simple recurring +neural network where you do exactly the same + +0:49:39.960 --> 0:49:44.512 +for the so you don't have. + +0:49:46.326 --> 0:49:47.503 +This type of things. + +0:49:49.149 --> 0:50:01.196 +And with this we are at the end of how to +make efficient architectures before we go to + +0:50:01.196 --> 0:50:02.580 +the next. + +0:50:13.013 --> 0:50:24.424 +Between the ink or the trader and the architectures +there is a next technique which is used in + +0:50:24.424 --> 0:50:28.988 +nearly all deburning very successful. + +0:50:29.449 --> 0:50:43.463 +So the idea is can we extract the knowledge +from a large network into a smaller one, but + +0:50:43.463 --> 0:50:45.983 +it's similarly. + +0:50:47.907 --> 0:50:53.217 +And the nice thing is that this really works, +and it may be very, very surprising. + +0:50:53.673 --> 0:51:03.035 +So the idea is that we have a large strong +model which we train for long, and the question + +0:51:03.035 --> 0:51:07.870 +is: Can that help us to train a smaller model? + +0:51:08.148 --> 0:51:16.296 +So can what we refer to as teacher model tell +us better to build a small student model than + +0:51:16.296 --> 0:51:17.005 +before. + +0:51:17.257 --> 0:51:27.371 +So what we're before in it as a student model, +we learn from the data and that is how we train + +0:51:27.371 --> 0:51:28.755 +our systems. + +0:51:29.249 --> 0:51:37.949 +The question is: Can we train this small model +better if we are not only learning from the + +0:51:37.949 --> 0:51:46.649 +data, but we are also learning from a large +model which has been trained maybe in the same + +0:51:46.649 --> 0:51:47.222 +data? + +0:51:47.667 --> 0:51:55.564 +So that you have then in the end a smaller +model that is somehow better performing than. + +0:51:55.895 --> 0:51:59.828 +And maybe that's on the first view. + +0:51:59.739 --> 0:52:05.396 +Very very surprising because it has seen the +same data so it should have learned the same + +0:52:05.396 --> 0:52:11.053 +so the baseline model trained only on the data +and the student teacher knowledge to still + +0:52:11.053 --> 0:52:11.682 +model it. + +0:52:11.682 --> 0:52:17.401 +They all have seen only this data because +your teacher modeling was also trained typically + +0:52:17.401 --> 0:52:19.161 +only on this model however. + +0:52:20.580 --> 0:52:30.071 +It has by now shown that by many ways the +model trained in the teacher and analysis framework + +0:52:30.071 --> 0:52:32.293 +is performing better. + +0:52:33.473 --> 0:52:40.971 +A bit of an explanation when we see how that +works. + +0:52:40.971 --> 0:52:46.161 +There's different ways of doing it. + +0:52:46.161 --> 0:52:47.171 +Maybe. + +0:52:47.567 --> 0:52:51.501 +So how does it work? + +0:52:51.501 --> 0:53:04.802 +This is our student network, the normal one, +some type of new network. + +0:53:04.802 --> 0:53:06.113 +We're. + +0:53:06.586 --> 0:53:17.050 +So we are training the model to predict the +same thing as we are doing that by calculating. + +0:53:17.437 --> 0:53:23.173 +The cross angry loss was defined in a way +where saying all the probabilities for the + +0:53:23.173 --> 0:53:25.332 +correct word should be as high. + +0:53:25.745 --> 0:53:32.207 +So you are calculating your alphabet probabilities +always, and each time step you have an alphabet + +0:53:32.207 --> 0:53:33.055 +probability. + +0:53:33.055 --> 0:53:38.669 +What is the most probable in the next word +and your training signal is put as much of + +0:53:38.669 --> 0:53:43.368 +your probability mass to the correct word to +the word that is there in. + +0:53:43.903 --> 0:53:51.367 +And this is the chief by this cross entry +loss, which says with some of the all training + +0:53:51.367 --> 0:53:58.664 +examples of all positions, with some of the +full vocabulary, and then this one is this + +0:53:58.664 --> 0:54:03.947 +one that this current word is the case word +in the vocabulary. + +0:54:04.204 --> 0:54:11.339 +And then we take here the lock for the ability +of that, so what we made me do is: We have + +0:54:11.339 --> 0:54:27.313 +this metric here, so each position of your +vocabulary size. + +0:54:27.507 --> 0:54:38.656 +In the end what you just do is some of these +three lock probabilities, and then you want + +0:54:38.656 --> 0:54:40.785 +to have as much. + +0:54:41.041 --> 0:54:54.614 +So although this is a thumb over this metric +here, in the end of each dimension you. + +0:54:54.794 --> 0:55:06.366 +So that is a normal cross end to be lost that +we have discussed at the very beginning of + +0:55:06.366 --> 0:55:07.016 +how. + +0:55:08.068 --> 0:55:15.132 +So what can we do differently in the teacher +network? + +0:55:15.132 --> 0:55:23.374 +We also have a teacher network which is trained +on large data. + +0:55:24.224 --> 0:55:35.957 +And of course this distribution might be better +than the one from the small model because it's. + +0:55:36.456 --> 0:55:40.941 +So in this case we have now the training signal +from the teacher network. + +0:55:41.441 --> 0:55:46.262 +And it's the same way as we had before. + +0:55:46.262 --> 0:55:56.507 +The only difference is we're training not +the ground truths per ability distribution + +0:55:56.507 --> 0:55:59.159 +year, which is sharp. + +0:55:59.299 --> 0:56:11.303 +That's also a probability, so this word has +a high probability, but have some probability. + +0:56:12.612 --> 0:56:19.577 +And that is the main difference. + +0:56:19.577 --> 0:56:30.341 +Typically you do like the interpretation of +these. + +0:56:33.213 --> 0:56:38.669 +Because there's more information contained +in the distribution than in the front booth, + +0:56:38.669 --> 0:56:44.187 +because it encodes more information about the +language, because language always has more + +0:56:44.187 --> 0:56:47.907 +options to put alone, that's the same sentence +yes exactly. + +0:56:47.907 --> 0:56:53.114 +So there's ambiguity in there that is encoded +hopefully very well in the complaint. + +0:56:53.513 --> 0:56:57.257 +Trade you two networks so better than a student +network you have in there from your learner. + +0:56:57.537 --> 0:57:05.961 +So maybe often there's only one correct word, +but it might be two or three, and then all + +0:57:05.961 --> 0:57:10.505 +of these three have a probability distribution. + +0:57:10.590 --> 0:57:21.242 +And then is the main advantage or one explanation +of why it's better to train from the. + +0:57:21.361 --> 0:57:32.652 +Of course, it's good to also keep the signal +in there because then you can prevent it because + +0:57:32.652 --> 0:57:33.493 +crazy. + +0:57:37.017 --> 0:57:49.466 +Any more questions on the first type of knowledge +distillation, also distribution changes. + +0:57:50.550 --> 0:58:02.202 +Coming around again, this would put it a bit +different, so this is not a solution to maintenance + +0:58:02.202 --> 0:58:04.244 +or distribution. + +0:58:04.744 --> 0:58:12.680 +But don't think it's performing worse than +only doing the ground tours because they also. + +0:58:13.113 --> 0:58:21.254 +So it's more like it's not improving you would +assume it's similarly helping you, but. + +0:58:21.481 --> 0:58:28.145 +Of course, if you now have a teacher, maybe +you have no danger on your target to Maine, + +0:58:28.145 --> 0:58:28.524 +but. + +0:58:28.888 --> 0:58:39.895 +Then you can use this one which is not the +ground truth but helpful to learn better for + +0:58:39.895 --> 0:58:42.147 +the distribution. + +0:58:46.326 --> 0:58:57.012 +The second idea is to do sequence level knowledge +distillation, so what we have in this case + +0:58:57.012 --> 0:59:02.757 +is we have looked at each position independently. + +0:59:03.423 --> 0:59:05.436 +Mean, we do that often. + +0:59:05.436 --> 0:59:10.972 +We are not generating a lot of sequences, +but that has a problem. + +0:59:10.972 --> 0:59:13.992 +We have this propagation of errors. + +0:59:13.992 --> 0:59:16.760 +We start with one area and then. + +0:59:17.237 --> 0:59:27.419 +So if we are doing word-level knowledge dissolution, +we are treating each word in the sentence independently. + +0:59:28.008 --> 0:59:32.091 +So we are not trying to like somewhat model +the dependency between. + +0:59:32.932 --> 0:59:47.480 +We can try to do that by sequence level knowledge +dissolution, but the problem is, of course,. + +0:59:47.847 --> 0:59:53.478 +So we can that for each position we can get +a distribution over all the words at this. + +0:59:53.793 --> 1:00:05.305 +But if we want to have a distribution of all +possible target sentences, that's not possible + +1:00:05.305 --> 1:00:06.431 +because. + +1:00:08.508 --> 1:00:15.940 +Area, so we can then again do a bit of a heck +on that. + +1:00:15.940 --> 1:00:23.238 +If we can't have a distribution of all sentences, +it. + +1:00:23.843 --> 1:00:30.764 +So what we can't do is you can not use the +teacher network and sample different translations. + +1:00:31.931 --> 1:00:39.327 +And now we can do different ways to train +them. + +1:00:39.327 --> 1:00:49.343 +We can use them as their probability, the +easiest one to assume. + +1:00:50.050 --> 1:00:56.373 +So what that ends to is that we're taking +our teacher network, we're generating some + +1:00:56.373 --> 1:01:01.135 +translations, and these ones we're using as +additional trading. + +1:01:01.781 --> 1:01:11.382 +Then we have mainly done this sequence level +because the teacher network takes us. + +1:01:11.382 --> 1:01:17.513 +These are all probable translations of the +sentence. + +1:01:26.286 --> 1:01:34.673 +And then you can do a bit of a yeah, and you +can try to better make a bit of an interpolated + +1:01:34.673 --> 1:01:36.206 +version of that. + +1:01:36.716 --> 1:01:42.802 +So what people have also done is like subsequent +level interpolations. + +1:01:42.802 --> 1:01:52.819 +You generate here several translations: But +then you don't use all of them. + +1:01:52.819 --> 1:02:00.658 +You do some metrics on which of these ones. + +1:02:01.021 --> 1:02:12.056 +So it's a bit more training on this brown +chose which might be improbable or unreachable + +1:02:12.056 --> 1:02:16.520 +because we can generate everything. + +1:02:16.676 --> 1:02:23.378 +And we are giving it an easier solution which +is also good quality and training of that. + +1:02:23.703 --> 1:02:32.602 +So you're not training it on a very difficult +solution, but you're training it on an easier + +1:02:32.602 --> 1:02:33.570 +solution. + +1:02:36.356 --> 1:02:38.494 +Any More Questions to This. + +1:02:40.260 --> 1:02:41.557 +Yeah. + +1:02:41.461 --> 1:02:44.296 +Good. + +1:02:43.843 --> 1:03:01.642 +Is to look at the vocabulary, so the problem +is we have seen that vocabulary calculations + +1:03:01.642 --> 1:03:06.784 +are often very presuming. + +1:03:09.789 --> 1:03:19.805 +The thing is that most of the vocabulary is +not needed for each sentence, so in each sentence. + +1:03:20.280 --> 1:03:28.219 +The question is: Can we somehow easily precalculate, +which words are probable to occur in the sentence, + +1:03:28.219 --> 1:03:30.967 +and then only calculate these ones? + +1:03:31.691 --> 1:03:34.912 +And this can be done so. + +1:03:34.912 --> 1:03:43.932 +For example, if you have sentenced card, it's +probably not happening. + +1:03:44.164 --> 1:03:48.701 +So what you can try to do is to limit your +vocabulary. + +1:03:48.701 --> 1:03:51.093 +You're considering for each. + +1:03:51.151 --> 1:04:04.693 +So you're no longer taking the full vocabulary +as possible output, but you're restricting. + +1:04:06.426 --> 1:04:18.275 +That typically works is that we limit it by +the most frequent words we always take because + +1:04:18.275 --> 1:04:23.613 +these are not so easy to align to words. + +1:04:23.964 --> 1:04:32.241 +To take the most treatment taggin' words and +then work that often aligns with one of the + +1:04:32.241 --> 1:04:32.985 +source. + +1:04:33.473 --> 1:04:46.770 +So for each source word you calculate the +word alignment on your training data, and then + +1:04:46.770 --> 1:04:51.700 +you calculate which words occur. + +1:04:52.352 --> 1:04:57.680 +And then for decoding you build this union +of maybe the source word list that other. + +1:04:59.960 --> 1:05:02.145 +Are like for each source work. + +1:05:02.145 --> 1:05:08.773 +One of the most frequent translations of these +source words, for example for each source work + +1:05:08.773 --> 1:05:13.003 +like in the most frequent ones, and then the +most frequent. + +1:05:13.193 --> 1:05:24.333 +In total, if you have short sentences, you +have a lot less words, so in most cases it's + +1:05:24.333 --> 1:05:26.232 +not more than. + +1:05:26.546 --> 1:05:33.957 +And so you have dramatically reduced your +vocabulary, and thereby can also fax a depot. + +1:05:35.495 --> 1:05:43.757 +That easy does anybody see what is challenging +here and why that might not always need. + +1:05:47.687 --> 1:05:54.448 +The performance is not why this might not. + +1:05:54.448 --> 1:06:01.838 +If you implement it, it might not be a strong. + +1:06:01.941 --> 1:06:06.053 +You have to store this list. + +1:06:06.053 --> 1:06:14.135 +You have to burn the union and of course your +safe time. + +1:06:14.554 --> 1:06:21.920 +The second thing the vocabulary is used in +our last step, so we have the hidden state, + +1:06:21.920 --> 1:06:23.868 +and then we calculate. + +1:06:24.284 --> 1:06:29.610 +Now we are not longer calculating them for +all output words, but for a subset of them. + +1:06:30.430 --> 1:06:35.613 +However, this metric multiplication is typically +parallelized with the perfect but good. + +1:06:35.956 --> 1:06:46.937 +But if you not only calculate some of them, +if you're not modeling it right, it will take + +1:06:46.937 --> 1:06:52.794 +as long as before because of the nature of +the. + +1:06:56.776 --> 1:07:07.997 +Here for beam search there's some ideas of +course you can go back to greedy search because + +1:07:07.997 --> 1:07:10.833 +that's more efficient. + +1:07:11.651 --> 1:07:18.347 +And better quality, and you can buffer some +states in between, so how much buffering it's + +1:07:18.347 --> 1:07:22.216 +again this tradeoff between calculation and +memory. + +1:07:25.125 --> 1:07:41.236 +Then at the end of today what we want to look +into is one last type of new machine translation + +1:07:41.236 --> 1:07:42.932 +approach. + +1:07:43.403 --> 1:07:53.621 +And the idea is what we've already seen in +our first two steps is that this ultra aggressive + +1:07:53.621 --> 1:07:57.246 +park is taking community coding. + +1:07:57.557 --> 1:08:04.461 +Can process everything in parallel, but we +are always taking the most probable and then. + +1:08:05.905 --> 1:08:10.476 +The question is: Do we really need to do that? + +1:08:10.476 --> 1:08:14.074 +Therefore, there is a bunch of work. + +1:08:14.074 --> 1:08:16.602 +Can we do it differently? + +1:08:16.602 --> 1:08:19.616 +Can we generate a full target? + +1:08:20.160 --> 1:08:29.417 +We'll see it's not that easy and there's still +an open debate whether this is really faster + +1:08:29.417 --> 1:08:31.832 +and quality, but think. + +1:08:32.712 --> 1:08:45.594 +So, as said, what we have done is our encoder +decoder where we can process our encoder color, + +1:08:45.594 --> 1:08:50.527 +and then the output always depends. + +1:08:50.410 --> 1:08:54.709 +We generate the output and then we have to +put it here the wide because then everything + +1:08:54.709 --> 1:08:56.565 +depends on the purpose of the output. + +1:08:56.916 --> 1:09:10.464 +This is what is referred to as an outer-regressive +model and nearly outs speech generation and + +1:09:10.464 --> 1:09:16.739 +language generation or works in this outer. + +1:09:18.318 --> 1:09:21.132 +So the motivation is, can we do that more +efficiently? + +1:09:21.361 --> 1:09:31.694 +And can we somehow process all target words +in parallel? + +1:09:31.694 --> 1:09:41.302 +So instead of doing it one by one, we are +inputting. + +1:09:45.105 --> 1:09:46.726 +So how does it work? + +1:09:46.726 --> 1:09:50.587 +So let's first have a basic auto regressive +mode. + +1:09:50.810 --> 1:09:53.551 +So the encoder looks as it is before. + +1:09:53.551 --> 1:09:58.310 +That's maybe not surprising because here we +know we can paralyze. + +1:09:58.618 --> 1:10:04.592 +So we have put in here our ink holder and +generated the ink stash, so that's exactly + +1:10:04.592 --> 1:10:05.295 +the same. + +1:10:05.845 --> 1:10:16.229 +However, now we need to do one more thing: +One challenge is what we had before and that's + +1:10:16.229 --> 1:10:26.799 +a challenge of natural language generation +like machine translation. + +1:10:32.672 --> 1:10:38.447 +We generate until we generate this out of +end of center stock, but if we now generate + +1:10:38.447 --> 1:10:44.625 +everything at once that's no longer possible, +so we cannot generate as long because we only + +1:10:44.625 --> 1:10:45.632 +generated one. + +1:10:46.206 --> 1:10:58.321 +So the question is how can we now determine +how long the sequence is, and we can also accelerate. + +1:11:00.000 --> 1:11:06.384 +Yes, but there would be one idea, and there +is other work which tries to do that. + +1:11:06.806 --> 1:11:15.702 +However, in here there's some work already +done before and maybe you remember we had the + +1:11:15.702 --> 1:11:20.900 +IBM models and there was this concept of fertility. + +1:11:21.241 --> 1:11:26.299 +The concept of fertility is means like for +one saucepan, and how many target pores does + +1:11:26.299 --> 1:11:27.104 +it translate? + +1:11:27.847 --> 1:11:34.805 +And exactly that we try to do here, and that +means we are calculating like at the top we + +1:11:34.805 --> 1:11:36.134 +are calculating. + +1:11:36.396 --> 1:11:42.045 +So it says word is translated into word. + +1:11:42.045 --> 1:11:54.171 +Word might be translated into words into, +so we're trying to predict in how many words. + +1:11:55.935 --> 1:12:10.314 +And then the end of the anchor, so this is +like a length estimation. + +1:12:10.314 --> 1:12:15.523 +You can do it otherwise. + +1:12:16.236 --> 1:12:24.526 +You initialize your decoder input and we know +it's good with word embeddings so we're trying + +1:12:24.526 --> 1:12:28.627 +to do the same thing and what people then do. + +1:12:28.627 --> 1:12:35.224 +They initialize it again with word embedding +but in the frequency of the. + +1:12:35.315 --> 1:12:36.460 +So we have the cartilage. + +1:12:36.896 --> 1:12:47.816 +So one has two, so twice the is and then one +is, so that is then our initialization. + +1:12:48.208 --> 1:12:57.151 +In other words, if you don't predict fertilities +but predict lengths, you can just initialize + +1:12:57.151 --> 1:12:57.912 +second. + +1:12:58.438 --> 1:13:07.788 +This often works a bit better, but that's +the other. + +1:13:07.788 --> 1:13:16.432 +Now you have everything in training and testing. + +1:13:16.656 --> 1:13:18.621 +This is all available at once. + +1:13:20.280 --> 1:13:31.752 +Then we can generate everything in parallel, +so we have the decoder stack, and that is now + +1:13:31.752 --> 1:13:33.139 +as before. + +1:13:35.395 --> 1:13:41.555 +And then we're doing the translation predictions +here on top of it in order to do. + +1:13:43.083 --> 1:13:59.821 +And then we are predicting here the target +words and once predicted, and that is the basic + +1:13:59.821 --> 1:14:00.924 +idea. + +1:14:01.241 --> 1:14:08.171 +Machine translation: Where the idea is, we +don't have to do one by one what we're. + +1:14:10.210 --> 1:14:13.900 +So this looks really, really, really great. + +1:14:13.900 --> 1:14:20.358 +On the first view there's one challenge with +this, and this is the baseline. + +1:14:20.358 --> 1:14:27.571 +Of course there's some improvements, but in +general the quality is often significant. + +1:14:28.068 --> 1:14:32.075 +So here you see the baseline models. + +1:14:32.075 --> 1:14:38.466 +You have a loss of ten blue points or something +like that. + +1:14:38.878 --> 1:14:40.230 +So why does it change? + +1:14:40.230 --> 1:14:41.640 +So why is it happening? + +1:14:43.903 --> 1:14:56.250 +If you look at the errors there is repetitive +tokens, so you have like or things like that. + +1:14:56.536 --> 1:15:01.995 +Broken senses or influent senses, so that +exactly where algebra aggressive models are + +1:15:01.995 --> 1:15:04.851 +very good, we say that's a bit of a problem. + +1:15:04.851 --> 1:15:07.390 +They generate very fluid transcription. + +1:15:07.387 --> 1:15:10.898 +Translation: Sometimes there doesn't have +to do anything with the input. + +1:15:11.411 --> 1:15:14.047 +But generally it really looks always very +fluid. + +1:15:14.995 --> 1:15:20.865 +Here exactly the opposite, so the problem +is that we don't have really fluid translation. + +1:15:21.421 --> 1:15:26.123 +And that is mainly due to the challenge that +we have this independent assumption. + +1:15:26.646 --> 1:15:35.873 +So in this case, the probability of Y of the +second position is independent of the probability + +1:15:35.873 --> 1:15:40.632 +of X, so we don't know what was there generated. + +1:15:40.632 --> 1:15:43.740 +We're just generating it there. + +1:15:43.964 --> 1:15:55.439 +You can see it also in a bit of examples. + +1:15:55.439 --> 1:16:03.636 +You can over-panelize shifts. + +1:16:04.024 --> 1:16:10.566 +And the problem is this is already an improvement +again, but this is also similar to. + +1:16:11.071 --> 1:16:19.900 +So you can, for example, translate heeded +back, or maybe you could also translate it + +1:16:19.900 --> 1:16:31.105 +with: But on their feeling down in feeling +down, if the first position thinks of their + +1:16:31.105 --> 1:16:34.594 +feeling done and the second. + +1:16:35.075 --> 1:16:42.908 +So each position here and that is one of the +main issues here doesn't know what the other. + +1:16:43.243 --> 1:16:53.846 +And for example, if you are translating something +with, you can often translate things in two + +1:16:53.846 --> 1:16:58.471 +ways: German with a different agreement. + +1:16:58.999 --> 1:17:02.047 +And then here where you have to decide do +you have to use jewelry. + +1:17:02.162 --> 1:17:05.460 +Interpretator: It doesn't know which word +it has to select. + +1:17:06.086 --> 1:17:14.789 +Mean, of course, it knows a hidden state, +but in the end you have a liability distribution. + +1:17:16.256 --> 1:17:20.026 +And that is the important thing in the outer +regressive month. + +1:17:20.026 --> 1:17:24.335 +You know that because you have put it in you +here, you don't know that. + +1:17:24.335 --> 1:17:29.660 +If it's equal probable here to two, you don't +Know Which Is Selected, and of course that + +1:17:29.660 --> 1:17:32.832 +depends on what should be the latest traction +under. + +1:17:33.333 --> 1:17:39.554 +Yep, that's the undershift, and we're going +to last last the next time. + +1:17:39.554 --> 1:17:39.986 +Yes. + +1:17:40.840 --> 1:17:44.935 +Doesn't this also appear in and like now we're +talking about physical training? + +1:17:46.586 --> 1:17:48.412 +The thing is in the auto regress. + +1:17:48.412 --> 1:17:50.183 +If you give it the correct one,. + +1:17:50.450 --> 1:17:55.827 +So if you predict here comma what the reference +is feeling then you tell the model here. + +1:17:55.827 --> 1:17:59.573 +The last one was feeling and then it knows +it has to be done. + +1:17:59.573 --> 1:18:04.044 +But here it doesn't know that because it doesn't +get as input as a right. + +1:18:04.204 --> 1:18:24.286 +Yes, that's a bit depending on what. + +1:18:24.204 --> 1:18:27.973 +But in training, of course, you just try to +make the highest one the current one. + +1:18:31.751 --> 1:18:38.181 +So what you can do is things like CDC loss +which can adjust for this. + +1:18:38.181 --> 1:18:42.866 +So then you can also have this shifted correction. + +1:18:42.866 --> 1:18:50.582 +If you're doing this type of correction in +the CDC loss you don't get full penalty. + +1:18:50.930 --> 1:18:58.486 +Just shifted by one, so it's a bit of a different +loss, which is mainly used in, but. + +1:19:00.040 --> 1:19:03.412 +It can be used in order to address this problem. + +1:19:04.504 --> 1:19:13.844 +The other problem is that outer regressively +we have the label buyers that tries to disimmigrate. + +1:19:13.844 --> 1:19:20.515 +That's the example did before was if you translate +thank you to Dung. + +1:19:20.460 --> 1:19:31.925 +And then it might end up because it learns +in the first position and the second also. + +1:19:32.492 --> 1:19:43.201 +In order to prevent that, it would be helpful +for one output, only one output, so that makes + +1:19:43.201 --> 1:19:47.002 +the system already better learn. + +1:19:47.227 --> 1:19:53.867 +Might be that for slightly different inputs +you have different outputs, but for the same. + +1:19:54.714 --> 1:19:57.467 +That we can luckily very easily solve. + +1:19:59.119 --> 1:19:59.908 +And it's done. + +1:19:59.908 --> 1:20:04.116 +We just learned the technique about it, which +is called knowledge distillation. + +1:20:04.985 --> 1:20:13.398 +So what we can do and the easiest solution +to prove your non-autoregressive model is to + +1:20:13.398 --> 1:20:16.457 +train an auto regressive model. + +1:20:16.457 --> 1:20:22.958 +Then you decode your whole training gamer +with this model and then. + +1:20:23.603 --> 1:20:27.078 +While the main advantage of that is that this +is more consistent,. + +1:20:27.407 --> 1:20:33.995 +So for the same input you always have the +same output. + +1:20:33.995 --> 1:20:41.901 +So you have to make your training data more +consistent and learn. + +1:20:42.482 --> 1:20:54.471 +So there is another advantage of knowledge +distillation and that advantage is you have + +1:20:54.471 --> 1:20:59.156 +more consistent training signals. + +1:21:04.884 --> 1:21:10.287 +There's another to make the things more easy +at the beginning. + +1:21:10.287 --> 1:21:16.462 +There's this plants model, black model where +you put in parts of input. + +1:21:16.756 --> 1:21:26.080 +So during training, especially at the beginning, +you give some correct solutions at the beginning. + +1:21:28.468 --> 1:21:38.407 +And there is this tokens at a time, so the +idea is to establish other regressive training. + +1:21:40.000 --> 1:21:50.049 +And some targets are open, so you always predict +only like first auto regression is K. + +1:21:50.049 --> 1:21:59.174 +It puts one, so you always have one input +and one output, then you do partial. + +1:21:59.699 --> 1:22:05.825 +So in that way you can slowly learn what is +a good and what is a bad answer. + +1:22:08.528 --> 1:22:10.862 +It doesn't sound very impressive. + +1:22:10.862 --> 1:22:12.578 +Don't contact me anyway. + +1:22:12.578 --> 1:22:15.323 +Go all over your training data several. + +1:22:15.875 --> 1:22:20.655 +You can even switch in between. + +1:22:20.655 --> 1:22:29.318 +There is a homework on this thing where you +try to start. + +1:22:31.271 --> 1:22:41.563 +You have to learn so there's a whole work +on that so this is often happening and it doesn't + +1:22:41.563 --> 1:22:46.598 +mean it's less efficient but still it helps. + +1:22:49.389 --> 1:22:57.979 +For later maybe here are some examples of +how much things help. + +1:22:57.979 --> 1:23:04.958 +Maybe one point here is that it's really important. + +1:23:05.365 --> 1:23:13.787 +Here's the translation performance and speed. + +1:23:13.787 --> 1:23:24.407 +One point which is a point is if you compare +researchers. + +1:23:24.784 --> 1:23:33.880 +So yeah, if you're compared to one very weak +baseline transformer even with beam search, + +1:23:33.880 --> 1:23:40.522 +then you're ten times slower than a very strong +auto regressive. + +1:23:40.961 --> 1:23:48.620 +If you make a strong baseline then it's going +down to depending on times and here like: You + +1:23:48.620 --> 1:23:53.454 +have a lot of different speed ups. + +1:23:53.454 --> 1:24:03.261 +Generally, it makes a strong baseline and +not very simple transformer. + +1:24:07.407 --> 1:24:20.010 +Yeah, with this one last thing that you can +do to speed up things and also reduce your + +1:24:20.010 --> 1:24:25.950 +memory is what is called half precision. + +1:24:26.326 --> 1:24:29.139 +And especially for decoding issues for training. + +1:24:29.139 --> 1:24:31.148 +Sometimes it also gets less stale. + +1:24:32.592 --> 1:24:45.184 +With this we close nearly wait a bit, so what +you should remember is that efficient machine + +1:24:45.184 --> 1:24:46.963 +translation. + +1:24:47.007 --> 1:24:51.939 +We have, for example, looked at knowledge +distillation. + +1:24:51.939 --> 1:24:55.991 +We have looked at non auto regressive models. + +1:24:55.991 --> 1:24:57.665 +We have different. + +1:24:58.898 --> 1:25:02.383 +For today and then only requests. + +1:25:02.383 --> 1:25:08.430 +So if you haven't done so, please fill out +the evaluation. + +1:25:08.388 --> 1:25:20.127 +So now if you have done so think then you +should have and with the online people hopefully. + +1:25:20.320 --> 1:25:29.758 +Only possibility to tell us what things are +good and what not the only one but the most + +1:25:29.758 --> 1:25:30.937 +efficient. + +1:25:31.851 --> 1:25:35.871 +So think of all the students doing it in this +case okay and then thank. + +0:00:01.921 --> 0:00:16.424 +Hey welcome to today's lecture, what we today +want to look at is how we can make new. + +0:00:16.796 --> 0:00:26.458 +So until now we have this global system, the +encoder and the decoder mostly, and we haven't + +0:00:26.458 --> 0:00:29.714 +really thought about how long. + +0:00:30.170 --> 0:00:42.684 +And what we, for example, know is yeah, you +can make the systems bigger in different ways. + +0:00:42.684 --> 0:00:47.084 +We can make them deeper so the. + +0:00:47.407 --> 0:00:56.331 +And if we have at least enough data that typically +helps you make things performance better,. + +0:00:56.576 --> 0:01:00.620 +But of course leads to problems that we need +more resources. + +0:01:00.620 --> 0:01:06.587 +That is a problem at universities where we +have typically limited computation capacities. + +0:01:06.587 --> 0:01:11.757 +So at some point you have such big models +that you cannot train them anymore. + +0:01:13.033 --> 0:01:23.792 +And also for companies is of course important +if it costs you like to generate translation + +0:01:23.792 --> 0:01:26.984 +just by power consumption. + +0:01:27.667 --> 0:01:35.386 +So yeah, there's different reasons why you +want to do efficient machine translation. + +0:01:36.436 --> 0:01:48.338 +One reason is there are different ways of +how you can improve your machine translation + +0:01:48.338 --> 0:01:50.527 +system once we. + +0:01:50.670 --> 0:01:55.694 +There can be different types of data we looked +into data crawling, monolingual data. + +0:01:55.875 --> 0:01:59.024 +All this data and the aim is always. + +0:01:59.099 --> 0:02:05.735 +Of course, we are not just purely interested +in having more data, but the idea why we want + +0:02:05.735 --> 0:02:12.299 +to have more data is that more data also means +that we have better quality because mostly + +0:02:12.299 --> 0:02:17.550 +we are interested in increasing the quality +of the machine translation. + +0:02:18.838 --> 0:02:24.892 +But there's also other ways of how you can +improve the quality of a machine translation. + +0:02:25.325 --> 0:02:36.450 +And what is, of course, that is where most +research is focusing on. + +0:02:36.450 --> 0:02:44.467 +It means all we want to build better algorithms. + +0:02:44.684 --> 0:02:48.199 +Course: The other things are normally as good. + +0:02:48.199 --> 0:02:54.631 +Sometimes it's easier to improve, so often +it's easier to just collect more data than + +0:02:54.631 --> 0:02:57.473 +to invent some great view algorithms. + +0:02:57.473 --> 0:03:00.315 +But yeah, both of them are important. + +0:03:00.920 --> 0:03:09.812 +But there is this third thing, especially +with neural machine translation, and that means + +0:03:09.812 --> 0:03:11.590 +we make a bigger. + +0:03:11.751 --> 0:03:16.510 +Can be, as said, that we have more layers, +that we have wider layers. + +0:03:16.510 --> 0:03:19.977 +The other thing we talked a bit about is ensemble. + +0:03:19.977 --> 0:03:24.532 +That means we are not building one new machine +translation system. + +0:03:24.965 --> 0:03:27.505 +And we can easily build four. + +0:03:27.505 --> 0:03:32.331 +What is the typical strategy to build different +systems? + +0:03:32.331 --> 0:03:33.177 +Remember. + +0:03:35.795 --> 0:03:40.119 +It should be of course a bit different if +you have the same. + +0:03:40.119 --> 0:03:44.585 +If they all predict the same then combining +them doesn't help. + +0:03:44.585 --> 0:03:48.979 +So what is the easiest way if you have to +build four systems? + +0:03:51.711 --> 0:04:01.747 +And the Charleston's will take, but this is +the best output of a single system. + +0:04:02.362 --> 0:04:10.165 +Mean now, it's really three different systems +so that you later can combine them and maybe + +0:04:10.165 --> 0:04:11.280 +the average. + +0:04:11.280 --> 0:04:16.682 +Ensembles are typically that the average is +all probabilities. + +0:04:19.439 --> 0:04:24.227 +The idea is to think about neural networks. + +0:04:24.227 --> 0:04:29.342 +There's one parameter which can easily adjust. + +0:04:29.342 --> 0:04:36.525 +That's exactly the easiest way to randomize +with three different. + +0:04:37.017 --> 0:04:43.119 +They have the same architecture, so all the +hydroparameters are the same, but they are + +0:04:43.119 --> 0:04:43.891 +different. + +0:04:43.891 --> 0:04:46.556 +They will have different predictions. + +0:04:48.228 --> 0:04:52.572 +So, of course, bigger amounts. + +0:04:52.572 --> 0:05:05.325 +Some of these are a bit the easiest way of +improving your quality because you don't really + +0:05:05.325 --> 0:05:08.268 +have to do anything. + +0:05:08.588 --> 0:05:12.588 +There is limits on that bigger models only +get better. + +0:05:12.588 --> 0:05:19.132 +If you have enough training data you can't +do like a handheld layer and you will not work + +0:05:19.132 --> 0:05:24.877 +on very small data but with a recent amount +of data that is the easiest thing. + +0:05:25.305 --> 0:05:33.726 +However, they are challenging with making +better models, bigger motors, and that is the + +0:05:33.726 --> 0:05:34.970 +computation. + +0:05:35.175 --> 0:05:44.482 +So, of course, if you have a bigger model +that can mean that you have longer running + +0:05:44.482 --> 0:05:49.518 +times, if you have models, you have to times. + +0:05:51.171 --> 0:05:56.685 +Normally you cannot paralyze the different +layers because the input to one layer is always + +0:05:56.685 --> 0:06:02.442 +the output of the previous layer, so you propagate +that so it will also increase your runtime. + +0:06:02.822 --> 0:06:10.720 +Then you have to store all your models in +memory. + +0:06:10.720 --> 0:06:20.927 +If you have double weights you will have: +Is more difficult to then do back propagation. + +0:06:20.927 --> 0:06:27.680 +You have to store in between the activations, +so there's not only do you increase the model + +0:06:27.680 --> 0:06:31.865 +in your memory, but also all these other variables +that. + +0:06:34.414 --> 0:06:36.734 +And so in general it is more expensive. + +0:06:37.137 --> 0:06:54.208 +And therefore there's good reasons in looking +into can we make these models sound more efficient. + +0:06:54.134 --> 0:07:00.982 +So it's been through the viewer, you can have +it okay, have one and one day of training time, + +0:07:00.982 --> 0:07:01.274 +or. + +0:07:01.221 --> 0:07:07.535 +Forty thousand euros and then what is the +best machine translation system I can get within + +0:07:07.535 --> 0:07:08.437 +this budget. + +0:07:08.969 --> 0:07:19.085 +And then, of course, you can make the models +bigger, but then you have to train them shorter, + +0:07:19.085 --> 0:07:24.251 +and then we can make more efficient algorithms. + +0:07:25.925 --> 0:07:31.699 +If you think about efficiency, there's a bit +different scenarios. + +0:07:32.312 --> 0:07:43.635 +So if you're more of coming from the research +community, what you'll be doing is building + +0:07:43.635 --> 0:07:47.913 +a lot of models in your research. + +0:07:48.088 --> 0:07:58.645 +So you're having your test set of maybe sentences, +calculating the blue score, then another model. + +0:07:58.818 --> 0:08:08.911 +So what that means is typically you're training +on millions of cents, so your training time + +0:08:08.911 --> 0:08:14.944 +is long, maybe a day, but maybe in other cases +a week. + +0:08:15.135 --> 0:08:22.860 +The testing is not really the cost efficient, +but the training is very costly. + +0:08:23.443 --> 0:08:37.830 +If you are more thinking of building models +for application, the scenario is quite different. + +0:08:38.038 --> 0:08:46.603 +And then you keep it running, and maybe thousands +of customers are using it in translating. + +0:08:46.603 --> 0:08:47.720 +So in that. + +0:08:48.168 --> 0:08:59.577 +And we will see that it is not always the +same type of challenges you can paralyze some + +0:08:59.577 --> 0:09:07.096 +things in training, which you cannot paralyze +in testing. + +0:09:07.347 --> 0:09:14.124 +For example, in training you have to do back +propagation, so you have to store the activations. + +0:09:14.394 --> 0:09:23.901 +Therefore, in testing we briefly discussed +that we would do it in more detail today in + +0:09:23.901 --> 0:09:24.994 +training. + +0:09:25.265 --> 0:09:36.100 +You know they're a target and you can process +everything in parallel while in testing. + +0:09:36.356 --> 0:09:46.741 +So you can only do one word at a time, and +so you can less paralyze this. + +0:09:46.741 --> 0:09:50.530 +Therefore, it's important. + +0:09:52.712 --> 0:09:55.347 +Is a specific task on this. + +0:09:55.347 --> 0:10:03.157 +For example, it's the efficiency task where +it's about making things as efficient. + +0:10:03.123 --> 0:10:09.230 +Is possible and they can look at different +resources. + +0:10:09.230 --> 0:10:14.207 +So how much deep fuel run time do you need? + +0:10:14.454 --> 0:10:19.366 +See how much memory you need or you can have +a fixed memory budget and then have to build + +0:10:19.366 --> 0:10:20.294 +the best system. + +0:10:20.500 --> 0:10:29.010 +And here is a bit like an example of that, +so there's three teams from Edinburgh from + +0:10:29.010 --> 0:10:30.989 +and they submitted. + +0:10:31.131 --> 0:10:36.278 +So then, of course, if you want to know the +most efficient system you have to do a bit + +0:10:36.278 --> 0:10:36.515 +of. + +0:10:36.776 --> 0:10:44.656 +You want to have a better quality or more +runtime and there's not the one solution. + +0:10:44.656 --> 0:10:46.720 +You can improve your. + +0:10:46.946 --> 0:10:49.662 +And that you see that there are different +systems. + +0:10:49.909 --> 0:11:06.051 +Here is how many words you can do for a second +on the clock, and you want to be as talk as + +0:11:06.051 --> 0:11:07.824 +possible. + +0:11:08.068 --> 0:11:08.889 +And you see here a bit. + +0:11:08.889 --> 0:11:09.984 +This is a little bit different. + +0:11:11.051 --> 0:11:27.717 +You want to be there on the top right corner +and you can get a score of something between + +0:11:27.717 --> 0:11:29.014 +words. + +0:11:30.250 --> 0:11:34.161 +Two hundred and fifty thousand, then you'll +ever come and score zero point three. + +0:11:34.834 --> 0:11:41.243 +There is, of course, any bit of a decision, +but the question is, like how far can you again? + +0:11:41.243 --> 0:11:47.789 +Some of all these points on this line would +be winners because they are somehow most efficient + +0:11:47.789 --> 0:11:53.922 +in a way that there's no system which achieves +the same quality with less computational. + +0:11:57.657 --> 0:12:04.131 +So there's the one question of which resources +are you interested. + +0:12:04.131 --> 0:12:07.416 +Are you running it on CPU or GPU? + +0:12:07.416 --> 0:12:11.668 +There's different ways of paralyzing stuff. + +0:12:14.654 --> 0:12:20.777 +Another dimension is how you process your +data. + +0:12:20.777 --> 0:12:27.154 +There's really the best processing and streaming. + +0:12:27.647 --> 0:12:34.672 +So in batch processing you have the whole +document available so you can translate all + +0:12:34.672 --> 0:12:39.981 +sentences in perimeter and then you're interested +in throughput. + +0:12:40.000 --> 0:12:43.844 +But you can then process, for example, especially +in GPS. + +0:12:43.844 --> 0:12:49.810 +That's interesting, you're not translating +one sentence at a time, but you're translating + +0:12:49.810 --> 0:12:56.108 +one hundred sentences or so in parallel, so +you have one more dimension where you can paralyze + +0:12:56.108 --> 0:12:57.964 +and then be more efficient. + +0:12:58.558 --> 0:13:14.863 +On the other hand, for example sorts of documents, +so we learned that if you do badge processing + +0:13:14.863 --> 0:13:16.544 +you have. + +0:13:16.636 --> 0:13:24.636 +Then, of course, it makes sense to sort the +sentences in order to have the minimum thing + +0:13:24.636 --> 0:13:25.535 +attached. + +0:13:27.427 --> 0:13:32.150 +The other scenario is more the streaming scenario +where you do life translation. + +0:13:32.512 --> 0:13:40.212 +So in that case you can't wait for the whole +document to pass, but you have to do. + +0:13:40.520 --> 0:13:49.529 +And then, for example, that's especially in +situations like speech translation, and then + +0:13:49.529 --> 0:13:53.781 +you're interested in things like latency. + +0:13:53.781 --> 0:14:00.361 +So how much do you have to wait to get the +output of a sentence? + +0:14:06.566 --> 0:14:16.956 +Finally, there is the thing about the implementation: +Today we're mainly looking at different algorithms, + +0:14:16.956 --> 0:14:23.678 +different models of how you can model them +in your machine translation system, but of + +0:14:23.678 --> 0:14:29.227 +course for the same algorithms there's also +different implementations. + +0:14:29.489 --> 0:14:38.643 +So, for example, for a machine translation +this tool could be very fast. + +0:14:38.638 --> 0:14:46.615 +So they have like coded a lot of the operations +very low resource, not low resource, low level + +0:14:46.615 --> 0:14:49.973 +on the directly on the QDAC kernels in. + +0:14:50.110 --> 0:15:00.948 +So the same attention network is typically +more efficient in that type of algorithm. + +0:15:00.880 --> 0:15:02.474 +Than in in any other. + +0:15:03.323 --> 0:15:13.105 +Of course, it might be other disadvantages, +so if you're a little worker or have worked + +0:15:13.105 --> 0:15:15.106 +in the practical. + +0:15:15.255 --> 0:15:22.604 +Because it's normally easier to understand, +easier to change, and so on, but there is again + +0:15:22.604 --> 0:15:23.323 +a train. + +0:15:23.483 --> 0:15:29.440 +You have to think about, do you want to include +this into my study or comparison or not? + +0:15:29.440 --> 0:15:36.468 +Should it be like I compare different implementations +and I also find the most efficient implementation? + +0:15:36.468 --> 0:15:39.145 +Or is it only about the pure algorithm? + +0:15:42.742 --> 0:15:50.355 +Yeah, when building these systems there is +a different trade-off to do. + +0:15:50.850 --> 0:15:56.555 +So there's one of the traders between memory +and throughput, so how many words can generate + +0:15:56.555 --> 0:15:57.299 +per second. + +0:15:57.557 --> 0:16:03.351 +So typically you can easily like increase +your scruple by increasing the batch size. + +0:16:03.643 --> 0:16:06.899 +So that means you are translating more sentences +in parallel. + +0:16:07.107 --> 0:16:09.241 +And gypsies are very good at that stuff. + +0:16:09.349 --> 0:16:15.161 +It should translate one sentence or one hundred +sentences, not the same time, but its. + +0:16:15.115 --> 0:16:20.997 +Rough are very similar because they have these +efficient metrics multiplication so that you + +0:16:20.997 --> 0:16:24.386 +can do the same operation on all sentences +parallel. + +0:16:24.386 --> 0:16:30.141 +So typically that means if you increase your +benchmark you can do more things in parallel + +0:16:30.141 --> 0:16:31.995 +and you will translate more. + +0:16:31.952 --> 0:16:33.370 +Second. + +0:16:33.653 --> 0:16:43.312 +On the other hand, with this advantage, of +course you will need higher badge sizes and + +0:16:43.312 --> 0:16:44.755 +more memory. + +0:16:44.965 --> 0:16:56.452 +To begin with, the other problem is that you +have such big models that you can only translate + +0:16:56.452 --> 0:16:59.141 +with lower bed sizes. + +0:16:59.119 --> 0:17:08.466 +If you are running out of memory with translating, +one idea to go on that is to decrease your. + +0:17:13.453 --> 0:17:24.456 +Then there is the thing about quality in Screwport, +of course, and before it's like larger models, + +0:17:24.456 --> 0:17:28.124 +but in generally higher quality. + +0:17:28.124 --> 0:17:31.902 +The first one is always this way. + +0:17:32.092 --> 0:17:38.709 +Course: Not always larger model helps you +have over fitting at some point, but in generally. + +0:17:43.883 --> 0:17:52.901 +And with this a bit on this training and testing +thing we had before. + +0:17:53.113 --> 0:17:58.455 +So it wears all the difference between training +and testing, and for the encoder and decoder. + +0:17:58.798 --> 0:18:06.992 +So if we are looking at what mentioned before +at training time, we have a source sentence + +0:18:06.992 --> 0:18:17.183 +here: And how this is processed on a is not +the attention here. + +0:18:17.183 --> 0:18:21.836 +That's a tubical transformer. + +0:18:22.162 --> 0:18:31.626 +And how we can do that on a is that we can +paralyze the ear ever since. + +0:18:31.626 --> 0:18:40.422 +The first thing to know is: So that is, of +course, not in all cases. + +0:18:40.422 --> 0:18:49.184 +We'll later talk about speech translation +where we might want to translate. + +0:18:49.389 --> 0:18:56.172 +Without the general case in, it's like you +have the full sentence you want to translate. + +0:18:56.416 --> 0:19:02.053 +So the important thing is we are here everything +available on the source side. + +0:19:03.323 --> 0:19:13.524 +And then this was one of the big advantages +that you can remember back of transformer. + +0:19:13.524 --> 0:19:15.752 +There are several. + +0:19:16.156 --> 0:19:25.229 +But the other one is now that we can calculate +the full layer. + +0:19:25.645 --> 0:19:29.318 +There is no dependency between this and this +state or this and this state. + +0:19:29.749 --> 0:19:36.662 +So we always did like here to calculate the +key value and query, and based on that you + +0:19:36.662 --> 0:19:37.536 +calculate. + +0:19:37.937 --> 0:19:46.616 +Which means we can do all these calculations +here in parallel and in parallel. + +0:19:48.028 --> 0:19:55.967 +And there, of course, is this very efficiency +because again for GPS it's too bigly possible + +0:19:55.967 --> 0:20:00.887 +to do these things in parallel and one after +each other. + +0:20:01.421 --> 0:20:10.311 +And then we can also for each layer one by +one, and then we calculate here the encoder. + +0:20:10.790 --> 0:20:21.921 +In training now an important thing is that +for the decoder we have the full sentence available + +0:20:21.921 --> 0:20:28.365 +because we know this is the target we should +generate. + +0:20:29.649 --> 0:20:33.526 +We have models now in a different way. + +0:20:33.526 --> 0:20:38.297 +This hidden state is only on the previous +ones. + +0:20:38.598 --> 0:20:51.887 +And the first thing here depends only on this +information, so you see if you remember we + +0:20:51.887 --> 0:20:56.665 +had this masked self-attention. + +0:20:56.896 --> 0:21:04.117 +So that means, of course, we can only calculate +the decoder once the encoder is done, but that's. + +0:21:04.444 --> 0:21:06.656 +Percent can calculate the end quarter. + +0:21:06.656 --> 0:21:08.925 +Then we can calculate here the decoder. + +0:21:09.569 --> 0:21:25.566 +But again in training we have x, y and that +is available so we can calculate everything + +0:21:25.566 --> 0:21:27.929 +in parallel. + +0:21:28.368 --> 0:21:40.941 +So the interesting thing or advantage of transformer +is in training. + +0:21:40.941 --> 0:21:46.408 +We can do it for the decoder. + +0:21:46.866 --> 0:21:54.457 +That means you will have more calculations +because you can only calculate one layer at + +0:21:54.457 --> 0:22:02.310 +a time, but for example the length which is +too bigly quite long or doesn't really matter + +0:22:02.310 --> 0:22:03.270 +that much. + +0:22:05.665 --> 0:22:10.704 +However, in testing this situation is different. + +0:22:10.704 --> 0:22:13.276 +In testing we only have. + +0:22:13.713 --> 0:22:20.622 +So this means we start with a sense: We don't +know the full sentence yet because we ought + +0:22:20.622 --> 0:22:29.063 +to regularly generate that so for the encoder +we have the same here but for the decoder. + +0:22:29.409 --> 0:22:39.598 +In this case we only have the first and the +second instinct, but only for all states in + +0:22:39.598 --> 0:22:40.756 +parallel. + +0:22:41.101 --> 0:22:51.752 +And then we can do the next step for y because +we are putting our most probable one. + +0:22:51.752 --> 0:22:58.643 +We do greedy search or beam search, but you +cannot do. + +0:23:03.663 --> 0:23:16.838 +Yes, so if we are interesting in making things +more efficient for testing, which we see, for + +0:23:16.838 --> 0:23:22.363 +example in the scenario of really our. + +0:23:22.642 --> 0:23:34.286 +It makes sense that we think about our architecture +and that we are currently working on attention + +0:23:34.286 --> 0:23:35.933 +based models. + +0:23:36.096 --> 0:23:44.150 +The decoder there is some of the most time +spent testing and testing. + +0:23:44.150 --> 0:23:47.142 +It's similar, but during. + +0:23:47.167 --> 0:23:50.248 +Nothing about beam search. + +0:23:50.248 --> 0:23:59.833 +It might be even more complicated because +in beam search you have to try different. + +0:24:02.762 --> 0:24:15.140 +So the question is what can you now do in +order to make your model more efficient and + +0:24:15.140 --> 0:24:21.905 +better in translation in these types of cases? + +0:24:24.604 --> 0:24:30.178 +And the one thing is to look into the encoded +decoder trailer. + +0:24:30.690 --> 0:24:43.898 +And then until now we typically assume that +the depth of the encoder and the depth of the + +0:24:43.898 --> 0:24:48.154 +decoder is roughly the same. + +0:24:48.268 --> 0:24:55.553 +So if you haven't thought about it, you just +take what is running well. + +0:24:55.553 --> 0:24:57.678 +You would try to do. + +0:24:58.018 --> 0:25:04.148 +However, we saw now that there is a quite +big challenge and the runtime is a lot longer + +0:25:04.148 --> 0:25:04.914 +than here. + +0:25:05.425 --> 0:25:14.018 +The question is also the case for the calculations, +or do we have there the same issue that we + +0:25:14.018 --> 0:25:21.887 +only get the good quality if we are having +high and high, so we know that making these + +0:25:21.887 --> 0:25:25.415 +more depths is increasing our quality. + +0:25:25.425 --> 0:25:31.920 +But what we haven't talked about is really +important that we increase the depth the same + +0:25:31.920 --> 0:25:32.285 +way. + +0:25:32.552 --> 0:25:41.815 +So what we can put instead also do is something +like this where you have a deep encoder and + +0:25:41.815 --> 0:25:42.923 +a shallow. + +0:25:43.163 --> 0:25:57.386 +So that would be that you, for example, have +instead of having layers on the encoder, and + +0:25:57.386 --> 0:25:59.757 +layers on the. + +0:26:00.080 --> 0:26:10.469 +So in this case the overall depth from start +to end would be similar and so hopefully. + +0:26:11.471 --> 0:26:21.662 +But we could a lot more things hear parallelized, +and hear what is costly at the end during decoding + +0:26:21.662 --> 0:26:22.973 +the decoder. + +0:26:22.973 --> 0:26:29.330 +Because that does change in an outer regressive +way, there we. + +0:26:31.411 --> 0:26:33.727 +And that that can be analyzed. + +0:26:33.727 --> 0:26:38.734 +So here is some examples: Where people have +done all this. + +0:26:39.019 --> 0:26:55.710 +So here it's mainly interested on the orange +things, which is auto-regressive about the + +0:26:55.710 --> 0:26:57.607 +speed up. + +0:26:57.717 --> 0:27:15.031 +You have the system, so agree is not exactly +the same, but it's similar. + +0:27:15.055 --> 0:27:23.004 +It's always the case if you look at speed +up. + +0:27:23.004 --> 0:27:31.644 +Think they put a speed of so that's the baseline. + +0:27:31.771 --> 0:27:35.348 +So between and times as fast. + +0:27:35.348 --> 0:27:42.621 +If you switch from a system to where you have +layers in the. + +0:27:42.782 --> 0:27:52.309 +You see that although you have slightly more +parameters, more calculations are also roughly + +0:27:52.309 --> 0:28:00.283 +the same, but you can speed out because now +during testing you can paralyze. + +0:28:02.182 --> 0:28:09.754 +The other thing is that you're speeding up, +but if you look at the performance it's similar, + +0:28:09.754 --> 0:28:13.500 +so sometimes you improve, sometimes you lose. + +0:28:13.500 --> 0:28:20.421 +There's a bit of losing English to Romania, +but in general the quality is very slow. + +0:28:20.680 --> 0:28:30.343 +So you see that you can keep a similar performance +while improving your speed by just having different. + +0:28:30.470 --> 0:28:34.903 +And you also see the encoder layers from speed. + +0:28:34.903 --> 0:28:38.136 +They don't really metal that much. + +0:28:38.136 --> 0:28:38.690 +Most. + +0:28:38.979 --> 0:28:50.319 +Because if you compare the 12th system to +the 6th system you have a lower performance + +0:28:50.319 --> 0:28:57.309 +with 6th and colder layers but the speed is +similar. + +0:28:57.897 --> 0:29:02.233 +And see the huge decrease is it maybe due +to a lack of data. + +0:29:03.743 --> 0:29:11.899 +Good idea would say it's not the case. + +0:29:11.899 --> 0:29:23.191 +Romanian English should have the same number +of data. + +0:29:24.224 --> 0:29:31.184 +Maybe it's just that something in that language. + +0:29:31.184 --> 0:29:40.702 +If you generate Romanian maybe they need more +target dependencies. + +0:29:42.882 --> 0:29:46.263 +The Wine's the Eye Also Don't Know Any Sex +People Want To. + +0:29:47.887 --> 0:29:49.034 +There could be yeah the. + +0:29:49.889 --> 0:29:58.962 +As the maybe if you go from like a movie sphere +to a hybrid sphere, you can: It's very much + +0:29:58.962 --> 0:30:12.492 +easier to expand the vocabulary to English, +but it must be the vocabulary. + +0:30:13.333 --> 0:30:21.147 +Have to check, but would assume that in this +case the system is not retrained, but it's + +0:30:21.147 --> 0:30:22.391 +trained with. + +0:30:22.902 --> 0:30:30.213 +And that's why I was assuming that they have +the same, but maybe you'll write that in this + +0:30:30.213 --> 0:30:35.595 +piece, for example, if they were pre-trained, +the decoder English. + +0:30:36.096 --> 0:30:43.733 +But don't remember exactly if they do something +like that, but that could be a good. + +0:30:45.325 --> 0:30:52.457 +So this is some of the most easy way to speed +up. + +0:30:52.457 --> 0:31:01.443 +You just switch to hyperparameters, not to +implement anything. + +0:31:02.722 --> 0:31:08.367 +Of course, there's other ways of doing that. + +0:31:08.367 --> 0:31:11.880 +We'll look into two things. + +0:31:11.880 --> 0:31:16.521 +The other thing is the architecture. + +0:31:16.796 --> 0:31:28.154 +We are now at some of the baselines that we +are doing. + +0:31:28.488 --> 0:31:39.978 +However, in translation in the decoder side, +it might not be the best solution. + +0:31:39.978 --> 0:31:41.845 +There is no. + +0:31:42.222 --> 0:31:47.130 +So we can use different types of architectures, +also in the encoder and the. + +0:31:47.747 --> 0:31:52.475 +And there's two ways of what you could do +different, or there's more ways. + +0:31:52.912 --> 0:31:54.825 +We will look into two todays. + +0:31:54.825 --> 0:31:58.842 +The one is average attention, which is a very +simple solution. + +0:31:59.419 --> 0:32:01.464 +You can do as it says. + +0:32:01.464 --> 0:32:04.577 +It's not really attending anymore. + +0:32:04.577 --> 0:32:08.757 +It's just like equal attendance to everything. + +0:32:09.249 --> 0:32:23.422 +And the other idea, which is currently done +in most systems which are optimized to efficiency, + +0:32:23.422 --> 0:32:24.913 +is we're. + +0:32:25.065 --> 0:32:32.623 +But on the decoder side we are then not using +transformer or self attention, but we are using + +0:32:32.623 --> 0:32:39.700 +recurrent neural network because they are the +disadvantage of recurrent neural network. + +0:32:39.799 --> 0:32:48.353 +And then the recurrent is normally easier +to calculate because it only depends on inputs, + +0:32:48.353 --> 0:32:49.684 +the input on. + +0:32:51.931 --> 0:33:02.190 +So what is the difference between decoding +and why is the tension maybe not sufficient + +0:33:02.190 --> 0:33:03.841 +for decoding? + +0:33:04.204 --> 0:33:14.390 +If we want to populate the new state, we only +have to look at the input and the previous + +0:33:14.390 --> 0:33:15.649 +state, so. + +0:33:16.136 --> 0:33:19.029 +We are more conditional here networks. + +0:33:19.029 --> 0:33:19.994 +We have the. + +0:33:19.980 --> 0:33:31.291 +Dependency to a fixed number of previous ones, +but that's rarely used for decoding. + +0:33:31.291 --> 0:33:39.774 +In contrast, in transformer we have this large +dependency, so. + +0:33:40.000 --> 0:33:52.760 +So from t minus one to y t so that is somehow +and mainly not very efficient in this way mean + +0:33:52.760 --> 0:33:56.053 +it's very good because. + +0:33:56.276 --> 0:34:03.543 +However, the disadvantage is that we also +have to do all these calculations, so if we + +0:34:03.543 --> 0:34:10.895 +more view from the point of view of efficient +calculation, this might not be the best. + +0:34:11.471 --> 0:34:20.517 +So the question is, can we change our architecture +to keep some of the advantages but make things + +0:34:20.517 --> 0:34:21.994 +more efficient? + +0:34:24.284 --> 0:34:31.131 +The one idea is what is called the average +attention, and the interesting thing is this + +0:34:31.131 --> 0:34:32.610 +work surprisingly. + +0:34:33.013 --> 0:34:38.917 +So the only idea what you're doing is doing +the decoder. + +0:34:38.917 --> 0:34:42.646 +You're not doing attention anymore. + +0:34:42.646 --> 0:34:46.790 +The attention weights are all the same. + +0:34:47.027 --> 0:35:00.723 +So you don't calculate with query and key +the different weights, and then you just take + +0:35:00.723 --> 0:35:03.058 +equal weights. + +0:35:03.283 --> 0:35:07.585 +So here would be one third from this, one +third from this, and one third. + +0:35:09.009 --> 0:35:14.719 +And while it is sufficient you can now do +precalculation and things get more efficient. + +0:35:15.195 --> 0:35:18.803 +So first go the formula that's maybe not directed +here. + +0:35:18.979 --> 0:35:38.712 +So the difference here is that your new hint +stage is the sum of all the hint states, then. + +0:35:38.678 --> 0:35:40.844 +So here would be with this. + +0:35:40.844 --> 0:35:45.022 +It would be one third of this plus one third +of this. + +0:35:46.566 --> 0:35:57.162 +But if you calculate it this way, it's not +yet being more efficient because you still + +0:35:57.162 --> 0:36:01.844 +have to sum over here all the hidden. + +0:36:04.524 --> 0:36:22.932 +But you can not easily speed up these things +by having an in between value, which is just + +0:36:22.932 --> 0:36:24.568 +always. + +0:36:25.585 --> 0:36:30.057 +If you take this as ten to one, you take this +one class this one. + +0:36:30.350 --> 0:36:36.739 +Because this one then was before this, and +this one was this, so in the end. + +0:36:37.377 --> 0:36:49.545 +So now this one is not the final one in order +to get the final one to do the average. + +0:36:49.545 --> 0:36:50.111 +So. + +0:36:50.430 --> 0:37:00.264 +But then if you do this calculation with speed +up you can do it with a fixed number of steps. + +0:37:00.180 --> 0:37:11.300 +Instead of the sun which depends on age, so +you only have to do calculations to calculate + +0:37:11.300 --> 0:37:12.535 +this one. + +0:37:12.732 --> 0:37:21.183 +Can you do the lakes and the lakes? + +0:37:21.183 --> 0:37:32.687 +For example, light bulb here now takes and +then. + +0:37:32.993 --> 0:37:38.762 +That's a very good point and that's why this +is now in the image. + +0:37:38.762 --> 0:37:44.531 +It's not very good so this is the one with +tilder and the tilder. + +0:37:44.884 --> 0:37:57.895 +So this one is just the sum of these two, +because this is just this one. + +0:37:58.238 --> 0:38:08.956 +So the sum of this is exactly as the sum of +these, and the sum of these is the sum of here. + +0:38:08.956 --> 0:38:15.131 +So you only do the sum in here, and the multiplying. + +0:38:15.255 --> 0:38:22.145 +So what you can mainly do here is you can +do it more mathematically. + +0:38:22.145 --> 0:38:31.531 +You can know this by tea taking out of the +sum, and then you can calculate the sum different. + +0:38:36.256 --> 0:38:42.443 +That maybe looks a bit weird and simple, so +we were all talking about this great attention + +0:38:42.443 --> 0:38:47.882 +that we can focus on different parts, and a +bit surprising on this work is now. + +0:38:47.882 --> 0:38:53.321 +In the end it might also work well without +really putting and just doing equal. + +0:38:53.954 --> 0:38:56.164 +Mean it's not that easy. + +0:38:56.376 --> 0:38:58.261 +It's like sometimes this is working. + +0:38:58.261 --> 0:39:00.451 +There's also report weight work that well. + +0:39:01.481 --> 0:39:05.848 +But I think it's an interesting way and it +maybe shows that a lot of. + +0:39:05.805 --> 0:39:10.669 +Things in the self or in the transformer paper +which are more put as like yet. + +0:39:10.669 --> 0:39:14.301 +These are some hyperparameters that are rounded +like that. + +0:39:14.301 --> 0:39:19.657 +You do the lay on all in between and that +you do a feat forward before and things like + +0:39:19.657 --> 0:39:20.026 +that. + +0:39:20.026 --> 0:39:25.567 +But these are also all important and the right +set up around that is also very important. + +0:39:28.969 --> 0:39:38.598 +The other thing you can do in the end is not +completely different from this one. + +0:39:38.598 --> 0:39:42.521 +It's just like a very different. + +0:39:42.942 --> 0:39:54.338 +And that is a recurrent network which also +has this type of highway connection that can + +0:39:54.338 --> 0:40:01.330 +ignore the recurrent unit and directly put +the input. + +0:40:01.561 --> 0:40:10.770 +It's not really adding out, but if you see +the hitting step is your input, but what you + +0:40:10.770 --> 0:40:15.480 +can do is somehow directly go to the output. + +0:40:17.077 --> 0:40:28.390 +These are the four components of the simple +return unit, and the unit is motivated by GIS + +0:40:28.390 --> 0:40:33.418 +and by LCMs, which we have seen before. + +0:40:33.513 --> 0:40:43.633 +And that has proven to be very good for iron +ends, which allows you to have a gate on your. + +0:40:44.164 --> 0:40:48.186 +In this thing we have two gates, the reset +gate and the forget gate. + +0:40:48.768 --> 0:40:57.334 +So first we have the general structure which +has a cell state. + +0:40:57.334 --> 0:41:01.277 +Here we have the cell state. + +0:41:01.361 --> 0:41:09.661 +And then this goes next, and we always get +the different cell states over the times that. + +0:41:10.030 --> 0:41:11.448 +This Is the South Stand. + +0:41:11.771 --> 0:41:16.518 +How do we now calculate that just assume we +have an initial cell safe here? + +0:41:17.017 --> 0:41:19.670 +But the first thing is we're doing the forget +game. + +0:41:20.060 --> 0:41:34.774 +The forgetting models should the new cell +state mainly depend on the previous cell state + +0:41:34.774 --> 0:41:40.065 +or should it depend on our age. + +0:41:40.000 --> 0:41:41.356 +Like Add to Them. + +0:41:41.621 --> 0:41:42.877 +How can we model that? + +0:41:44.024 --> 0:41:45.599 +First we were at a cocktail. + +0:41:45.945 --> 0:41:52.151 +The forget gait is depending on minus one. + +0:41:52.151 --> 0:41:56.480 +You also see here the former. + +0:41:57.057 --> 0:42:01.963 +So we are multiplying both the cell state +and our input. + +0:42:01.963 --> 0:42:04.890 +With some weights we are getting. + +0:42:05.105 --> 0:42:08.472 +We are putting some Bay Inspector and then +we are doing Sigma Weed on that. + +0:42:08.868 --> 0:42:13.452 +So in the end we have numbers between zero +and one saying for each dimension. + +0:42:13.853 --> 0:42:22.041 +Like how much if it's near to zero we will +mainly use the new input. + +0:42:22.041 --> 0:42:31.890 +If it's near to one we will keep the input +and ignore the input at this dimension. + +0:42:33.313 --> 0:42:40.173 +And by this motivation we can then create +here the new sound state, and here you see + +0:42:40.173 --> 0:42:41.141 +the formal. + +0:42:41.601 --> 0:42:55.048 +So you take your foot back gate and multiply +it with your class. + +0:42:55.048 --> 0:43:00.427 +So if my was around then. + +0:43:00.800 --> 0:43:07.405 +In the other case, when the value was others, +that's what you added. + +0:43:07.405 --> 0:43:10.946 +Then you're adding a transformation. + +0:43:11.351 --> 0:43:24.284 +So if this value was maybe zero then you're +putting most of the information from inputting. + +0:43:25.065 --> 0:43:26.947 +Is already your element? + +0:43:26.947 --> 0:43:30.561 +The only question is now based on your element. + +0:43:30.561 --> 0:43:32.067 +What is the output? + +0:43:33.253 --> 0:43:47.951 +And there you have another opportunity so +you can either take the output or instead you + +0:43:47.951 --> 0:43:50.957 +prefer the input. + +0:43:52.612 --> 0:43:58.166 +So is the value also the same for the recept +game and the forget game. + +0:43:58.166 --> 0:43:59.417 +Yes, the movie. + +0:44:00.900 --> 0:44:10.004 +Yes exactly so the matrices are different +and therefore it can be and that should be + +0:44:10.004 --> 0:44:16.323 +and maybe there is sometimes you want to have +information. + +0:44:16.636 --> 0:44:23.843 +So here again we have this vector with values +between zero and which says controlling how + +0:44:23.843 --> 0:44:25.205 +the information. + +0:44:25.505 --> 0:44:36.459 +And then the output is calculated here similar +to a cell stage, but again input is from. + +0:44:36.536 --> 0:44:45.714 +So either the reset gate decides should give +what is currently stored in there, or. + +0:44:46.346 --> 0:44:58.647 +So it's not exactly as the thing we had before, +with the residual connections where we added + +0:44:58.647 --> 0:45:01.293 +up, but here we do. + +0:45:04.224 --> 0:45:08.472 +This is the general idea of a simple recurrent +neural network. + +0:45:08.472 --> 0:45:13.125 +Then we will now look at how we can make things +even more efficient. + +0:45:13.125 --> 0:45:17.104 +But first do you have more questions on how +it is working? + +0:45:23.063 --> 0:45:38.799 +Now these calculations are a bit where things +get more efficient because this somehow. + +0:45:38.718 --> 0:45:43.177 +It depends on all the other damage for the +second one also. + +0:45:43.423 --> 0:45:48.904 +Because if you do a matrix multiplication +with a vector like for the output vector, each + +0:45:48.904 --> 0:45:52.353 +diameter of the output vector depends on all +the other. + +0:45:52.973 --> 0:46:06.561 +The cell state here depends because this one +is used here, and somehow the first dimension + +0:46:06.561 --> 0:46:11.340 +of the cell state only depends. + +0:46:11.931 --> 0:46:17.973 +In order to make that, of course, is sometimes +again making things less paralyzeable if things + +0:46:17.973 --> 0:46:18.481 +depend. + +0:46:19.359 --> 0:46:35.122 +Can easily make that different by changing +from the metric product to not a vector. + +0:46:35.295 --> 0:46:51.459 +So you do first, just like inside here, you +take like the first dimension, my second dimension. + +0:46:52.032 --> 0:46:53.772 +Is, of course, narrow. + +0:46:53.772 --> 0:46:59.294 +This should be reset or this should be because +it should be a different. + +0:46:59.899 --> 0:47:12.053 +Now the first dimension only depends on the +first dimension, so you don't have dependencies + +0:47:12.053 --> 0:47:16.148 +any longer between dimensions. + +0:47:18.078 --> 0:47:25.692 +Maybe it gets a bit clearer if you see about +it in this way, so what we have to do now. + +0:47:25.966 --> 0:47:31.911 +First, we have to do a metrics multiplication +on to gather and to get the. + +0:47:32.292 --> 0:47:38.041 +And then we only have the element wise operations +where we take this output. + +0:47:38.041 --> 0:47:38.713 +We take. + +0:47:39.179 --> 0:47:42.978 +Minus one and our original. + +0:47:42.978 --> 0:47:52.748 +Here we only have elemental abrasions which +can be optimally paralyzed. + +0:47:53.273 --> 0:48:07.603 +So here we have additional paralyzed things +across the dimension and don't have to do that. + +0:48:09.929 --> 0:48:24.255 +Yeah, but this you can do like in parallel +again for all xts. + +0:48:24.544 --> 0:48:33.014 +Here you can't do it in parallel, but you +only have to do it on each seat, and then you + +0:48:33.014 --> 0:48:34.650 +can parallelize. + +0:48:35.495 --> 0:48:39.190 +But this maybe for the dimension. + +0:48:39.190 --> 0:48:42.124 +Maybe it's also important. + +0:48:42.124 --> 0:48:46.037 +I don't know if they have tried it. + +0:48:46.037 --> 0:48:55.383 +I assume it's not only for dimension reduction, +but it's hard because you can easily. + +0:49:01.001 --> 0:49:08.164 +People have even like made the second thing +even more easy. + +0:49:08.164 --> 0:49:10.313 +So there is this. + +0:49:10.313 --> 0:49:17.954 +This is how we have the highway connections +in the transformer. + +0:49:17.954 --> 0:49:20.699 +Then it's like you do. + +0:49:20.780 --> 0:49:24.789 +So that is like how things are put together +as a transformer. + +0:49:25.125 --> 0:49:39.960 +And that is a similar and simple recurring +neural network where you do exactly the same + +0:49:39.960 --> 0:49:44.512 +for the so you don't have. + +0:49:46.326 --> 0:49:47.503 +This type of things. + +0:49:49.149 --> 0:50:01.196 +And with this we are at the end of how to +make efficient architectures before we go to + +0:50:01.196 --> 0:50:02.580 +the next. + +0:50:13.013 --> 0:50:24.424 +Between the ink or the trader and the architectures +there is a next technique which is used in + +0:50:24.424 --> 0:50:28.988 +nearly all deburning very successful. + +0:50:29.449 --> 0:50:43.463 +So the idea is can we extract the knowledge +from a large network into a smaller one, but + +0:50:43.463 --> 0:50:45.983 +it's similarly. + +0:50:47.907 --> 0:50:53.217 +And the nice thing is that this really works, +and it may be very, very surprising. + +0:50:53.673 --> 0:51:03.000 +So the idea is that we have a large straw +model which we train for long, and the question + +0:51:03.000 --> 0:51:07.871 +is: Can that help us to train a smaller model? + +0:51:08.148 --> 0:51:16.296 +So can what we refer to as teacher model tell +us better to build a small student model than + +0:51:16.296 --> 0:51:17.005 +before. + +0:51:17.257 --> 0:51:27.371 +So what we're before in it as a student model, +we learn from the data and that is how we train + +0:51:27.371 --> 0:51:28.755 +our systems. + +0:51:29.249 --> 0:51:37.949 +The question is: Can we train this small model +better if we are not only learning from the + +0:51:37.949 --> 0:51:46.649 +data, but we are also learning from a large +model which has been trained maybe in the same + +0:51:46.649 --> 0:51:47.222 +data? + +0:51:47.667 --> 0:51:55.564 +So that you have then in the end a smaller +model that is somehow better performing than. + +0:51:55.895 --> 0:51:59.828 +And maybe that's on the first view. + +0:51:59.739 --> 0:52:05.396 +Very very surprising because it has seen the +same data so it should have learned the same + +0:52:05.396 --> 0:52:11.053 +so the baseline model trained only on the data +and the student teacher knowledge to still + +0:52:11.053 --> 0:52:11.682 +model it. + +0:52:11.682 --> 0:52:17.401 +They all have seen only this data because +your teacher modeling was also trained typically + +0:52:17.401 --> 0:52:19.161 +only on this model however. + +0:52:20.580 --> 0:52:30.071 +It has by now shown that by many ways the +model trained in the teacher and analysis framework + +0:52:30.071 --> 0:52:32.293 +is performing better. + +0:52:33.473 --> 0:52:40.971 +A bit of an explanation when we see how that +works. + +0:52:40.971 --> 0:52:46.161 +There's different ways of doing it. + +0:52:46.161 --> 0:52:47.171 +Maybe. + +0:52:47.567 --> 0:52:51.501 +So how does it work? + +0:52:51.501 --> 0:53:04.802 +This is our student network, the normal one, +some type of new network. + +0:53:04.802 --> 0:53:06.113 +We're. + +0:53:06.586 --> 0:53:17.050 +So we are training the model to predict the +same thing as we are doing that by calculating. + +0:53:17.437 --> 0:53:23.173 +The cross angry loss was defined in a way +where saying all the probabilities for the + +0:53:23.173 --> 0:53:25.332 +correct word should be as high. + +0:53:25.745 --> 0:53:31.576 +So your calculating gear out of probability +is always and each time step you have an out + +0:53:31.576 --> 0:53:32.624 +of probability. + +0:53:32.624 --> 0:53:38.258 +What is the most probable in the next word +and your training signal is put as much of + +0:53:38.258 --> 0:53:43.368 +your probability mass to the correct word to +the word that is there in train. + +0:53:43.903 --> 0:53:51.367 +And this is the chief by this cross entry +loss, which says with some of the all training + +0:53:51.367 --> 0:53:58.664 +examples of all positions, with some of the +full vocabulary, and then this one is this + +0:53:58.664 --> 0:54:03.947 +one that this current word is the case word +in the vocabulary. + +0:54:04.204 --> 0:54:11.339 +And then we take here the lock for the ability +of that, so what we made me do is: We have + +0:54:11.339 --> 0:54:27.313 +this metric here, so each position of your +vocabulary size. + +0:54:27.507 --> 0:54:38.656 +In the end what you just do is some of these +three lock probabilities, and then you want + +0:54:38.656 --> 0:54:40.785 +to have as much. + +0:54:41.041 --> 0:54:54.614 +So although this is a thumb over this metric +here, in the end of each dimension you. + +0:54:54.794 --> 0:55:06.366 +So that is a normal cross end to be lost that +we have discussed at the very beginning of + +0:55:06.366 --> 0:55:07.016 +how. + +0:55:08.068 --> 0:55:15.132 +So what can we do differently in the teacher +network? + +0:55:15.132 --> 0:55:23.374 +We also have a teacher network which is trained +on large data. + +0:55:24.224 --> 0:55:35.957 +And of course this distribution might be better +than the one from the small model because it's. + +0:55:36.456 --> 0:55:40.941 +So in this case we have now the training signal +from the teacher network. + +0:55:41.441 --> 0:55:46.262 +And it's the same way as we had before. + +0:55:46.262 --> 0:55:56.507 +The only difference is we're training not +the ground truths per ability distribution + +0:55:56.507 --> 0:55:59.159 +year, which is sharp. + +0:55:59.299 --> 0:56:11.303 +That's also a probability, so this word has +a high probability, but have some probability. + +0:56:12.612 --> 0:56:19.577 +And that is the main difference. + +0:56:19.577 --> 0:56:30.341 +Typically you do like the interpretation of +these. + +0:56:33.213 --> 0:56:38.669 +Because there's more information contained +in the distribution than in the front booth, + +0:56:38.669 --> 0:56:44.187 +because it encodes more information about the +language, because language always has more + +0:56:44.187 --> 0:56:47.907 +options to put alone, that's the same sentence +yes exactly. + +0:56:47.907 --> 0:56:53.114 +So there's ambiguity in there that is encoded +hopefully very well in the complaint. + +0:56:53.513 --> 0:56:57.257 +Trade you two networks so better than a student +network you have in there from your learner. + +0:56:57.537 --> 0:57:05.961 +So maybe often there's only one correct word, +but it might be two or three, and then all + +0:57:05.961 --> 0:57:10.505 +of these three have a probability distribution. + +0:57:10.590 --> 0:57:21.242 +And then is the main advantage or one explanation +of why it's better to train from the. + +0:57:21.361 --> 0:57:32.652 +Of course, it's good to also keep the signal +in there because then you can prevent it because + +0:57:32.652 --> 0:57:33.493 +crazy. + +0:57:37.017 --> 0:57:49.466 +Any more questions on the first type of knowledge +distillation, also distribution changes. + +0:57:50.550 --> 0:58:02.202 +Coming around again, this would put it a bit +different, so this is not a solution to maintenance + +0:58:02.202 --> 0:58:04.244 +or distribution. + +0:58:04.744 --> 0:58:12.680 +But don't think it's performing worse than +only doing the ground tours because they also. + +0:58:13.113 --> 0:58:21.254 +So it's more like it's not improving you would +assume it's similarly helping you, but. + +0:58:21.481 --> 0:58:28.145 +Of course, if you now have a teacher, maybe +you have no danger on your target to Maine, + +0:58:28.145 --> 0:58:28.524 +but. + +0:58:28.888 --> 0:58:39.895 +Then you can use this one which is not the +ground truth but helpful to learn better for + +0:58:39.895 --> 0:58:42.147 +the distribution. + +0:58:46.326 --> 0:58:57.012 +The second idea is to do sequence level knowledge +distillation, so what we have in this case + +0:58:57.012 --> 0:59:02.757 +is we have looked at each position independently. + +0:59:03.423 --> 0:59:05.436 +Mean, we do that often. + +0:59:05.436 --> 0:59:10.972 +We are not generating a lot of sequences, +but that has a problem. + +0:59:10.972 --> 0:59:13.992 +We have this propagation of errors. + +0:59:13.992 --> 0:59:16.760 +We start with one area and then. + +0:59:17.237 --> 0:59:27.419 +So if we are doing word-level knowledge dissolution, +we are treating each word in the sentence independently. + +0:59:28.008 --> 0:59:32.091 +So we are not trying to like somewhat model +the dependency between. + +0:59:32.932 --> 0:59:47.480 +We can try to do that by sequence level knowledge +dissolution, but the problem is, of course,. + +0:59:47.847 --> 0:59:53.478 +So we can that for each position we can get +a distribution over all the words at this. + +0:59:53.793 --> 1:00:05.305 +But if we want to have a distribution of all +possible target sentences, that's not possible + +1:00:05.305 --> 1:00:06.431 +because. + +1:00:08.508 --> 1:00:15.940 +Area, so we can then again do a bit of a heck +on that. + +1:00:15.940 --> 1:00:23.238 +If we can't have a distribution of all sentences, +it. + +1:00:23.843 --> 1:00:30.764 +So what we can't do is you can not use the +teacher network and sample different translations. + +1:00:31.931 --> 1:00:39.327 +And now we can do different ways to train +them. + +1:00:39.327 --> 1:00:49.343 +We can use them as their probability, the +easiest one to assume. + +1:00:50.050 --> 1:00:56.373 +So what that ends to is that we're taking +our teacher network, we're generating some + +1:00:56.373 --> 1:01:01.135 +translations, and these ones we're using as +additional trading. + +1:01:01.781 --> 1:01:11.382 +Then we have mainly done this sequence level +because the teacher network takes us. + +1:01:11.382 --> 1:01:17.513 +These are all probable translations of the +sentence. + +1:01:26.286 --> 1:01:34.673 +And then you can do a bit of a yeah, and you +can try to better make a bit of an interpolated + +1:01:34.673 --> 1:01:36.206 +version of that. + +1:01:36.716 --> 1:01:42.802 +So what people have also done is like subsequent +level interpolations. + +1:01:42.802 --> 1:01:52.819 +You generate here several translations: But +then you don't use all of them. + +1:01:52.819 --> 1:02:00.658 +You do some metrics on which of these ones. + +1:02:01.021 --> 1:02:12.056 +So it's a bit more training on this brown +chose which might be improbable or unreachable + +1:02:12.056 --> 1:02:16.520 +because we can generate everything. + +1:02:16.676 --> 1:02:23.378 +And we are giving it an easier solution which +is also good quality and training of that. + +1:02:23.703 --> 1:02:32.602 +So you're not training it on a very difficult +solution, but you're training it on an easier + +1:02:32.602 --> 1:02:33.570 +solution. + +1:02:36.356 --> 1:02:38.494 +Any More Questions to This. + +1:02:40.260 --> 1:02:41.557 +Yeah. + +1:02:41.461 --> 1:02:44.296 +Good. + +1:02:43.843 --> 1:03:01.642 +Is to look at the vocabulary, so the problem +is we have seen that vocabulary calculations + +1:03:01.642 --> 1:03:06.784 +are often very presuming. + +1:03:09.789 --> 1:03:19.805 +The thing is that most of the vocabulary is +not needed for each sentence, so in each sentence. + +1:03:20.280 --> 1:03:28.219 +The question is: Can we somehow easily precalculate, +which words are probable to occur in the sentence, + +1:03:28.219 --> 1:03:30.967 +and then only calculate these ones? + +1:03:31.691 --> 1:03:34.912 +And this can be done so. + +1:03:34.912 --> 1:03:43.932 +For example, if you have sentenced card, it's +probably not happening. + +1:03:44.164 --> 1:03:48.701 +So what you can try to do is to limit your +vocabulary. + +1:03:48.701 --> 1:03:51.093 +You're considering for each. + +1:03:51.151 --> 1:04:04.693 +So you're no longer taking the full vocabulary +as possible output, but you're restricting. + +1:04:06.426 --> 1:04:18.275 +That typically works is that we limit it by +the most frequent words we always take because + +1:04:18.275 --> 1:04:23.613 +these are not so easy to align to words. + +1:04:23.964 --> 1:04:32.241 +To take the most treatment taggin' words and +then work that often aligns with one of the + +1:04:32.241 --> 1:04:32.985 +source. + +1:04:33.473 --> 1:04:46.770 +So for each source word you calculate the +word alignment on your training data, and then + +1:04:46.770 --> 1:04:51.700 +you calculate which words occur. + +1:04:52.352 --> 1:04:57.680 +And then for decoding you build this union +of maybe the source word list that other. + +1:04:59.960 --> 1:05:02.145 +Are like for each source work. + +1:05:02.145 --> 1:05:08.773 +One of the most frequent translations of these +source words, for example for each source work + +1:05:08.773 --> 1:05:13.003 +like in the most frequent ones, and then the +most frequent. + +1:05:13.193 --> 1:05:24.333 +In total, if you have short sentences, you +have a lot less words, so in most cases it's + +1:05:24.333 --> 1:05:26.232 +not more than. + +1:05:26.546 --> 1:05:33.957 +And so you have dramatically reduced your +vocabulary, and thereby can also fax a depot. + +1:05:35.495 --> 1:05:43.757 +That easy does anybody see what is challenging +here and why that might not always need. + +1:05:47.687 --> 1:05:54.448 +The performance is not why this might not. + +1:05:54.448 --> 1:06:01.838 +If you implement it, it might not be a strong. + +1:06:01.941 --> 1:06:06.053 +You have to store this list. + +1:06:06.053 --> 1:06:14.135 +You have to burn the union and of course your +safe time. + +1:06:14.554 --> 1:06:21.920 +The second thing the vocabulary is used in +our last step, so we have the hidden state, + +1:06:21.920 --> 1:06:23.868 +and then we calculate. + +1:06:24.284 --> 1:06:29.610 +Now we are not longer calculating them for +all output words, but for a subset of them. + +1:06:30.430 --> 1:06:35.613 +However, this metric multiplication is typically +parallelized with the perfect but good. + +1:06:35.956 --> 1:06:46.937 +But if you not only calculate some of them, +if you're not modeling it right, it will take + +1:06:46.937 --> 1:06:52.794 +as long as before because of the nature of +the. + +1:06:56.776 --> 1:07:07.997 +Here for beam search there's some ideas of +course you can go back to greedy search because + +1:07:07.997 --> 1:07:10.833 +that's more efficient. + +1:07:11.651 --> 1:07:18.347 +And better quality, and you can buffer some +states in between, so how much buffering it's + +1:07:18.347 --> 1:07:22.216 +again this tradeoff between calculation and +memory. + +1:07:25.125 --> 1:07:41.236 +Then at the end of today what we want to look +into is one last type of new machine translation + +1:07:41.236 --> 1:07:42.932 +approach. + +1:07:43.403 --> 1:07:53.621 +And the idea is what we've already seen in +our first two steps is that this ultra aggressive + +1:07:53.621 --> 1:07:57.246 +park is taking community coding. + +1:07:57.557 --> 1:08:04.461 +Can process everything in parallel, but we +are always taking the most probable and then. + +1:08:05.905 --> 1:08:10.476 +The question is: Do we really need to do that? + +1:08:10.476 --> 1:08:14.074 +Therefore, there is a bunch of work. + +1:08:14.074 --> 1:08:16.602 +Can we do it differently? + +1:08:16.602 --> 1:08:19.616 +Can we generate a full target? + +1:08:20.160 --> 1:08:29.417 +We'll see it's not that easy and there's still +an open debate whether this is really faster + +1:08:29.417 --> 1:08:31.832 +and quality, but think. + +1:08:32.712 --> 1:08:45.594 +So, as said, what we have done is our encoder +decoder where we can process our encoder color, + +1:08:45.594 --> 1:08:50.527 +and then the output always depends. + +1:08:50.410 --> 1:08:54.709 +We generate the output and then we have to +put it here the wide because then everything + +1:08:54.709 --> 1:08:56.565 +depends on the purpose of the output. + +1:08:56.916 --> 1:09:10.464 +This is what is referred to as an outer-regressive +model and nearly outs speech generation and + +1:09:10.464 --> 1:09:16.739 +language generation or works in this outer. + +1:09:18.318 --> 1:09:21.132 +So the motivation is, can we do that more +efficiently? + +1:09:21.361 --> 1:09:31.694 +And can we somehow process all target words +in parallel? + +1:09:31.694 --> 1:09:41.302 +So instead of doing it one by one, we are +inputting. + +1:09:45.105 --> 1:09:46.726 +So how does it work? + +1:09:46.726 --> 1:09:50.587 +So let's first have a basic auto regressive +mode. + +1:09:50.810 --> 1:09:53.551 +So the encoder looks as it is before. + +1:09:53.551 --> 1:09:58.310 +That's maybe not surprising because here we +know we can paralyze. + +1:09:58.618 --> 1:10:04.592 +So we have put in here our ink holder and +generated the ink stash, so that's exactly + +1:10:04.592 --> 1:10:05.295 +the same. + +1:10:05.845 --> 1:10:16.229 +However, now we need to do one more thing: +One challenge is what we had before and that's + +1:10:16.229 --> 1:10:26.799 +a challenge of natural language generation +like machine translation. + +1:10:32.672 --> 1:10:38.447 +We generate until we generate this out of +end of center stock, but if we now generate + +1:10:38.447 --> 1:10:44.625 +everything at once that's no longer possible, +so we cannot generate as long because we only + +1:10:44.625 --> 1:10:45.632 +generated one. + +1:10:46.206 --> 1:10:58.321 +So the question is how can we now determine +how long the sequence is, and we can also accelerate. + +1:11:00.000 --> 1:11:06.384 +Yes, but there would be one idea, and there +is other work which tries to do that. + +1:11:06.806 --> 1:11:15.702 +However, in here there's some work already +done before and maybe you remember we had the + +1:11:15.702 --> 1:11:20.900 +IBM models and there was this concept of fertility. + +1:11:21.241 --> 1:11:26.299 +The concept of fertility is means like for +one saucepan, and how many target pores does + +1:11:26.299 --> 1:11:27.104 +it translate? + +1:11:27.847 --> 1:11:34.805 +And exactly that we try to do here, and that +means we are calculating like at the top we + +1:11:34.805 --> 1:11:36.134 +are calculating. + +1:11:36.396 --> 1:11:42.045 +So it says word is translated into word. + +1:11:42.045 --> 1:11:54.171 +Word might be translated into words into, +so we're trying to predict in how many words. + +1:11:55.935 --> 1:12:10.314 +And then the end of the anchor, so this is +like a length estimation. + +1:12:10.314 --> 1:12:15.523 +You can do it otherwise. + +1:12:16.236 --> 1:12:24.526 +You initialize your decoder input and we know +it's good with word embeddings so we're trying + +1:12:24.526 --> 1:12:28.627 +to do the same thing and what people then do. + +1:12:28.627 --> 1:12:35.224 +They initialize it again with word embedding +but in the frequency of the. + +1:12:35.315 --> 1:12:36.460 +So we have the cartilage. + +1:12:36.896 --> 1:12:47.816 +So one has two, so twice the is and then one +is, so that is then our initialization. + +1:12:48.208 --> 1:12:57.151 +In other words, if you don't predict fertilities +but predict lengths, you can just initialize + +1:12:57.151 --> 1:12:57.912 +second. + +1:12:58.438 --> 1:13:07.788 +This often works a bit better, but that's +the other. + +1:13:07.788 --> 1:13:16.432 +Now you have everything in training and testing. + +1:13:16.656 --> 1:13:18.621 +This is all available at once. + +1:13:20.280 --> 1:13:31.752 +Then we can generate everything in parallel, +so we have the decoder stack, and that is now + +1:13:31.752 --> 1:13:33.139 +as before. + +1:13:35.395 --> 1:13:41.555 +And then we're doing the translation predictions +here on top of it in order to do. + +1:13:43.083 --> 1:13:59.821 +And then we are predicting here the target +words and once predicted, and that is the basic + +1:13:59.821 --> 1:14:00.924 +idea. + +1:14:01.241 --> 1:14:08.171 +Machine translation: Where the idea is, we +don't have to do one by one what we're. + +1:14:10.210 --> 1:14:13.900 +So this looks really, really, really great. + +1:14:13.900 --> 1:14:20.358 +On the first view there's one challenge with +this, and this is the baseline. + +1:14:20.358 --> 1:14:27.571 +Of course there's some improvements, but in +general the quality is often significant. + +1:14:28.068 --> 1:14:32.075 +So here you see the baseline models. + +1:14:32.075 --> 1:14:38.466 +You have a loss of ten blue points or something +like that. + +1:14:38.878 --> 1:14:40.230 +So why does it change? + +1:14:40.230 --> 1:14:41.640 +So why is it happening? + +1:14:43.903 --> 1:14:56.250 +If you look at the errors there is repetitive +tokens, so you have like or things like that. + +1:14:56.536 --> 1:15:01.995 +Broken senses or influent senses, so that +exactly where algebra aggressive models are + +1:15:01.995 --> 1:15:04.851 +very good, we say that's a bit of a problem. + +1:15:04.851 --> 1:15:07.390 +They generate very fluid transcription. + +1:15:07.387 --> 1:15:10.898 +Translation: Sometimes there doesn't have +to do anything with the input. + +1:15:11.411 --> 1:15:14.047 +But generally it really looks always very +fluid. + +1:15:14.995 --> 1:15:20.865 +Here exactly the opposite, so the problem +is that we don't have really fluid translation. + +1:15:21.421 --> 1:15:26.123 +And that is mainly due to the challenge that +we have this independent assumption. + +1:15:26.646 --> 1:15:35.873 +So in this case, the probability of Y of the +second position is independent of the probability + +1:15:35.873 --> 1:15:40.632 +of X, so we don't know what was there generated. + +1:15:40.632 --> 1:15:43.740 +We're just generating it there. + +1:15:43.964 --> 1:15:55.439 +You can see it also in a bit of examples. + +1:15:55.439 --> 1:16:03.636 +You can over-panelize shifts. + +1:16:04.024 --> 1:16:10.566 +And the problem is this is already an improvement +again, but this is also similar to. + +1:16:11.071 --> 1:16:19.900 +So you can, for example, translate heeded +back, or maybe you could also translate it + +1:16:19.900 --> 1:16:31.105 +with: But on their feeling down in feeling +down, if the first position thinks of their + +1:16:31.105 --> 1:16:34.594 +feeling done and the second. + +1:16:35.075 --> 1:16:42.908 +So each position here and that is one of the +main issues here doesn't know what the other. + +1:16:43.243 --> 1:16:53.846 +And for example, if you are translating something +with, you can often translate things in two + +1:16:53.846 --> 1:16:58.471 +ways: German with a different agreement. + +1:16:58.999 --> 1:17:02.058 +And then here where you have to decide do +a used jet. + +1:17:02.162 --> 1:17:05.460 +Interpretator: It doesn't know which word +it has to select. + +1:17:06.086 --> 1:17:14.789 +Mean, of course, it knows a hidden state, +but in the end you have a liability distribution. + +1:17:16.256 --> 1:17:20.026 +And that is the important thing in the outer +regressive month. + +1:17:20.026 --> 1:17:24.335 +You know that because you have put it in you +here, you don't know that. + +1:17:24.335 --> 1:17:29.660 +If it's equal probable here to two, you don't +Know Which Is Selected, and of course that + +1:17:29.660 --> 1:17:32.832 +depends on what should be the latest traction +under. + +1:17:33.333 --> 1:17:39.554 +Yep, that's the undershift, and we're going +to last last the next time. + +1:17:39.554 --> 1:17:39.986 +Yes. + +1:17:40.840 --> 1:17:44.934 +Doesn't this also appear in and like now we're +talking about physical training or. + +1:17:46.586 --> 1:17:48.412 +The thing is in the auto regress. + +1:17:48.412 --> 1:17:50.183 +If you give it the correct one,. + +1:17:50.450 --> 1:17:55.827 +So if you predict here comma what the reference +is feeling then you tell the model here. + +1:17:55.827 --> 1:17:59.573 +The last one was feeling and then it knows +it has to be done. + +1:17:59.573 --> 1:18:04.044 +But here it doesn't know that because it doesn't +get as input as a right. + +1:18:04.204 --> 1:18:24.286 +Yes, that's a bit depending on what. + +1:18:24.204 --> 1:18:27.973 +But in training, of course, you just try to +make the highest one the current one. + +1:18:31.751 --> 1:18:38.181 +So what you can do is things like CDC loss +which can adjust for this. + +1:18:38.181 --> 1:18:42.866 +So then you can also have this shifted correction. + +1:18:42.866 --> 1:18:50.582 +If you're doing this type of correction in +the CDC loss you don't get full penalty. + +1:18:50.930 --> 1:18:58.486 +Just shifted by one, so it's a bit of a different +loss, which is mainly used in, but. + +1:19:00.040 --> 1:19:03.412 +It can be used in order to address this problem. + +1:19:04.504 --> 1:19:13.844 +The other problem is that outer regressively +we have the label buyers that tries to disimmigrate. + +1:19:13.844 --> 1:19:20.515 +That's the example did before was if you translate +thank you to Dung. + +1:19:20.460 --> 1:19:31.925 +And then it might end up because it learns +in the first position and the second also. + +1:19:32.492 --> 1:19:43.201 +In order to prevent that, it would be helpful +for one output, only one output, so that makes + +1:19:43.201 --> 1:19:47.002 +the system already better learn. + +1:19:47.227 --> 1:19:53.867 +Might be that for slightly different inputs +you have different outputs, but for the same. + +1:19:54.714 --> 1:19:57.467 +That we can luckily very easily solve. + +1:19:59.119 --> 1:19:59.908 +And it's done. + +1:19:59.908 --> 1:20:04.116 +We just learned the technique about it, which +is called knowledge distillation. + +1:20:04.985 --> 1:20:13.398 +So what we can do and the easiest solution +to prove your non-autoregressive model is to + +1:20:13.398 --> 1:20:16.457 +train an auto regressive model. + +1:20:16.457 --> 1:20:22.958 +Then you decode your whole training gamer +with this model and then. + +1:20:23.603 --> 1:20:27.078 +While the main advantage of that is that this +is more consistent,. + +1:20:27.407 --> 1:20:33.995 +So for the same input you always have the +same output. + +1:20:33.995 --> 1:20:41.901 +So you have to make your training data more +consistent and learn. + +1:20:42.482 --> 1:20:54.471 +So there is another advantage of knowledge +distillation and that advantage is you have + +1:20:54.471 --> 1:20:59.156 +more consistent training signals. + +1:21:04.884 --> 1:21:10.630 +There's another to make the things more easy +at the beginning. + +1:21:10.630 --> 1:21:16.467 +There's this plants model, black model where +you do more masks. + +1:21:16.756 --> 1:21:26.080 +So during training, especially at the beginning, +you give some correct solutions at the beginning. + +1:21:28.468 --> 1:21:38.407 +And there is this tokens at a time, so the +idea is to establish other regressive training. + +1:21:40.000 --> 1:21:50.049 +And some targets are open, so you always predict +only like first auto regression is K. + +1:21:50.049 --> 1:21:59.174 +It puts one, so you always have one input +and one output, then you do partial. + +1:21:59.699 --> 1:22:05.825 +So in that way you can slowly learn what is +a good and what is a bad answer. + +1:22:08.528 --> 1:22:10.862 +It doesn't sound very impressive. + +1:22:10.862 --> 1:22:12.578 +Don't contact me anyway. + +1:22:12.578 --> 1:22:15.323 +Go all over your training data several. + +1:22:15.875 --> 1:22:20.655 +You can even switch in between. + +1:22:20.655 --> 1:22:29.318 +There is a homework on this thing where you +try to start. + +1:22:31.271 --> 1:22:41.563 +You have to learn so there's a whole work +on that so this is often happening and it doesn't + +1:22:41.563 --> 1:22:46.598 +mean it's less efficient but still it helps. + +1:22:49.389 --> 1:22:57.979 +For later maybe here are some examples of +how much things help. + +1:22:57.979 --> 1:23:04.958 +Maybe one point here is that it's really important. + +1:23:05.365 --> 1:23:13.787 +Here's the translation performance and speed. + +1:23:13.787 --> 1:23:24.407 +One point which is a point is if you compare +researchers. + +1:23:24.784 --> 1:23:33.880 +So yeah, if you're compared to one very weak +baseline transformer even with beam search, + +1:23:33.880 --> 1:23:40.522 +then you're ten times slower than a very strong +auto regressive. + +1:23:40.961 --> 1:23:48.620 +If you make a strong baseline then it's going +down to depending on times and here like: You + +1:23:48.620 --> 1:23:53.454 +have a lot of different speed ups. + +1:23:53.454 --> 1:24:03.261 +Generally, it makes a strong baseline and +not very simple transformer. + +1:24:07.407 --> 1:24:20.010 +Yeah, with this one last thing that you can +do to speed up things and also reduce your + +1:24:20.010 --> 1:24:25.950 +memory is what is called half precision. + +1:24:26.326 --> 1:24:29.139 +And especially for decoding issues for training. + +1:24:29.139 --> 1:24:31.148 +Sometimes it also gets less stale. + +1:24:32.592 --> 1:24:45.184 +With this we close nearly wait a bit, so what +you should remember is that efficient machine + +1:24:45.184 --> 1:24:46.963 +translation. + +1:24:47.007 --> 1:24:51.939 +We have, for example, looked at knowledge +distillation. + +1:24:51.939 --> 1:24:55.991 +We have looked at non auto regressive models. + +1:24:55.991 --> 1:24:57.665 +We have different. + +1:24:58.898 --> 1:25:02.383 +For today and then only requests. + +1:25:02.383 --> 1:25:08.430 +So if you haven't done so, please fill out +the evaluation. + +1:25:08.388 --> 1:25:20.127 +So now if you have done so think then you +should have and with the online people hopefully. + +1:25:20.320 --> 1:25:29.758 +Only possibility to tell us what things are +good and what not the only one but the most + +1:25:29.758 --> 1:25:30.937 +efficient. + +1:25:31.851 --> 1:25:35.875 +So think of all the students doing it in the +next okay, then thank. + +0:00:03.243 --> 0:00:18.400 +Hey welcome to our video, small room today +and to the lecture machine translation. + +0:00:19.579 --> 0:00:32.295 +So the idea is we have like last time we started +addressing problems and building machine translation. + +0:00:32.772 --> 0:00:39.140 +And we looked into different ways of how we +can use other types of resources. + +0:00:39.379 --> 0:00:54.656 +Last time we looked into language models and +especially pre-trained models which are different + +0:00:54.656 --> 0:00:59.319 +paradigms and learning data. + +0:01:00.480 --> 0:01:07.606 +However, there is one other way of getting +data and that is just searching for more data. + +0:01:07.968 --> 0:01:14.637 +And the nice thing is it was a worldwide web. + +0:01:14.637 --> 0:01:27.832 +We have a very big data resource where there's +various types of data which we can all use. + +0:01:28.128 --> 0:01:38.902 +If you want to build a machine translation +for a specific language or specific to Maine, + +0:01:38.902 --> 0:01:41.202 +it might be worse. + +0:01:46.586 --> 0:01:55.399 +In general, the other year we had different +types of additional resources we can have. + +0:01:55.399 --> 0:01:59.654 +Today we look into the state of crawling. + +0:01:59.654 --> 0:02:05.226 +It always depends a bit on what type of task +you have. + +0:02:05.525 --> 0:02:08.571 +We're crawling, you point off no possibilities. + +0:02:08.828 --> 0:02:14.384 +We have seen some weeks ago that Maje Lingo +models another thing where you can try to share + +0:02:14.384 --> 0:02:16.136 +knowledge between languages. + +0:02:16.896 --> 0:02:26.774 +Last we looked into monolingual data and next +we also unsupervised them too which is purely + +0:02:26.774 --> 0:02:29.136 +based on monolingual. + +0:02:29.689 --> 0:02:35.918 +What we today will focus on is really web +crawling of parallel data. + +0:02:35.918 --> 0:02:40.070 +We will focus not on the crawling pad itself. + +0:02:41.541 --> 0:02:49.132 +Networking lecture is something about one +of the best techniques to do web trolleying + +0:02:49.132 --> 0:02:53.016 +and then we'll just rely on existing tools. + +0:02:53.016 --> 0:02:59.107 +But the challenge is normally if you have +web data that's pure text. + +0:03:00.920 --> 0:03:08.030 +And these are all different ways of how we +can do that, and today is focused on that. + +0:03:08.508 --> 0:03:21.333 +So why would we be interested in that there +is quite different ways of collecting data? + +0:03:21.333 --> 0:03:28.473 +If you're currently when we talk about parallel. + +0:03:28.548 --> 0:03:36.780 +The big difference is that you focus on one +specific website so you can manually check + +0:03:36.780 --> 0:03:37.632 +how you. + +0:03:38.278 --> 0:03:49.480 +This you can do for dedicated resources where +you have high quality data. + +0:03:50.510 --> 0:03:56.493 +Another thing which has been developed or +has been done for several tasks is also is + +0:03:56.493 --> 0:03:59.732 +like you can do something like crowdsourcing. + +0:03:59.732 --> 0:04:05.856 +I don't know if you know about sites like +Amazon Mechanical Turing or things like that + +0:04:05.856 --> 0:04:08.038 +so you can there get a lot of. + +0:04:07.988 --> 0:04:11.544 +Writing between cheap labors would like easy +translations for you. + +0:04:12.532 --> 0:04:22.829 +Of course you can't collect millions of sentences, +but if it's like thousands of sentences that's + +0:04:22.829 --> 0:04:29.134 +also sourced, it's often interesting when you +have somehow. + +0:04:29.509 --> 0:04:36.446 +However, this is a field of itself, so crowdsourcing +is not that easy. + +0:04:36.446 --> 0:04:38.596 +It's not like upload. + +0:04:38.738 --> 0:04:50.806 +If you're doing that you will have very poor +quality, for example in the field of machine + +0:04:50.806 --> 0:04:52.549 +translation. + +0:04:52.549 --> 0:04:57.511 +Crowdsourcing is very commonly used. + +0:04:57.397 --> 0:05:00.123 +The problem there is. + +0:05:00.480 --> 0:05:08.181 +Since they are paid quite bad, of course, +a lot of people also try to make it put into + +0:05:08.181 --> 0:05:09.598 +it as possible. + +0:05:09.869 --> 0:05:21.076 +So if you're just using it without any control +mechanisms, the quality will be bad. + +0:05:21.076 --> 0:05:27.881 +What you can do is like doing additional checking. + +0:05:28.188 --> 0:05:39.084 +And think recently read a paper that now these +things can be worse because people don't do + +0:05:39.084 --> 0:05:40.880 +it themselves. + +0:05:41.281 --> 0:05:46.896 +So it's a very interesting topic. + +0:05:46.896 --> 0:05:55.320 +There has been a lot of resources created +by this. + +0:05:57.657 --> 0:06:09.796 +It's really about large scale data, then of +course doing some type of web crawling is the + +0:06:09.796 --> 0:06:10.605 +best. + +0:06:10.930 --> 0:06:17.296 +However, the biggest issue in this case is +in the quality. + +0:06:17.296 --> 0:06:22.690 +So how can we ensure that somehow the quality +of. + +0:06:23.003 --> 0:06:28.656 +Because if you just, we all know that in the +Internet there's also a lot of tools. + +0:06:29.149 --> 0:06:37.952 +Low quality staff, and especially now the +bigger question is how can we ensure that translations + +0:06:37.952 --> 0:06:41.492 +are really translations of each other? + +0:06:45.065 --> 0:06:58.673 +Why is this interesting so we had this number +before so there is some estimates that roughly + +0:06:58.673 --> 0:07:05.111 +a human reads around three hundred million. + +0:07:05.525 --> 0:07:16.006 +If you look into the web you will have millions +of words there so you can really get a large + +0:07:16.006 --> 0:07:21.754 +amount of data and if you think about monolingual. + +0:07:22.042 --> 0:07:32.702 +So at least for some language pairs there +is a large amount of data you can have. + +0:07:32.852 --> 0:07:37.783 +Languages are official languages in one country. + +0:07:37.783 --> 0:07:46.537 +There's always a very great success because +a lot of websites from the government need + +0:07:46.537 --> 0:07:48.348 +to be translated. + +0:07:48.568 --> 0:07:58.777 +For example, a large purpose like in India, +which we have worked with in India, so you + +0:07:58.777 --> 0:08:00.537 +have parallel. + +0:08:01.201 --> 0:08:02.161 +Two questions. + +0:08:02.161 --> 0:08:08.438 +First of all, if jet GPS and machine translation +tools are more becoming ubiquitous and everybody + +0:08:08.438 --> 0:08:14.138 +uses them, don't we get a problem because we +want to crawl the web and use the data and. + +0:08:15.155 --> 0:08:18.553 +Yes, that is a severe problem. + +0:08:18.553 --> 0:08:26.556 +Of course, are we only training on training +data which is automatically? + +0:08:26.766 --> 0:08:41.182 +And if we are doing that, of course, we talked +about the synthetic data where we do back translation. + +0:08:41.341 --> 0:08:46.446 +But of course it gives you some aren't up +about norm, you cannot be much better than + +0:08:46.446 --> 0:08:46.806 +this. + +0:08:48.308 --> 0:08:57.194 +That is, we'll get more and more on issues, +so maybe at some point we won't look at the + +0:08:57.194 --> 0:09:06.687 +current Internet, but focus on oats like image +of the Internet, which are created by Archive. + +0:09:07.527 --> 0:09:18.611 +There's lots of classification algorithms +on how to classify automatic data they had + +0:09:18.611 --> 0:09:26.957 +a very interesting paper on how to watermark +their translation. + +0:09:27.107 --> 0:09:32.915 +So there's like two scenarios of course in +this program: The one thing you might want + +0:09:32.915 --> 0:09:42.244 +to find your own translation if you're a big +company and say do an antisystem that may be + +0:09:42.244 --> 0:09:42.866 +used. + +0:09:43.083 --> 0:09:49.832 +This problem might be that most of the translation +out there is created by you. + +0:09:49.832 --> 0:10:02.007 +You might be able: And there is a relatively +easy way of doing that so that there are other + +0:10:02.007 --> 0:10:09.951 +peoples' mainly that can do like the search +or teacher. + +0:10:09.929 --> 0:10:12.878 +They are different, but there is not the one +correction station. + +0:10:13.153 --> 0:10:23.763 +So what you then can't do is you can't output +the best one to the user, but the highest value. + +0:10:23.763 --> 0:10:30.241 +For example, it's easy, but you can take the +translation. + +0:10:30.870 --> 0:10:40.713 +And if you always give the translation of +your investments, which are all good with the + +0:10:40.713 --> 0:10:42.614 +most ease, then. + +0:10:42.942 --> 0:10:55.503 +But of course this you can only do with most +of the data generated by your model. + +0:10:55.503 --> 0:11:02.855 +What we are now seeing is not only checks, +but. + +0:11:03.163 --> 0:11:13.295 +But it's definitely an additional research +question that might get more and more importance, + +0:11:13.295 --> 0:11:18.307 +and it might be an additional filtering step. + +0:11:18.838 --> 0:11:29.396 +There are other issues in data quality, so +in which direction wasn't translated, so that + +0:11:29.396 --> 0:11:31.650 +is not interested. + +0:11:31.891 --> 0:11:35.672 +But if you're now reaching better and better +quality, it makes a difference. + +0:11:35.672 --> 0:11:39.208 +The original data was from German to English +or from English to German. + +0:11:39.499 --> 0:11:44.797 +Because translation, they call it translate +Chinese. + +0:11:44.797 --> 0:11:53.595 +So if you generate German from English, it +has a more similar structure as if you would + +0:11:53.595 --> 0:11:55.195 +directly speak. + +0:11:55.575 --> 0:11:57.187 +So um. + +0:11:57.457 --> 0:12:03.014 +These are all issues which you then might +do like do additional training to remove them + +0:12:03.014 --> 0:12:07.182 +or you first train on them and later train +on other quality data. + +0:12:07.182 --> 0:12:11.034 +But yet that's a general view on so it's an +important issue. + +0:12:11.034 --> 0:12:17.160 +But until now I think it hasn't been addressed +that much maybe because the quality was decently. + +0:12:18.858 --> 0:12:23.691 +Actually, I think we're sure if we have the +time we use the Internet. + +0:12:23.691 --> 0:12:29.075 +The problem is, it's a lot of English speaking +text, but most used languages. + +0:12:29.075 --> 0:12:34.460 +I don't know some language in Africa that's +spoken, but we do about that one. + +0:12:34.460 --> 0:12:37.566 +I mean, that's why most data is English too. + +0:12:38.418 --> 0:12:42.259 +Other languages, and then you get the best. + +0:12:42.259 --> 0:12:46.013 +If there is no data on the Internet, then. + +0:12:46.226 --> 0:12:48.255 +So there is still a lot of data collection. + +0:12:48.255 --> 0:12:50.976 +Also in the wild way you try to improve there +and collect. + +0:12:51.431 --> 0:12:57.406 +But English is the most in the world, but +you find surprisingly much data also for other + +0:12:57.406 --> 0:12:58.145 +languages. + +0:12:58.678 --> 0:13:04.227 +Of course, only if they're written remember. + +0:13:04.227 --> 0:13:15.077 +Most languages are not written at all, but +for them you might find some video, but it's + +0:13:15.077 --> 0:13:17.420 +difficult to find. + +0:13:17.697 --> 0:13:22.661 +So this is mainly done for the web trawling. + +0:13:22.661 --> 0:13:29.059 +It's mainly done for languages which are commonly +spoken. + +0:13:30.050 --> 0:13:37.907 +Is exactly the next point, so this is that +much data is only true for English and some + +0:13:37.907 --> 0:13:41.972 +other languages, but of course there's many. + +0:13:41.982 --> 0:13:50.285 +And therefore a lot of research on how to +make things efficient and efficient and learn + +0:13:50.285 --> 0:13:54.248 +faster from pure data is still essential. + +0:13:59.939 --> 0:14:06.326 +So what we are interested in now on data is +parallel data. + +0:14:06.326 --> 0:14:10.656 +We assume always we have parallel data. + +0:14:10.656 --> 0:14:12.820 +That means we have. + +0:14:13.253 --> 0:14:20.988 +To be careful when you start crawling from +the web, we might get only related types of. + +0:14:21.421 --> 0:14:30.457 +So one comedy thing is what people refer as +noisy parallel data where there is documents + +0:14:30.457 --> 0:14:34.315 +which are translations of each other. + +0:14:34.434 --> 0:14:44.300 +So you have senses where there is no translation +on the other side because you have. + +0:14:44.484 --> 0:14:50.445 +So if you have these types of documents your +algorithm to extract parallel data might be + +0:14:50.445 --> 0:14:51.918 +a bit more difficult. + +0:14:52.352 --> 0:15:04.351 +Know if you can still remember in the beginning +of the lecture when we talked about different + +0:15:04.351 --> 0:15:06.393 +data resources. + +0:15:06.286 --> 0:15:11.637 +But the first step is then approached to a +light source and target sentences, and it was + +0:15:11.637 --> 0:15:16.869 +about like a steep vocabulary, and then you +have some probabilities for one to one and + +0:15:16.869 --> 0:15:17.590 +one to one. + +0:15:17.590 --> 0:15:23.002 +It's very like simple algorithm, but yet it +works fine for really a high quality parallel + +0:15:23.002 --> 0:15:23.363 +data. + +0:15:23.623 --> 0:15:30.590 +But when we're talking about noisy data, we +might have to do additional steps and use more + +0:15:30.590 --> 0:15:35.872 +advanced models to extract what is parallel +and to get high quality. + +0:15:36.136 --> 0:15:44.682 +So if we just had no easy parallel data, the +document might not be as easy to extract. + +0:15:49.249 --> 0:15:54.877 +And then there is even the more extreme pains, +which has also been used to be honest. + +0:15:54.877 --> 0:15:58.214 +The use of this data is reasoning not that +common. + +0:15:58.214 --> 0:16:04.300 +It was more interested maybe like ten or fifteen +years ago, and that is what people referred + +0:16:04.300 --> 0:16:05.871 +to as comparative data. + +0:16:06.266 --> 0:16:17.167 +And then the idea is you even don't have translations +like sentences which are translations of each + +0:16:17.167 --> 0:16:25.234 +other, but you have more news documents or +articles about the same topic. + +0:16:25.205 --> 0:16:32.410 +But it's more that you find phrases which +are too big in the user, so even black fragments. + +0:16:32.852 --> 0:16:44.975 +So if you think about the pedia, for example, +these articles have to be written in like the + +0:16:44.975 --> 0:16:51.563 +Wikipedia general idea independent of each +other. + +0:16:51.791 --> 0:17:01.701 +They have different information in there, +and I mean, the German movie gets more detail + +0:17:01.701 --> 0:17:04.179 +than the English one. + +0:17:04.179 --> 0:17:07.219 +However, it might be that. + +0:17:07.807 --> 0:17:20.904 +And the same thing is that you think about +newspaper articles if they're at the same time. + +0:17:21.141 --> 0:17:25.603 +And so this is an ability to learn. + +0:17:25.603 --> 0:17:36.760 +For example, new phrases, vocabulary and stature +if you don't have monitor all time long. + +0:17:37.717 --> 0:17:49.020 +And then not everything will be the same, +but there might be an overlap about events. + +0:17:54.174 --> 0:18:00.348 +So if we're talking about web trolling said +in the beginning it was really about specific. + +0:18:00.660 --> 0:18:18.878 +They do very good things by hand and really +focus on them and do a very specific way of + +0:18:18.878 --> 0:18:20.327 +doing. + +0:18:20.540 --> 0:18:23.464 +The European Parliament was very focused in +Ted. + +0:18:23.464 --> 0:18:26.686 +Maybe you even have looked in the particular +session. + +0:18:27.427 --> 0:18:40.076 +And these are still important, but they are +of course very specific in covering different + +0:18:40.076 --> 0:18:41.341 +pockets. + +0:18:42.002 --> 0:18:55.921 +Then there was a focus on language centering, +so there was a big drawer, for example, that + +0:18:55.921 --> 0:18:59.592 +you can check websites. + +0:19:00.320 --> 0:19:07.918 +Apparently what really people like is a more +general approach where you just have to specify. + +0:19:07.918 --> 0:19:15.355 +I'm interested in data from German to Lithuanian +and then you can as automatic as possible. + +0:19:15.355 --> 0:19:19.640 +You can collect data and extract codelator +for this. + +0:19:21.661 --> 0:19:25.633 +So is this our interest? + +0:19:25.633 --> 0:19:36.435 +Of course, the question is how can we build +these types of systems? + +0:19:36.616 --> 0:19:52.913 +The first are more general web crawling base +systems, so there is nothing about. + +0:19:53.173 --> 0:19:57.337 +Based on the websites you have, you have to +do like text extraction. + +0:19:57.597 --> 0:20:06.503 +We are typically not that much interested +in text and images in there, so we try to extract + +0:20:06.503 --> 0:20:07.083 +text. + +0:20:07.227 --> 0:20:16.919 +This is also not specific to machine translation, +but it's a more traditional way of doing web + +0:20:16.919 --> 0:20:17.939 +trolling. + +0:20:18.478 --> 0:20:22.252 +And at the end you have mirror like some other +set of document collectors. + +0:20:22.842 --> 0:20:37.025 +Is the idea, so you have the text, and often +this is a document, and so in the end. + +0:20:37.077 --> 0:20:51.523 +And that is some of your starting point now +for doing the more machine translation. + +0:20:52.672 --> 0:21:05.929 +One way of doing that now is very similar +to what you might have think about the traditional + +0:21:05.929 --> 0:21:06.641 +one. + +0:21:06.641 --> 0:21:10.633 +The first thing is to do a. + +0:21:11.071 --> 0:21:22.579 +So you have this based on the initial fact +that you know this is a German website in the + +0:21:22.579 --> 0:21:25.294 +English translation. + +0:21:25.745 --> 0:21:31.037 +And based on this document alignment, then +you can do your sentence alignment. + +0:21:31.291 --> 0:21:39.072 +And this is similar to what we had before +with the church accordion. + +0:21:39.072 --> 0:21:43.696 +This is typically more noisy peril data. + +0:21:43.623 --> 0:21:52.662 +So that you are not assuming that everything +is on both sides, that the order is the same, + +0:21:52.662 --> 0:21:56.635 +so you should do more flexible systems. + +0:21:58.678 --> 0:22:14.894 +Then it depends if the documents you were +drawing were really some type of parallel data. + +0:22:15.115 --> 0:22:35.023 +Say then you should do what is referred to +as fragmented extraction. + +0:22:36.136 --> 0:22:47.972 +One problem with these types of models is +if you are doing errors in your document alignment,. + +0:22:48.128 --> 0:22:55.860 +It means that if you are saying these two +documents are align then you can only find + +0:22:55.860 --> 0:22:58.589 +sense and if you are missing. + +0:22:59.259 --> 0:23:15.284 +Is very different, only small parts of the +document are parallel, and most parts are independent + +0:23:15.284 --> 0:23:17.762 +of each other. + +0:23:19.459 --> 0:23:31.318 +Therefore, more recently, there is also the +idea of directly doing sentence aligned so + +0:23:31.318 --> 0:23:35.271 +that you're directly taking. + +0:23:36.036 --> 0:23:41.003 +Was already one challenge of this one, the +second approach. + +0:23:42.922 --> 0:23:50.300 +Yes, so one big challenge on here, beef, then +you have to do a lot of comparison. + +0:23:50.470 --> 0:23:59.270 +You have to cook out every source, every target +set and square. + +0:23:59.270 --> 0:24:06.283 +If you think of a million or trillion pairs, +then. + +0:24:07.947 --> 0:24:12.176 +And this also gives you a reason for a last +step in both cases. + +0:24:12.176 --> 0:24:18.320 +So in both of them you have to remember you're +typically eating here in this very large data + +0:24:18.320 --> 0:24:18.650 +set. + +0:24:18.650 --> 0:24:24.530 +So all of these and also the document alignment +here they should be done very efficient. + +0:24:24.965 --> 0:24:42.090 +And if you want to do it very efficiently, +that means your quality will go lower. + +0:24:41.982 --> 0:24:47.348 +Because you just have to ever see it fast, +and then yeah you can put less computation + +0:24:47.348 --> 0:24:47.910 +on each. + +0:24:48.688 --> 0:25:06.255 +Therefore, in a lot of scenarios it makes +sense to make an additional filtering step + +0:25:06.255 --> 0:25:08.735 +at the end. + +0:25:08.828 --> 0:25:13.370 +And then we do a second filtering step where +we now can put a lot more effort. + +0:25:13.433 --> 0:25:20.972 +Because now we don't have like any square +possible combinations anymore, we have already + +0:25:20.972 --> 0:25:26.054 +selected and maybe in dimension of maybe like +two or three. + +0:25:26.054 --> 0:25:29.273 +For each sentence we even don't have. + +0:25:29.429 --> 0:25:39.234 +And then we can put a lot more effort in each +individual example and build a high quality + +0:25:39.234 --> 0:25:42.611 +classic fire to really select. + +0:25:45.125 --> 0:26:00.506 +Two or one example for that, so one of the +biggest projects doing this is the so-called + +0:26:00.506 --> 0:26:03.478 +Paratrol Corpus. + +0:26:03.343 --> 0:26:11.846 +Typically it's like before the picturing so +there are a lot of challenges on how you can. + +0:26:12.272 --> 0:26:25.808 +And the steps they start to be with the seatbelt, +so what you should give at the beginning is: + +0:26:26.146 --> 0:26:36.908 +Then they do the problem, the text extraction, +the document alignment, the sentence alignment, + +0:26:36.908 --> 0:26:45.518 +and the sentence filter, and it swings down +to implementing the text store. + +0:26:46.366 --> 0:26:51.936 +We'll see later for a lot of language pairs +exist so it's easier to download them and then + +0:26:51.936 --> 0:26:52.793 +like improve. + +0:26:53.073 --> 0:27:08.270 +For example, the crawling one thing they often +do is even not throw the direct website because + +0:27:08.270 --> 0:27:10.510 +there's also. + +0:27:10.770 --> 0:27:14.540 +Black parts of the Internet that they can +work on today. + +0:27:14.854 --> 0:27:22.238 +In more detail, this is a bit shown here. + +0:27:22.238 --> 0:27:31.907 +All the steps you can see are different possibilities. + +0:27:32.072 --> 0:27:39.018 +You need a bit of knowledge to do that, or +you can build a machine translation system. + +0:27:39.239 --> 0:27:47.810 +There are two different ways of deduction +and alignment. + +0:27:47.810 --> 0:27:52.622 +You can use sentence alignment. + +0:27:53.333 --> 0:28:02.102 +And how you can do the flexigrade exam, for +example, the lexic graph, or you can chin. + +0:28:02.422 --> 0:28:05.826 +To the next step in a bit more detail. + +0:28:05.826 --> 0:28:13.680 +But before we're doing it, I need more questions +about the general overview of how these. + +0:28:22.042 --> 0:28:37.058 +Yeah, so two or three things to web-drawing, +so you normally start with the URLs. + +0:28:37.058 --> 0:28:40.903 +It's most promising. + +0:28:41.021 --> 0:28:48.652 +What you found is that if you're interested +in German to English, you would: Companies + +0:28:48.652 --> 0:29:01.074 +where you know they have a German and an English +website are from agencies which might be: And + +0:29:01.074 --> 0:29:10.328 +then we can use one of these tools to start +from there using standard web calling techniques. + +0:29:11.071 --> 0:29:23.942 +There are several challenges when doing that, +so if you request a website too often you can: + +0:29:25.305 --> 0:29:37.819 +You have to keep in history of the sites and +you click on all the links and then click on + +0:29:37.819 --> 0:29:40.739 +all the links again. + +0:29:41.721 --> 0:29:49.432 +To be very careful about legal issues starting +from this robotics day so get allowed to use. + +0:29:49.549 --> 0:29:58.941 +Mean, that's the one major thing about what +trolley general is. + +0:29:58.941 --> 0:30:05.251 +The problem is how you deal with property. + +0:30:05.685 --> 0:30:13.114 +That is why it is easier sometimes to start +with some quick fold data that you don't have. + +0:30:13.893 --> 0:30:22.526 +Of course, the network issues you retry, so +there's more technical things, but there's + +0:30:22.526 --> 0:30:23.122 +good. + +0:30:24.724 --> 0:30:35.806 +Another thing which is very helpful and is +often done is instead of doing the web trolling + +0:30:35.806 --> 0:30:38.119 +yourself, relying. + +0:30:38.258 --> 0:30:44.125 +And one thing is it's common crawl from the +web. + +0:30:44.125 --> 0:30:51.190 +Think on this common crawl a lot of these +language models. + +0:30:51.351 --> 0:30:59.763 +So think in American Company or organization +which really works on like writing. + +0:31:00.000 --> 0:31:01.111 +Possible. + +0:31:01.111 --> 0:31:10.341 +So the nice thing is if you start with this +you don't have to worry about network. + +0:31:10.250 --> 0:31:16.086 +I don't think you can do that because it's +too big, but you can do a pipeline on how to + +0:31:16.086 --> 0:31:16.683 +process. + +0:31:17.537 --> 0:31:28.874 +That is, of course, a general challenge in +all this web crawling and parallel web mining. + +0:31:28.989 --> 0:31:38.266 +That means you cannot just don't know the +data and study the processes. + +0:31:39.639 --> 0:31:45.593 +Here it might make sense to directly fields +of both domains that in some way bark just + +0:31:45.593 --> 0:31:46.414 +marginally. + +0:31:49.549 --> 0:31:59.381 +Then you can do the text extraction, which +means like converging two HTML and then splitting + +0:31:59.381 --> 0:32:01.707 +things from the HTML. + +0:32:01.841 --> 0:32:04.802 +Often very important is to do the language +I need. + +0:32:05.045 --> 0:32:16.728 +It's not that clear even if it's links which +language it is, but they are quite good tools + +0:32:16.728 --> 0:32:22.891 +like that can't identify from relatively short. + +0:32:23.623 --> 0:32:36.678 +And then you are now in the situation that +you have all your danger and that you can start. + +0:32:37.157 --> 0:32:43.651 +After the text extraction you have now a collection +or a large collection of of data where it's + +0:32:43.651 --> 0:32:49.469 +like text and maybe the document at use of +some meta information and now the question + +0:32:49.469 --> 0:32:55.963 +is based on this monolingual text or multilingual +text so text in many languages but not align. + +0:32:56.036 --> 0:32:59.863 +How can you now do a generate power? + +0:33:01.461 --> 0:33:06.289 +And UM. + +0:33:05.705 --> 0:33:12.965 +So the main thing, if we're not seeing it +as a task, or if we want to do it in a machine + +0:33:12.965 --> 0:33:20.388 +learning way, what we have is we have a set +of sentences and a suits language, and we have + +0:33:20.388 --> 0:33:23.324 +a set Of sentences from the target. + +0:33:23.823 --> 0:33:27.814 +This is the target language. + +0:33:27.814 --> 0:33:31.392 +This is the data we have. + +0:33:31.392 --> 0:33:37.034 +We kind of directly assume any ordering. + +0:33:38.018 --> 0:33:44.502 +More documents there are not really in line +or there is maybe a graph and what we are interested + +0:33:44.502 --> 0:33:50.518 +in is finding these alignments so which senses +are aligned to each other and which senses + +0:33:50.518 --> 0:33:53.860 +we can remove but we don't have translations +for. + +0:33:53.974 --> 0:34:00.339 +But exactly this mapping is what we are interested +in and what we need to find. + +0:34:01.901 --> 0:34:17.910 +And if we are modeling it more from the machine +translation point of view, what can model that + +0:34:17.910 --> 0:34:21.449 +as a classification? + +0:34:21.681 --> 0:34:34.850 +And so the main challenge is to build this +type of classifier and you want to decide is + +0:34:34.850 --> 0:34:36.646 +a parallel. + +0:34:42.402 --> 0:34:50.912 +However, the biggest challenge has already +pointed out in the beginning is the sites if + +0:34:50.912 --> 0:34:53.329 +we have millions target. + +0:34:53.713 --> 0:35:05.194 +The number of comparison is n square, so this +very path is very inefficient, and we need + +0:35:05.194 --> 0:35:06.355 +to find. + +0:35:07.087 --> 0:35:16.914 +And traditionally there is the first one mentioned +before the local or the hierarchical meaning + +0:35:16.914 --> 0:35:20.292 +mining and there the idea is OK. + +0:35:20.292 --> 0:35:23.465 +First we are lining documents. + +0:35:23.964 --> 0:35:32.887 +Move back the things and align them, and once +you have the alignment you only need to remind. + +0:35:33.273 --> 0:35:51.709 +That of course makes anything more efficient +because we don't have to do all the comparison. + +0:35:53.253 --> 0:35:56.411 +Then it's, for example, in the before mentioned +apparel. + +0:35:57.217 --> 0:36:11.221 +But it has the issue that if this document +is bad you have error propagation and you can + +0:36:11.221 --> 0:36:14.211 +recover from that. + +0:36:14.494 --> 0:36:20.715 +Because then document that cannot say ever, +there are some sentences which are: Therefore, + +0:36:20.715 --> 0:36:24.973 +more recently there is also was referred to +as global mining. + +0:36:26.366 --> 0:36:31.693 +And there we really do this. + +0:36:31.693 --> 0:36:43.266 +Although it's in the square, we are doing +all the comparisons. + +0:36:43.523 --> 0:36:52.588 +So the idea is that you can do represent all +the sentences in a vector space. + +0:36:52.892 --> 0:37:06.654 +And then it's about nearest neighbor search +and there is a lot of very efficient algorithms. + +0:37:07.067 --> 0:37:20.591 +Then if you only compare them to your nearest +neighbors you don't have to do like a comparison + +0:37:20.591 --> 0:37:22.584 +but you have. + +0:37:26.186 --> 0:37:40.662 +So in the first step what we want to look +at is this: This document classification refers + +0:37:40.662 --> 0:37:49.584 +to the document alignment, and then we do the +sentence alignment. + +0:37:51.111 --> 0:37:58.518 +And if we're talking about document alignment, +there's like typically two steps in that: We + +0:37:58.518 --> 0:38:01.935 +first do a candidate selection. + +0:38:01.935 --> 0:38:10.904 +Often we have several steps and that is again +to make more things more efficiently. + +0:38:10.904 --> 0:38:13.360 +We have the candidate. + +0:38:13.893 --> 0:38:18.402 +The candidate select means OK, which documents +do we want to compare? + +0:38:19.579 --> 0:38:35.364 +Then if we have initial candidates which might +be parallel, we can do a classification test. + +0:38:35.575 --> 0:38:37.240 +And there is different ways. + +0:38:37.240 --> 0:38:40.397 +We can use lexical similarity or we can use +ten basic. + +0:38:41.321 --> 0:38:48.272 +The first and easiest thing is to take off +possible candidates. + +0:38:48.272 --> 0:38:55.223 +There's one possibility, the other one, is +based on structural. + +0:38:55.235 --> 0:39:05.398 +So based on how your website looks like, you +might find that there are only translations. + +0:39:05.825 --> 0:39:14.789 +This is typically the only case where we try +to do some kind of major information, which + +0:39:14.789 --> 0:39:22.342 +can be very useful because we know that websites, +for example, are linked. + +0:39:22.722 --> 0:39:35.586 +We can try to use some URL patterns, so if +we have some website which ends with the. + +0:39:35.755 --> 0:39:43.932 +So that can be easily used in order to find +candidates. + +0:39:43.932 --> 0:39:49.335 +Then we only compare websites where. + +0:39:49.669 --> 0:40:05.633 +The language and the translation of each other, +but typically you hear several heuristics to + +0:40:05.633 --> 0:40:07.178 +do that. + +0:40:07.267 --> 0:40:16.606 +Then you don't have to compare all websites, +but you only have to compare web sites. + +0:40:17.277 --> 0:40:27.607 +Cruiser problems especially with an hour day's +content management system. + +0:40:27.607 --> 0:40:32.912 +Sometimes it's nice and easy to read. + +0:40:33.193 --> 0:40:44.452 +So on the one hand there typically leads from +the parent's side to different languages. + +0:40:44.764 --> 0:40:46.632 +Now I can look at the kit websites. + +0:40:46.632 --> 0:40:49.381 +It's the same thing you can check on the difference. + +0:40:49.609 --> 0:41:06.833 +Languages: You can either do that from the +parent website or you can also click on English. + +0:41:06.926 --> 0:41:10.674 +You can therefore either like prepare to all +the websites. + +0:41:10.971 --> 0:41:18.205 +Can be even more focused and checked if the +link is somehow either flexible or the language + +0:41:18.205 --> 0:41:18.677 +name. + +0:41:19.019 --> 0:41:24.413 +So there really depends on how much you want +to filter out. + +0:41:24.413 --> 0:41:29.178 +There is always a trade-off between being +efficient. + +0:41:33.913 --> 0:41:49.963 +Based on that we then have our candidate list, +so we now have two independent sets of German + +0:41:49.963 --> 0:41:52.725 +documents, but. + +0:41:53.233 --> 0:42:03.515 +And now the task is, we want to extract these, +which are really translations of each other. + +0:42:03.823 --> 0:42:10.201 +So the question of how can we measure the +document similarity? + +0:42:10.201 --> 0:42:14.655 +Because what we then do is, we measure the. + +0:42:14.955 --> 0:42:27.096 +And here you already see why this is also +that problematic from where it's partial or + +0:42:27.096 --> 0:42:28.649 +similarly. + +0:42:30.330 --> 0:42:37.594 +All you can do that is again two folds. + +0:42:37.594 --> 0:42:48.309 +You can do it more content based or more structural +based. + +0:42:48.188 --> 0:42:53.740 +Calculating a lot of features and then maybe +training a classic pyramid small set which + +0:42:53.740 --> 0:42:57.084 +stands like based on the spesse feature is +the data. + +0:42:57.084 --> 0:42:58.661 +It is a corpus parallel. + +0:43:00.000 --> 0:43:10.955 +One way of doing that is to have traction +features, so the idea is the text length, so + +0:43:10.955 --> 0:43:12.718 +the document. + +0:43:13.213 --> 0:43:20.511 +Of course, text links will not be the same, +but if the one document has fifty words and + +0:43:20.511 --> 0:43:24.907 +the other five thousand words, it's quite realistic. + +0:43:25.305 --> 0:43:29.274 +So you can use the text length as one proxy +of. + +0:43:29.274 --> 0:43:32.334 +Is this might be a good translation? + +0:43:32.712 --> 0:43:41.316 +Now the thing is the alignment between the +structure. + +0:43:41.316 --> 0:43:52.151 +If you have here the website you can create +some type of structure. + +0:43:52.332 --> 0:44:04.958 +You can compare that to the French version +and then calculate some similarities because + +0:44:04.958 --> 0:44:07.971 +you see translation. + +0:44:08.969 --> 0:44:12.172 +Of course, it's getting more and more problematic. + +0:44:12.172 --> 0:44:16.318 +It does be a different structure than these +features are helpful. + +0:44:16.318 --> 0:44:22.097 +However, if you are doing it more in a trained +way, you can automatically learn how helpful + +0:44:22.097 --> 0:44:22.725 +they are. + +0:44:24.704 --> 0:44:37.516 +Then there are different ways of yeah: Content +based things: One easy thing, especially if + +0:44:37.516 --> 0:44:48.882 +you have systems that are using the same script +that you are looking for. + +0:44:48.888 --> 0:44:49.611 +The legs. + +0:44:49.611 --> 0:44:53.149 +We call them a beggar words and we'll look +into. + +0:44:53.149 --> 0:44:55.027 +You can use some type of. + +0:44:55.635 --> 0:44:58.418 +And neural embedding is also to abate him +at. + +0:45:02.742 --> 0:45:06.547 +And as then mean we have machine translation,. + +0:45:06.906 --> 0:45:14.640 +And one idea that you can also do is really +use the machine translation. + +0:45:14.874 --> 0:45:22.986 +Because this one is one which takes more effort, +so what you then have to do is put more effort. + +0:45:23.203 --> 0:45:37.526 +You wouldn't do this type of machine translation +based approach for a system which has product. + +0:45:38.018 --> 0:45:53.712 +But maybe your first of thinking why can't +do that because I'm collecting data to build + +0:45:53.712 --> 0:45:55.673 +an system. + +0:45:55.875 --> 0:46:01.628 +So you can use an initial system to translate +it, and then you can collect more data. + +0:46:01.901 --> 0:46:06.879 +And one way of doing that is, you're translating, +for example, all documents even to English. + +0:46:07.187 --> 0:46:25.789 +Then you only need two English data and you +do it in the example with three grams. + +0:46:25.825 --> 0:46:33.253 +For example, the current induction in 1 in +the Spanish, which is German induction in 1, + +0:46:33.253 --> 0:46:37.641 +which was Spanish induction in 2, which was +French. + +0:46:37.637 --> 0:46:52.225 +You're creating this index and then based +on that you can calculate how similar the documents. + +0:46:52.092 --> 0:46:58.190 +And then you can use the Cossack similarity +to really calculate which of the most similar + +0:46:58.190 --> 0:47:00.968 +document or how similar is the document. + +0:47:00.920 --> 0:47:04.615 +And then measure if this is a possible translation. + +0:47:05.285 --> 0:47:14.921 +Mean, of course, the document will not be +exactly the same, and even if you have a parallel + +0:47:14.921 --> 0:47:18.483 +document, French and German, and. + +0:47:18.898 --> 0:47:29.086 +You'll have not a perfect translation, therefore +it's looking into five front overlap since + +0:47:29.086 --> 0:47:31.522 +there should be last. + +0:47:34.074 --> 0:47:42.666 +Okay, before we take the next step and go +into the sentence alignment, there are more + +0:47:42.666 --> 0:47:44.764 +questions about the. + +0:47:51.131 --> 0:47:55.924 +Too Hot and. + +0:47:56.997 --> 0:47:59.384 +Well um. + +0:48:00.200 --> 0:48:05.751 +There is different ways of doing sentence +alignment. + +0:48:05.751 --> 0:48:12.036 +Here's one way to describe is to call the +other line again. + +0:48:12.172 --> 0:48:17.590 +Of course, we have the advantage that we have +only documents, so we might have like hundred + +0:48:17.590 --> 0:48:20.299 +sentences and hundred sentences in the tower. + +0:48:20.740 --> 0:48:31.909 +Although it still might be difficult to compare +all the things in parallel, and. + +0:48:31.791 --> 0:48:37.541 +And therefore typically these even assume +that we are only interested in a line character + +0:48:37.541 --> 0:48:40.800 +that can be identified on the sum of the diagonal. + +0:48:40.800 --> 0:48:46.422 +Of course, not exactly the diagonal will sum +some parts around it, but in order to make + +0:48:46.422 --> 0:48:47.891 +things more efficient. + +0:48:48.108 --> 0:48:55.713 +You can still do it around the diagonal because +if you say this is a parallel document, we + +0:48:55.713 --> 0:48:56.800 +assume that. + +0:48:56.836 --> 0:49:05.002 +We wouldn't have passed the document alignment, +therefore we wouldn't have seen it. + +0:49:05.505 --> 0:49:06.774 +In the underline. + +0:49:06.774 --> 0:49:10.300 +Then we are calculating the similarity for +these. + +0:49:10.270 --> 0:49:17.428 +Set this here based on the bilingual dictionary, +so it may be based on how much overlap you + +0:49:17.428 --> 0:49:17.895 +have. + +0:49:18.178 --> 0:49:24.148 +And then we are finding a path through it. + +0:49:24.148 --> 0:49:31.089 +You are finding a path which the lights ever +see. + +0:49:31.271 --> 0:49:41.255 +But you're trying to find a pass through your +document so that you get these parallel. + +0:49:41.201 --> 0:49:49.418 +And then the perfect ones here would be your +pass, where you just take this other parallel. + +0:49:51.011 --> 0:50:05.579 +The advantage is that, of course, on the one +end limits your search space. + +0:50:05.579 --> 0:50:07.521 +That is,. + +0:50:07.787 --> 0:50:10.013 +So what does it mean? + +0:50:10.013 --> 0:50:19.120 +So even if you have a very high probable pair, +you're not taking them on because overall. + +0:50:19.399 --> 0:50:27.063 +So sometimes it makes sense to also use this +global information and not only compare on + +0:50:27.063 --> 0:50:34.815 +individual sentences because what you're with +your parents is that sometimes it's only a + +0:50:34.815 --> 0:50:36.383 +good translation. + +0:50:38.118 --> 0:50:51.602 +So by this minion paste you're preventing +the system to do it at the border where there's + +0:50:51.602 --> 0:50:52.201 +no. + +0:50:53.093 --> 0:50:55.689 +So that might achieve you a bit better quality. + +0:50:56.636 --> 0:51:12.044 +The pack always ends if we write the button +for everybody, but it also means you couldn't + +0:51:12.044 --> 0:51:15.126 +necessarily have. + +0:51:15.375 --> 0:51:24.958 +Have some restrictions that is right, so first +of all they can't be translated out. + +0:51:25.285 --> 0:51:32.572 +So the handle line typically only really works +well if you have a relatively high quality. + +0:51:32.752 --> 0:51:39.038 +So if you have this more general data where +there's like some parts are translated and + +0:51:39.038 --> 0:51:39.471 +some. + +0:51:39.719 --> 0:51:43.604 +It doesn't really work, so it might. + +0:51:43.604 --> 0:51:53.157 +It's okay with having maybe at the end some +sentences which are missing, but in generally. + +0:51:53.453 --> 0:51:59.942 +So it's not robust against significant noise +on the. + +0:52:05.765 --> 0:52:12.584 +The second thing is is to what is referred +to as blue alibi. + +0:52:13.233 --> 0:52:16.982 +And this doesn't does, does not do us much. + +0:52:16.977 --> 0:52:30.220 +A global information you can translate each +sentence to English, and then you calculate + +0:52:30.220 --> 0:52:34.885 +the voice for the translation. + +0:52:35.095 --> 0:52:41.888 +And that you would get six answer points, +which are the ones in a purple ear. + +0:52:42.062 --> 0:52:56.459 +And then you have the ability to add some +points around it, which might be a bit lower. + +0:52:56.756 --> 0:53:06.962 +But here in this case you are able to deal +with reorderings, angles to deal with parts. + +0:53:07.247 --> 0:53:16.925 +Therefore, in this case we need a full scale +and key system to do this calculation while + +0:53:16.925 --> 0:53:17.686 +we're. + +0:53:18.318 --> 0:53:26.637 +Then, of course, the better your similarity +metric is, so the better you are able to do + +0:53:26.637 --> 0:53:35.429 +this comparison, the less you have to rely +on structural information that, in one sentence,. + +0:53:39.319 --> 0:53:53.411 +Anymore questions, and then there are things +like back in line which try to do the same. + +0:53:53.793 --> 0:53:59.913 +That means the idea is that you expect each +sentence. + +0:53:59.819 --> 0:54:02.246 +In a crossing will vector space. + +0:54:02.246 --> 0:54:08.128 +Crossing will vector space always means that +you have a vector or knight means. + +0:54:08.128 --> 0:54:14.598 +In this case you have a vector space where +sentences in different languages are near to + +0:54:14.598 --> 0:54:16.069 +each other if they. + +0:54:16.316 --> 0:54:23.750 +So you can have it again and so on, but just +next to each other and want to call you. + +0:54:24.104 --> 0:54:32.009 +And then you can of course measure now the +similarity by some distance matrix in this + +0:54:32.009 --> 0:54:32.744 +vector. + +0:54:33.033 --> 0:54:36.290 +And you're saying towards two senses are lying. + +0:54:36.290 --> 0:54:39.547 +If the distance in the vector space is somehow. + +0:54:40.240 --> 0:54:50.702 +We'll discuss that in a bit more heat soon +because these vector spades and bathings are + +0:54:50.702 --> 0:54:52.010 +even then. + +0:54:52.392 --> 0:54:55.861 +So the nice thing is with this. + +0:54:55.861 --> 0:55:05.508 +It's really good and good to get quite good +quality and can decide whether two sentences + +0:55:05.508 --> 0:55:08.977 +are translations of each other. + +0:55:08.888 --> 0:55:14.023 +In the fact-lined approach, but often they +even work on a global search way to really + +0:55:14.023 --> 0:55:15.575 +compare on everything to. + +0:55:16.236 --> 0:55:29.415 +What weak alignment also does is trying to +do to make this more efficient in finding the. + +0:55:29.309 --> 0:55:40.563 +If you don't want to compare everything to +everything, you first need sentence blocks, + +0:55:40.563 --> 0:55:41.210 +and. + +0:55:41.141 --> 0:55:42.363 +Then find him fast. + +0:55:42.562 --> 0:55:55.053 +You always have full sentence resolution, +but then you always compare on the area around. + +0:55:55.475 --> 0:56:11.501 +So if you do compare blocks on the source +of the target, then you have of your possibilities. + +0:56:11.611 --> 0:56:17.262 +So here the end times and comparison is a +lot less than the comparison you have here. + +0:56:17.777 --> 0:56:23.750 +And with neural embeddings you can also embed +not only single sentences and whole blocks. + +0:56:24.224 --> 0:56:28.073 +So how you make this in fast? + +0:56:28.073 --> 0:56:35.643 +You're starting from a coarse grain resolution +here where. + +0:56:36.176 --> 0:56:47.922 +Then you're getting a double pass where they +could be good and near this pass you're doing + +0:56:47.922 --> 0:56:49.858 +more and more. + +0:56:52.993 --> 0:56:54.601 +And yeah, what's the? + +0:56:54.601 --> 0:56:56.647 +This is the white egg lift. + +0:56:56.647 --> 0:56:59.352 +These are the sewers and the target. + +0:57:00.100 --> 0:57:16.163 +While it was sleeping in the forests and things, +I thought it was very strange to see this man. + +0:57:16.536 --> 0:57:25.197 +So you have the sentences, but if you do blocks +you have blocks that are in. + +0:57:30.810 --> 0:57:38.514 +This is the thing about the pipeline approach. + +0:57:38.514 --> 0:57:46.710 +We want to look at the global mining, but +before. + +0:57:53.633 --> 0:58:07.389 +In the global mining thing we have to also +do some filtering and so typically in the things + +0:58:07.389 --> 0:58:10.379 +they do they start. + +0:58:10.290 --> 0:58:14.256 +And then they are doing some pretty processing. + +0:58:14.254 --> 0:58:17.706 +So you try to at first to de-defecate paragraphs. + +0:58:17.797 --> 0:58:30.622 +So, of course, if you compare everything with +everything in two times the same input example, + +0:58:30.622 --> 0:58:35.748 +you will also: The hard thing is that you first +keep duplicating. + +0:58:35.748 --> 0:58:37.385 +You have each paragraph only one. + +0:58:37.958 --> 0:58:42.079 +There's a lot of text which occurs a lot of +times. + +0:58:42.079 --> 0:58:44.585 +They will happen all the time. + +0:58:44.884 --> 0:58:57.830 +There are pages about the cookie thing you +see and about accepting things. + +0:58:58.038 --> 0:59:04.963 +So you can already be duplicated here, or +your problem has crossed the website twice, + +0:59:04.963 --> 0:59:05.365 +and. + +0:59:06.066 --> 0:59:11.291 +Then you can remove low quality data like +cooking warnings that have biolabites start. + +0:59:12.012 --> 0:59:13.388 +Hey! + +0:59:13.173 --> 0:59:19.830 +So let you have maybe some other sentence, +and then you're doing a language idea. + +0:59:19.830 --> 0:59:29.936 +That means you want to have a text, which +is: You want to know for each sentence a paragraph + +0:59:29.936 --> 0:59:38.695 +which language it has so that you then, of +course, if you want. + +0:59:39.259 --> 0:59:44.987 +Finally, there is some complexity based film +screenings to believe, for example, for very + +0:59:44.987 --> 0:59:46.069 +high complexity. + +0:59:46.326 --> 0:59:59.718 +That means, for example, data where there's +a lot of crazy names which are growing. + +1:00:00.520 --> 1:00:09.164 +Sometimes it also improves very high perplexity +data because that is then unmanned generated + +1:00:09.164 --> 1:00:09.722 +data. + +1:00:11.511 --> 1:00:17.632 +And then the model which is mostly used for +that is what is called a laser model. + +1:00:18.178 --> 1:00:21.920 +It's based on machine translation. + +1:00:21.920 --> 1:00:28.442 +Hope it all recognizes the machine translation +architecture. + +1:00:28.442 --> 1:00:37.103 +However, there is a difference between a general +machine translation system and. + +1:01:00.000 --> 1:01:13.322 +Machine translation system, so it's messy. + +1:01:14.314 --> 1:01:24.767 +See one bigger difference, which is great +if I'm excluding that object or the other. + +1:01:25.405 --> 1:01:39.768 +There is one difference to the other, one +with attention, so we are having. + +1:01:40.160 --> 1:01:43.642 +And then we are using that here in there each +time set up. + +1:01:44.004 --> 1:01:54.295 +Mean, therefore, it's maybe a bit similar +to original anti-system without attention. + +1:01:54.295 --> 1:01:56.717 +It's quite similar. + +1:01:57.597 --> 1:02:10.011 +However, it has this disadvantage saying that +we have to put everything in one sentence and + +1:02:10.011 --> 1:02:14.329 +that maybe not all information. + +1:02:15.055 --> 1:02:25.567 +However, now in this type of framework we +are not really interested in machine translation, + +1:02:25.567 --> 1:02:27.281 +so this model. + +1:02:27.527 --> 1:02:34.264 +So we are training it to do machine translation. + +1:02:34.264 --> 1:02:42.239 +What that means in the end should be as much +information. + +1:02:43.883 --> 1:03:01.977 +Only all the information in here is able to +really well do the machine translation. + +1:03:02.642 --> 1:03:07.801 +So that is the first step, so we are doing +here. + +1:03:07.801 --> 1:03:17.067 +We are building the MT system, not with the +goal of making the best MT system, but with + +1:03:17.067 --> 1:03:22.647 +learning and sentences, and hopefully all important. + +1:03:22.882 --> 1:03:26.116 +Because otherwise we won't be able to generate +the translation. + +1:03:26.906 --> 1:03:31.287 +So it's a bit more on the bottom neck like +to try to put as much information. + +1:03:32.012 --> 1:03:36.426 +And if you think if you want to do later finding +the bear's neighbor or something like. + +1:03:37.257 --> 1:03:48.680 +So finding similarities is typically possible +with fixed dimensional things, so we can do + +1:03:48.680 --> 1:03:56.803 +that in an end dimensional space and find the +nearest neighbor. + +1:03:57.857 --> 1:03:59.837 +Yeah, it would be very difficult. + +1:04:00.300 --> 1:04:03.865 +There's one thing that we also do. + +1:04:03.865 --> 1:04:09.671 +We don't want to find the nearest neighbor +in the other. + +1:04:10.570 --> 1:04:13.424 +Do you have an idea how we can train them? + +1:04:13.424 --> 1:04:16.542 +This is a set that embeddings can be compared. + +1:04:23.984 --> 1:04:36.829 +Any idea do you think about two lectures, +a three lecture stack, one that did gave. + +1:04:41.301 --> 1:04:50.562 +We can train them on a multilingual setting +and that's how it's done in lasers so we're + +1:04:50.562 --> 1:04:56.982 +not doing it only from German to English but +we're training. + +1:04:57.017 --> 1:05:04.898 +Mean, if the English one has to be useful +for German, French and so on, and for German + +1:05:04.898 --> 1:05:13.233 +also, the German and the English and so have +to be useful, then somehow we'll automatically + +1:05:13.233 --> 1:05:16.947 +learn that these embattes are popularly. + +1:05:17.437 --> 1:05:28.562 +And then we can use an exact as we will plan +to have a similar sentence embedding. + +1:05:28.908 --> 1:05:39.734 +If you put in here a German and a French one +and always generate as they both have the same + +1:05:39.734 --> 1:05:48.826 +translations, you give these sentences: And +you should do exactly the same thing, so that's + +1:05:48.826 --> 1:05:50.649 +of course the easiest. + +1:05:51.151 --> 1:05:59.817 +If the sentence is very different then most +people will also hear the English decoder and + +1:05:59.817 --> 1:06:00.877 +therefore. + +1:06:02.422 --> 1:06:04.784 +So that is the first thing. + +1:06:04.784 --> 1:06:06.640 +Now we have this one. + +1:06:06.640 --> 1:06:10.014 +We have to be trained on parallel data. + +1:06:10.390 --> 1:06:22.705 +Then we can use these embeddings on our new +data and try to use them to make efficient + +1:06:22.705 --> 1:06:24.545 +comparisons. + +1:06:26.286 --> 1:06:30.669 +So how can you do comparison? + +1:06:30.669 --> 1:06:37.243 +Maybe the first thing you think of is to do. + +1:06:37.277 --> 1:06:44.365 +So you take all the German sentences, all +the French sentences. + +1:06:44.365 --> 1:06:49.460 +We compute the Cousin's simple limit between. + +1:06:49.469 --> 1:06:58.989 +And then you take all pairs where the similarity +is very high. + +1:07:00.180 --> 1:07:17.242 +So you have your French list, you have them, +and then you just take all sentences. + +1:07:19.839 --> 1:07:29.800 +It's an additional power method that we have, +but we have a lot of data who will find a point. + +1:07:29.800 --> 1:07:32.317 +It's a good point, but. + +1:07:35.595 --> 1:07:45.738 +It's also not that easy, so one problem is +that typically there are some sentences where. + +1:07:46.066 --> 1:07:48.991 +And other points where there is very few points +in the neighborhood. + +1:07:49.629 --> 1:08:06.241 +And then for things where a lot of things +are enabled you might extract not for one percent + +1:08:06.241 --> 1:08:08.408 +to do that. + +1:08:08.868 --> 1:08:18.341 +So what typically is happening is you do the +max merchant? + +1:08:18.341 --> 1:08:25.085 +How good is a pair compared to the other? + +1:08:25.305 --> 1:08:33.859 +So you take the similarity between X and Y, +and then you look at one of the eight nearest + +1:08:33.859 --> 1:08:35.190 +neighbors of. + +1:08:35.115 --> 1:08:48.461 +Of x and what are the eight nearest neighbors +of y, and the dividing of the similarity through + +1:08:48.461 --> 1:08:51.411 +the eight neighbors. + +1:08:51.671 --> 1:09:00.333 +So what you may be looking at are these two +sentences a lot more similar than all the other. + +1:09:00.840 --> 1:09:13.455 +And if these are exceptional and similar compared +to other sentences then they should be translations. + +1:09:16.536 --> 1:09:19.158 +Of course, that has also some. + +1:09:19.158 --> 1:09:24.148 +Then the good thing is there's a lot of similar +sentences. + +1:09:24.584 --> 1:09:30.641 +If there is a lot of similar sensations in +white then these are also very similar and + +1:09:30.641 --> 1:09:32.824 +you are doing more comparison. + +1:09:32.824 --> 1:09:36.626 +If all the arrows are far away then the translations. + +1:09:37.057 --> 1:09:40.895 +So think about this like short sentences. + +1:09:40.895 --> 1:09:47.658 +They might be that most things are similar, +but they are just in general. + +1:09:49.129 --> 1:09:59.220 +There are some problems that now we assume +there is only one pair of translations. + +1:09:59.759 --> 1:10:09.844 +So it has some problems in their two or three +ballad translations of that. + +1:10:09.844 --> 1:10:18.853 +Then, of course, this pair might not find +it, but in general this. + +1:10:19.139 --> 1:10:27.397 +For example, they have like all of these common +trawl. + +1:10:27.397 --> 1:10:32.802 +They have large parallel data sets. + +1:10:36.376 --> 1:10:38.557 +One point maybe also year. + +1:10:38.557 --> 1:10:45.586 +Of course, now it's important that we have +done the deduplication before because if we + +1:10:45.586 --> 1:10:52.453 +wouldn't have the deduplication, we would have +points which are the same coordinate. + +1:10:57.677 --> 1:11:03.109 +Maybe only one small things to that mean. + +1:11:03.109 --> 1:11:09.058 +A major issue in this case is still making +a. + +1:11:09.409 --> 1:11:18.056 +So you have to still do all of this comparison, +and that cannot be done just by simple. + +1:11:19.199 --> 1:11:27.322 +So what is done typically express the word, +you know things can be done in parallel. + +1:11:28.368 --> 1:11:36.024 +So calculating the embeddings and all that +stuff doesn't need to be sequential, but it's + +1:11:36.024 --> 1:11:37.143 +independent. + +1:11:37.357 --> 1:11:48.680 +What you typically do is create an event and +then you do some kind of projectization. + +1:11:48.708 --> 1:11:57.047 +So there is this space library which does +key nearest neighbor search very efficient + +1:11:57.047 --> 1:11:59.597 +in very high-dimensional. + +1:12:00.080 --> 1:12:03.410 +And then based on that you can now do comparison. + +1:12:03.410 --> 1:12:06.873 +You can even do the comparison in parallel +because. + +1:12:06.906 --> 1:12:13.973 +Can look at different areas of your space +and then compare the different pieces to find + +1:12:13.973 --> 1:12:14.374 +the. + +1:12:15.875 --> 1:12:30.790 +With this you are then able to do very fast +calculations on this type of sentence. + +1:12:31.451 --> 1:12:34.761 +So yeah this is currently one. + +1:12:35.155 --> 1:12:48.781 +Mean, those of them are covered with this, +so there's a parade. + +1:12:48.668 --> 1:12:55.543 +We are collected by that and most of them +are in a very big corporate for languages which + +1:12:55.543 --> 1:12:57.453 +you can hardly stand on. + +1:12:58.778 --> 1:13:01.016 +Do you have any more questions on this? + +1:13:05.625 --> 1:13:17.306 +And then some more words to this last set +here: So we have now done our pearl marker + +1:13:17.306 --> 1:13:25.165 +and we could assume that everything is fine +now. + +1:13:25.465 --> 1:13:35.238 +However, the problem with this noisy data +is that typically this is quite noisy still, + +1:13:35.238 --> 1:13:35.687 +so. + +1:13:36.176 --> 1:13:44.533 +In order to make things efficient to have +a high recall, the final data is often not + +1:13:44.533 --> 1:13:49.547 +of the best quality, not the same type of quality. + +1:13:49.789 --> 1:13:58.870 +So it is essential to do another figuring +step and to remove senses which might seem + +1:13:58.870 --> 1:14:01.007 +to be translations. + +1:14:01.341 --> 1:14:08.873 +And here, of course, the final evaluation +matrix would be how much do my system improve? + +1:14:09.089 --> 1:14:23.476 +And there are even challenges on doing that +so: people getting this noisy data like symmetrics + +1:14:23.476 --> 1:14:25.596 +or something. + +1:14:27.707 --> 1:14:34.247 +However, all these steps is of course very +time consuming, so you might not always want + +1:14:34.247 --> 1:14:37.071 +to do the full pipeline and training. + +1:14:37.757 --> 1:14:51.614 +So how can you model that we want to get this +best and normally what we always want? + +1:14:51.871 --> 1:15:02.781 +You also want to have the best over translation +quality, but this is also normally not achieved + +1:15:02.781 --> 1:15:03.917 +with all. + +1:15:04.444 --> 1:15:12.389 +And that's why you're doing this two-step +approach first of the second alignment. + +1:15:12.612 --> 1:15:27.171 +And after once you do the sentence filtering, +we can put a lot more alphabet in all the comparisons. + +1:15:27.627 --> 1:15:37.472 +For example, you can just translate the source +and compare that translation with the original + +1:15:37.472 --> 1:15:40.404 +one and calculate how good. + +1:15:40.860 --> 1:15:49.467 +And this, of course, you can do with the filing +set, but you can't do with your initial set + +1:15:49.467 --> 1:15:50.684 +of millions. + +1:15:54.114 --> 1:16:01.700 +So what it is again is the ancient test where +you input as a sentence pair as here, and then + +1:16:01.700 --> 1:16:09.532 +once you have a biometria, these are sentence +pairs with a high quality, and these are sentence + +1:16:09.532 --> 1:16:11.653 +pairs avec a low quality. + +1:16:12.692 --> 1:16:17.552 +Does anybody see what might be a challenge +if you want to train this type of classifier? + +1:16:22.822 --> 1:16:24.264 +How do you measure exactly? + +1:16:24.264 --> 1:16:26.477 +The quality is probably about the problem. + +1:16:27.887 --> 1:16:39.195 +Yes, that is one, that is true, there is even +more, more simple one, and high quality data + +1:16:39.195 --> 1:16:42.426 +here is not so difficult. + +1:16:43.303 --> 1:16:46.844 +Globally, yeah, probably we have a class in +balance. + +1:16:46.844 --> 1:16:49.785 +We don't see many bad quality combinations. + +1:16:49.785 --> 1:16:54.395 +It's hard to get there at the beginning, so +maybe how can you argue? + +1:16:54.395 --> 1:16:58.405 +Where do you find bad quality and what type +of bad quality? + +1:16:58.798 --> 1:17:05.122 +Because if it's too easy, you just take a +random germ and the random innocence that is + +1:17:05.122 --> 1:17:05.558 +very. + +1:17:05.765 --> 1:17:15.747 +But what you're interested is like bad quality +data, which still passes your first initial + +1:17:15.747 --> 1:17:16.405 +step. + +1:17:17.257 --> 1:17:28.824 +What you can use for that is you can use any +type of network or model that in the beginning, + +1:17:28.824 --> 1:17:33.177 +like in random forests, would see. + +1:17:33.613 --> 1:17:38.912 +So the positive examples are quite easy to +get. + +1:17:38.912 --> 1:17:44.543 +You just take parallel data and high quality +data. + +1:17:44.543 --> 1:17:45.095 +You. + +1:17:45.425 --> 1:17:47.565 +That is quite easy. + +1:17:47.565 --> 1:17:55.482 +You normally don't need a lot of data, then +to train in a few validation. + +1:17:57.397 --> 1:18:12.799 +The challenge is like the negative samples +because how would you generate negative samples? + +1:18:13.133 --> 1:18:17.909 +Because the negative examples are the ones +which ask the first step but don't ask the + +1:18:17.909 --> 1:18:18.353 +second. + +1:18:18.838 --> 1:18:23.682 +So how do you typically do it? + +1:18:23.682 --> 1:18:28.994 +You try to do synthetic examples. + +1:18:28.994 --> 1:18:33.369 +You can do random examples. + +1:18:33.493 --> 1:18:45.228 +But this is the typical error that you want +to detect when you do frequency based replacements. + +1:18:45.228 --> 1:18:52.074 +But this is one major issue when you generate +the data. + +1:18:52.132 --> 1:19:02.145 +That doesn't match well with what are the +real arrows that you're interested in. + +1:19:02.702 --> 1:19:13.177 +Is some of the most challenging here to find +the negative samples, which are hard enough + +1:19:13.177 --> 1:19:14.472 +to detect. + +1:19:17.537 --> 1:19:21.863 +And the other thing, which is difficult, is +of course the data ratio. + +1:19:22.262 --> 1:19:24.212 +Why is it important any? + +1:19:24.212 --> 1:19:29.827 +Why is the ratio between positive and negative +examples here important? + +1:19:30.510 --> 1:19:40.007 +Because in a case of plus imbalance we effectively +could learn to just that it's positive and + +1:19:40.007 --> 1:19:43.644 +high quality and we would be right. + +1:19:44.844 --> 1:19:46.654 +Yes, so I'm training. + +1:19:46.654 --> 1:19:51.180 +This is important, but otherwise it might +be too easy. + +1:19:51.180 --> 1:19:52.414 +You always do. + +1:19:52.732 --> 1:19:58.043 +And on the other head, of course, navy and +deputy, it's also important because if we have + +1:19:58.043 --> 1:20:03.176 +equal things, we're also assuming that this +might be the other one, and if the quality + +1:20:03.176 --> 1:20:06.245 +is worse or higher, we might also accept too +fewer. + +1:20:06.626 --> 1:20:10.486 +So this ratio is not easy to determine. + +1:20:13.133 --> 1:20:16.969 +What type of features can we use? + +1:20:16.969 --> 1:20:23.175 +Traditionally, we're also looking at word +translation. + +1:20:23.723 --> 1:20:37.592 +And nowadays, of course, we can model this +also with something like similar, so this is + +1:20:37.592 --> 1:20:38.696 +again. + +1:20:40.200 --> 1:20:42.306 +Language follow. + +1:20:42.462 --> 1:20:49.763 +So we can, for example, put the sentence in +there for the source and the target, and then + +1:20:49.763 --> 1:20:56.497 +based on this classification label we can classify +as this a parallel sentence or. + +1:20:56.476 --> 1:21:00.054 +So it's more like a normal classification +task. + +1:21:00.160 --> 1:21:09.233 +And by having a system which can have much +enable input, we can just put in two R. + +1:21:09.233 --> 1:21:16.886 +We can also put in two independent of each +other based on the hidden. + +1:21:17.657 --> 1:21:35.440 +You can, as you do any other type of classifier, +you can train them on top of. + +1:21:35.895 --> 1:21:42.801 +This so it tries to represent the full sentence +and that's what you also want to do on. + +1:21:43.103 --> 1:21:45.043 +The Other Thing What They Can't Do Is, of +Course. + +1:21:45.265 --> 1:21:46.881 +You can make here. + +1:21:46.881 --> 1:21:52.837 +You can do your summation of all the hidden +statements that you said. + +1:21:58.698 --> 1:22:10.618 +Okay, and then one thing which we skipped +until now, and that is only briefly this fragment. + +1:22:10.630 --> 1:22:19.517 +So if we have sentences which are not really +parallel, can we also extract information from + +1:22:19.517 --> 1:22:20.096 +them? + +1:22:22.002 --> 1:22:25.627 +And so what here the test is? + +1:22:25.627 --> 1:22:33.603 +We have a sentence and we want to find within +or a sentence pair. + +1:22:33.603 --> 1:22:38.679 +We want to find within the sentence pair. + +1:22:39.799 --> 1:22:46.577 +And how that, for example, has been done is +using a lexical positive and negative association. + +1:22:47.187 --> 1:22:57.182 +And then you can transform your target sentence +into a signal and find a thing where you have. + +1:22:57.757 --> 1:23:00.317 +So I'm Going to Get a Clear Eye. + +1:23:00.480 --> 1:23:15.788 +So you hear the English sentence, the other +language, and you have an alignment between + +1:23:15.788 --> 1:23:18.572 +them, and then. + +1:23:18.818 --> 1:23:21.925 +This is not a light cell from a negative signal. + +1:23:22.322 --> 1:23:40.023 +And then you drink some sauce on there because +you want to have an area where there's. + +1:23:40.100 --> 1:23:51.742 +It doesn't matter if you have simple arrows +here by smooth saying you can't. + +1:23:51.972 --> 1:23:58.813 +So you try to find long segments here where +at least most of the words are somehow aligned. + +1:24:00.040 --> 1:24:10.069 +And then you take this one in the side and +extract that one as your parallel fragment, + +1:24:10.069 --> 1:24:10.645 +and. + +1:24:10.630 --> 1:24:21.276 +So in the end you not only have full sentences +but you also have partial sentences which might + +1:24:21.276 --> 1:24:27.439 +be helpful for especially if you have quite +low upset. + +1:24:32.332 --> 1:24:36.388 +That's everything work for today. + +1:24:36.388 --> 1:24:44.023 +What you hopefully remember is the thing about +how the general. + +1:24:44.184 --> 1:24:54.506 +We talked about how we can do the document +alignment and then we can do the sentence alignment, + +1:24:54.506 --> 1:24:57.625 +which can be done after the. + +1:24:59.339 --> 1:25:12.611 +Any more questions think on Thursday we had +to do a switch, so on Thursday there will be + +1:25:12.611 --> 1:25:15.444 +a practical thing. + diff --git a/demo_data/lectures/Lecture-12-20.06.2023/video.mp4 b/demo_data/lectures/Lecture-12-20.06.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..d7e7afba97445d4963e4ab303e22486484316d41 --- /dev/null +++ b/demo_data/lectures/Lecture-12-20.06.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86b4df900483ac17cf6e78c131d83ab5f7df2a0790c7ae034502bdce61554f3 +size 158173841 diff --git a/demo_data/lectures/Lecture-13-04.07.2023/English.vtt b/demo_data/lectures/Lecture-13-04.07.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..91db54e6d4bb1cfcbc44e806bca152281fcc4c53 --- /dev/null +++ b/demo_data/lectures/Lecture-13-04.07.2023/English.vtt @@ -0,0 +1,2696 @@ +WEBVTT + +0:00:01.641 --> 0:00:06.302 +Hey so what again to today's lecture on machine +translation. + +0:00:07.968 --> 0:00:15.152 +This week we'll have a bit of different focus, +so last two weeks or so we have looking into. + +0:00:15.655 --> 0:00:28.073 +How we can improve our system by having more +data, other data sources, or using them to + +0:00:28.073 --> 0:00:30.331 +more efficient. + +0:00:30.590 --> 0:00:38.046 +And we'll have a bit more of that next week +with the anti-travised and the context. + +0:00:38.338 --> 0:00:47.415 +So that we are shifting from this idea of +we treat each sentence independently, but treat + +0:00:47.415 --> 0:00:49.129 +the translation. + +0:00:49.129 --> 0:00:58.788 +Because maybe you can remember from the beginning, +there are phenomenon in machine translation + +0:00:58.788 --> 0:01:02.143 +that you cannot correctly check. + +0:01:03.443 --> 0:01:14.616 +However, today we want to more look into what +challenges arise, specifically when we're practically + +0:01:14.616 --> 0:01:16.628 +applying machine. + +0:01:17.017 --> 0:01:23.674 +And this block will be a total of four different +lectures. + +0:01:23.674 --> 0:01:29.542 +What type of biases are in machine translation +can. + +0:01:29.729 --> 0:01:37.646 +Just then can we try to improve this, but +of course the first focus can be at least the. + +0:01:37.717 --> 0:01:41.375 +And this, of course, gets more and more important. + +0:01:41.375 --> 0:01:48.333 +The more often you apply this type of technology, +when it was mainly a basic research tool which + +0:01:48.333 --> 0:01:53.785 +you were using in a research environment, it's +not directly that important. + +0:01:54.054 --> 0:02:00.370 +But once you apply it to the question, is +it performed the same for everybody or is it + +0:02:00.370 --> 0:02:04.436 +performance of some people less good than other +people? + +0:02:04.436 --> 0:02:10.462 +Does it have specific challenges and we are +seeing that especially in translation? + +0:02:10.710 --> 0:02:13.420 +We have the major challenge. + +0:02:13.420 --> 0:02:20.333 +We have the grammatical gender and this is +not the same in all languages. + +0:02:20.520 --> 0:02:35.431 +In English, it's not clear if you talk about +some person, if it's male or female, and so + +0:02:35.431 --> 0:02:39.787 +hopefully you've learned. + +0:02:41.301 --> 0:02:50.034 +Just as a brief view, so based on this one +aspect of application will then have two other + +0:02:50.034 --> 0:02:57.796 +aspects: On Thursday we'll look into adaptation, +so how can we adapt to specific situations? + +0:02:58.718 --> 0:03:09.127 +Because we have seen that your systems perform +well when the test case is similar to the training + +0:03:09.127 --> 0:03:15.181 +case, it's always the case you should get training +data. + +0:03:16.036 --> 0:03:27.577 +However, in practical applications, it's not +always possible to collect really the best + +0:03:27.577 --> 0:03:31.642 +fitting data, so in that case. + +0:03:32.092 --> 0:03:39.269 +And then the third larger group of applications +will then be speech translation. + +0:03:39.269 --> 0:03:42.991 +What do we have to change in our machine? + +0:03:43.323 --> 0:03:53.569 +If we are now not translating text, but if +we want to translate speech, that will be more + +0:03:53.569 --> 0:03:54.708 +lectures. + +0:04:00.180 --> 0:04:12.173 +So what are we talking about when we are talking +about bias from a definition point? + +0:04:12.092 --> 0:04:21.799 +Means we are introducing systematic errors +when testing, and then we encourage the selection + +0:04:21.799 --> 0:04:24.408 +of the specific answers. + +0:04:24.804 --> 0:04:36.862 +The most prominent case, which is analyzed +most in the research community, is a bias based + +0:04:36.862 --> 0:04:38.320 +on gender. + +0:04:38.320 --> 0:04:43.355 +One example: she works in a hospital. + +0:04:43.523 --> 0:04:50.787 +It is not directly able to assess whether +this is now a point or a friend. + +0:04:51.251 --> 0:05:07.095 +And although in this one even there is, it's +possible to ambiguate this based on the context. + +0:05:07.127 --> 0:05:14.391 +However, there is yeah, this relation to learn +is of course not that easy. + +0:05:14.614 --> 0:05:27.249 +So the system might also learn more like shortcut +connections, which might be that in your training + +0:05:27.249 --> 0:05:31.798 +data most of the doctors are males. + +0:05:32.232 --> 0:05:41.725 +That is like that was too bigly analyzed and +biased, and we'll focus on that also in this. + +0:05:41.641 --> 0:05:47.664 +In this lecture, however, of course, the system +might be a lot of other biases too, which have + +0:05:47.664 --> 0:05:50.326 +been partly investigated in other fields. + +0:05:50.326 --> 0:05:53.496 +But I think machine translation is not that +much. + +0:05:53.813 --> 0:05:57.637 +For example, it can be based on your originals. + +0:05:57.737 --> 0:06:09.405 +So there is an example for a sentiment analysis +that's a bit prominent. + +0:06:09.405 --> 0:06:15.076 +A sentiment analysis means you're. + +0:06:15.035 --> 0:06:16.788 +Like you're seeing it in reviews. + +0:06:17.077 --> 0:06:24.045 +And then you can show that with baseline models, +if the name is Mohammed then the sentiment + +0:06:24.045 --> 0:06:30.786 +in a lot of systems will be more negative than +if it's like a traditional European name. + +0:06:31.271 --> 0:06:33.924 +Are with foods that is simple. + +0:06:33.924 --> 0:06:36.493 +It's this type of restaurant. + +0:06:36.493 --> 0:06:38.804 +It's positive and another. + +0:06:39.319 --> 0:06:49.510 +You have other aspects, so we have seen this. + +0:06:49.510 --> 0:06:59.480 +We have done some experiments in Vietnamese. + +0:06:59.559 --> 0:07:11.040 +And then, for example, you can analyze that +if it's like he's Germany will address it more + +0:07:11.040 --> 0:07:18.484 +formal, while if he is North Korean he'll use +an informal. + +0:07:18.838 --> 0:07:24.923 +So these are also possible types of gender. + +0:07:24.923 --> 0:07:31.009 +However, this is difficult types of biases. + +0:07:31.251 --> 0:07:38.903 +However, especially in translation, the bias +for gender is the most challenging because + +0:07:38.903 --> 0:07:42.989 +we are treating gender in different languages. + +0:07:45.405 --> 0:07:46.930 +Hi this is challenging. + +0:07:48.148 --> 0:07:54.616 +The reason for that is that there is a translation +mismatch and we have, I mean, one reason for + +0:07:54.616 --> 0:08:00.140 +that is there's a translation mismatch and +that's the most challenging situation. + +0:08:00.140 --> 0:08:05.732 +So there is there is different information +in the Sears language or in the target. + +0:08:06.046 --> 0:08:08.832 +So if we have the English word dot player,. + +0:08:09.029 --> 0:08:12.911 +It's there is no information about the gender +in there. + +0:08:12.911 --> 0:08:19.082 +However, if you want to translate in German, +you cannot easily generate a word without a + +0:08:19.082 --> 0:08:20.469 +gender information. + +0:08:20.469 --> 0:08:27.056 +Or man, you can't do something like Shubila +in, but that sounds a bit weird if you're talking. + +0:08:27.027 --> 0:08:29.006 +About a specific person. + +0:08:29.006 --> 0:08:32.331 +Then you should use the appropriate font. + +0:08:32.692 --> 0:08:44.128 +And so it's most challenging translation as +always in this situation where you have less + +0:08:44.128 --> 0:08:50.939 +information on the source side but more information. + +0:08:51.911 --> 0:08:57.103 +Similar things like if you think about Japanese, +for example where there's different formality + +0:08:57.103 --> 0:08:57.540 +levels. + +0:08:57.540 --> 0:09:02.294 +If in German there is no formality or like +two only or in English there's no formality + +0:09:02.294 --> 0:09:02.677 +level. + +0:09:02.862 --> 0:09:08.139 +And now you have to estimate the formality +level. + +0:09:08.139 --> 0:09:10.884 +Of course, it takes some. + +0:09:10.884 --> 0:09:13.839 +It's not directly possible. + +0:09:14.094 --> 0:09:20.475 +What nowadays systems are doing is at least +assess. + +0:09:20.475 --> 0:09:27.470 +This is a situation where don't have enough +information. + +0:09:27.567 --> 0:09:28.656 +Translation. + +0:09:28.656 --> 0:09:34.938 +So here you have that suggesting it can be +doctor or doctorate in Spanish. + +0:09:35.115 --> 0:09:37.051 +So that is a possibility. + +0:09:37.051 --> 0:09:41.595 +However, it is of course very, very challenging +to find out. + +0:09:42.062 --> 0:09:46.130 +Is there two really different meanings, or +is it not the case? + +0:09:46.326 --> 0:09:47.933 +You can do the big rule base here. + +0:09:47.933 --> 0:09:49.495 +Maybe don't know how they did it. + +0:09:49.990 --> 0:09:57.469 +You can, of course, if you are focusing on +gender, the source and the target is different, + +0:09:57.469 --> 0:09:57.879 +and. + +0:09:58.118 --> 0:10:05.799 +But if you want to do it more general, it's +not that easy because there's always. + +0:10:06.166 --> 0:10:18.255 +But it's not clear if these are really different +or if there's only slight differences. + +0:10:22.142 --> 0:10:36.451 +Between that another reason why there is a +bias in there is typically the system tries + +0:10:36.451 --> 0:10:41.385 +to always do the most simple. + +0:10:42.262 --> 0:10:54.483 +And also in your training data there are unintended +shortcuts or clues only in the training data + +0:10:54.483 --> 0:10:59.145 +because you sample them in some way. + +0:10:59.379 --> 0:11:06.257 +This example, if she works in a hospital and +my friend is a nurse, then it might be that + +0:11:06.257 --> 0:11:07.184 +one friend. + +0:11:08.168 --> 0:11:18.979 +Male and female because it has learned that +in your trained doctor is a male and a nurse + +0:11:18.979 --> 0:11:20.802 +is doing this. + +0:11:20.880 --> 0:11:29.587 +And of course, if we are doing maximum likelihood +approximation as we are doing it in general, + +0:11:29.587 --> 0:11:30.962 +we are always. + +0:11:30.951 --> 0:11:43.562 +So that means if in your training data this +correlation is maybe in the case then your + +0:11:43.562 --> 0:11:48.345 +predictions are always the same. + +0:11:48.345 --> 0:11:50.375 +It typically. + +0:11:55.035 --> 0:12:06.007 +What does it mean, of course, if we are having +this type of fires and if we are applying? + +0:12:05.925 --> 0:12:14.821 +It might be that the benefit of machine translation +rice so more and more people can benefit from + +0:12:14.821 --> 0:12:20.631 +the ability to talk to people in different +languages and so on. + +0:12:20.780 --> 0:12:27.261 +But if you more often use it, problems of +the system also get more and more important. + +0:12:27.727 --> 0:12:36.984 +And so if we are seeing that these problems +and people nowadays only start to analyze these + +0:12:36.984 --> 0:12:46.341 +problems partly, also because if it hasn't +been used, it's not that important if the quality + +0:12:46.341 --> 0:12:47.447 +is so bad. + +0:12:47.627 --> 0:12:51.907 +Version or is mixing it all the time like +we have seen in old systems. + +0:12:51.907 --> 0:12:52.993 +Then, of course,. + +0:12:53.053 --> 0:12:57.303 +The issue is not that you have biased issues +that you at first need to create a right view. + +0:12:57.637 --> 0:13:10.604 +So only with the wide application of the good +quality this becomes important, and then of + +0:13:10.604 --> 0:13:15.359 +course you should look into how. + +0:13:15.355 --> 0:13:23.100 +In order to first get aware of what are the +challenges, and that is a general idea not + +0:13:23.100 --> 0:13:24.613 +only about bias. + +0:13:24.764 --> 0:13:31.868 +Of course, we have learned about blue scores, +so how can you evaluate the over quality and + +0:13:31.868 --> 0:13:36.006 +they are very important, either blue or any +of that. + +0:13:36.006 --> 0:13:40.378 +However, they are somehow giving us a general +overview. + +0:13:40.560 --> 0:13:58.410 +And if we want to improve our systems, of +course it's important that we also do more + +0:13:58.410 --> 0:14:00.510 +detailed. + +0:14:00.340 --> 0:14:05.828 +Test sets which are very challenging in order +to attend to see how good these systems. + +0:14:06.446 --> 0:14:18.674 +Of course, one last reminder to that if you +do a challenge that says it's typically good + +0:14:18.674 --> 0:14:24.581 +to keep track of your general performance. + +0:14:24.784 --> 0:14:28.648 +You don't want to improve normally then on +the general quality. + +0:14:28.688 --> 0:14:41.555 +So if you build a system which will mitigate +some biases then the aim is that if you evaluate + +0:14:41.555 --> 0:14:45.662 +it on the challenging biases. + +0:14:45.745 --> 0:14:53.646 +You don't need to get better because the aggregated +versions don't really measure that aspect well, + +0:14:53.646 --> 0:14:57.676 +but if you significantly drop in performance +then. + +0:15:00.000 --> 0:15:19.164 +What are, in generally calms, people report +about that or why should you care about? + +0:15:19.259 --> 0:15:23.598 +And you're even then amplifying this type +of stereotypes. + +0:15:23.883 --> 0:15:33.879 +And that is not what you want to achieve with +using this technology. + +0:15:33.879 --> 0:15:39.384 +It's not working through some groups. + +0:15:39.819 --> 0:15:47.991 +And secondly what is referred to as allocational +parts. + +0:15:47.991 --> 0:15:54.119 +The system might not perform as well for. + +0:15:54.314 --> 0:16:00.193 +So another example of which we would like +to see is that sometimes the translation depends + +0:16:00.193 --> 0:16:01.485 +on who is speaking. + +0:16:01.601 --> 0:16:03.463 +So Here You Have It in French. + +0:16:03.723 --> 0:16:16.359 +Not say it, but the word happy or French has +to be expressed differently, whether it's a + +0:16:16.359 --> 0:16:20.902 +male person or a female person. + +0:16:21.121 --> 0:16:28.917 +It's nearly impossible to guess that or it's +impossible, so then you always select one. + +0:16:29.189 --> 0:16:37.109 +And of course, since we do greedy search, +it will always generate the same, so you will + +0:16:37.109 --> 0:16:39.449 +have a worse performance. + +0:16:39.779 --> 0:16:46.826 +And of course not what we want to achieve +in average. + +0:16:46.826 --> 0:16:54.004 +You might be then good, but you also have +the ability. + +0:16:54.234 --> 0:17:08.749 +This is a biased problem or an interface problem +because mean you can say well. + +0:17:09.069 --> 0:17:17.358 +And if you do it, we still have a system that +generates unusable output. + +0:17:17.358 --> 0:17:24.057 +If you don't tell it what you want to do, +so in this case. + +0:17:24.244 --> 0:17:27.173 +So in this case it's like if we don't have +enough information. + +0:17:27.467 --> 0:17:34.629 +So you have to adapt your system in some way +that can either access the information or output. + +0:17:34.894 --> 0:17:46.144 +But yeah, how you mean there's different ways +of how to improve over that first thing is + +0:17:46.144 --> 0:17:47.914 +you find out. + +0:17:48.688 --> 0:17:53.826 +Then there is different ways of addressing +them, and they of course differ. + +0:17:53.826 --> 0:17:57.545 +Isn't the situation where the information's +available? + +0:17:58.038 --> 0:18:12.057 +That's the first case we have, or is it a +situation where we don't have the information + +0:18:12.057 --> 0:18:13.332 +either? + +0:18:14.154 --> 0:18:28.787 +Or should give the system maybe the opportunity +to output those or say don't know this is still + +0:18:28.787 --> 0:18:29.701 +open. + +0:18:29.769 --> 0:18:35.470 +And even if they have enough information, +need this additional information, but they + +0:18:35.470 --> 0:18:36.543 +are just doing. + +0:18:36.776 --> 0:18:51.132 +Which is a bit based on how we find that there +is research on that, but it's not that easy + +0:18:51.132 --> 0:18:52.710 +to solve. + +0:18:52.993 --> 0:19:05.291 +But in general, detecting do have enough information +to do a good translation or are information + +0:19:05.291 --> 0:19:06.433 +missing? + +0:19:09.669 --> 0:19:18.951 +But before we come on how we will address +it or try to change it, and before we look + +0:19:18.951 --> 0:19:22.992 +at how we can assess it, of course,. + +0:19:23.683 --> 0:19:42.820 +And therefore wanted to do a bit of a review +on how gender is represented in languages. + +0:19:43.743 --> 0:19:48.920 +Course: You can have more fine grained. + +0:19:48.920 --> 0:20:00.569 +It's not that everything in the group is the +same, but in general you have a large group. + +0:20:01.381 --> 0:20:08.347 +For example, you even don't say ishi or but +it's just one word for it written. + +0:20:08.347 --> 0:20:16.107 +Oh, don't know how it's pronounced, so you +cannot say from a sentence whether it's ishi + +0:20:16.107 --> 0:20:16.724 +or it. + +0:20:17.937 --> 0:20:29.615 +Of course, there are some exceptions for whether +it's a difference between male and female. + +0:20:29.615 --> 0:20:35.962 +They have different names for brother and +sister. + +0:20:36.036 --> 0:20:41.772 +So normally you cannot infer whether this +is a male speaker or speaking about a male + +0:20:41.772 --> 0:20:42.649 +or a female. + +0:20:44.304 --> 0:20:50.153 +Examples for these languages are, for example, +Finnish and Turkish. + +0:20:50.153 --> 0:21:00.370 +There are more languages, but these are: Then +we have no nutritional gender languages where + +0:21:00.370 --> 0:21:05.932 +there's some gender information in there, but +it's. + +0:21:05.905 --> 0:21:08.169 +And this is an example. + +0:21:08.169 --> 0:21:15.149 +This is English, which is in that way a nice +example because most people. + +0:21:15.415 --> 0:21:20.164 +So you have there some lexicogender and phenomenal +gender. + +0:21:20.164 --> 0:21:23.303 +I mean mamadeta there she-hee and him. + +0:21:23.643 --> 0:21:31.171 +And very few words are marked like actor and +actress, but in general most words are not + +0:21:31.171 --> 0:21:39.468 +marked, so it's teacher and lecturer and friend, +so in all these words the gender is not marked, + +0:21:39.468 --> 0:21:41.607 +and so you cannot infer. + +0:21:42.622 --> 0:21:48.216 +So the initial Turkish sentence here would +be translated to either he is a good friend + +0:21:48.216 --> 0:21:49.373 +or she is a good. + +0:21:51.571 --> 0:22:05.222 +In this case you would have them gender information +in there, but of course there's a good friend. + +0:22:07.667 --> 0:22:21.077 +And then finally there is the grammatical +German languages where each noun has a gender. + +0:22:21.077 --> 0:22:25.295 +That's the case in Spanish. + +0:22:26.186 --> 0:22:34.025 +This is mostly formal, but at least if you're +talking about a human that also agrees. + +0:22:34.214 --> 0:22:38.209 +Of course, it's like the sun. + +0:22:38.209 --> 0:22:50.463 +There is no clear thing why the sun should +be female, and in other language it's different. + +0:22:50.390 --> 0:22:56.100 +The matching, and then you also have more +agreements with this that makes things more + +0:22:56.100 --> 0:22:56.963 +complicated. + +0:22:57.958 --> 0:23:08.571 +Here he is a good friend and the good is also +depending whether it's male or went up so it's + +0:23:08.571 --> 0:23:17.131 +changing also based on the gender so you have +a lot of gender information. + +0:23:17.777 --> 0:23:21.364 +Get them, but do you always get them correctly? + +0:23:21.364 --> 0:23:25.099 +It might be that they're in English, for example. + +0:23:28.748 --> 0:23:36.154 +And since this is the case, and you need to +like often express the gender even though you + +0:23:36.154 --> 0:23:37.059 +might not. + +0:23:37.377 --> 0:23:53.030 +Aware of it or it's not possible, there's +some ways in German how to mark mutual forms. + +0:23:54.194 --> 0:24:03.025 +But then it's again from the machine learning +side of view, of course quite challenging because + +0:24:03.025 --> 0:24:05.417 +you only want to use the. + +0:24:05.625 --> 0:24:11.108 +If it's known to the reader you want to use +the correct, the not mutual form but either + +0:24:11.108 --> 0:24:12.354 +the male or female. + +0:24:13.013 --> 0:24:21.771 +So they are assessing what is known to the +reader as a challenge which needs to in some + +0:24:21.771 --> 0:24:23.562 +way be addressed. + +0:24:26.506 --> 0:24:30.887 +Here why does that happen? + +0:24:30.887 --> 0:24:42.084 +Three reasons we have that in a bit so one +is, of course, that your. + +0:24:42.162 --> 0:24:49.003 +Example: If you look at the Europe High Corpus, +which is an important resource for doing machine + +0:24:49.003 --> 0:24:49.920 +translation. + +0:24:50.010 --> 0:24:59.208 +Then there's only thirty percent of the speakers +are female, and so if you train a model on + +0:24:59.208 --> 0:25:06.606 +that data, if you're translating to French, +there will be a male version. + +0:25:06.746 --> 0:25:10.762 +And so you'll just have a lot more like seventy +percent of your mail for it. + +0:25:10.971 --> 0:25:18.748 +And that will be Yep will make the model therefore +from this data sub. + +0:25:18.898 --> 0:25:25.882 +And of course this will be in the data for +a very long time. + +0:25:25.882 --> 0:25:33.668 +So if there's more female speakers in the +European Parliament, but. + +0:25:33.933 --> 0:25:42.338 +But we are training on historical data, so +even if there is for a long time, it will not + +0:25:42.338 --> 0:25:43.377 +be in the. + +0:25:46.346 --> 0:25:57.457 +Then besides these preexisting data there +is of course technical biases which will amplify + +0:25:57.457 --> 0:25:58.800 +this type. + +0:25:59.039 --> 0:26:04.027 +So one we already address, that's for example +sampling or beam search. + +0:26:04.027 --> 0:26:06.416 +You get the most probable output. + +0:26:06.646 --> 0:26:16.306 +So if there's a bias in your model, it will +amplify that not only in the case we had before, + +0:26:16.306 --> 0:26:19.423 +and produce the male version. + +0:26:20.040 --> 0:26:32.873 +So if you have the same source sentence like +am happy and in your training data it will + +0:26:32.873 --> 0:26:38.123 +be male and female if you're doing. + +0:26:38.418 --> 0:26:44.510 +So in that way by doing this type of algorithmic +design you will have. + +0:26:44.604 --> 0:26:59.970 +Another use case is if you think about a multilingual +machine translation, for example if you are + +0:26:59.970 --> 0:27:04.360 +now doing a pivot language. + +0:27:04.524 --> 0:27:13.654 +But if you're first trying to English this +information might get lost and then you translate + +0:27:13.654 --> 0:27:14.832 +to Spanish. + +0:27:15.075 --> 0:27:21.509 +So while in general in this class there is +not this type of bias there,. + +0:27:22.922 --> 0:27:28.996 +You might introduce it because you might have +good reasons for doing a modular system because + +0:27:28.996 --> 0:27:31.968 +you don't have enough training data or so on. + +0:27:31.968 --> 0:27:37.589 +It's performing better in average, but of +course by doing this choice you'll introduce + +0:27:37.589 --> 0:27:40.044 +an additional type of bias into your. + +0:27:45.805 --> 0:27:52.212 +And then there is what people refer to as +emergent bias, and that is, if you use a system + +0:27:52.212 --> 0:27:58.903 +for a different use case as we see in, generally +it is the case that is performing worse, but + +0:27:58.903 --> 0:28:02.533 +then of course you can have even more challenging. + +0:28:02.942 --> 0:28:16.196 +So the extreme case would be if you train +a system only on male speakers, then of course + +0:28:16.196 --> 0:28:22.451 +it will perform worse on female speakers. + +0:28:22.902 --> 0:28:36.287 +So, of course, if you're doing this type of +problem, if you use a system for a different + +0:28:36.287 --> 0:28:42.152 +situation where it was original, then. + +0:28:44.004 --> 0:28:54.337 +And with this we would then go for type of +evaluation, but before we are looking at how + +0:28:54.337 --> 0:28:56.333 +we can evaluate. + +0:29:00.740 --> 0:29:12.176 +Before we want to look into how we can improve +the system, think yeah, maybe at the moment + +0:29:12.176 --> 0:29:13.559 +most work. + +0:29:13.954 --> 0:29:21.659 +And the one thing is the system trying to +look into stereotypes. + +0:29:21.659 --> 0:29:26.164 +So how does a system use stereotypes? + +0:29:26.466 --> 0:29:29.443 +So if you have the Hungarian sentence,. + +0:29:29.729 --> 0:29:33.805 +Which should be he is an engineer or she is +an engineer. + +0:29:35.375 --> 0:29:43.173 +And you cannot guess that because we saw that +he and she is not different in Hungary. + +0:29:43.423 --> 0:29:57.085 +Then you can have a test set where you have +these type of ailanomal occupations. + +0:29:56.977 --> 0:30:03.862 +You have statistics from how is the distribution +by gender so you can automatically generate + +0:30:03.862 --> 0:30:04.898 +the sentence. + +0:30:04.985 --> 0:30:21.333 +Then you could put in jobs which are mostly +done by a man and then you can check how is + +0:30:21.333 --> 0:30:22.448 +your. + +0:30:22.542 --> 0:30:31.315 +That is one type of evaluating stereotypes +that one of the most famous benchmarks called + +0:30:31.315 --> 0:30:42.306 +vino is exactly: The second type of evaluation +is about gender preserving. + +0:30:42.342 --> 0:30:51.201 +So that is exactly what we have seen beforehand. + +0:30:51.201 --> 0:31:00.240 +If these information are not in the text itself,. + +0:31:00.320 --> 0:31:01.875 +Gender as a speaker. + +0:31:02.062 --> 0:31:04.450 +And how good does a system do that? + +0:31:04.784 --> 0:31:09.675 +And we'll see there's, for example, one benchmark +on this. + +0:31:09.675 --> 0:31:16.062 +For example: For Arabic there is one benchmark +on this foot: Audio because if you're now think + +0:31:16.062 --> 0:31:16.781 +already of the. + +0:31:17.157 --> 0:31:25.257 +From when we're talking about speech translation, +it might be interesting because in the speech + +0:31:25.257 --> 0:31:32.176 +signal you should have a better guess on whether +it's a male or a female speaker. + +0:31:32.432 --> 0:31:38.928 +So but mean current systems, mostly you can +always add, and they will just first transcribe. + +0:31:42.562 --> 0:31:45.370 +Yes, so how do these benchmarks? + +0:31:45.305 --> 0:31:51.356 +Look like that, the first one is here. + +0:31:51.356 --> 0:32:02.837 +There's an occupation test where it looks +like a simple test set because. + +0:32:03.023 --> 0:32:10.111 +So I've known either hurry him or pronounce +the name for a long time. + +0:32:10.111 --> 0:32:13.554 +My friend works as an occupation. + +0:32:13.833 --> 0:32:16.771 +So that is like all sentences in that look +like that. + +0:32:17.257 --> 0:32:28.576 +So in this case you haven't had the biggest +work in here, which is friends. + +0:32:28.576 --> 0:32:33.342 +So your only checking later is. + +0:32:34.934 --> 0:32:46.981 +This can be inferred from whether it's her +or her or her, or if it's a proper name, so + +0:32:46.981 --> 0:32:55.013 +can you infer it from the name, and then you +can compare. + +0:32:55.115 --> 0:33:01.744 +So is this because the job description is +nearer to friend. + +0:33:01.744 --> 0:33:06.937 +Does the system get disturbed by this type +of. + +0:33:08.828 --> 0:33:14.753 +And there you can then automatically assess +yeah this type. + +0:33:14.774 --> 0:33:18.242 +Of course, that's what said at the beginning. + +0:33:18.242 --> 0:33:24.876 +You shouldn't only rely on that because if +you only rely on it you can easily trick the + +0:33:24.876 --> 0:33:25.479 +system. + +0:33:25.479 --> 0:33:31.887 +So one type of sentence is translated, but +of course it can give you very important. + +0:33:33.813 --> 0:33:35.309 +Any questions yeah. + +0:33:36.736 --> 0:33:44.553 +Much like the evaluation of stereotype, we +want the system to agree with stereotypes because + +0:33:44.553 --> 0:33:46.570 +it increases precision. + +0:33:46.786 --> 0:33:47.979 +No, no, no. + +0:33:47.979 --> 0:33:53.149 +In this case, if we say oh yeah, he is an +engineer. + +0:33:53.149 --> 0:34:01.600 +From the example, it's probably the most likely +translation, probably in more cases. + +0:34:02.702 --> 0:34:08.611 +Now there is two things, so yeah yeah, so +there is two ways of evaluating. + +0:34:08.611 --> 0:34:15.623 +The one thing is in this case he's using that +he's an engineer, but there is conflicting + +0:34:15.623 --> 0:34:19.878 +information that in this case the engineer +is female. + +0:34:20.380 --> 0:34:21.890 +So anything was. + +0:34:22.342 --> 0:34:29.281 +Information yes, so that is the one in the +other case. + +0:34:29.281 --> 0:34:38.744 +Typically it's not evaluated in that, but +in that time you really want it. + +0:34:38.898 --> 0:34:52.732 +That's why most of those cases you have evaluated +in scenarios where you have context information. + +0:34:53.453 --> 0:34:58.878 +How to deal with the other thing is even more +challenging to one case where it is the case + +0:34:58.878 --> 0:35:04.243 +is what I said before is when it's about the +speaker so that the speech translation test. + +0:35:04.584 --> 0:35:17.305 +And there they try to look in a way that can +you use, so use the audio also as input. + +0:35:18.678 --> 0:35:20.432 +Yeah. + +0:35:20.640 --> 0:35:30.660 +So if we have a reference where she is an +engineer okay, are there efforts to adjust + +0:35:30.660 --> 0:35:37.497 +the metric so that our transmissions go into +the correct? + +0:35:37.497 --> 0:35:38.676 +We don't. + +0:35:38.618 --> 0:35:40.389 +Only done for mean this is evaluation. + +0:35:40.389 --> 0:35:42.387 +You are not pushing the model for anything. + +0:35:43.023 --> 0:35:53.458 +But if you want to do it in training, that +you're not doing it this way. + +0:35:53.458 --> 0:35:58.461 +I'm not aware of any direct model. + +0:35:58.638 --> 0:36:04.146 +Because you have to find out, is it known +in this scenario or not? + +0:36:05.725 --> 0:36:12.622 +So at least I'm not aware of there's like +the directive doing training try to assess + +0:36:12.622 --> 0:36:13.514 +more than. + +0:36:13.813 --> 0:36:18.518 +Mean there is data augmentation in the way +that is done. + +0:36:18.518 --> 0:36:23.966 +Think we'll have that later, so what you can +do is generate more. + +0:36:24.144 --> 0:36:35.355 +You can do that automatically or there's ways +of biasing so that you can try to make your + +0:36:35.355 --> 0:36:36.600 +training. + +0:36:36.957 --> 0:36:46.228 +That's typically not done with focusing on +scenarios where you check before or do have + +0:36:46.228 --> 0:36:47.614 +information. + +0:36:49.990 --> 0:36:58.692 +Mean, but for everyone it's not clear and +agree with you in this scenario, the normal + +0:36:58.692 --> 0:37:01.222 +evaluation system where. + +0:37:01.341 --> 0:37:07.006 +Maybe you could say it shouldn't do always +the same but have a distribution like a training + +0:37:07.006 --> 0:37:12.733 +data or something like that because otherwise +we're amplifying but that current system can't + +0:37:12.733 --> 0:37:15.135 +do current systems can't predict both. + +0:37:15.135 --> 0:37:17.413 +That's why we see all the beginning. + +0:37:17.413 --> 0:37:20.862 +They have this extra interface where they +then propose. + +0:37:24.784 --> 0:37:33.896 +Another thing is the vino empty system and +it started from a challenge set for co-reference + +0:37:33.896 --> 0:37:35.084 +resolution. + +0:37:35.084 --> 0:37:43.502 +Co-reference resolution means we have pear +on him and we need to find out what it's. + +0:37:43.823 --> 0:37:53.620 +So you have the doctor off the nurse to help +her in the procedure, and now her does not + +0:37:53.620 --> 0:37:55.847 +refer to the nurse. + +0:37:56.556 --> 0:38:10.689 +And there you of course have the same type +of stewardesses and the same type of buyers + +0:38:10.689 --> 0:38:15.237 +as the machine translation. + +0:38:16.316 --> 0:38:25.165 +And no think that normally yeah mean maybe +that's also biased. + +0:38:27.687 --> 0:38:37.514 +No, but if you ask somebody, I guess if you +ask somebody, then I mean syntectically it's + +0:38:37.514 --> 0:38:38.728 +ambiguous. + +0:38:38.918 --> 0:38:50.248 +If you ask somebody to help, then the horror +has to refer to that. + +0:38:50.248 --> 0:38:54.983 +So it should also help the. + +0:38:56.396 --> 0:38:57.469 +Of the time. + +0:38:57.469 --> 0:39:03.906 +The doctor is female and says please have +me in the procedure, but the other. + +0:39:04.904 --> 0:39:09.789 +Oh, you mean that it's helping the third person. + +0:39:12.192 --> 0:39:16.140 +Yeah, agree that it could also be yes. + +0:39:16.140 --> 0:39:19.077 +Don't know how easy that is. + +0:39:19.077 --> 0:39:21.102 +Only know the test. + +0:39:21.321 --> 0:39:31.820 +Then guess yeah, then you need a situation +context where you know the situation, the other + +0:39:31.820 --> 0:39:34.589 +person having problems. + +0:39:36.936 --> 0:39:42.251 +Yeah no yeah that is like here when there +is additional ambiguity in there. + +0:39:45.465 --> 0:39:48.395 +See that pure text models is not always okay. + +0:39:48.395 --> 0:39:51.134 +How full mean there is a lot of work also. + +0:39:52.472 --> 0:40:00.119 +Will not cover that in the lecture, but there +are things like multimodal machine translation + +0:40:00.119 --> 0:40:07.109 +where you try to add pictures or something +like that to have more context, and then. + +0:40:10.370 --> 0:40:23.498 +Yeah, it starts with this, so in order to +evaluate that what it does is that you translate + +0:40:23.498 --> 0:40:25.229 +the system. + +0:40:25.305 --> 0:40:32.310 +It's doing stereotyping so the doctor is male +and the nurse is female. + +0:40:32.492 --> 0:40:42.362 +And then you're using word alignment, and +then you check whether this gender maps with + +0:40:42.362 --> 0:40:52.345 +the annotated gender of there, and that is +how you evaluate in this type of vino empty. + +0:40:52.832 --> 0:40:59.475 +Mean, as you see, you're only focusing on +the situation where you can or where the gender + +0:40:59.475 --> 0:41:00.214 +is known. + +0:41:00.214 --> 0:41:06.930 +Why for this one you don't do any evaluation, +but because nurses can in that case be those + +0:41:06.930 --> 0:41:08.702 +and you cannot, as has. + +0:41:08.728 --> 0:41:19.112 +The benchmarks are at the moment designed +in a way that you only evaluate things that + +0:41:19.112 --> 0:41:20.440 +are known. + +0:41:23.243 --> 0:41:25.081 +Then yeah, you can have a look. + +0:41:25.081 --> 0:41:28.931 +For example, here what people are looking +is you can do the first. + +0:41:28.931 --> 0:41:32.149 +Oh well, the currency, how often does it do +it correct? + +0:41:32.552 --> 0:41:41.551 +And there you see these numbers are a bit +older. + +0:41:41.551 --> 0:41:51.835 +There's more work on that, but this is the +first color. + +0:41:51.731 --> 0:42:01.311 +Because they do it like in this test, they +do it twice, one with him and one with her. + +0:42:01.311 --> 0:42:04.834 +So the chance is fifty percent. + +0:42:05.065 --> 0:42:12.097 +Except somehow here, the one system seems +to be quite good there that everything. + +0:42:13.433 --> 0:42:30.863 +What you can also do is look at the difference, +where you need to predict female and the difference. + +0:42:30.850 --> 0:42:40.338 +It's more often correct on the male forms +than on the female forms, and you see that + +0:42:40.338 --> 0:42:43.575 +it's except for this system. + +0:42:43.603 --> 0:42:53.507 +So would assume that they maybe in this one +language did some type of method in there. + +0:42:55.515 --> 0:42:57.586 +If you are more often mean there is like. + +0:42:58.178 --> 0:43:01.764 +It's not a lot lower, there's one. + +0:43:01.764 --> 0:43:08.938 +I don't know why, but if you're always to +the same then it should be. + +0:43:08.938 --> 0:43:14.677 +You seem to be counter intuitive, so maybe +it's better. + +0:43:15.175 --> 0:43:18.629 +Don't know exactly how yes, but it's, it's +true. + +0:43:19.019 --> 0:43:20.849 +Mean, there's very few cases. + +0:43:20.849 --> 0:43:22.740 +I also don't know for Russian. + +0:43:22.740 --> 0:43:27.559 +I mean, there is, I think, mainly for Russian +where you have very low numbers. + +0:43:27.559 --> 0:43:30.183 +I mean, I would say like forty five or so. + +0:43:30.183 --> 0:43:32.989 +There can be more about renting and sampling. + +0:43:32.989 --> 0:43:37.321 +I don't know if they have even more gender +or if they have a new tool. + +0:43:37.321 --> 0:43:38.419 +I don't think so. + +0:43:40.040 --> 0:43:46.901 +Then you have typically even a stronger bias +here where you not do the differentiation between + +0:43:46.901 --> 0:43:53.185 +how often is it correct for me and the female, +but you are distinguishing between the. + +0:43:53.553 --> 0:44:00.503 +So you're here, for you can check for each +occupation, which is the most important. + +0:44:00.440 --> 0:44:06.182 +A comment one based on statistics, and then +you take that on the one side and the anti + +0:44:06.182 --> 0:44:12.188 +stereotypically on the other side, and you +see that not in all cases but in a lot of cases + +0:44:12.188 --> 0:44:16.081 +that null probabilities are even higher than +on the other. + +0:44:21.061 --> 0:44:24.595 +Ah, I'm telling you there's something. + +0:44:28.668 --> 0:44:32.850 +But it has to be for a doctor. + +0:44:32.850 --> 0:44:39.594 +For example, for a doctor there three don't +know. + +0:44:40.780 --> 0:44:44.275 +Yeah, but guess here it's mainly imminent +job description. + +0:44:44.275 --> 0:44:45.104 +So yeah, but. + +0:44:50.050 --> 0:45:01.145 +And then there is the Arabic capital gender +corpus where it is about more assessing how + +0:45:01.145 --> 0:45:03.289 +strong a singer. + +0:45:03.483 --> 0:45:09.445 +How that is done is the open subtitles. + +0:45:09.445 --> 0:45:18.687 +Corpus is like a corpus of subtitles generated +by volunteers. + +0:45:18.558 --> 0:45:23.426 +For the Words Like I Mean Myself. + +0:45:23.303 --> 0:45:30.670 +And mine, and then they annotated the Arabic +sentences, whether here I refer to as a female + +0:45:30.670 --> 0:45:38.198 +and masculine, or whether it's ambiguous, and +then from the male and female one they generate + +0:45:38.198 --> 0:45:40.040 +types of translations. + +0:45:43.703 --> 0:45:51.921 +And then a bit more different test sets as +the last one that is referred to as the machine. + +0:45:52.172 --> 0:45:57.926 +Corpus, which is based on these lectures. + +0:45:57.926 --> 0:46:05.462 +In general, this lecture is very important +because it. + +0:46:05.765 --> 0:46:22.293 +And here is also interesting because you also +have the obvious signal and it's done in the + +0:46:22.293 --> 0:46:23.564 +worst. + +0:46:23.763 --> 0:46:27.740 +In the first case is where it can only be +determined based on the speaker. + +0:46:27.968 --> 0:46:30.293 +So something like am a good speaker. + +0:46:30.430 --> 0:46:32.377 +You cannot do that correctly. + +0:46:32.652 --> 0:46:36.970 +However, if you would have the audio signal +you should have a lot better guests. + +0:46:37.257 --> 0:46:47.812 +So it wasn't evaluated, especially machine +translation and speech translation system, + +0:46:47.812 --> 0:46:53.335 +which take this into account or, of course,. + +0:46:57.697 --> 0:47:04.265 +The second thing is where you can do it based +on the context. + +0:47:04.265 --> 0:47:08.714 +In this case we are not using artificial. + +0:47:11.011 --> 0:47:15.550 +Cope from the from the real data, so it's +not like artificial creative data, but. + +0:47:15.815 --> 0:47:20.939 +Of course, in a lot more work you have to +somehow find these in the corpus and use them + +0:47:20.939 --> 0:47:21.579 +as a test. + +0:47:21.601 --> 0:47:27.594 +Is something she got together with two of +her dearest friends, this older woman, and + +0:47:27.594 --> 0:47:34.152 +then, of course, here friends can we get from +the context, but it might be that some systems + +0:47:34.152 --> 0:47:36.126 +ignore that that should be. + +0:47:36.256 --> 0:47:43.434 +So you have two test sets in there, two types +of benchmarks, and you want to determine which + +0:47:43.434 --> 0:47:43.820 +one. + +0:47:47.787 --> 0:47:55.801 +Yes, this is how we can evaluate it, so the +next question is how can we improve our systems + +0:47:55.801 --> 0:48:03.728 +because that's normally how we do evaluation +and why we do evaluation so before we go into + +0:48:03.728 --> 0:48:04.251 +that? + +0:48:08.508 --> 0:48:22.685 +One idea is to do what is referred to as modeling, +so the idea is somehow change the model in + +0:48:22.685 --> 0:48:24.495 +a way that. + +0:48:24.965 --> 0:48:38.271 +And yes, one idea is, of course, if we are +giving him more information, the system doesn't + +0:48:38.271 --> 0:48:44.850 +need to do a guess without this information. + +0:48:44.724 --> 0:48:47.253 +In order to just ambiguate the bias,. + +0:48:47.707 --> 0:48:59.746 +The first thing is you can do that on the +sentence level, for example, especially if + +0:48:59.746 --> 0:49:03.004 +you have the speakers. + +0:49:03.063 --> 0:49:12.518 +You can annotate the sentence with whether +a speaker is made or a female, and then you + +0:49:12.518 --> 0:49:25.998 +can: Here we're seeing one thing which is very +successful in neuromachine translation and + +0:49:25.998 --> 0:49:30.759 +other kinds of neural networks. + +0:49:31.711 --> 0:49:39.546 +However, in neuromachine translation, since +we have no longer the strong correlation between + +0:49:39.546 --> 0:49:47.043 +input and output, the nice thing is you can +normally put everything into your input, and + +0:49:47.043 --> 0:49:50.834 +if you have enough data, it's well balanced. + +0:49:51.151 --> 0:50:00.608 +So how you can do it here is you can add the +token here saying female or male if the speaker + +0:50:00.608 --> 0:50:01.523 +is male. + +0:50:01.881 --> 0:50:07.195 +So, of course, this is no longer for human +correct translation. + +0:50:07.195 --> 0:50:09.852 +It's like female Madam because. + +0:50:10.090 --> 0:50:22.951 +If you are doing the same thing then the translation +would not be to translate female but can use + +0:50:22.951 --> 0:50:25.576 +it to disintegrate. + +0:50:25.865 --> 0:50:43.573 +And so this type of tagging is a very commonly +used method in order to add more information. + +0:50:47.107 --> 0:50:54.047 +So this is first of all a very good thing, +a very easy one. + +0:50:54.047 --> 0:50:57.633 +You don't have to change your. + +0:50:58.018 --> 0:51:04.581 +For example, has also been done if you think +about formality in German. + +0:51:04.581 --> 0:51:11.393 +Whether you have to produce or, you can: We'll +see it on Thursday. + +0:51:11.393 --> 0:51:19.628 +It's a very common approach for domains, so +you put in the domain beforehand. + +0:51:19.628 --> 0:51:24.589 +This is from a Twitter or something like that. + +0:51:24.904 --> 0:51:36.239 +Of course, it only learns it if it has seen +it and it dees them out, but in this case you + +0:51:36.239 --> 0:51:38.884 +don't need an equal. + +0:51:39.159 --> 0:51:42.593 +But however, it's still like challenging to +get this availability. + +0:51:42.983 --> 0:51:55.300 +If you would do that on the first of all, +of course, it only works if you really have + +0:51:55.300 --> 0:52:02.605 +data from speaking because otherwise it's unclear. + +0:52:02.642 --> 0:52:09.816 +You would only have the text and you would +not easily see whether it is the mayor or the + +0:52:09.816 --> 0:52:14.895 +female speaker because this information has +been removed from. + +0:52:16.456 --> 0:52:18.745 +Does anybody of you have an idea of how it +fits? + +0:52:20.000 --> 0:52:25.480 +Manage that and still get the data of whether +it's made or not speaking. + +0:52:32.152 --> 0:52:34.270 +Can do a small trick. + +0:52:34.270 --> 0:52:37.834 +We can just look on the target side. + +0:52:37.937 --> 0:52:43.573 +Mean this is, of course, only important if +in the target side this is the case. + +0:52:44.004 --> 0:52:50.882 +So for your training data you can irritate +it based on your target site in German you + +0:52:50.882 --> 0:52:51.362 +know. + +0:52:51.362 --> 0:52:58.400 +In German you don't know but in Spanish for +example you know because different and then + +0:52:58.400 --> 0:53:00.400 +you can use grammatical. + +0:53:00.700 --> 0:53:10.964 +Of course, the test day would still need to +do that more interface decision. + +0:53:13.954 --> 0:53:18.829 +And: You can, of course, do it even more advanced. + +0:53:18.898 --> 0:53:30.659 +You can even try to add these information +to each word, so you're not doing it for the + +0:53:30.659 --> 0:53:32.687 +full sentence. + +0:53:32.572 --> 0:53:42.129 +If it's unknown, if it's female or if it's +male, you know word alignment so you can't + +0:53:42.129 --> 0:53:42.573 +do. + +0:53:42.502 --> 0:53:55.919 +Here then you can do a word alignment, which +is of course not always perfect, but roughly + +0:53:55.919 --> 0:53:59.348 +then you can annotate. + +0:54:01.401 --> 0:54:14.165 +Now you have these type of inputs where you +have one information per word, but on the one + +0:54:14.165 --> 0:54:16.718 +end you have the. + +0:54:17.517 --> 0:54:26.019 +This has been used before in other scenarios, +so you might not put in the gender, but in + +0:54:26.019 --> 0:54:29.745 +general this can be other information. + +0:54:30.090 --> 0:54:39.981 +And people refer to that or have used that +as a factored translation model, so what you + +0:54:39.981 --> 0:54:42.454 +may do is you factor. + +0:54:42.742 --> 0:54:45.612 +You have the word itself. + +0:54:45.612 --> 0:54:48.591 +You might have the gender. + +0:54:48.591 --> 0:54:55.986 +You could have more information like don't +know the paddle speech. + +0:54:56.316 --> 0:54:58.564 +And then you have an embedding for each of +them. + +0:54:59.199 --> 0:55:03.599 +And you congratulate them, and then you have +years of congratulated a bedding. + +0:55:03.563 --> 0:55:09.947 +Which says okay, this is a female plumber +or a male plumber or so on. + +0:55:09.947 --> 0:55:18.064 +This has additional information and then you +can train this factory model where you have + +0:55:18.064 --> 0:55:22.533 +the ability to give the model extra information. + +0:55:23.263 --> 0:55:35.702 +And of course now if you are training this +way directly you always need to have this information. + +0:55:36.576 --> 0:55:45.396 +So that might not be the best way if you want +to use a translation system and sometimes don't + +0:55:45.396 --> 0:55:45.959 +have. + +0:55:46.866 --> 0:55:57.987 +So any idea of how you can train it or what +machine learning technique you can use to deal + +0:55:57.987 --> 0:55:58.720 +with. + +0:56:03.263 --> 0:56:07.475 +Mainly despite it already, many of your things. + +0:56:14.154 --> 0:56:21.521 +Drop out so you sometimes put information +in there and then you can use dropouts to inputs. + +0:56:21.861 --> 0:56:27.599 +Is sometimes put in this information in there, +sometimes not, and the system is then able + +0:56:27.599 --> 0:56:28.874 +to deal with those. + +0:56:28.874 --> 0:56:34.803 +If it doesn't have the information, it's doing +some of the best it can do, but if it has the + +0:56:34.803 --> 0:56:39.202 +information, it can use the information and +maybe do a more rounded. + +0:56:46.766 --> 0:56:52.831 +So then there is, of course, more ways to +try to do a moderately biased one. + +0:56:52.993 --> 0:57:01.690 +We will only want to mention here because +you'll have a full lecture on that next week + +0:57:01.690 --> 0:57:08.188 +and that is referred to where context based +machine translation. + +0:57:08.728 --> 0:57:10.397 +Good, and in this other ones, but. + +0:57:10.750 --> 0:57:16.830 +If you translate several sentences well, of +course, there are more situations where you + +0:57:16.830 --> 0:57:17.866 +can dissemble. + +0:57:18.118 --> 0:57:23.996 +Because it might be that the information is +not in the current sentence, but it's in the + +0:57:23.996 --> 0:57:25.911 +previous sentence or before. + +0:57:26.967 --> 0:57:33.124 +If you have the mean with the speaker maybe +not, but if it's referring to, you can core + +0:57:33.124 --> 0:57:33.963 +references. + +0:57:34.394 --> 0:57:40.185 +They are often referring to things in the +previous sentence so you can use them in order + +0:57:40.185 --> 0:57:44.068 +to: And that can be done basically and very +easy. + +0:57:44.068 --> 0:57:47.437 +You'll see more advanced options, but the +main. + +0:57:48.108 --> 0:57:58.516 +Mean, no machine translation is a sequence +to sequence model, which can use any input + +0:57:58.516 --> 0:58:02.993 +sequence to output sequence mapping. + +0:58:02.993 --> 0:58:04.325 +So now at. + +0:58:04.484 --> 0:58:11.281 +So then you can do, for example, five to five +translations, or also five to one, or so there's. + +0:58:11.811 --> 0:58:19.211 +This is not a method like only dedicated to +buying, of course, but the hope is. + +0:58:19.139 --> 0:58:25.534 +If you're using this because I mean bias often, +we have seen that it rises in situations where + +0:58:25.534 --> 0:58:27.756 +we're not having enough context. + +0:58:27.756 --> 0:58:32.940 +So the idea is if we generally increase our +context, it will also help this. + +0:58:32.932 --> 0:58:42.378 +Of course, it will help other situations where +you need context to disintegrate. + +0:58:43.603 --> 0:58:45.768 +Get There If You're Saying I'm Going to the +Bank. + +0:58:46.286 --> 0:58:54.761 +It's not directly from this sentence clear +whether it's the finance institute or the bank + +0:58:54.761 --> 0:58:59.093 +for sitting, but maybe if you say afterward,. + +0:59:02.322 --> 0:59:11.258 +And then there is in generally a very large +amount of work on debiasing the word embelling. + +0:59:11.258 --> 0:59:20.097 +So the one I hear like, I mean, I think that +partly comes from the fact that like a first. + +0:59:21.041 --> 0:59:26.925 +Or that first research was done often on inspecting +the word embeddings and seeing whether they + +0:59:26.925 --> 0:59:32.503 +are biased or not, and people found out how +there is some bias in there, and then the idea + +0:59:32.503 --> 0:59:38.326 +is oh, if you remove them from the word embedded +in already, then maybe your system later will + +0:59:38.326 --> 0:59:39.981 +not have that strong of a. + +0:59:40.520 --> 0:59:44.825 +So how can that work? + +0:59:44.825 --> 0:59:56.369 +Or like maybe first, how do words encounter +bias in there? + +0:59:56.369 --> 0:59:57.152 +So. + +0:59:57.137 --> 1:00:05.555 +So you can look at the word embedding, and +then you can compare the distance of the word + +1:00:05.555 --> 1:00:11.053 +compared: And there's like interesting findings. + +1:00:11.053 --> 1:00:18.284 +For example, you have the difference in occupation +and how similar. + +1:00:18.678 --> 1:00:33.068 +And of course it's not a perfect correlation, +but you see some type of correlation: jobs + +1:00:33.068 --> 1:00:37.919 +which have a high occupation. + +1:00:37.797 --> 1:00:41.387 +They also are more similar to the word what +we're going to be talking about. + +1:00:43.023 --> 1:00:50.682 +Maybe a secretary is also a bit difficult, +but because yeah maybe it's more often. + +1:00:50.610 --> 1:00:52.438 +Done in general by by women. + +1:00:52.438 --> 1:00:58.237 +However, there is a secretary like the Secretary +of State or so, the German minister, which + +1:00:58.237 --> 1:01:03.406 +I of course know that many so in the statistics +they are not counting that often. + +1:01:03.543 --> 1:01:11.576 +But in data they of course cook quite often, +so there's different ways of different meanings. + +1:01:14.154 --> 1:01:23.307 +So how can you not try to remove this type +of bias? + +1:01:23.307 --> 1:01:32.988 +One way is the idea of hearts, devices and +embeddings. + +1:01:33.113 --> 1:01:39.354 +So if you remember on word embeddings think +we have this image that you can do the difference + +1:01:39.354 --> 1:01:44.931 +between man and woman and add this difference +to king and then look at your screen. + +1:01:45.865 --> 1:01:57.886 +So here's the idea we want to remove this +gender information from some things which should + +1:01:57.886 --> 1:02:00.132 +not have gender. + +1:02:00.120 --> 1:02:01.386 +The word engineer. + +1:02:01.386 --> 1:02:06.853 +There is no information about the gender in +that, so you should remove this type. + +1:02:07.347 --> 1:02:16.772 +Of course, you first need to find out where +these inflammations are and you can. + +1:02:17.037 --> 1:02:23.603 +However, normally if you do the difference +like the subspace by only one example, it's + +1:02:23.603 --> 1:02:24.659 +not the best. + +1:02:24.924 --> 1:02:31.446 +So you can do the same thing for things like +brother and sister, man and dad, and then you + +1:02:31.446 --> 1:02:38.398 +can somehow take the average of these differences +saying this is a vector which maps a male from + +1:02:38.398 --> 1:02:39.831 +to the female form. + +1:02:40.660 --> 1:02:50.455 +And then you can try to neutralize this gender +information on this dimension. + +1:02:50.490 --> 1:02:57.951 +You can find it's subspace or dimensional. + +1:02:57.951 --> 1:03:08.882 +It would be a line, but now this is dimensional, +and then you. + +1:03:08.728 --> 1:03:13.104 +Representation: Where you remove this type +of embellishment. + +1:03:15.595 --> 1:03:18.178 +This is, of course, quite strong of the questions. + +1:03:18.178 --> 1:03:19.090 +How good does it? + +1:03:19.090 --> 1:03:20.711 +Thanks tell them for one other. + +1:03:20.880 --> 1:03:28.256 +But it's an idea we are trying to after learning +before we are using the Word and Banks for + +1:03:28.256 --> 1:03:29.940 +machine translation. + +1:03:29.940 --> 1:03:37.315 +We are trying to remove the gender information +from the jobs and then have a representation + +1:03:37.315 --> 1:03:38.678 +which hopefully. + +1:03:40.240 --> 1:03:45.047 +Similar idea is the one of agenda neutral +glove. + +1:03:45.047 --> 1:03:50.248 +Glove is another technique to learn word embeddings. + +1:03:50.750 --> 1:03:52.870 +Think we discussed one shortly. + +1:03:52.870 --> 1:03:56.182 +It was too back, which was some of the first +one. + +1:03:56.456 --> 1:04:04.383 +But there are other of course methods how +you can train word embeddings and glove as + +1:04:04.383 --> 1:04:04.849 +one. + +1:04:04.849 --> 1:04:07.460 +The idea is we're training. + +1:04:07.747 --> 1:04:19.007 +At least this is somehow a bit separated, +so where you have part of the vector is gender + +1:04:19.007 --> 1:04:20.146 +neutral. + +1:04:20.300 --> 1:04:29.247 +What you need therefore is three sets of words, +so you have male words and you have words. + +1:04:29.769 --> 1:04:39.071 +And then you're trying to learn some type +of vector where some dimensions are not. + +1:04:39.179 --> 1:04:51.997 +So the idea is can learn a representation +where at least know that this part is gender + +1:04:51.997 --> 1:04:56.123 +neutral and the other part. + +1:05:00.760 --> 1:05:03.793 +How can we do that? + +1:05:03.793 --> 1:05:12.435 +How can we change the system to learn anything +specific? + +1:05:12.435 --> 1:05:20.472 +Nearly in all cases this works by the loss +function. + +1:05:20.520 --> 1:05:26.206 +And that is more a general approach in machine +translation. + +1:05:26.206 --> 1:05:30.565 +The general loss function is we are learning. + +1:05:31.111 --> 1:05:33.842 +Here is the same idea. + +1:05:33.842 --> 1:05:44.412 +You have the general loss function in order +to learn good embeddings and then you try to + +1:05:44.412 --> 1:05:48.687 +introduce additional loss function. + +1:05:48.969 --> 1:05:58.213 +Yes, I think yes, yes, that's the solution, +and how you make sure that if I have training + +1:05:58.213 --> 1:06:07.149 +for all nurses of email, how do you make sure +that the algorithm puts it into neutral? + +1:06:07.747 --> 1:06:12.448 +And you need, so this is like for only the +first learning of word embeddings. + +1:06:12.448 --> 1:06:18.053 +Then the idea is if you have word embeddings +where the gender is separate and then you train + +1:06:18.053 --> 1:06:23.718 +on top of that machine translation where you +don't change the embeddings, it should hopefully + +1:06:23.718 --> 1:06:25.225 +be less and less biased. + +1:06:25.865 --> 1:06:33.465 +And in order to train that yes you need additional +information so these information need to be + +1:06:33.465 --> 1:06:40.904 +hence defined and they can't be general so +you need to have a list of these are male persons + +1:06:40.904 --> 1:06:44.744 +or males these are nouns for females and these. + +1:06:49.429 --> 1:06:52.575 +So the first step, of course, we still want +to have good word inventings. + +1:06:54.314 --> 1:07:04.100 +So you have the normal objective function +of the word embedding. + +1:07:04.100 --> 1:07:09.519 +It's something like the similarity. + +1:07:09.849 --> 1:07:19.751 +How it's exactly derived is not that important +because we're not interested in love itself, + +1:07:19.751 --> 1:07:23.195 +but you have any loss function. + +1:07:23.195 --> 1:07:26.854 +Of course, you have to keep that. + +1:07:27.167 --> 1:07:37.481 +And then there's three more lost functions +that you can add: So the one is you take the + +1:07:37.481 --> 1:07:51.341 +average value of all the male words and the +average word embedding of all the female words. + +1:07:51.731 --> 1:08:00.066 +So the good thing about this is we don't always +need to have for one word the male and the + +1:08:00.066 --> 1:08:05.837 +female worship, so it's only like we have a +set of male words. + +1:08:06.946 --> 1:08:21.719 +So this is just saying yeah, we want these +two should be somehow similar to each other. + +1:08:21.719 --> 1:08:25.413 +It shouldn't be that. + +1:08:30.330 --> 1:08:40.081 +Should be the other one, or think this should +be it. + +1:08:40.081 --> 1:08:45.969 +This is agenda, the average of. + +1:08:45.945 --> 1:09:01.206 +The average should be the same, but if you're +looking at the female should be at the other. + +1:09:01.681 --> 1:09:06.959 +This is like on these dimensions, the male +should be on the one and the female on the + +1:09:06.959 --> 1:09:07.388 +other. + +1:09:07.627 --> 1:09:16.123 +The same yeah, this gender information should +be there, so you're pushing all the males to + +1:09:16.123 --> 1:09:17.150 +the other. + +1:09:21.541 --> 1:09:23.680 +Then their words should be. + +1:09:23.680 --> 1:09:30.403 +If you have that you see the neutral words, +they should be in the middle of between the + +1:09:30.403 --> 1:09:32.008 +male and the female. + +1:09:32.012 --> 1:09:48.261 +So you say is the middle point between all +male and female words and just somehow putting + +1:09:48.261 --> 1:09:51.691 +the neutral words. + +1:09:52.912 --> 1:09:56.563 +And then you're learning them, and then you +can apply them in different ways. + +1:09:57.057 --> 1:10:03.458 +So you have this a bit in the pre-training +thing. + +1:10:03.458 --> 1:10:10.372 +You can use the pre-trained inbeddings on +the output. + +1:10:10.372 --> 1:10:23.117 +All you can use are: And then you can analyze +what happens instead of training them directly. + +1:10:23.117 --> 1:10:30.504 +If have this additional loss, which tries +to optimize. + +1:10:32.432 --> 1:10:42.453 +And then it was evaluated exactly on the sentences +we had at the beginning where it is about know + +1:10:42.453 --> 1:10:44.600 +her for a long time. + +1:10:44.600 --> 1:10:48.690 +My friend works as an accounting cling. + +1:10:48.788 --> 1:10:58.049 +So all these examples are not very difficult +to translation, but the question is how often + +1:10:58.049 --> 1:10:58.660 +does? + +1:11:01.621 --> 1:11:06.028 +That it's not that complicated as you see +here, so even the baseline. + +1:11:06.366 --> 1:11:10.772 +If you're doing nothing is working quite well, +it's most challenging. + +1:11:10.772 --> 1:11:16.436 +It seems overall in the situation where it's +a name, so for he and him he has learned the + +1:11:16.436 --> 1:11:22.290 +correlation because that's maybe not surprisingly +because this correlation occurs more often + +1:11:22.290 --> 1:11:23.926 +than with any name there. + +1:11:24.044 --> 1:11:31.749 +If you have a name that you can extract, that +is talking about Mary, that's female is a lot + +1:11:31.749 --> 1:11:34.177 +harder to extract than this. + +1:11:34.594 --> 1:11:40.495 +So you'll see already in the bass line this +is yeah, not working, not working. + +1:11:43.403 --> 1:11:47.159 +And for all the other cases it's working very +well. + +1:11:47.787 --> 1:11:53.921 +Where all the best one is achieved here with +an arc debiasing both on the encoder, on the. + +1:11:57.077 --> 1:12:09.044 +It makes sense that a hard debasing on the +decoder doesn't really work because there you + +1:12:09.044 --> 1:12:12.406 +have gender information. + +1:12:14.034 --> 1:12:17.406 +For glove it seems to already work here. + +1:12:17.406 --> 1:12:20.202 +That's maybe surprising and yeah. + +1:12:20.260 --> 1:12:28.263 +So there is no clear else we don't have numbers +for that doesn't really work well on the other. + +1:12:28.263 --> 1:12:30.513 +So how much do I use then? + +1:12:33.693 --> 1:12:44.720 +Then as a last way of improving that is a +bit what we had mentioned before. + +1:12:44.720 --> 1:12:48.493 +That is what is referred. + +1:12:48.488 --> 1:12:59.133 +One problem is the bias in the data so you +can adapt your data so you can just try to + +1:12:59.133 --> 1:13:01.485 +find equal amount. + +1:13:01.561 --> 1:13:11.368 +In your data like you adapt your data and +then you find your data on the smaller but + +1:13:11.368 --> 1:13:12.868 +you can try. + +1:13:18.298 --> 1:13:19.345 +This is line okay. + +1:13:19.345 --> 1:13:21.605 +We have access to the data to the model. + +1:13:21.605 --> 1:13:23.038 +We can improve our model. + +1:13:24.564 --> 1:13:31.328 +One situation we haven't talked a lot about +but another situation might also be and that's + +1:13:31.328 --> 1:13:37.942 +even getting more important is oh you want +to work with a model which you don't have but + +1:13:37.942 --> 1:13:42.476 +you want to improve the model without having +access so when. + +1:13:42.862 --> 1:13:49.232 +Nowadays there are a lot of companies who +are not developing their own system but they're + +1:13:49.232 --> 1:13:52.983 +using or something like that or machine translation. + +1:13:53.313 --> 1:13:59.853 +So there is interest that you might not be +able to find children with models completely. + +1:14:00.080 --> 1:14:09.049 +So the question is, can you do some type of +black box adaptation of a system that takes + +1:14:09.049 --> 1:14:19.920 +the black box system but tries to improve it +in some ways through: There's some ways of + +1:14:19.920 --> 1:14:21.340 +doing that. + +1:14:21.340 --> 1:14:30.328 +One is called black box injection and that's +what is referred to as prompt. + +1:14:30.730 --> 1:14:39.793 +So the problem is if you have sentences you +don't have information about the speakers. + +1:14:39.793 --> 1:14:43.127 +So how can you put information? + +1:14:43.984 --> 1:14:53.299 +And what we know from a large language model, +we just prompt them, and you can do that. + +1:14:53.233 --> 1:14:59.545 +Translating directly, I love you, you said +she said to him, I love you, and then of course + +1:14:59.545 --> 1:15:01.210 +you have to strip away. + +1:15:01.181 --> 1:15:06.629 +I mean, you cannot prevent the model from +translating that, but you should be able to + +1:15:06.629 --> 1:15:08.974 +see what is the translation of this. + +1:15:08.974 --> 1:15:14.866 +One can strip that away, and now the system +had hopefully the information that it's somebody + +1:15:14.866 --> 1:15:15.563 +like that. + +1:15:15.563 --> 1:15:17.020 +The speaker is female. + +1:15:18.198 --> 1:15:23.222 +Because you're no longer translating love +you, but you're translating the sentence she + +1:15:23.222 --> 1:15:24.261 +said to him love. + +1:15:24.744 --> 1:15:37.146 +And so you insert this information as contextual +information around it and don't have to change + +1:15:37.146 --> 1:15:38.567 +the model. + +1:15:41.861 --> 1:15:56.946 +Last idea is to do what is referred to as +letters rescoring, so the idea there is you + +1:15:56.946 --> 1:16:01.156 +generate a translation. + +1:16:01.481 --> 1:16:18.547 +And now you have an additional component which +tries to add possibilities where gender information + +1:16:18.547 --> 1:16:21.133 +might be lost. + +1:16:21.261 --> 1:16:29.687 +It's just a graph in this way, a simplified +graph where there's always one word between + +1:16:29.687 --> 1:16:31.507 +two notes and you. + +1:16:31.851 --> 1:16:35.212 +So you have something like Zi is an ads or +a Zi is an ads. + +1:16:35.535 --> 1:16:41.847 +And then you can generate all possible variants. + +1:16:41.847 --> 1:16:49.317 +Then, of course, we're not done because the +final output. + +1:16:50.530 --> 1:16:56.999 +Then you can re-score the system by a gender +de-biased model. + +1:16:56.999 --> 1:17:03.468 +So the nice thing is why why don't we directly +use our model? + +1:17:03.468 --> 1:17:10.354 +The idea is our model, which is only focusing +on gender devising. + +1:17:10.530 --> 1:17:16.470 +It can be, for example, if it's just trained +on some synthetical data, it will not be that + +1:17:16.470 --> 1:17:16.862 +well. + +1:17:16.957 --> 1:17:21.456 +But what we can do then is now you can rescore +the possible translations in here. + +1:17:21.721 --> 1:17:31.090 +And here the cases of course in general structure +is already done how to translate the words. + +1:17:31.051 --> 1:17:42.226 +Then you're only using the second component +in order to react for some variants and then + +1:17:42.226 --> 1:17:45.490 +get the best translation. + +1:17:45.925 --> 1:17:58.479 +And: As the last one there is the post processing +so you can't have it. + +1:17:58.538 --> 1:18:02.830 +Mean this was one way of post-processing was +to generate the lattice and retranslate it. + +1:18:03.123 --> 1:18:08.407 +But you can also have a processing, for example +only on the target side where you have additional + +1:18:08.407 --> 1:18:12.236 +components with checks about the gender which +maybe only knows gender. + +1:18:12.236 --> 1:18:17.089 +So it's not a machine translation component +but more like a grammatical checker which can + +1:18:17.089 --> 1:18:19.192 +be used as most processing to do that. + +1:18:19.579 --> 1:18:22.926 +Think about it a bit like when you use PPT. + +1:18:22.926 --> 1:18:25.892 +There's also a lot of post processing. + +1:18:25.892 --> 1:18:32.661 +If you use a directive, it would tell you +how to build a bond, but they have some checks + +1:18:32.661 --> 1:18:35.931 +either before and after to prevent things. + +1:18:36.356 --> 1:18:40.580 +So often there might be an application system. + +1:18:40.580 --> 1:18:44.714 +There might be extra pre and post processing. + +1:18:48.608 --> 1:18:52.589 +And yeah, with this we're at the end of. + +1:18:52.512 --> 1:19:09.359 +To this lecture where we focused on the bias, +but think a lot of these techniques we have + +1:19:09.359 --> 1:19:11.418 +seen here. + +1:19:11.331 --> 1:19:17.664 +So we saw, on the one hand, we saw that evaluating +just pure blues first might not always be. + +1:19:17.677 --> 1:19:18.947 +Mean it's very important. + +1:19:20.000 --> 1:19:30.866 +Always do that, but if you want to check and +some specific things are important, then you + +1:19:30.866 --> 1:19:35.696 +might have to do dedicated evaluations. + +1:19:36.036 --> 1:19:44.296 +It is now translating for the President and +it is like in German that guess it is not very + +1:19:44.296 --> 1:19:45.476 +appropriate. + +1:19:45.785 --> 1:19:53.591 +So it might be important if characteristics +of your system are essential to have dedicated + +1:19:53.591 --> 1:19:54.620 +evaluation. + +1:19:55.135 --> 1:20:02.478 +And then if you have that, of course, it might +be also important to develop delicate techniques. + +1:20:02.862 --> 1:20:10.988 +We have seen today some how to mitigate biases, +but I hope you see that a lot of these techniques + +1:20:10.988 --> 1:20:13.476 +you can also use to mitigate. + +1:20:13.573 --> 1:20:31.702 +At least related things you can adjust the +training data you can do for other things. + +1:20:33.253 --> 1:20:36.022 +Before we have been finishing, we have any +more questions. + +1:20:41.761 --> 1:20:47.218 +Then thanks a lot, and then we will see each +other again on the first step. + diff --git a/demo_data/lectures/Lecture-13-04.07.2023/video.mp4 b/demo_data/lectures/Lecture-13-04.07.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a9bf3ebb6684685450dbb6a822043811bf487fd3 --- /dev/null +++ b/demo_data/lectures/Lecture-13-04.07.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42f89fc932d5818061ea4e7490a1ea9a58c6b937b7696d69d117fca50623f0a2 +size 108699463 diff --git a/demo_data/lectures/Lecture-14-27.06.2023/English.vtt b/demo_data/lectures/Lecture-14-27.06.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..44f242579e60fd28f3157e07422abbeb36b2d958 --- /dev/null +++ b/demo_data/lectures/Lecture-14-27.06.2023/English.vtt @@ -0,0 +1,2747 @@ +WEBVTT + +0:00:01.921 --> 0:00:16.424 +Hey welcome to today's lecture, what we today +want to look at is how we can make new. + +0:00:16.796 --> 0:00:26.458 +So until now we have this global system, the +encoder and the decoder mostly, and we haven't + +0:00:26.458 --> 0:00:29.714 +really thought about how long. + +0:00:30.170 --> 0:00:42.684 +And what we, for example, know is yeah, you +can make the systems bigger in different ways. + +0:00:42.684 --> 0:00:47.084 +We can make them deeper so the. + +0:00:47.407 --> 0:00:56.331 +And if we have at least enough data that typically +helps you make things performance better,. + +0:00:56.576 --> 0:01:00.620 +But of course leads to problems that we need +more resources. + +0:01:00.620 --> 0:01:06.587 +That is a problem at universities where we +have typically limited computation capacities. + +0:01:06.587 --> 0:01:11.757 +So at some point you have such big models +that you cannot train them anymore. + +0:01:13.033 --> 0:01:23.792 +And also for companies is of course important +if it costs you like to generate translation + +0:01:23.792 --> 0:01:26.984 +just by power consumption. + +0:01:27.667 --> 0:01:35.386 +So yeah, there's different reasons why you +want to do efficient machine translation. + +0:01:36.436 --> 0:01:48.338 +One reason is there are different ways of +how you can improve your machine translation + +0:01:48.338 --> 0:01:50.527 +system once we. + +0:01:50.670 --> 0:01:55.694 +There can be different types of data we looked +into data crawling, monolingual data. + +0:01:55.875 --> 0:01:59.024 +All this data and the aim is always. + +0:01:59.099 --> 0:02:05.735 +Of course, we are not just purely interested +in having more data, but the idea why we want + +0:02:05.735 --> 0:02:12.299 +to have more data is that more data also means +that we have better quality because mostly + +0:02:12.299 --> 0:02:17.550 +we are interested in increasing the quality +of the machine translation. + +0:02:18.838 --> 0:02:24.892 +But there's also other ways of how you can +improve the quality of a machine translation. + +0:02:25.325 --> 0:02:36.450 +And what is, of course, that is where most +research is focusing on. + +0:02:36.450 --> 0:02:44.467 +It means all we want to build better algorithms. + +0:02:44.684 --> 0:02:48.199 +Course: The other things are normally as good. + +0:02:48.199 --> 0:02:54.631 +Sometimes it's easier to improve, so often +it's easier to just collect more data than + +0:02:54.631 --> 0:02:57.473 +to invent some great view algorithms. + +0:02:57.473 --> 0:03:00.315 +But yeah, both of them are important. + +0:03:00.920 --> 0:03:09.812 +But there is this third thing, especially +with neural machine translation, and that means + +0:03:09.812 --> 0:03:11.590 +we make a bigger. + +0:03:11.751 --> 0:03:16.510 +Can be, as said, that we have more layers, +that we have wider layers. + +0:03:16.510 --> 0:03:19.977 +The other thing we talked a bit about is ensemble. + +0:03:19.977 --> 0:03:24.532 +That means we are not building one new machine +translation system. + +0:03:24.965 --> 0:03:27.505 +And we can easily build four. + +0:03:27.505 --> 0:03:32.331 +What is the typical strategy to build different +systems? + +0:03:32.331 --> 0:03:33.177 +Remember. + +0:03:35.795 --> 0:03:40.119 +It should be of course a bit different if +you have the same. + +0:03:40.119 --> 0:03:44.585 +If they all predict the same then combining +them doesn't help. + +0:03:44.585 --> 0:03:48.979 +So what is the easiest way if you have to +build four systems? + +0:03:51.711 --> 0:04:01.747 +And the Charleston's will take, but this is +the best output of a single system. + +0:04:02.362 --> 0:04:10.165 +Mean now, it's really three different systems +so that you later can combine them and maybe + +0:04:10.165 --> 0:04:11.280 +the average. + +0:04:11.280 --> 0:04:16.682 +Ensembles are typically that the average is +all probabilities. + +0:04:19.439 --> 0:04:24.227 +The idea is to think about neural networks. + +0:04:24.227 --> 0:04:29.342 +There's one parameter which can easily adjust. + +0:04:29.342 --> 0:04:36.525 +That's exactly the easiest way to randomize +with three different. + +0:04:37.017 --> 0:04:43.119 +They have the same architecture, so all the +hydroparameters are the same, but they are + +0:04:43.119 --> 0:04:43.891 +different. + +0:04:43.891 --> 0:04:46.556 +They will have different predictions. + +0:04:48.228 --> 0:04:52.572 +So, of course, bigger amounts. + +0:04:52.572 --> 0:05:05.325 +Some of these are a bit the easiest way of +improving your quality because you don't really + +0:05:05.325 --> 0:05:08.268 +have to do anything. + +0:05:08.588 --> 0:05:12.588 +There is limits on that bigger models only +get better. + +0:05:12.588 --> 0:05:19.132 +If you have enough training data you can't +do like a handheld layer and you will not work + +0:05:19.132 --> 0:05:24.877 +on very small data but with a recent amount +of data that is the easiest thing. + +0:05:25.305 --> 0:05:33.726 +However, they are challenging with making +better models, bigger motors, and that is the + +0:05:33.726 --> 0:05:34.970 +computation. + +0:05:35.175 --> 0:05:44.482 +So, of course, if you have a bigger model +that can mean that you have longer running + +0:05:44.482 --> 0:05:49.518 +times, if you have models, you have to times. + +0:05:51.171 --> 0:05:56.685 +Normally you cannot paralyze the different +layers because the input to one layer is always + +0:05:56.685 --> 0:06:02.442 +the output of the previous layer, so you propagate +that so it will also increase your runtime. + +0:06:02.822 --> 0:06:10.720 +Then you have to store all your models in +memory. + +0:06:10.720 --> 0:06:20.927 +If you have double weights you will have: +Is more difficult to then do back propagation. + +0:06:20.927 --> 0:06:27.680 +You have to store in between the activations, +so there's not only do you increase the model + +0:06:27.680 --> 0:06:31.865 +in your memory, but also all these other variables +that. + +0:06:34.414 --> 0:06:36.734 +And so in general it is more expensive. + +0:06:37.137 --> 0:06:54.208 +And therefore there's good reasons in looking +into can we make these models sound more efficient. + +0:06:54.134 --> 0:07:00.982 +So it's been through the viewer, you can have +it okay, have one and one day of training time, + +0:07:00.982 --> 0:07:01.274 +or. + +0:07:01.221 --> 0:07:07.535 +Forty thousand euros and then what is the +best machine translation system I can get within + +0:07:07.535 --> 0:07:08.437 +this budget. + +0:07:08.969 --> 0:07:19.085 +And then, of course, you can make the models +bigger, but then you have to train them shorter, + +0:07:19.085 --> 0:07:24.251 +and then we can make more efficient algorithms. + +0:07:25.925 --> 0:07:31.699 +If you think about efficiency, there's a bit +different scenarios. + +0:07:32.312 --> 0:07:43.635 +So if you're more of coming from the research +community, what you'll be doing is building + +0:07:43.635 --> 0:07:47.913 +a lot of models in your research. + +0:07:48.088 --> 0:07:58.645 +So you're having your test set of maybe sentences, +calculating the blue score, then another model. + +0:07:58.818 --> 0:08:08.911 +So what that means is typically you're training +on millions of cents, so your training time + +0:08:08.911 --> 0:08:14.944 +is long, maybe a day, but maybe in other cases +a week. + +0:08:15.135 --> 0:08:22.860 +The testing is not really the cost efficient, +but the training is very costly. + +0:08:23.443 --> 0:08:37.830 +If you are more thinking of building models +for application, the scenario is quite different. + +0:08:38.038 --> 0:08:46.603 +And then you keep it running, and maybe thousands +of customers are using it in translating. + +0:08:46.603 --> 0:08:47.720 +So in that. + +0:08:48.168 --> 0:08:59.577 +And we will see that it is not always the +same type of challenges you can paralyze some + +0:08:59.577 --> 0:09:07.096 +things in training, which you cannot paralyze +in testing. + +0:09:07.347 --> 0:09:14.124 +For example, in training you have to do back +propagation, so you have to store the activations. + +0:09:14.394 --> 0:09:23.901 +Therefore, in testing we briefly discussed +that we would do it in more detail today in + +0:09:23.901 --> 0:09:24.994 +training. + +0:09:25.265 --> 0:09:36.100 +You know they're a target and you can process +everything in parallel while in testing. + +0:09:36.356 --> 0:09:46.741 +So you can only do one word at a time, and +so you can less paralyze this. + +0:09:46.741 --> 0:09:50.530 +Therefore, it's important. + +0:09:52.712 --> 0:09:55.347 +Is a specific task on this. + +0:09:55.347 --> 0:10:03.157 +For example, it's the efficiency task where +it's about making things as efficient. + +0:10:03.123 --> 0:10:09.230 +Is possible and they can look at different +resources. + +0:10:09.230 --> 0:10:14.207 +So how much deep fuel run time do you need? + +0:10:14.454 --> 0:10:19.366 +See how much memory you need or you can have +a fixed memory budget and then have to build + +0:10:19.366 --> 0:10:20.294 +the best system. + +0:10:20.500 --> 0:10:29.010 +And here is a bit like an example of that, +so there's three teams from Edinburgh from + +0:10:29.010 --> 0:10:30.989 +and they submitted. + +0:10:31.131 --> 0:10:36.278 +So then, of course, if you want to know the +most efficient system you have to do a bit + +0:10:36.278 --> 0:10:36.515 +of. + +0:10:36.776 --> 0:10:44.656 +You want to have a better quality or more +runtime and there's not the one solution. + +0:10:44.656 --> 0:10:46.720 +You can improve your. + +0:10:46.946 --> 0:10:49.662 +And that you see that there are different +systems. + +0:10:49.909 --> 0:11:06.051 +Here is how many words you can do for a second +on the clock, and you want to be as talk as + +0:11:06.051 --> 0:11:07.824 +possible. + +0:11:08.068 --> 0:11:08.889 +And you see here a bit. + +0:11:08.889 --> 0:11:09.984 +This is a little bit different. + +0:11:11.051 --> 0:11:27.717 +You want to be there on the top right corner +and you can get a score of something between + +0:11:27.717 --> 0:11:29.014 +words. + +0:11:30.250 --> 0:11:34.161 +Two hundred and fifty thousand, then you'll +ever come and score zero point three. + +0:11:34.834 --> 0:11:41.243 +There is, of course, any bit of a decision, +but the question is, like how far can you again? + +0:11:41.243 --> 0:11:47.789 +Some of all these points on this line would +be winners because they are somehow most efficient + +0:11:47.789 --> 0:11:53.922 +in a way that there's no system which achieves +the same quality with less computational. + +0:11:57.657 --> 0:12:04.131 +So there's the one question of which resources +are you interested. + +0:12:04.131 --> 0:12:07.416 +Are you running it on CPU or GPU? + +0:12:07.416 --> 0:12:11.668 +There's different ways of paralyzing stuff. + +0:12:14.654 --> 0:12:20.777 +Another dimension is how you process your +data. + +0:12:20.777 --> 0:12:27.154 +There's really the best processing and streaming. + +0:12:27.647 --> 0:12:34.672 +So in batch processing you have the whole +document available so you can translate all + +0:12:34.672 --> 0:12:39.981 +sentences in perimeter and then you're interested +in throughput. + +0:12:40.000 --> 0:12:43.844 +But you can then process, for example, especially +in GPS. + +0:12:43.844 --> 0:12:49.810 +That's interesting, you're not translating +one sentence at a time, but you're translating + +0:12:49.810 --> 0:12:56.108 +one hundred sentences or so in parallel, so +you have one more dimension where you can paralyze + +0:12:56.108 --> 0:12:57.964 +and then be more efficient. + +0:12:58.558 --> 0:13:14.863 +On the other hand, for example sorts of documents, +so we learned that if you do badge processing + +0:13:14.863 --> 0:13:16.544 +you have. + +0:13:16.636 --> 0:13:24.636 +Then, of course, it makes sense to sort the +sentences in order to have the minimum thing + +0:13:24.636 --> 0:13:25.535 +attached. + +0:13:27.427 --> 0:13:32.150 +The other scenario is more the streaming scenario +where you do life translation. + +0:13:32.512 --> 0:13:40.212 +So in that case you can't wait for the whole +document to pass, but you have to do. + +0:13:40.520 --> 0:13:49.529 +And then, for example, that's especially in +situations like speech translation, and then + +0:13:49.529 --> 0:13:53.781 +you're interested in things like latency. + +0:13:53.781 --> 0:14:00.361 +So how much do you have to wait to get the +output of a sentence? + +0:14:06.566 --> 0:14:16.956 +Finally, there is the thing about the implementation: +Today we're mainly looking at different algorithms, + +0:14:16.956 --> 0:14:23.678 +different models of how you can model them +in your machine translation system, but of + +0:14:23.678 --> 0:14:29.227 +course for the same algorithms there's also +different implementations. + +0:14:29.489 --> 0:14:38.643 +So, for example, for a machine translation +this tool could be very fast. + +0:14:38.638 --> 0:14:46.615 +So they have like coded a lot of the operations +very low resource, not low resource, low level + +0:14:46.615 --> 0:14:49.973 +on the directly on the QDAC kernels in. + +0:14:50.110 --> 0:15:00.948 +So the same attention network is typically +more efficient in that type of algorithm. + +0:15:00.880 --> 0:15:02.474 +Than in in any other. + +0:15:03.323 --> 0:15:13.105 +Of course, it might be other disadvantages, +so if you're a little worker or have worked + +0:15:13.105 --> 0:15:15.106 +in the practical. + +0:15:15.255 --> 0:15:22.604 +Because it's normally easier to understand, +easier to change, and so on, but there is again + +0:15:22.604 --> 0:15:23.323 +a train. + +0:15:23.483 --> 0:15:29.440 +You have to think about, do you want to include +this into my study or comparison or not? + +0:15:29.440 --> 0:15:36.468 +Should it be like I compare different implementations +and I also find the most efficient implementation? + +0:15:36.468 --> 0:15:39.145 +Or is it only about the pure algorithm? + +0:15:42.742 --> 0:15:50.355 +Yeah, when building these systems there is +a different trade-off to do. + +0:15:50.850 --> 0:15:56.555 +So there's one of the traders between memory +and throughput, so how many words can generate + +0:15:56.555 --> 0:15:57.299 +per second. + +0:15:57.557 --> 0:16:03.351 +So typically you can easily like increase +your scruple by increasing the batch size. + +0:16:03.643 --> 0:16:06.899 +So that means you are translating more sentences +in parallel. + +0:16:07.107 --> 0:16:09.241 +And gypsies are very good at that stuff. + +0:16:09.349 --> 0:16:15.161 +It should translate one sentence or one hundred +sentences, not the same time, but its. + +0:16:15.115 --> 0:16:20.784 +Rough are very similar because they are at +this efficient metrics multiplication so that + +0:16:20.784 --> 0:16:24.415 +you can do the same operation on all sentences +parallel. + +0:16:24.415 --> 0:16:30.148 +So typically that means if you increase your +benchmark you can do more things in parallel + +0:16:30.148 --> 0:16:31.995 +and you will translate more. + +0:16:31.952 --> 0:16:33.370 +Second. + +0:16:33.653 --> 0:16:43.312 +On the other hand, with this advantage, of +course you will need higher badge sizes and + +0:16:43.312 --> 0:16:44.755 +more memory. + +0:16:44.965 --> 0:16:56.452 +To begin with, the other problem is that you +have such big models that you can only translate + +0:16:56.452 --> 0:16:59.141 +with lower bed sizes. + +0:16:59.119 --> 0:17:08.466 +If you are running out of memory with translating, +one idea to go on that is to decrease your. + +0:17:13.453 --> 0:17:24.456 +Then there is the thing about quality in Screwport, +of course, and before it's like larger models, + +0:17:24.456 --> 0:17:28.124 +but in generally higher quality. + +0:17:28.124 --> 0:17:31.902 +The first one is always this way. + +0:17:32.092 --> 0:17:38.709 +Course: Not always larger model helps you +have over fitting at some point, but in generally. + +0:17:43.883 --> 0:17:52.901 +And with this a bit on this training and testing +thing we had before. + +0:17:53.113 --> 0:17:58.455 +So it wears all the difference between training +and testing, and for the encoder and decoder. + +0:17:58.798 --> 0:18:06.992 +So if we are looking at what mentioned before +at training time, we have a source sentence + +0:18:06.992 --> 0:18:17.183 +here: And how this is processed on a is not +the attention here. + +0:18:17.183 --> 0:18:21.836 +That's a tubical transformer. + +0:18:22.162 --> 0:18:31.626 +And how we can do that on a is that we can +paralyze the ear ever since. + +0:18:31.626 --> 0:18:40.422 +The first thing to know is: So that is, of +course, not in all cases. + +0:18:40.422 --> 0:18:49.184 +We'll later talk about speech translation +where we might want to translate. + +0:18:49.389 --> 0:18:56.172 +Without the general case in, it's like you +have the full sentence you want to translate. + +0:18:56.416 --> 0:19:02.053 +So the important thing is we are here everything +available on the source side. + +0:19:03.323 --> 0:19:13.524 +And then this was one of the big advantages +that you can remember back of transformer. + +0:19:13.524 --> 0:19:15.752 +There are several. + +0:19:16.156 --> 0:19:25.229 +But the other one is now that we can calculate +the full layer. + +0:19:25.645 --> 0:19:29.318 +There is no dependency between this and this +state or this and this state. + +0:19:29.749 --> 0:19:36.662 +So we always did like here to calculate the +key value and query, and based on that you + +0:19:36.662 --> 0:19:37.536 +calculate. + +0:19:37.937 --> 0:19:46.616 +Which means we can do all these calculations +here in parallel and in parallel. + +0:19:48.028 --> 0:19:55.967 +And there, of course, is this very efficiency +because again for GPS it's too bigly possible + +0:19:55.967 --> 0:20:00.887 +to do these things in parallel and one after +each other. + +0:20:01.421 --> 0:20:10.311 +And then we can also for each layer one by +one, and then we calculate here the encoder. + +0:20:10.790 --> 0:20:21.921 +In training now an important thing is that +for the decoder we have the full sentence available + +0:20:21.921 --> 0:20:28.365 +because we know this is the target we should +generate. + +0:20:29.649 --> 0:20:33.526 +We have models now in a different way. + +0:20:33.526 --> 0:20:38.297 +This hidden state is only on the previous +ones. + +0:20:38.598 --> 0:20:51.887 +And the first thing here depends only on this +information, so you see if you remember we + +0:20:51.887 --> 0:20:56.665 +had this masked self-attention. + +0:20:56.896 --> 0:21:04.117 +So that means, of course, we can only calculate +the decoder once the encoder is done, but that's. + +0:21:04.444 --> 0:21:06.656 +Percent can calculate the end quarter. + +0:21:06.656 --> 0:21:08.925 +Then we can calculate here the decoder. + +0:21:09.569 --> 0:21:25.566 +But again in training we have x, y and that +is available so we can calculate everything + +0:21:25.566 --> 0:21:27.929 +in parallel. + +0:21:28.368 --> 0:21:40.941 +So the interesting thing or advantage of transformer +is in training. + +0:21:40.941 --> 0:21:46.408 +We can do it for the decoder. + +0:21:46.866 --> 0:21:54.457 +That means you will have more calculations +because you can only calculate one layer at + +0:21:54.457 --> 0:22:02.310 +a time, but for example the length which is +too bigly quite long or doesn't really matter + +0:22:02.310 --> 0:22:03.270 +that much. + +0:22:05.665 --> 0:22:10.704 +However, in testing this situation is different. + +0:22:10.704 --> 0:22:13.276 +In testing we only have. + +0:22:13.713 --> 0:22:20.622 +So this means we start with a sense: We don't +know the full sentence yet because we ought + +0:22:20.622 --> 0:22:29.063 +to regularly generate that so for the encoder +we have the same here but for the decoder. + +0:22:29.409 --> 0:22:39.598 +In this case we only have the first and the +second instinct, but only for all states in + +0:22:39.598 --> 0:22:40.756 +parallel. + +0:22:41.101 --> 0:22:51.752 +And then we can do the next step for y because +we are putting our most probable one. + +0:22:51.752 --> 0:22:58.643 +We do greedy search or beam search, but you +cannot do. + +0:23:03.663 --> 0:23:16.838 +Yes, so if we are interesting in making things +more efficient for testing, which we see, for + +0:23:16.838 --> 0:23:22.363 +example in the scenario of really our. + +0:23:22.642 --> 0:23:34.286 +It makes sense that we think about our architecture +and that we are currently working on attention + +0:23:34.286 --> 0:23:35.933 +based models. + +0:23:36.096 --> 0:23:44.150 +The decoder there is some of the most time +spent testing and testing. + +0:23:44.150 --> 0:23:47.142 +It's similar, but during. + +0:23:47.167 --> 0:23:50.248 +Nothing about beam search. + +0:23:50.248 --> 0:23:59.833 +It might be even more complicated because +in beam search you have to try different. + +0:24:02.762 --> 0:24:15.140 +So the question is what can you now do in +order to make your model more efficient and + +0:24:15.140 --> 0:24:21.905 +better in translation in these types of cases? + +0:24:24.604 --> 0:24:30.178 +And the one thing is to look into the encoded +decoder trailer. + +0:24:30.690 --> 0:24:43.898 +And then until now we typically assume that +the depth of the encoder and the depth of the + +0:24:43.898 --> 0:24:48.154 +decoder is roughly the same. + +0:24:48.268 --> 0:24:55.553 +So if you haven't thought about it, you just +take what is running well. + +0:24:55.553 --> 0:24:57.678 +You would try to do. + +0:24:58.018 --> 0:25:04.148 +However, we saw now that there is a quite +big challenge and the runtime is a lot longer + +0:25:04.148 --> 0:25:04.914 +than here. + +0:25:05.425 --> 0:25:14.018 +The question is also the case for the calculations, +or do we have there the same issue that we + +0:25:14.018 --> 0:25:21.887 +only get the good quality if we are having +high and high, so we know that making these + +0:25:21.887 --> 0:25:25.415 +more depths is increasing our quality. + +0:25:25.425 --> 0:25:31.920 +But what we haven't talked about is really +important that we increase the depth the same + +0:25:31.920 --> 0:25:32.285 +way. + +0:25:32.552 --> 0:25:41.815 +So what we can put instead also do is something +like this where you have a deep encoder and + +0:25:41.815 --> 0:25:42.923 +a shallow. + +0:25:43.163 --> 0:25:57.386 +So that would be that you, for example, have +instead of having layers on the encoder, and + +0:25:57.386 --> 0:25:59.757 +layers on the. + +0:26:00.080 --> 0:26:10.469 +So in this case the overall depth from start +to end would be similar and so hopefully. + +0:26:11.471 --> 0:26:21.662 +But we could a lot more things hear parallelized, +and hear what is costly at the end during decoding + +0:26:21.662 --> 0:26:22.973 +the decoder. + +0:26:22.973 --> 0:26:29.330 +Because that does change in an outer regressive +way, there we. + +0:26:31.411 --> 0:26:33.727 +And that that can be analyzed. + +0:26:33.727 --> 0:26:38.734 +So here is some examples: Where people have +done all this. + +0:26:39.019 --> 0:26:55.710 +So here it's mainly interested on the orange +things, which is auto-regressive about the + +0:26:55.710 --> 0:26:57.607 +speed up. + +0:26:57.717 --> 0:27:15.031 +You have the system, so agree is not exactly +the same, but it's similar. + +0:27:15.055 --> 0:27:23.004 +It's always the case if you look at speed +up. + +0:27:23.004 --> 0:27:31.644 +Think they put a speed of so that's the baseline. + +0:27:31.771 --> 0:27:35.348 +So between and times as fast. + +0:27:35.348 --> 0:27:42.621 +If you switch from a system to where you have +layers in the. + +0:27:42.782 --> 0:27:52.309 +You see that although you have slightly more +parameters, more calculations are also roughly + +0:27:52.309 --> 0:28:00.283 +the same, but you can speed out because now +during testing you can paralyze. + +0:28:02.182 --> 0:28:09.754 +The other thing is that you're speeding up, +but if you look at the performance it's similar, + +0:28:09.754 --> 0:28:13.500 +so sometimes you improve, sometimes you lose. + +0:28:13.500 --> 0:28:20.421 +There's a bit of losing English to Romania, +but in general the quality is very slow. + +0:28:20.680 --> 0:28:30.343 +So you see that you can keep a similar performance +while improving your speed by just having different. + +0:28:30.470 --> 0:28:34.903 +And you also see the encoder layers from speed. + +0:28:34.903 --> 0:28:38.136 +They don't really metal that much. + +0:28:38.136 --> 0:28:38.690 +Most. + +0:28:38.979 --> 0:28:50.319 +Because if you compare the 12th system to +the 6th system you have a lower performance + +0:28:50.319 --> 0:28:57.309 +with 6th and colder layers but the speed is +similar. + +0:28:57.897 --> 0:29:02.233 +And see the huge decrease is it maybe due +to a lack of data. + +0:29:03.743 --> 0:29:11.899 +Good idea would say it's not the case. + +0:29:11.899 --> 0:29:23.191 +Romanian English should have the same number +of data. + +0:29:24.224 --> 0:29:31.184 +Maybe it's just that something in that language. + +0:29:31.184 --> 0:29:40.702 +If you generate Romanian maybe they need more +target dependencies. + +0:29:42.882 --> 0:29:46.263 +The Wine's the Eye Also Don't Know Any Sex +People Want To. + +0:29:47.887 --> 0:29:49.034 +There could be yeah the. + +0:29:49.889 --> 0:29:58.962 +As the maybe if you go from like a movie sphere +to a hybrid sphere, you can: It's very much + +0:29:58.962 --> 0:30:12.492 +easier to expand the vocabulary to English, +but it must be the vocabulary. + +0:30:13.333 --> 0:30:21.147 +Have to check, but would assume that in this +case the system is not retrained, but it's + +0:30:21.147 --> 0:30:22.391 +trained with. + +0:30:22.902 --> 0:30:30.213 +And that's why I was assuming that they have +the same, but maybe you'll write that in this + +0:30:30.213 --> 0:30:35.595 +piece, for example, if they were pre-trained, +the decoder English. + +0:30:36.096 --> 0:30:43.733 +But don't remember exactly if they do something +like that, but that could be a good. + +0:30:45.325 --> 0:30:52.457 +So this is some of the most easy way to speed +up. + +0:30:52.457 --> 0:31:01.443 +You just switch to hyperparameters, not to +implement anything. + +0:31:02.722 --> 0:31:08.367 +Of course, there's other ways of doing that. + +0:31:08.367 --> 0:31:11.880 +We'll look into two things. + +0:31:11.880 --> 0:31:16.521 +The other thing is the architecture. + +0:31:16.796 --> 0:31:28.154 +We are now at some of the baselines that we +are doing. + +0:31:28.488 --> 0:31:39.978 +However, in translation in the decoder side, +it might not be the best solution. + +0:31:39.978 --> 0:31:41.845 +There is no. + +0:31:42.222 --> 0:31:47.130 +So we can use different types of architectures, +also in the encoder and the. + +0:31:47.747 --> 0:31:52.475 +And there's two ways of what you could do +different, or there's more ways. + +0:31:52.912 --> 0:31:54.825 +We will look into two todays. + +0:31:54.825 --> 0:31:58.842 +The one is average attention, which is a very +simple solution. + +0:31:59.419 --> 0:32:01.464 +You can do as it says. + +0:32:01.464 --> 0:32:04.577 +It's not really attending anymore. + +0:32:04.577 --> 0:32:08.757 +It's just like equal attendance to everything. + +0:32:09.249 --> 0:32:23.422 +And the other idea, which is currently done +in most systems which are optimized to efficiency, + +0:32:23.422 --> 0:32:24.913 +is we're. + +0:32:25.065 --> 0:32:32.623 +But on the decoder side we are then not using +transformer or self attention, but we are using + +0:32:32.623 --> 0:32:39.700 +recurrent neural network because they are the +disadvantage of recurrent neural network. + +0:32:39.799 --> 0:32:48.353 +And then the recurrent is normally easier +to calculate because it only depends on inputs, + +0:32:48.353 --> 0:32:49.684 +the input on. + +0:32:51.931 --> 0:33:02.190 +So what is the difference between decoding +and why is the tension maybe not sufficient + +0:33:02.190 --> 0:33:03.841 +for decoding? + +0:33:04.204 --> 0:33:14.390 +If we want to populate the new state, we only +have to look at the input and the previous + +0:33:14.390 --> 0:33:15.649 +state, so. + +0:33:16.136 --> 0:33:19.029 +We are more conditional here networks. + +0:33:19.029 --> 0:33:19.994 +We have the. + +0:33:19.980 --> 0:33:31.291 +Dependency to a fixed number of previous ones, +but that's rarely used for decoding. + +0:33:31.291 --> 0:33:39.774 +In contrast, in transformer we have this large +dependency, so. + +0:33:40.000 --> 0:33:52.760 +So from t minus one to y t so that is somehow +and mainly not very efficient in this way mean + +0:33:52.760 --> 0:33:56.053 +it's very good because. + +0:33:56.276 --> 0:34:03.543 +However, the disadvantage is that we also +have to do all these calculations, so if we + +0:34:03.543 --> 0:34:10.895 +more view from the point of view of efficient +calculation, this might not be the best. + +0:34:11.471 --> 0:34:20.517 +So the question is, can we change our architecture +to keep some of the advantages but make things + +0:34:20.517 --> 0:34:21.994 +more efficient? + +0:34:24.284 --> 0:34:31.131 +The one idea is what is called the average +attention, and the interesting thing is this + +0:34:31.131 --> 0:34:32.610 +work surprisingly. + +0:34:33.013 --> 0:34:38.917 +So the only idea what you're doing is doing +the decoder. + +0:34:38.917 --> 0:34:42.646 +You're not doing attention anymore. + +0:34:42.646 --> 0:34:46.790 +The attention weights are all the same. + +0:34:47.027 --> 0:35:00.723 +So you don't calculate with query and key +the different weights, and then you just take + +0:35:00.723 --> 0:35:03.058 +equal weights. + +0:35:03.283 --> 0:35:07.585 +So here would be one third from this, one +third from this, and one third. + +0:35:09.009 --> 0:35:14.719 +And while it is sufficient you can now do +precalculation and things get more efficient. + +0:35:15.195 --> 0:35:18.803 +So first go the formula that's maybe not directed +here. + +0:35:18.979 --> 0:35:38.712 +So the difference here is that your new hint +stage is the sum of all the hint states, then. + +0:35:38.678 --> 0:35:40.844 +So here would be with this. + +0:35:40.844 --> 0:35:45.022 +It would be one third of this plus one third +of this. + +0:35:46.566 --> 0:35:57.162 +But if you calculate it this way, it's not +yet being more efficient because you still + +0:35:57.162 --> 0:36:01.844 +have to sum over here all the hidden. + +0:36:04.524 --> 0:36:22.932 +But you can not easily speed up these things +by having an in between value, which is just + +0:36:22.932 --> 0:36:24.568 +always. + +0:36:25.585 --> 0:36:30.057 +If you take this as ten to one, you take this +one class this one. + +0:36:30.350 --> 0:36:36.739 +Because this one then was before this, and +this one was this, so in the end. + +0:36:37.377 --> 0:36:49.545 +So now this one is not the final one in order +to get the final one to do the average. + +0:36:49.545 --> 0:36:50.111 +So. + +0:36:50.430 --> 0:37:00.264 +But then if you do this calculation with speed +up you can do it with a fixed number of steps. + +0:37:00.180 --> 0:37:11.300 +Instead of the sun which depends on age, so +you only have to do calculations to calculate + +0:37:11.300 --> 0:37:12.535 +this one. + +0:37:12.732 --> 0:37:21.718 +Can you do the lakes and the lakes? + +0:37:21.718 --> 0:37:32.701 +For example, light bulb here now takes and. + +0:37:32.993 --> 0:37:38.762 +That's a very good point and that's why this +is now in the image. + +0:37:38.762 --> 0:37:44.531 +It's not very good so this is the one with +tilder and the tilder. + +0:37:44.884 --> 0:37:57.895 +So this one is just the sum of these two, +because this is just this one. + +0:37:58.238 --> 0:38:08.956 +So the sum of this is exactly as the sum of +these, and the sum of these is the sum of here. + +0:38:08.956 --> 0:38:15.131 +So you only do the sum in here, and the multiplying. + +0:38:15.255 --> 0:38:22.145 +So what you can mainly do here is you can +do it more mathematically. + +0:38:22.145 --> 0:38:31.531 +You can know this by tea taking out of the +sum, and then you can calculate the sum different. + +0:38:36.256 --> 0:38:42.443 +That maybe looks a bit weird and simple, so +we were all talking about this great attention + +0:38:42.443 --> 0:38:47.882 +that we can focus on different parts, and a +bit surprising on this work is now. + +0:38:47.882 --> 0:38:53.321 +In the end it might also work well without +really putting and just doing equal. + +0:38:53.954 --> 0:38:56.164 +Mean it's not that easy. + +0:38:56.376 --> 0:38:58.261 +It's like sometimes this is working. + +0:38:58.261 --> 0:39:00.451 +There's also report weight work that well. + +0:39:01.481 --> 0:39:05.848 +But I think it's an interesting way and it +maybe shows that a lot of. + +0:39:05.805 --> 0:39:10.624 +Things in the self or in the transformer paper +which are more put as like yet. + +0:39:10.624 --> 0:39:15.930 +These are some hyperpermetheuss around it, +like that you do the layer norm in between, + +0:39:15.930 --> 0:39:21.785 +and that you do a feat forward before, and +things like that, that these are also all important, + +0:39:21.785 --> 0:39:25.566 +and that the right set up around that is also +very important. + +0:39:28.969 --> 0:39:38.598 +The other thing you can do in the end is not +completely different from this one. + +0:39:38.598 --> 0:39:42.521 +It's just like a very different. + +0:39:42.942 --> 0:39:54.338 +And that is a recurrent network which also +has this type of highway connection that can + +0:39:54.338 --> 0:40:01.330 +ignore the recurrent unit and directly put +the input. + +0:40:01.561 --> 0:40:10.770 +It's not really adding out, but if you see +the hitting step is your input, but what you + +0:40:10.770 --> 0:40:15.480 +can do is somehow directly go to the output. + +0:40:17.077 --> 0:40:28.390 +These are the four components of the simple +return unit, and the unit is motivated by GIS + +0:40:28.390 --> 0:40:33.418 +and by LCMs, which we have seen before. + +0:40:33.513 --> 0:40:43.633 +And that has proven to be very good for iron +ends, which allows you to have a gate on your. + +0:40:44.164 --> 0:40:48.186 +In this thing we have two gates, the reset +gate and the forget gate. + +0:40:48.768 --> 0:40:57.334 +So first we have the general structure which +has a cell state. + +0:40:57.334 --> 0:41:01.277 +Here we have the cell state. + +0:41:01.361 --> 0:41:09.661 +And then this goes next, and we always get +the different cell states over the times that. + +0:41:10.030 --> 0:41:11.448 +This Is the South Stand. + +0:41:11.771 --> 0:41:16.518 +How do we now calculate that just assume we +have an initial cell safe here? + +0:41:17.017 --> 0:41:19.670 +But the first thing is we're doing the forget +game. + +0:41:20.060 --> 0:41:34.774 +The forgetting models should the new cell +state mainly depend on the previous cell state + +0:41:34.774 --> 0:41:40.065 +or should it depend on our age. + +0:41:40.000 --> 0:41:41.356 +Like Add to Them. + +0:41:41.621 --> 0:41:42.877 +How can we model that? + +0:41:44.024 --> 0:41:45.599 +First we were at a cocktail. + +0:41:45.945 --> 0:41:52.151 +The forget gait is depending on minus one. + +0:41:52.151 --> 0:41:56.480 +You also see here the former. + +0:41:57.057 --> 0:42:01.963 +So we are multiplying both the cell state +and our input. + +0:42:01.963 --> 0:42:04.890 +With some weights we are getting. + +0:42:05.105 --> 0:42:08.472 +We are putting some Bay Inspector and then +we are doing Sigma Weed on that. + +0:42:08.868 --> 0:42:13.452 +So in the end we have numbers between zero +and one saying for each dimension. + +0:42:13.853 --> 0:42:22.041 +Like how much if it's near to zero we will +mainly use the new input. + +0:42:22.041 --> 0:42:31.890 +If it's near to one we will keep the input +and ignore the input at this dimension. + +0:42:33.313 --> 0:42:40.173 +And by this motivation we can then create +here the new sound state, and here you see + +0:42:40.173 --> 0:42:41.141 +the formal. + +0:42:41.601 --> 0:42:55.048 +So you take your foot back gate and multiply +it with your class. + +0:42:55.048 --> 0:43:00.427 +So if my was around then. + +0:43:00.800 --> 0:43:07.405 +In the other case, when the value was others, +that's what you added. + +0:43:07.405 --> 0:43:10.946 +Then you're adding a transformation. + +0:43:11.351 --> 0:43:24.284 +So if this value was maybe zero then you're +putting most of the information from inputting. + +0:43:25.065 --> 0:43:26.947 +Is already your element? + +0:43:26.947 --> 0:43:30.561 +The only question is now based on your element. + +0:43:30.561 --> 0:43:32.067 +What is the output? + +0:43:33.253 --> 0:43:47.951 +And there you have another opportunity so +you can either take the output or instead you + +0:43:47.951 --> 0:43:50.957 +prefer the input. + +0:43:52.612 --> 0:43:58.166 +So is the value also the same for the recept +game and the forget game. + +0:43:58.166 --> 0:43:59.417 +Yes, the movie. + +0:44:00.900 --> 0:44:10.004 +Yes exactly so the matrices are different +and therefore it can be and that should be + +0:44:10.004 --> 0:44:16.323 +and maybe there is sometimes you want to have +information. + +0:44:16.636 --> 0:44:23.843 +So here again we have this vector with values +between zero and which says controlling how + +0:44:23.843 --> 0:44:25.205 +the information. + +0:44:25.505 --> 0:44:36.459 +And then the output is calculated here similar +to a cell stage, but again input is from. + +0:44:36.536 --> 0:44:45.714 +So either the reset gate decides should give +what is currently stored in there, or. + +0:44:46.346 --> 0:44:58.647 +So it's not exactly as the thing we had before, +with the residual connections where we added + +0:44:58.647 --> 0:45:01.293 +up, but here we do. + +0:45:04.224 --> 0:45:08.472 +This is the general idea of a simple recurrent +neural network. + +0:45:08.472 --> 0:45:13.125 +Then we will now look at how we can make things +even more efficient. + +0:45:13.125 --> 0:45:17.104 +But first do you have more questions on how +it is working? + +0:45:23.063 --> 0:45:38.799 +Now these calculations are a bit where things +get more efficient because this somehow. + +0:45:38.718 --> 0:45:43.177 +It depends on all the other damage for the +second one also. + +0:45:43.423 --> 0:45:48.904 +Because if you do a matrix multiplication +with a vector like for the output vector, each + +0:45:48.904 --> 0:45:52.353 +diameter of the output vector depends on all +the other. + +0:45:52.973 --> 0:46:06.561 +The cell state here depends because this one +is used here, and somehow the first dimension + +0:46:06.561 --> 0:46:11.340 +of the cell state only depends. + +0:46:11.931 --> 0:46:17.973 +In order to make that, of course, is sometimes +again making things less paralyzeable if things + +0:46:17.973 --> 0:46:18.481 +depend. + +0:46:19.359 --> 0:46:35.122 +Can easily make that different by changing +from the metric product to not a vector. + +0:46:35.295 --> 0:46:51.459 +So you do first, just like inside here, you +take like the first dimension, my second dimension. + +0:46:52.032 --> 0:46:53.772 +Is, of course, narrow. + +0:46:53.772 --> 0:46:59.294 +This should be reset or this should be because +it should be a different. + +0:46:59.899 --> 0:47:12.053 +Now the first dimension only depends on the +first dimension, so you don't have dependencies + +0:47:12.053 --> 0:47:16.148 +any longer between dimensions. + +0:47:18.078 --> 0:47:25.692 +Maybe it gets a bit clearer if you see about +it in this way, so what we have to do now. + +0:47:25.966 --> 0:47:31.911 +First, we have to do a metrics multiplication +on to gather and to get the. + +0:47:32.292 --> 0:47:38.041 +And then we only have the element wise operations +where we take this output. + +0:47:38.041 --> 0:47:38.713 +We take. + +0:47:39.179 --> 0:47:42.978 +Minus one and our original. + +0:47:42.978 --> 0:47:52.748 +Here we only have elemental abrasions which +can be optimally paralyzed. + +0:47:53.273 --> 0:48:07.603 +So here we have additional paralyzed things +across the dimension and don't have to do that. + +0:48:09.929 --> 0:48:24.255 +Yeah, but this you can do like in parallel +again for all xts. + +0:48:24.544 --> 0:48:33.014 +Here you can't do it in parallel, but you +only have to do it on each seat, and then you + +0:48:33.014 --> 0:48:34.650 +can parallelize. + +0:48:35.495 --> 0:48:39.190 +But this maybe for the dimension. + +0:48:39.190 --> 0:48:42.124 +Maybe it's also important. + +0:48:42.124 --> 0:48:46.037 +I don't know if they have tried it. + +0:48:46.037 --> 0:48:55.383 +I assume it's not only for dimension reduction, +but it's hard because you can easily. + +0:49:01.001 --> 0:49:08.164 +People have even like made the second thing +even more easy. + +0:49:08.164 --> 0:49:10.313 +So there is this. + +0:49:10.313 --> 0:49:17.954 +This is how we have the highway connections +in the transformer. + +0:49:17.954 --> 0:49:20.699 +Then it's like you do. + +0:49:20.780 --> 0:49:24.789 +So that is like how things are put together +as a transformer. + +0:49:25.125 --> 0:49:39.960 +And that is a similar and simple recurring +neural network where you do exactly the same + +0:49:39.960 --> 0:49:44.512 +for the so you don't have. + +0:49:46.326 --> 0:49:47.503 +This type of things. + +0:49:49.149 --> 0:50:01.196 +And with this we are at the end of how to +make efficient architectures before we go to + +0:50:01.196 --> 0:50:02.580 +the next. + +0:50:13.013 --> 0:50:24.424 +Between the ink or the trader and the architectures +there is a next technique which is used in + +0:50:24.424 --> 0:50:28.988 +nearly all deburning very successful. + +0:50:29.449 --> 0:50:43.463 +So the idea is can we extract the knowledge +from a large network into a smaller one, but + +0:50:43.463 --> 0:50:45.983 +it's similarly. + +0:50:47.907 --> 0:50:53.217 +And the nice thing is that this really works, +and it may be very, very surprising. + +0:50:53.673 --> 0:51:03.000 +So the idea is that we have a large straw +model which we train for long, and the question + +0:51:03.000 --> 0:51:07.871 +is: Can that help us to train a smaller model? + +0:51:08.148 --> 0:51:16.296 +So can what we refer to as teacher model tell +us better to build a small student model than + +0:51:16.296 --> 0:51:17.005 +before. + +0:51:17.257 --> 0:51:27.371 +So what we're before in it as a student model, +we learn from the data and that is how we train + +0:51:27.371 --> 0:51:28.755 +our systems. + +0:51:29.249 --> 0:51:37.949 +The question is: Can we train this small model +better if we are not only learning from the + +0:51:37.949 --> 0:51:46.649 +data, but we are also learning from a large +model which has been trained maybe in the same + +0:51:46.649 --> 0:51:47.222 +data? + +0:51:47.667 --> 0:51:55.564 +So that you have then in the end a smaller +model that is somehow better performing than. + +0:51:55.895 --> 0:51:59.828 +And maybe that's on the first view. + +0:51:59.739 --> 0:52:05.396 +Very very surprising because it has seen the +same data so it should have learned the same + +0:52:05.396 --> 0:52:11.053 +so the baseline model trained only on the data +and the student teacher knowledge to still + +0:52:11.053 --> 0:52:11.682 +model it. + +0:52:11.682 --> 0:52:17.401 +They all have seen only this data because +your teacher modeling was also trained typically + +0:52:17.401 --> 0:52:19.161 +only on this model however. + +0:52:20.580 --> 0:52:30.071 +It has by now shown that by many ways the +model trained in the teacher and analysis framework + +0:52:30.071 --> 0:52:32.293 +is performing better. + +0:52:33.473 --> 0:52:40.971 +A bit of an explanation when we see how that +works. + +0:52:40.971 --> 0:52:46.161 +There's different ways of doing it. + +0:52:46.161 --> 0:52:47.171 +Maybe. + +0:52:47.567 --> 0:52:51.501 +So how does it work? + +0:52:51.501 --> 0:53:04.802 +This is our student network, the normal one, +some type of new network. + +0:53:04.802 --> 0:53:06.113 +We're. + +0:53:06.586 --> 0:53:17.050 +So we are training the model to predict the +same thing as we are doing that by calculating. + +0:53:17.437 --> 0:53:23.173 +The cross angry loss was defined in a way +where saying all the probabilities for the + +0:53:23.173 --> 0:53:25.332 +correct word should be as high. + +0:53:25.745 --> 0:53:32.207 +So you are calculating your alphabet probabilities +always, and each time step you have an alphabet + +0:53:32.207 --> 0:53:33.055 +probability. + +0:53:33.055 --> 0:53:38.669 +What is the most probable in the next word +and your training signal is put as much of + +0:53:38.669 --> 0:53:43.368 +your probability mass to the correct word to +the word that is there in. + +0:53:43.903 --> 0:53:51.367 +And this is the chief by this cross entry +loss, which says with some of the all training + +0:53:51.367 --> 0:53:58.664 +examples of all positions, with some of the +full vocabulary, and then this one is this + +0:53:58.664 --> 0:54:03.947 +one that this current word is the case word +in the vocabulary. + +0:54:04.204 --> 0:54:11.339 +And then we take here the lock for the ability +of that, so what we made me do is: We have + +0:54:11.339 --> 0:54:27.313 +this metric here, so each position of your +vocabulary size. + +0:54:27.507 --> 0:54:38.656 +In the end what you just do is some of these +three lock probabilities, and then you want + +0:54:38.656 --> 0:54:40.785 +to have as much. + +0:54:41.041 --> 0:54:54.614 +So although this is a thumb over this metric +here, in the end of each dimension you. + +0:54:54.794 --> 0:55:06.366 +So that is a normal cross end to be lost that +we have discussed at the very beginning of + +0:55:06.366 --> 0:55:07.016 +how. + +0:55:08.068 --> 0:55:15.132 +So what can we do differently in the teacher +network? + +0:55:15.132 --> 0:55:23.374 +We also have a teacher network which is trained +on large data. + +0:55:24.224 --> 0:55:35.957 +And of course this distribution might be better +than the one from the small model because it's. + +0:55:36.456 --> 0:55:40.941 +So in this case we have now the training signal +from the teacher network. + +0:55:41.441 --> 0:55:46.262 +And it's the same way as we had before. + +0:55:46.262 --> 0:55:56.507 +The only difference is we're training not +the ground truths per ability distribution + +0:55:56.507 --> 0:55:59.159 +year, which is sharp. + +0:55:59.299 --> 0:56:11.303 +That's also a probability, so this word has +a high probability, but have some probability. + +0:56:12.612 --> 0:56:19.577 +And that is the main difference. + +0:56:19.577 --> 0:56:30.341 +Typically you do like the interpretation of +these. + +0:56:33.213 --> 0:56:38.669 +Because there's more information contained +in the distribution than in the front booth, + +0:56:38.669 --> 0:56:44.187 +because it encodes more information about the +language, because language always has more + +0:56:44.187 --> 0:56:47.907 +options to put alone, that's the same sentence +yes exactly. + +0:56:47.907 --> 0:56:53.114 +So there's ambiguity in there that is encoded +hopefully very well in the complaint. + +0:56:53.513 --> 0:56:57.257 +Trade you two networks so better than a student +network you have in there from your learner. + +0:56:57.537 --> 0:57:05.961 +So maybe often there's only one correct word, +but it might be two or three, and then all + +0:57:05.961 --> 0:57:10.505 +of these three have a probability distribution. + +0:57:10.590 --> 0:57:21.242 +And then is the main advantage or one explanation +of why it's better to train from the. + +0:57:21.361 --> 0:57:32.652 +Of course, it's good to also keep the signal +in there because then you can prevent it because + +0:57:32.652 --> 0:57:33.493 +crazy. + +0:57:37.017 --> 0:57:49.466 +Any more questions on the first type of knowledge +distillation, also distribution changes. + +0:57:50.550 --> 0:58:02.202 +Coming around again, this would put it a bit +different, so this is not a solution to maintenance + +0:58:02.202 --> 0:58:04.244 +or distribution. + +0:58:04.744 --> 0:58:12.680 +But don't think it's performing worse than +only doing the ground tours because they also. + +0:58:13.113 --> 0:58:21.254 +So it's more like it's not improving you would +assume it's similarly helping you, but. + +0:58:21.481 --> 0:58:28.145 +Of course, if you now have a teacher, maybe +you have no danger on your target to Maine, + +0:58:28.145 --> 0:58:28.524 +but. + +0:58:28.888 --> 0:58:39.895 +Then you can use this one which is not the +ground truth but helpful to learn better for + +0:58:39.895 --> 0:58:42.147 +the distribution. + +0:58:46.326 --> 0:58:57.012 +The second idea is to do sequence level knowledge +distillation, so what we have in this case + +0:58:57.012 --> 0:59:02.757 +is we have looked at each position independently. + +0:59:03.423 --> 0:59:05.436 +Mean, we do that often. + +0:59:05.436 --> 0:59:10.972 +We are not generating a lot of sequences, +but that has a problem. + +0:59:10.972 --> 0:59:13.992 +We have this propagation of errors. + +0:59:13.992 --> 0:59:16.760 +We start with one area and then. + +0:59:17.237 --> 0:59:27.419 +So if we are doing word-level knowledge dissolution, +we are treating each word in the sentence independently. + +0:59:28.008 --> 0:59:32.091 +So we are not trying to like somewhat model +the dependency between. + +0:59:32.932 --> 0:59:47.480 +We can try to do that by sequence level knowledge +dissolution, but the problem is, of course,. + +0:59:47.847 --> 0:59:53.478 +So we can that for each position we can get +a distribution over all the words at this. + +0:59:53.793 --> 1:00:05.305 +But if we want to have a distribution of all +possible target sentences, that's not possible + +1:00:05.305 --> 1:00:06.431 +because. + +1:00:08.508 --> 1:00:15.940 +Area, so we can then again do a bit of a heck +on that. + +1:00:15.940 --> 1:00:23.238 +If we can't have a distribution of all sentences, +it. + +1:00:23.843 --> 1:00:30.764 +So what we can't do is you can not use the +teacher network and sample different translations. + +1:00:31.931 --> 1:00:39.327 +And now we can do different ways to train +them. + +1:00:39.327 --> 1:00:49.343 +We can use them as their probability, the +easiest one to assume. + +1:00:50.050 --> 1:00:56.373 +So what that ends to is that we're taking +our teacher network, we're generating some + +1:00:56.373 --> 1:01:01.135 +translations, and these ones we're using as +additional trading. + +1:01:01.781 --> 1:01:11.382 +Then we have mainly done this sequence level +because the teacher network takes us. + +1:01:11.382 --> 1:01:17.513 +These are all probable translations of the +sentence. + +1:01:26.286 --> 1:01:34.673 +And then you can do a bit of a yeah, and you +can try to better make a bit of an interpolated + +1:01:34.673 --> 1:01:36.206 +version of that. + +1:01:36.716 --> 1:01:42.802 +So what people have also done is like subsequent +level interpolations. + +1:01:42.802 --> 1:01:52.819 +You generate here several translations: But +then you don't use all of them. + +1:01:52.819 --> 1:02:00.658 +You do some metrics on which of these ones. + +1:02:01.021 --> 1:02:12.056 +So it's a bit more training on this brown +chose which might be improbable or unreachable + +1:02:12.056 --> 1:02:16.520 +because we can generate everything. + +1:02:16.676 --> 1:02:23.378 +And we are giving it an easier solution which +is also good quality and training of that. + +1:02:23.703 --> 1:02:32.602 +So you're not training it on a very difficult +solution, but you're training it on an easier + +1:02:32.602 --> 1:02:33.570 +solution. + +1:02:36.356 --> 1:02:38.494 +Any More Questions to This. + +1:02:40.260 --> 1:02:41.557 +Yeah. + +1:02:41.461 --> 1:02:44.296 +Good. + +1:02:43.843 --> 1:03:01.642 +Is to look at the vocabulary, so the problem +is we have seen that vocabulary calculations + +1:03:01.642 --> 1:03:06.784 +are often very presuming. + +1:03:09.789 --> 1:03:19.805 +The thing is that most of the vocabulary is +not needed for each sentence, so in each sentence. + +1:03:20.280 --> 1:03:28.219 +The question is: Can we somehow easily precalculate, +which words are probable to occur in the sentence, + +1:03:28.219 --> 1:03:30.967 +and then only calculate these ones? + +1:03:31.691 --> 1:03:34.912 +And this can be done so. + +1:03:34.912 --> 1:03:43.932 +For example, if you have sentenced card, it's +probably not happening. + +1:03:44.164 --> 1:03:48.701 +So what you can try to do is to limit your +vocabulary. + +1:03:48.701 --> 1:03:51.093 +You're considering for each. + +1:03:51.151 --> 1:04:04.693 +So you're no longer taking the full vocabulary +as possible output, but you're restricting. + +1:04:06.426 --> 1:04:18.275 +That typically works is that we limit it by +the most frequent words we always take because + +1:04:18.275 --> 1:04:23.613 +these are not so easy to align to words. + +1:04:23.964 --> 1:04:32.241 +To take the most treatment taggin' words and +then work that often aligns with one of the + +1:04:32.241 --> 1:04:32.985 +source. + +1:04:33.473 --> 1:04:46.770 +So for each source word you calculate the +word alignment on your training data, and then + +1:04:46.770 --> 1:04:51.700 +you calculate which words occur. + +1:04:52.352 --> 1:04:57.680 +And then for decoding you build this union +of maybe the source word list that other. + +1:04:59.960 --> 1:05:02.145 +Are like for each source work. + +1:05:02.145 --> 1:05:08.773 +One of the most frequent translations of these +source words, for example for each source work + +1:05:08.773 --> 1:05:13.003 +like in the most frequent ones, and then the +most frequent. + +1:05:13.193 --> 1:05:24.333 +In total, if you have short sentences, you +have a lot less words, so in most cases it's + +1:05:24.333 --> 1:05:26.232 +not more than. + +1:05:26.546 --> 1:05:33.957 +And so you have dramatically reduced your +vocabulary, and thereby can also fax a depot. + +1:05:35.495 --> 1:05:43.757 +That easy does anybody see what is challenging +here and why that might not always need. + +1:05:47.687 --> 1:05:54.448 +The performance is not why this might not. + +1:05:54.448 --> 1:06:01.838 +If you implement it, it might not be a strong. + +1:06:01.941 --> 1:06:06.053 +You have to store this list. + +1:06:06.053 --> 1:06:14.135 +You have to burn the union and of course your +safe time. + +1:06:14.554 --> 1:06:21.920 +The second thing the vocabulary is used in +our last step, so we have the hidden state, + +1:06:21.920 --> 1:06:23.868 +and then we calculate. + +1:06:24.284 --> 1:06:29.610 +Now we are not longer calculating them for +all output words, but for a subset of them. + +1:06:30.430 --> 1:06:35.613 +However, this metric multiplication is typically +parallelized with the perfect but good. + +1:06:35.956 --> 1:06:46.937 +But if you not only calculate some of them, +if you're not modeling it right, it will take + +1:06:46.937 --> 1:06:52.794 +as long as before because of the nature of +the. + +1:06:56.776 --> 1:07:07.997 +Here for beam search there's some ideas of +course you can go back to greedy search because + +1:07:07.997 --> 1:07:10.833 +that's more efficient. + +1:07:11.651 --> 1:07:18.347 +And better quality, and you can buffer some +states in between, so how much buffering it's + +1:07:18.347 --> 1:07:22.216 +again this tradeoff between calculation and +memory. + +1:07:25.125 --> 1:07:41.236 +Then at the end of today what we want to look +into is one last type of new machine translation + +1:07:41.236 --> 1:07:42.932 +approach. + +1:07:43.403 --> 1:07:53.621 +And the idea is what we've already seen in +our first two steps is that this ultra aggressive + +1:07:53.621 --> 1:07:57.246 +park is taking community coding. + +1:07:57.557 --> 1:08:04.461 +Can process everything in parallel, but we +are always taking the most probable and then. + +1:08:05.905 --> 1:08:10.476 +The question is: Do we really need to do that? + +1:08:10.476 --> 1:08:14.074 +Therefore, there is a bunch of work. + +1:08:14.074 --> 1:08:16.602 +Can we do it differently? + +1:08:16.602 --> 1:08:19.616 +Can we generate a full target? + +1:08:20.160 --> 1:08:29.417 +We'll see it's not that easy and there's still +an open debate whether this is really faster + +1:08:29.417 --> 1:08:31.832 +and quality, but think. + +1:08:32.712 --> 1:08:45.594 +So, as said, what we have done is our encoder +decoder where we can process our encoder color, + +1:08:45.594 --> 1:08:50.527 +and then the output always depends. + +1:08:50.410 --> 1:08:54.709 +We generate the output and then we have to +put it here the wide because then everything + +1:08:54.709 --> 1:08:56.565 +depends on the purpose of the output. + +1:08:56.916 --> 1:09:10.464 +This is what is referred to as an outer-regressive +model and nearly outs speech generation and + +1:09:10.464 --> 1:09:16.739 +language generation or works in this outer. + +1:09:18.318 --> 1:09:21.132 +So the motivation is, can we do that more +efficiently? + +1:09:21.361 --> 1:09:31.694 +And can we somehow process all target words +in parallel? + +1:09:31.694 --> 1:09:41.302 +So instead of doing it one by one, we are +inputting. + +1:09:45.105 --> 1:09:46.726 +So how does it work? + +1:09:46.726 --> 1:09:50.587 +So let's first have a basic auto regressive +mode. + +1:09:50.810 --> 1:09:53.551 +So the encoder looks as it is before. + +1:09:53.551 --> 1:09:58.310 +That's maybe not surprising because here we +know we can paralyze. + +1:09:58.618 --> 1:10:04.592 +So we have put in here our ink holder and +generated the ink stash, so that's exactly + +1:10:04.592 --> 1:10:05.295 +the same. + +1:10:05.845 --> 1:10:16.229 +However, now we need to do one more thing: +One challenge is what we had before and that's + +1:10:16.229 --> 1:10:26.799 +a challenge of natural language generation +like machine translation. + +1:10:32.672 --> 1:10:38.447 +We generate until we generate this out of +end of center stock, but if we now generate + +1:10:38.447 --> 1:10:44.625 +everything at once that's no longer possible, +so we cannot generate as long because we only + +1:10:44.625 --> 1:10:45.632 +generated one. + +1:10:46.206 --> 1:10:58.321 +So the question is how can we now determine +how long the sequence is, and we can also accelerate. + +1:11:00.000 --> 1:11:06.384 +Yes, but there would be one idea, and there +is other work which tries to do that. + +1:11:06.806 --> 1:11:15.702 +However, in here there's some work already +done before and maybe you remember we had the + +1:11:15.702 --> 1:11:20.900 +IBM models and there was this concept of fertility. + +1:11:21.241 --> 1:11:26.299 +The concept of fertility is means like for +one saucepan, and how many target pores does + +1:11:26.299 --> 1:11:27.104 +it translate? + +1:11:27.847 --> 1:11:34.805 +And exactly that we try to do here, and that +means we are calculating like at the top we + +1:11:34.805 --> 1:11:36.134 +are calculating. + +1:11:36.396 --> 1:11:42.045 +So it says word is translated into word. + +1:11:42.045 --> 1:11:54.171 +Word might be translated into words into, +so we're trying to predict in how many words. + +1:11:55.935 --> 1:12:10.314 +And then the end of the anchor, so this is +like a length estimation. + +1:12:10.314 --> 1:12:15.523 +You can do it otherwise. + +1:12:16.236 --> 1:12:24.526 +You initialize your decoder input and we know +it's good with word embeddings so we're trying + +1:12:24.526 --> 1:12:28.627 +to do the same thing and what people then do. + +1:12:28.627 --> 1:12:35.224 +They initialize it again with word embedding +but in the frequency of the. + +1:12:35.315 --> 1:12:36.460 +So we have the cartilage. + +1:12:36.896 --> 1:12:47.816 +So one has two, so twice the is and then one +is, so that is then our initialization. + +1:12:48.208 --> 1:12:57.151 +In other words, if you don't predict fertilities +but predict lengths, you can just initialize + +1:12:57.151 --> 1:12:57.912 +second. + +1:12:58.438 --> 1:13:07.788 +This often works a bit better, but that's +the other. + +1:13:07.788 --> 1:13:16.432 +Now you have everything in training and testing. + +1:13:16.656 --> 1:13:18.621 +This is all available at once. + +1:13:20.280 --> 1:13:31.752 +Then we can generate everything in parallel, +so we have the decoder stack, and that is now + +1:13:31.752 --> 1:13:33.139 +as before. + +1:13:35.395 --> 1:13:41.555 +And then we're doing the translation predictions +here on top of it in order to do. + +1:13:43.083 --> 1:13:59.821 +And then we are predicting here the target +words and once predicted, and that is the basic + +1:13:59.821 --> 1:14:00.924 +idea. + +1:14:01.241 --> 1:14:08.171 +Machine translation: Where the idea is, we +don't have to do one by one what we're. + +1:14:10.210 --> 1:14:13.900 +So this looks really, really, really great. + +1:14:13.900 --> 1:14:20.358 +On the first view there's one challenge with +this, and this is the baseline. + +1:14:20.358 --> 1:14:27.571 +Of course there's some improvements, but in +general the quality is often significant. + +1:14:28.068 --> 1:14:32.075 +So here you see the baseline models. + +1:14:32.075 --> 1:14:38.466 +You have a loss of ten blue points or something +like that. + +1:14:38.878 --> 1:14:40.230 +So why does it change? + +1:14:40.230 --> 1:14:41.640 +So why is it happening? + +1:14:43.903 --> 1:14:56.250 +If you look at the errors there is repetitive +tokens, so you have like or things like that. + +1:14:56.536 --> 1:15:01.995 +Broken senses or influent senses, so that +exactly where algebra aggressive models are + +1:15:01.995 --> 1:15:04.851 +very good, we say that's a bit of a problem. + +1:15:04.851 --> 1:15:07.390 +They generate very fluid transcription. + +1:15:07.387 --> 1:15:10.898 +Translation: Sometimes there doesn't have +to do anything with the input. + +1:15:11.411 --> 1:15:14.047 +But generally it really looks always very +fluid. + +1:15:14.995 --> 1:15:20.865 +Here exactly the opposite, so the problem +is that we don't have really fluid translation. + +1:15:21.421 --> 1:15:26.123 +And that is mainly due to the challenge that +we have this independent assumption. + +1:15:26.646 --> 1:15:35.873 +So in this case, the probability of Y of the +second position is independent of the probability + +1:15:35.873 --> 1:15:40.632 +of X, so we don't know what was there generated. + +1:15:40.632 --> 1:15:43.740 +We're just generating it there. + +1:15:43.964 --> 1:15:55.439 +You can see it also in a bit of examples. + +1:15:55.439 --> 1:16:03.636 +You can over-panelize shifts. + +1:16:04.024 --> 1:16:10.566 +And the problem is this is already an improvement +again, but this is also similar to. + +1:16:11.071 --> 1:16:19.900 +So you can, for example, translate heeded +back, or maybe you could also translate it + +1:16:19.900 --> 1:16:31.105 +with: But on their feeling down in feeling +down, if the first position thinks of their + +1:16:31.105 --> 1:16:34.594 +feeling done and the second. + +1:16:35.075 --> 1:16:42.908 +So each position here and that is one of the +main issues here doesn't know what the other. + +1:16:43.243 --> 1:16:53.846 +And for example, if you are translating something +with, you can often translate things in two + +1:16:53.846 --> 1:16:58.471 +ways: German with a different agreement. + +1:16:58.999 --> 1:17:02.058 +And then here where you have to decide do +a used jet. + +1:17:02.162 --> 1:17:05.460 +Interpretator: It doesn't know which word +it has to select. + +1:17:06.086 --> 1:17:14.789 +Mean, of course, it knows a hidden state, +but in the end you have a liability distribution. + +1:17:16.256 --> 1:17:20.026 +And that is the important thing in the outer +regressive month. + +1:17:20.026 --> 1:17:24.335 +You know that because you have put it in you +here, you don't know that. + +1:17:24.335 --> 1:17:29.660 +If it's equal probable here to two, you don't +Know Which Is Selected, and of course that + +1:17:29.660 --> 1:17:32.832 +depends on what should be the latest traction +under. + +1:17:33.333 --> 1:17:39.554 +Yep, that's the undershift, and we're going +to last last the next time. + +1:17:39.554 --> 1:17:39.986 +Yes. + +1:17:40.840 --> 1:17:44.935 +Doesn't this also appear in and like now we're +talking about physical training? + +1:17:46.586 --> 1:17:48.412 +The thing is in the auto regress. + +1:17:48.412 --> 1:17:50.183 +If you give it the correct one,. + +1:17:50.450 --> 1:17:55.827 +So if you predict here comma what the reference +is feeling then you tell the model here. + +1:17:55.827 --> 1:17:59.573 +The last one was feeling and then it knows +it has to be done. + +1:17:59.573 --> 1:18:04.044 +But here it doesn't know that because it doesn't +get as input as a right. + +1:18:04.204 --> 1:18:24.286 +Yes, that's a bit depending on what. + +1:18:24.204 --> 1:18:27.973 +But in training, of course, you just try to +make the highest one the current one. + +1:18:31.751 --> 1:18:38.181 +So what you can do is things like CDC loss +which can adjust for this. + +1:18:38.181 --> 1:18:42.866 +So then you can also have this shifted correction. + +1:18:42.866 --> 1:18:50.582 +If you're doing this type of correction in +the CDC loss you don't get full penalty. + +1:18:50.930 --> 1:18:58.486 +Just shifted by one, so it's a bit of a different +loss, which is mainly used in, but. + +1:19:00.040 --> 1:19:03.412 +It can be used in order to address this problem. + +1:19:04.504 --> 1:19:13.844 +The other problem is that outer regressively +we have the label buyers that tries to disimmigrate. + +1:19:13.844 --> 1:19:20.515 +That's the example did before was if you translate +thank you to Dung. + +1:19:20.460 --> 1:19:31.925 +And then it might end up because it learns +in the first position and the second also. + +1:19:32.492 --> 1:19:43.201 +In order to prevent that, it would be helpful +for one output, only one output, so that makes + +1:19:43.201 --> 1:19:47.002 +the system already better learn. + +1:19:47.227 --> 1:19:53.867 +Might be that for slightly different inputs +you have different outputs, but for the same. + +1:19:54.714 --> 1:19:57.467 +That we can luckily very easily solve. + +1:19:59.119 --> 1:19:59.908 +And it's done. + +1:19:59.908 --> 1:20:04.116 +We just learned the technique about it, which +is called knowledge distillation. + +1:20:04.985 --> 1:20:13.398 +So what we can do and the easiest solution +to prove your non-autoregressive model is to + +1:20:13.398 --> 1:20:16.457 +train an auto regressive model. + +1:20:16.457 --> 1:20:22.958 +Then you decode your whole training gamer +with this model and then. + +1:20:23.603 --> 1:20:27.078 +While the main advantage of that is that this +is more consistent,. + +1:20:27.407 --> 1:20:33.995 +So for the same input you always have the +same output. + +1:20:33.995 --> 1:20:41.901 +So you have to make your training data more +consistent and learn. + +1:20:42.482 --> 1:20:54.471 +So there is another advantage of knowledge +distillation and that advantage is you have + +1:20:54.471 --> 1:20:59.156 +more consistent training signals. + +1:21:04.884 --> 1:21:10.630 +There's another to make the things more easy +at the beginning. + +1:21:10.630 --> 1:21:16.467 +There's this plants model, black model where +you do more masks. + +1:21:16.756 --> 1:21:26.080 +So during training, especially at the beginning, +you give some correct solutions at the beginning. + +1:21:28.468 --> 1:21:38.407 +And there is this tokens at a time, so the +idea is to establish other regressive training. + +1:21:40.000 --> 1:21:50.049 +And some targets are open, so you always predict +only like first auto regression is K. + +1:21:50.049 --> 1:21:59.174 +It puts one, so you always have one input +and one output, then you do partial. + +1:21:59.699 --> 1:22:05.825 +So in that way you can slowly learn what is +a good and what is a bad answer. + +1:22:08.528 --> 1:22:10.862 +It doesn't sound very impressive. + +1:22:10.862 --> 1:22:12.578 +Don't contact me anyway. + +1:22:12.578 --> 1:22:15.323 +Go all over your training data several. + +1:22:15.875 --> 1:22:20.655 +You can even switch in between. + +1:22:20.655 --> 1:22:29.318 +There is a homework on this thing where you +try to start. + +1:22:31.271 --> 1:22:41.563 +You have to learn so there's a whole work +on that so this is often happening and it doesn't + +1:22:41.563 --> 1:22:46.598 +mean it's less efficient but still it helps. + +1:22:49.389 --> 1:22:57.979 +For later maybe here are some examples of +how much things help. + +1:22:57.979 --> 1:23:04.958 +Maybe one point here is that it's really important. + +1:23:05.365 --> 1:23:13.787 +Here's the translation performance and speed. + +1:23:13.787 --> 1:23:24.407 +One point which is a point is if you compare +researchers. + +1:23:24.784 --> 1:23:33.880 +So yeah, if you're compared to one very weak +baseline transformer even with beam search, + +1:23:33.880 --> 1:23:40.522 +then you're ten times slower than a very strong +auto regressive. + +1:23:40.961 --> 1:23:48.620 +If you make a strong baseline then it's going +down to depending on times and here like: You + +1:23:48.620 --> 1:23:53.454 +have a lot of different speed ups. + +1:23:53.454 --> 1:24:03.261 +Generally, it makes a strong baseline and +not very simple transformer. + +1:24:07.407 --> 1:24:20.010 +Yeah, with this one last thing that you can +do to speed up things and also reduce your + +1:24:20.010 --> 1:24:25.950 +memory is what is called half precision. + +1:24:26.326 --> 1:24:29.139 +And especially for decoding issues for training. + +1:24:29.139 --> 1:24:31.148 +Sometimes it also gets less stale. + +1:24:32.592 --> 1:24:45.184 +With this we close nearly wait a bit, so what +you should remember is that efficient machine + +1:24:45.184 --> 1:24:46.963 +translation. + +1:24:47.007 --> 1:24:51.939 +We have, for example, looked at knowledge +distillation. + +1:24:51.939 --> 1:24:55.991 +We have looked at non auto regressive models. + +1:24:55.991 --> 1:24:57.665 +We have different. + +1:24:58.898 --> 1:25:02.383 +For today and then only requests. + +1:25:02.383 --> 1:25:08.430 +So if you haven't done so, please fill out +the evaluation. + +1:25:08.388 --> 1:25:20.127 +So now if you have done so think then you +should have and with the online people hopefully. + +1:25:20.320 --> 1:25:29.758 +Only possibility to tell us what things are +good and what not the only one but the most + +1:25:29.758 --> 1:25:30.937 +efficient. + +1:25:31.851 --> 1:25:35.871 +So think of all the students doing it in this +case okay and then thank. + diff --git a/demo_data/lectures/Lecture-14-27.06.2023/video.mp4 b/demo_data/lectures/Lecture-14-27.06.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..619ac097e1b75180f907c3eedba64168b00e6341 --- /dev/null +++ b/demo_data/lectures/Lecture-14-27.06.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59f384b3137c89cb3f00f2020badb6eb5ff6de5043bd9e015adab92072e27e62 +size 113488295 diff --git a/demo_data/lectures/Lecture-15-11.07.2023/English.vtt b/demo_data/lectures/Lecture-15-11.07.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..72161b73f83bf0d8de577ff3f8d7aac6c011381e --- /dev/null +++ b/demo_data/lectures/Lecture-15-11.07.2023/English.vtt @@ -0,0 +1,2279 @@ +WEBVTT + +0:00:00.060 --> 0:00:07.762 +OK good so today's lecture is on on supervised +machines and stations so what you have seen + +0:00:07.762 --> 0:00:13.518 +so far is different techniques are on supervised +and MP so you are. + +0:00:13.593 --> 0:00:18.552 +Data right so let's say in English coppers +you are one file and then in German you have + +0:00:18.552 --> 0:00:23.454 +another file which is sentence to sentence +la and then you try to build systems around + +0:00:23.454 --> 0:00:23.679 +it. + +0:00:24.324 --> 0:00:30.130 +But what's different about this lecture is +that you assume that you have no final data + +0:00:30.130 --> 0:00:30.663 +at all. + +0:00:30.663 --> 0:00:37.137 +You only have monolingual data and the question +is how can we build systems to translate between + +0:00:37.137 --> 0:00:39.405 +these two languages right and so. + +0:00:39.359 --> 0:00:44.658 +This is a bit more realistic scenario because +you have so many languages in the world. + +0:00:44.658 --> 0:00:50.323 +You cannot expect to have parallel data between +all the two languages and so, but in typical + +0:00:50.323 --> 0:00:55.623 +cases you have newspapers and so on, which +is like monolingual files, and the question + +0:00:55.623 --> 0:00:57.998 +is can we build something around them? + +0:00:59.980 --> 0:01:01.651 +They like said for today. + +0:01:01.651 --> 0:01:05.893 +First we'll start up with the interactions, +so why do we need it? + +0:01:05.893 --> 0:01:11.614 +and also some infusion on how these models +work before going into the technical details. + +0:01:11.614 --> 0:01:17.335 +I want to also go through an example,, which +kind of gives you more understanding on how + +0:01:17.335 --> 0:01:19.263 +people came into more elders. + +0:01:20.820 --> 0:01:23.905 +Then the rest of the lecture is going to be +two parts. + +0:01:23.905 --> 0:01:26.092 +One is we're going to translate words. + +0:01:26.092 --> 0:01:30.018 +We're not going to care about how can we translate +the full sentence. + +0:01:30.018 --> 0:01:35.177 +But given to monolingual files, how can we +get a dictionary basically, which is much easier + +0:01:35.177 --> 0:01:37.813 +than generating something in a sentence level? + +0:01:38.698 --> 0:01:43.533 +Then we're going to go into the Edwards case, +which is the unsupervised sentence type solution. + +0:01:44.204 --> 0:01:50.201 +And here what you'll see is what are the training +objectives which are quite different than the + +0:01:50.201 --> 0:01:55.699 +word translation and also where it doesn't +but because this is also quite important and + +0:01:55.699 --> 0:02:01.384 +it's one of the reasons why unsupervised does +not use anymore because the limitations kind + +0:02:01.384 --> 0:02:03.946 +of go away from the realistic use cases. + +0:02:04.504 --> 0:02:06.922 +And then that leads to the marketing world +model. + +0:02:06.922 --> 0:02:07.115 +So. + +0:02:07.807 --> 0:02:12.915 +People are trying to do to build systems for +languages that will not have any parallel data. + +0:02:12.915 --> 0:02:17.693 +Is use multilingual models and combine with +these training objectives to get better at + +0:02:17.693 --> 0:02:17.913 +it. + +0:02:17.913 --> 0:02:18.132 +So. + +0:02:18.658 --> 0:02:24.396 +People are not trying to build bilingual systems +currently for unsupervised arm translation, + +0:02:24.396 --> 0:02:30.011 +but I think it's good to know how they came +to hear this point and what they're doing now. + +0:02:30.090 --> 0:02:34.687 +You also see some patterns overlapping which +people are using. + +0:02:36.916 --> 0:02:41.642 +So as you said before, and you probably hear +it multiple times now is that we have seven + +0:02:41.642 --> 0:02:43.076 +thousand languages around. + +0:02:43.903 --> 0:02:49.460 +Can be different dialects in someone, so it's +quite hard to distinguish what's the language, + +0:02:49.460 --> 0:02:54.957 +but you can typically approximate that seven +thousand and that leads to twenty five million + +0:02:54.957 --> 0:02:59.318 +pairs, which is the obvious reason why we do +not have any parallel data. + +0:03:00.560 --> 0:03:06.386 +So you want to build an empty system for all +possible language pests and the question is + +0:03:06.386 --> 0:03:07.172 +how can we? + +0:03:08.648 --> 0:03:13.325 +The typical use case, but there are actually +quite few interesting use cases than what you + +0:03:13.325 --> 0:03:14.045 +would expect. + +0:03:14.614 --> 0:03:20.508 +One is the animal languages, which is the +real thing that's happening right now with. + +0:03:20.780 --> 0:03:26.250 +The dog but with dolphins and so on, but I +couldn't find a picture that could show this, + +0:03:26.250 --> 0:03:31.659 +but if you are interested in stuff like this +you can check out the website where people + +0:03:31.659 --> 0:03:34.916 +are actually trying to understand how animals +speak. + +0:03:35.135 --> 0:03:37.356 +It's Also a Bit More About. + +0:03:37.297 --> 0:03:44.124 +Knowing what the animals want to say but may +not die dead but still people are trying to + +0:03:44.124 --> 0:03:44.661 +do it. + +0:03:45.825 --> 0:03:50.689 +More realistic thing that's happening is the +translation of programming languages. + +0:03:51.371 --> 0:03:56.963 +And so this is quite a quite good scenario +for entrepreneurs and empty is that you have + +0:03:56.963 --> 0:04:02.556 +a lot of code available online right in C + ++ and in Python and the question is how can + +0:04:02.556 --> 0:04:08.402 +we translate by just looking at the code alone +and no parallel functions and so on and this + +0:04:08.402 --> 0:04:10.754 +is actually quite good right now so. + +0:04:12.032 --> 0:04:16.111 +See how these techniques were applied to do +the programming translation. + +0:04:18.258 --> 0:04:23.882 +And then you can also think of language as +something that is quite common so you can take + +0:04:23.882 --> 0:04:24.194 +off. + +0:04:24.194 --> 0:04:29.631 +Think of formal sentences in English as one +language and informal sentences in English + +0:04:29.631 --> 0:04:35.442 +as another language and then learn the kind +to stay between them and then it kind of becomes + +0:04:35.442 --> 0:04:37.379 +a style plan for a problem so. + +0:04:38.358 --> 0:04:43.042 +Although it's translation, you can consider +different characteristics of a language and + +0:04:43.042 --> 0:04:46.875 +then separate them as two different languages +and then try to map them. + +0:04:46.875 --> 0:04:52.038 +So it's not only about languages, but you +can also do quite cool things by using unsophisticated + +0:04:52.038 --> 0:04:54.327 +techniques, which are quite possible also. + +0:04:56.256 --> 0:04:56.990 +I am so. + +0:04:56.990 --> 0:05:04.335 +This is kind of TV modeling for many of the +use cases that we have for ours, ours and MD. + +0:05:04.335 --> 0:05:11.842 +But before we go into the modeling of these +systems, what I want you to do is look at these + +0:05:11.842 --> 0:05:12.413 +dummy. + +0:05:13.813 --> 0:05:19.720 +We have text and language one, text and language +two right, and nobody knows what these languages + +0:05:19.720 --> 0:05:20.082 +mean. + +0:05:20.082 --> 0:05:23.758 +They completely are made up right, and the +question is also. + +0:05:23.758 --> 0:05:29.364 +They're not parallel lines, so the first line +here and the first line is not a line, they're + +0:05:29.364 --> 0:05:30.810 +just monolingual files. + +0:05:32.052 --> 0:05:38.281 +And now think about how can you translate +the word M1 from language one to language two, + +0:05:38.281 --> 0:05:41.851 +and this kind of you see how we try to model +this. + +0:05:42.983 --> 0:05:47.966 +Would take your time and then think of how +can you translate more into language two? + +0:06:41.321 --> 0:06:45.589 +About the model, if you ask somebody who doesn't +know anything about machine translation right, + +0:06:45.589 --> 0:06:47.411 +and then you ask them to translate more. + +0:07:01.201 --> 0:07:10.027 +But it's also not quite easy if you think +of the way that I made this example is relatively + +0:07:10.027 --> 0:07:10.986 +easy, so. + +0:07:11.431 --> 0:07:17.963 +Basically, the first two sentences are these +two: A, B, C is E, and G cured up the U, V + +0:07:17.963 --> 0:07:21.841 +is L, A, A, C, S, and S, on and this is used +towards the German. + +0:07:22.662 --> 0:07:25.241 +And then when you join these two words, it's. + +0:07:25.205 --> 0:07:32.445 +English German the third line and the last +line, and then the fourth line is the first + +0:07:32.445 --> 0:07:38.521 +line, so German language, English, and then +speak English, speak German. + +0:07:38.578 --> 0:07:44.393 +So this is how I made made up the example +and what the intuition here is that you assume + +0:07:44.393 --> 0:07:50.535 +that the languages have a fundamental structure +right and it's the same across all languages. + +0:07:51.211 --> 0:07:57.727 +Doesn't matter what language you are thinking +of words kind of you have in the same way join + +0:07:57.727 --> 0:07:59.829 +together is the same way and. + +0:07:59.779 --> 0:08:06.065 +And plasma sign thinks the same way but this +is not a realistic assumption for sure but + +0:08:06.065 --> 0:08:12.636 +it's actually a decent one to make and if you +can think of this like if you can assume this + +0:08:12.636 --> 0:08:16.207 +then we can model systems in an unsupervised +way. + +0:08:16.396 --> 0:08:22.743 +So this is the intuition that I want to give, +and you can see that whenever assumptions fail, + +0:08:22.743 --> 0:08:23.958 +the systems fail. + +0:08:23.958 --> 0:08:29.832 +So in practice whenever we go far away from +these assumptions, the systems try to more + +0:08:29.832 --> 0:08:30.778 +time to fail. + +0:08:33.753 --> 0:08:39.711 +So the example that I gave was actually perfect +mapping right, so it never really sticks bad. + +0:08:39.711 --> 0:08:45.353 +They have the same number of words, same sentence +structure, perfect mapping, and so on. + +0:08:45.353 --> 0:08:50.994 +This doesn't happen, but let's assume that +this happens and try to see how we can moral. + +0:08:53.493 --> 0:09:01.061 +Okay, now let's go a bit more formal, so what +you want to do is unsupervise word translation. + +0:09:01.901 --> 0:09:08.773 +Here the task is that we have input data as +monolingual data, so a bunch of sentences in + +0:09:08.773 --> 0:09:15.876 +one file and a bunch of sentences another file +in two different languages, and the question + +0:09:15.876 --> 0:09:18.655 +is how can we get a bilingual word? + +0:09:19.559 --> 0:09:25.134 +So if you look at the picture you see that +it's just kind of projected down into two dimension + +0:09:25.134 --> 0:09:30.358 +planes, but it's basically when you map them +into a plot you see that the words that are + +0:09:30.358 --> 0:09:35.874 +parallel are closer together, and the question +is how can we do it just looking at two files? + +0:09:36.816 --> 0:09:42.502 +And you can say that what we want to basically +do is create a dictionary in the end given + +0:09:42.502 --> 0:09:43.260 +two fights. + +0:09:43.260 --> 0:09:45.408 +So this is the task that we want. + +0:09:46.606 --> 0:09:52.262 +And the first step on how we do this is to +learn word vectors, and this chicken is whatever + +0:09:52.262 --> 0:09:56.257 +techniques that you have seen before, but to +work glow or so on. + +0:09:56.856 --> 0:10:00.699 +So you take a monolingual data and try to +learn word embeddings. + +0:10:02.002 --> 0:10:07.675 +Then you plot them into a graph, and then +typically what you would see is that they're + +0:10:07.675 --> 0:10:08.979 +not aligned at all. + +0:10:08.979 --> 0:10:14.717 +One word space is somewhere, and one word +space is somewhere else, and this is what you + +0:10:14.717 --> 0:10:18.043 +would typically expect to see in the in the +image. + +0:10:19.659 --> 0:10:23.525 +Now our assumption was that both lines we +just have the same. + +0:10:23.563 --> 0:10:28.520 +Culture and so that we can use this information +to learn the mapping between these two spaces. + +0:10:30.130 --> 0:10:37.085 +So before how we do it, I think this is quite +famous already, and everybody knows it a bit + +0:10:37.085 --> 0:10:41.824 +more is that we're emitting capture semantic +relations right. + +0:10:41.824 --> 0:10:48.244 +So the distance between man and woman is approximately +the same as king and prince. + +0:10:48.888 --> 0:10:54.620 +It's also for world dances, country capital +and so on, so there are some relationships + +0:10:54.620 --> 0:11:00.286 +happening in the word emmering space, which +is quite clear for at least one language. + +0:11:03.143 --> 0:11:08.082 +Now if you think of this, let's say of the +English word embryng. + +0:11:08.082 --> 0:11:14.769 +Let's say of German word embryng and the way +the King Keene Man woman organized is same + +0:11:14.769 --> 0:11:17.733 +as the German translation of his word. + +0:11:17.998 --> 0:11:23.336 +This is the main idea is that although they +are somewhere else, the relationship is the + +0:11:23.336 --> 0:11:28.008 +same between the both languages and we can +use this to to learn the mapping. + +0:11:31.811 --> 0:11:35.716 +'S not only for these poor words where it +happens for all the words in the language, + +0:11:35.716 --> 0:11:37.783 +and so we can use this to to learn the math. + +0:11:39.179 --> 0:11:43.828 +This is the main idea is that both emittings +have a similar shape. + +0:11:43.828 --> 0:11:48.477 +It's only that they're just not aligned and +so you go to the here. + +0:11:48.477 --> 0:11:50.906 +They kind of have a similar shape. + +0:11:50.906 --> 0:11:57.221 +They're just in some different spaces and +what you need to do is to map them into a common + +0:11:57.221 --> 0:11:57.707 +space. + +0:12:06.086 --> 0:12:12.393 +The w, such that if it multiplied w with x, +they both become. + +0:12:35.335 --> 0:12:41.097 +That's true, but there are also many works +that have the relationship right, and we hope + +0:12:41.097 --> 0:12:43.817 +that this is enough to learn the mapping. + +0:12:43.817 --> 0:12:49.838 +So there's always going to be a bit of noise, +as in how when we align them they're not going + +0:12:49.838 --> 0:12:51.716 +to be exactly the same, but. + +0:12:51.671 --> 0:12:57.293 +What you can expect is that there are these +main works that allow us to learn the mapping, + +0:12:57.293 --> 0:13:02.791 +so it's not going to be perfect, but it's an +approximation that we make to to see how it + +0:13:02.791 --> 0:13:04.521 +works and then practice it. + +0:13:04.521 --> 0:13:10.081 +Also, it's not that the fact that women do +not have any relationship does not affect that + +0:13:10.081 --> 0:13:10.452 +much. + +0:13:10.550 --> 0:13:15.429 +A lot of words usually have, so it kind of +works out in practice. + +0:13:22.242 --> 0:13:34.248 +I have not heard about it, but if you want +to say something about it, I would be interested, + +0:13:34.248 --> 0:13:37.346 +but we can do it later. + +0:13:41.281 --> 0:13:44.133 +Usual case: This is supervised. + +0:13:45.205 --> 0:13:49.484 +First way to do a supervised work translation +where we have a dictionary right and that we + +0:13:49.484 --> 0:13:53.764 +can use that to learn the mapping, but in our +case we assume that we have nothing right so + +0:13:53.764 --> 0:13:55.222 +we only have monolingual data. + +0:13:56.136 --> 0:14:03.126 +Then we need unsupervised planning to figure +out W, and we're going to use guns to to find + +0:14:03.126 --> 0:14:06.122 +W, and it's quite a nice way to do it. + +0:14:08.248 --> 0:14:15.393 +So just before I go on how we use it to use +case, I'm going to go briefly on gas right, + +0:14:15.393 --> 0:14:19.940 +so we have two components: generator and discriminator. + +0:14:21.441 --> 0:14:27.052 +Gen data tries to generate something obviously, +and the discriminator tries to see if it's + +0:14:27.052 --> 0:14:30.752 +real data or something that is generated by +the generation. + +0:14:31.371 --> 0:14:37.038 +And there's like this two player game where +the winner decides to fool and the winner decides + +0:14:37.038 --> 0:14:41.862 +to market food and they try to build these +two components and try to learn WWE. + +0:14:43.483 --> 0:14:53.163 +Okay, so let's say we have two languages, +X and Y right, so the X language has N words + +0:14:53.163 --> 0:14:56.167 +with numbering dimensions. + +0:14:56.496 --> 0:14:59.498 +So what I'm reading is matrix is peak or something. + +0:14:59.498 --> 0:15:02.211 +Then we have target language why with m words. + +0:15:02.211 --> 0:15:06.944 +I'm also the same amount of things I mentioned +and then we have a matrix peak or. + +0:15:07.927 --> 0:15:13.784 +Basically what you're going to do is use word +to work and learn our word embedded. + +0:15:14.995 --> 0:15:23.134 +Now we have these X Mrings, Y Mrings, and +what you want to know is W, such that W X and + +0:15:23.134 --> 0:15:24.336 +Y are align. + +0:15:29.209 --> 0:15:35.489 +With guns you have two steps, one is a discriminative +step and one is the the mapping step and the + +0:15:35.489 --> 0:15:41.135 +discriminative step is to see if the embeddings +are from the source or mapped embedding. + +0:15:41.135 --> 0:15:44.688 +So it's going to be much scary when I go to +the figure. + +0:15:46.306 --> 0:15:50.041 +So we have a monolingual documents with two +different languages. + +0:15:50.041 --> 0:15:54.522 +From here we get our source language ambients +target language ambients right. + +0:15:54.522 --> 0:15:57.855 +Then we randomly initialize the transformation +metrics W. + +0:16:00.040 --> 0:16:06.377 +Then we have the discriminator which tries +to see if it's WX or Y, so it needs to know + +0:16:06.377 --> 0:16:13.735 +that this is a mapped one and this is the original +language, and so if you look at the lost function + +0:16:13.735 --> 0:16:20.072 +here, it's basically that source is one given +WX, so this is from the source language. + +0:16:23.543 --> 0:16:27.339 +Which means it's the target language em yeah. + +0:16:27.339 --> 0:16:34.436 +It's just like my figure is not that great, +but you can assume that they are totally. + +0:16:40.260 --> 0:16:43.027 +So this is the kind of the lost function. + +0:16:43.027 --> 0:16:46.386 +We have N source words, M target words, and +so on. + +0:16:46.386 --> 0:16:52.381 +So that's why you have one by M, one by M, +and the discriminator is to just see if they're + +0:16:52.381 --> 0:16:55.741 +mapped or they're from the original target +number. + +0:16:57.317 --> 0:17:04.024 +And then we have the mapping step where we +train W to fool the the discriminators. + +0:17:04.564 --> 0:17:10.243 +So here it's the same way, but what you're +going to just do is inverse the loss function. + +0:17:10.243 --> 0:17:15.859 +So now we freeze the discriminators, so it's +important to note that in the previous sect + +0:17:15.859 --> 0:17:20.843 +we freezed the transformation matrix, and here +we freezed your discriminators. + +0:17:22.482 --> 0:17:28.912 +And now it's to fool the discriminated rights, +so it should predict that the source is zero + +0:17:28.912 --> 0:17:35.271 +given the map numbering, and the source is +one given the target numbering, which is wrong, + +0:17:35.271 --> 0:17:37.787 +which is why we're attaining the W. + +0:17:39.439 --> 0:17:46.261 +Any questions on this okay so then how do +we know when to stop? + +0:17:46.261 --> 0:17:55.854 +We just train until we reach convergence right +and then we have our W hopefully train and + +0:17:55.854 --> 0:17:59.265 +map them into an airline space. + +0:18:02.222 --> 0:18:07.097 +The question is how can we evaluate this mapping? + +0:18:07.097 --> 0:18:13.923 +Does anybody know what we can use to mapping +or evaluate the mapping? + +0:18:13.923 --> 0:18:15.873 +How good is a word? + +0:18:28.969 --> 0:18:33.538 +We use as I said we use a dictionary, at least +in the end. + +0:18:33.538 --> 0:18:40.199 +We need a dictionary to evaluate, so this +is our only final, so we aren't using it at + +0:18:40.199 --> 0:18:42.600 +all in attaining data and the. + +0:18:43.223 --> 0:18:49.681 +Is one is to check what's the position for +our dictionary, just that. + +0:18:50.650 --> 0:18:52.813 +The first nearest neighbor and see if it's +there on. + +0:18:53.573 --> 0:18:56.855 +But this is quite strict because there's a +lot of noise in the emitting space right. + +0:18:57.657 --> 0:19:03.114 +Not always your first neighbor is going to +be the translation, so what people also report + +0:19:03.114 --> 0:19:05.055 +is precision at file and so on. + +0:19:05.055 --> 0:19:10.209 +So you take the finerest neighbors and see +if the translation is in there and so on. + +0:19:10.209 --> 0:19:15.545 +So the more you increase, the more likely +that there is a translation because where I'm + +0:19:15.545 --> 0:19:16.697 +being quite noisy. + +0:19:19.239 --> 0:19:25.924 +What's interesting is that people have used +dictionary to to learn word translation, but + +0:19:25.924 --> 0:19:32.985 +the way of doing this is much better than using +a dictionary, so somehow our assumption helps + +0:19:32.985 --> 0:19:36.591 +us to to build better than a supervised system. + +0:19:39.099 --> 0:19:42.985 +So as you see on the top you have a question +at one five ten. + +0:19:42.985 --> 0:19:47.309 +These are the typical numbers that you report +for world translation. + +0:19:48.868 --> 0:19:55.996 +But guns are usually quite tricky to to train, +and it does not converge on on language based, + +0:19:55.996 --> 0:20:02.820 +and this kind of goes back to a assumption +that they kind of behave in the same structure + +0:20:02.820 --> 0:20:03.351 +right. + +0:20:03.351 --> 0:20:07.142 +But if you take a language like English and +some. + +0:20:07.087 --> 0:20:12.203 +Other languages are almost very lotus, so +it's quite different from English and so on. + +0:20:12.203 --> 0:20:13.673 +Then I've one language,. + +0:20:13.673 --> 0:20:18.789 +So whenever whenever our assumption fails, +these unsupervised techniques always do not + +0:20:18.789 --> 0:20:21.199 +converge or just give really bad scores. + +0:20:22.162 --> 0:20:27.083 +And so the fact is that the monolingual embryons +for distant languages are too far. + +0:20:27.083 --> 0:20:30.949 +They do not share the same structure, and +so they do not convert. + +0:20:32.452 --> 0:20:39.380 +And so I just want to mention that there is +a better retrieval technique than the nearest + +0:20:39.380 --> 0:20:41.458 +neighbor, which is called. + +0:20:42.882 --> 0:20:46.975 +But it's more advanced than mathematical, +so I didn't want to go in it now. + +0:20:46.975 --> 0:20:51.822 +But if your interest is in some quite good +retrieval segments, you can just look at these + +0:20:51.822 --> 0:20:53.006 +if you're interested. + +0:20:55.615 --> 0:20:59.241 +Okay, so this is about the the word translation. + +0:20:59.241 --> 0:21:02.276 +Does anybody have any questions of cure? + +0:21:06.246 --> 0:21:07.501 +Was the worst answer? + +0:21:07.501 --> 0:21:12.580 +It was a bit easier than a sentence right, +so you just assume that there's a mapping and + +0:21:12.580 --> 0:21:14.577 +then you try to learn the mapping. + +0:21:14.577 --> 0:21:19.656 +But now it's a bit more difficult because +you need to jump at stuff also, which is quite + +0:21:19.656 --> 0:21:20.797 +much more trickier. + +0:21:22.622 --> 0:21:28.512 +Task here is that we have our input as manually +well data for both languages as before, but + +0:21:28.512 --> 0:21:34.017 +now what we want to do is instead of translating +word by word we want to do sentence. + +0:21:37.377 --> 0:21:44.002 +We have word of work now and so on to learn +word amber inks, but sentence amber inks are + +0:21:44.002 --> 0:21:50.627 +actually not the site powered often, at least +when people try to work on Answer Voice M, + +0:21:50.627 --> 0:21:51.445 +E, before. + +0:21:52.632 --> 0:21:54.008 +Now they're a bit okay. + +0:21:54.008 --> 0:21:59.054 +I mean, as you've seen in the practice on +where we used places, they were quite decent. + +0:21:59.054 --> 0:22:03.011 +But then it's also the case on which data +it's trained on and so on. + +0:22:03.011 --> 0:22:03.240 +So. + +0:22:04.164 --> 0:22:09.666 +Sentence embedings are definitely much more +harder to get than were embedings, so this + +0:22:09.666 --> 0:22:13.776 +is a bit more complicated than the task that +you've seen before. + +0:22:16.476 --> 0:22:18.701 +Before we go into how U. + +0:22:18.701 --> 0:22:18.968 +N. + +0:22:18.968 --> 0:22:19.235 +M. + +0:22:19.235 --> 0:22:19.502 +T. + +0:22:19.502 --> 0:22:24.485 +Works, so this is your typical supervised +system right. + +0:22:24.485 --> 0:22:29.558 +So we have parallel data source sentence target +centers. + +0:22:29.558 --> 0:22:31.160 +We have a source. + +0:22:31.471 --> 0:22:36.709 +We have a target decoder and then we try to +minimize the cross center pillar on this viral + +0:22:36.709 --> 0:22:37.054 +data. + +0:22:37.157 --> 0:22:39.818 +And this is how we train our typical system. + +0:22:43.583 --> 0:22:49.506 +But now we do not have any parallel data, +and so the intuition here is that if we can + +0:22:49.506 --> 0:22:55.429 +learn language independent representations +at the end quota outputs, then we can pass + +0:22:55.429 --> 0:22:58.046 +it along to the decoder that we want. + +0:22:58.718 --> 0:23:03.809 +It's going to get more clear in the future, +but I'm trying to give a bit more intuition + +0:23:03.809 --> 0:23:07.164 +before I'm going to show you all the planning +objectives. + +0:23:08.688 --> 0:23:15.252 +So I assume that we have these different encoders +right, so it's not only two, you have a bunch + +0:23:15.252 --> 0:23:21.405 +of different source language encoders, a bunch +of different target language decoders, and + +0:23:21.405 --> 0:23:26.054 +also I assume that the encoder is in the same +representation space. + +0:23:26.706 --> 0:23:31.932 +If you give a sentence in English and the +same sentence in German, the embeddings are + +0:23:31.932 --> 0:23:38.313 +quite the same, so like the muddling when embeddings +die right, and so then what we can do is, depending + +0:23:38.313 --> 0:23:42.202 +on the language we want, pass it to the the +appropriate decode. + +0:23:42.682 --> 0:23:50.141 +And so the kind of goal here is to find out +a way to create language independent representations + +0:23:50.141 --> 0:23:52.909 +and then pass it to the decodement. + +0:23:54.975 --> 0:23:59.714 +Just keep in mind that you're trying to do +language independent for some reason, but it's + +0:23:59.714 --> 0:24:02.294 +going to be more clear once we see how it works. + +0:24:05.585 --> 0:24:12.845 +So in total we have three objectives that +we're going to try to train in our systems, + +0:24:12.845 --> 0:24:16.981 +so this is and all of them use monolingual +data. + +0:24:17.697 --> 0:24:19.559 +So there's no pilot data at all. + +0:24:19.559 --> 0:24:24.469 +The first one is denoising water encoding, +so it's more like you add noise to noise to + +0:24:24.469 --> 0:24:27.403 +the sentence, and then they construct the original. + +0:24:28.388 --> 0:24:34.276 +Then we have the on the flyby translation, +so this is where you take a sentence, generate + +0:24:34.276 --> 0:24:39.902 +a translation, and then learn the the word +smarting, which I'm going to show pictures + +0:24:39.902 --> 0:24:45.725 +stated, and then we have an adverse serial +planning to do learn the language independent + +0:24:45.725 --> 0:24:46.772 +representation. + +0:24:47.427 --> 0:24:52.148 +So somehow we'll fill in these three tasks +or retain on these three tasks. + +0:24:52.148 --> 0:24:54.728 +We somehow get an answer to President M. + +0:24:54.728 --> 0:24:54.917 +T. + +0:24:56.856 --> 0:25:02.964 +OK, so first we're going to do is denoising +what I'm cutting right, so as I said we add + +0:25:02.964 --> 0:25:06.295 +noise to the sentence, so we take our sentence. + +0:25:06.826 --> 0:25:09.709 +And then there are different ways to add noise. + +0:25:09.709 --> 0:25:11.511 +You can shuffle words around. + +0:25:11.511 --> 0:25:12.712 +You can drop words. + +0:25:12.712 --> 0:25:18.298 +Do whatever you want to do as long as there's +enough information to reconstruct the original + +0:25:18.298 --> 0:25:18.898 +sentence. + +0:25:19.719 --> 0:25:25.051 +And then we assume that the nicest one and +the original one are parallel data and train + +0:25:25.051 --> 0:25:26.687 +similar to the supervised. + +0:25:28.168 --> 0:25:30.354 +So we have a source sentence. + +0:25:30.354 --> 0:25:32.540 +We have a noisy source right. + +0:25:32.540 --> 0:25:37.130 +So here what basically happened is that the +word got shuffled. + +0:25:37.130 --> 0:25:39.097 +One word is dropped right. + +0:25:39.097 --> 0:25:41.356 +So this was a noise of source. + +0:25:41.356 --> 0:25:47.039 +And then we treat the noise of source and +source as a sentence bed basically. + +0:25:49.009 --> 0:25:53.874 +Way retainers optimizing the cross entropy +loss similar to. + +0:25:57.978 --> 0:26:03.211 +Basically a picture to show what's happening +and we have the nice resources. + +0:26:03.163 --> 0:26:09.210 +Now is the target and then we have the reconstructed +original source and original tag and since + +0:26:09.210 --> 0:26:14.817 +the languages are different we have our source +hand coded target and coded source coded. + +0:26:17.317 --> 0:26:20.202 +And for this task we only need monolingual +data. + +0:26:20.202 --> 0:26:25.267 +We don't need any pedal data because it's +just taking a sentence and shuffling it and + +0:26:25.267 --> 0:26:27.446 +reconstructing the the original one. + +0:26:28.848 --> 0:26:31.058 +And we are four different blocks. + +0:26:31.058 --> 0:26:36.841 +This is kind of very important to keep in +mind on how we change these connections later. + +0:26:41.121 --> 0:26:49.093 +Then this is more like the mathematical formulation +where you predict source given the noisy. + +0:26:52.492 --> 0:26:55.090 +So that was the nursing water encoding. + +0:26:55.090 --> 0:26:58.403 +The second step is on the flight back translation. + +0:26:59.479 --> 0:27:06.386 +So what we do is, we put our model inference +mode right, we take a source of sentences, + +0:27:06.386 --> 0:27:09.447 +and we generate a translation pattern. + +0:27:09.829 --> 0:27:18.534 +It might be completely wrong or maybe partially +correct or so on, but we assume that the moral + +0:27:18.534 --> 0:27:20.091 +knows of it and. + +0:27:20.680 --> 0:27:25.779 +Tend rate: T head right and then what we do +is assume that T head or not assume but T head + +0:27:25.779 --> 0:27:27.572 +and S are sentence space right. + +0:27:27.572 --> 0:27:29.925 +That's how we can handle the translation. + +0:27:30.530 --> 0:27:38.824 +So we train a supervised system on this sentence +bed, so we do inference and then build a reverse + +0:27:38.824 --> 0:27:39.924 +translation. + +0:27:42.442 --> 0:27:49.495 +Are both more concrete, so we have a false +sentence right, then we chamber the translation, + +0:27:49.495 --> 0:27:55.091 +then we give the general translation as an +input and try to predict the. + +0:27:58.378 --> 0:28:03.500 +This is how we would do in practice right, +so not before the source encoder was connected + +0:28:03.500 --> 0:28:08.907 +to the source decoder, but now we interchanged +connections, so the source encoder is connected + +0:28:08.907 --> 0:28:10.216 +to the target decoder. + +0:28:10.216 --> 0:28:13.290 +The target encoder is turned into the source +decoder. + +0:28:13.974 --> 0:28:20.747 +And given s we get t-hat and given t we get +s-hat, so this is the first time. + +0:28:21.661 --> 0:28:24.022 +On the second time step, what you're going +to do is reverse. + +0:28:24.664 --> 0:28:32.625 +So as that is here, t hat is here, and given +s hat we are trying to predict t, and given + +0:28:32.625 --> 0:28:34.503 +t hat we are trying. + +0:28:36.636 --> 0:28:39.386 +Is this clear you have any questions on? + +0:28:45.405 --> 0:28:50.823 +Bit more mathematically, we try to play the +class, give and take and so it's always the + +0:28:50.823 --> 0:28:53.963 +supervised NMP technique that we are trying +to do. + +0:28:53.963 --> 0:28:59.689 +But you're trying to create this synthetic +pass that kind of helpers to build an unsurprised + +0:28:59.689 --> 0:29:00.181 +system. + +0:29:02.362 --> 0:29:08.611 +Now also with maybe you can see here is that +if the source encoded and targeted encoded + +0:29:08.611 --> 0:29:14.718 +the language independent, we can always shuffle +the connections and the translations. + +0:29:14.718 --> 0:29:21.252 +That's why it was important to find a way +to generate language independent representations. + +0:29:21.441 --> 0:29:26.476 +And the way we try to force this language +independence is the gan step. + +0:29:27.627 --> 0:29:34.851 +So the third step kind of combines all of +them is where we try to use gun to make the + +0:29:34.851 --> 0:29:37.959 +encoded output language independent. + +0:29:37.959 --> 0:29:42.831 +So here it's the same picture but from a different +paper. + +0:29:42.831 --> 0:29:43.167 +So. + +0:29:43.343 --> 0:29:48.888 +We have X-rays, X-ray objects which is monolingual +in data. + +0:29:48.888 --> 0:29:50.182 +We add noise. + +0:29:50.690 --> 0:29:54.736 +Then we encode it using the source and the +target encoders right. + +0:29:54.736 --> 0:29:58.292 +Then we get the latent space Z source and +Z target right. + +0:29:58.292 --> 0:30:03.503 +Then we decode and try to reconstruct the +original one and this is the auto encoding + +0:30:03.503 --> 0:30:08.469 +loss which takes the X source which is the +original one and then the translated. + +0:30:08.468 --> 0:30:09.834 +Predicted output. + +0:30:09.834 --> 0:30:16.740 +So hello, it always is the auto encoding step +where the gun concern is in the between gang + +0:30:16.740 --> 0:30:24.102 +cord outputs, and here we have an discriminator +which tries to predict which language the latent + +0:30:24.102 --> 0:30:25.241 +space is from. + +0:30:26.466 --> 0:30:33.782 +So given Z source it has to predict that the +representation is from a language source and + +0:30:33.782 --> 0:30:39.961 +given Z target it has to predict the representation +from a language target. + +0:30:40.520 --> 0:30:45.135 +And our headquarters are kind of teaching +data right now, and then we have a separate + +0:30:45.135 --> 0:30:49.803 +network discriminator which tries to predict +which language the Latin spaces are from. + +0:30:53.393 --> 0:30:57.611 +And then this one is when we combined guns +with the other ongoing step. + +0:30:57.611 --> 0:31:02.767 +Then we had an on the fly back translation +step right, and so here what we're trying to + +0:31:02.767 --> 0:31:03.001 +do. + +0:31:03.863 --> 0:31:07.260 +Is the same, basically just exactly the same. + +0:31:07.260 --> 0:31:12.946 +But when we are doing the training, we are +at the adversarial laws here, so. + +0:31:13.893 --> 0:31:20.762 +We take our X source, gender and intermediate +translation, so why target and why source right? + +0:31:20.762 --> 0:31:27.342 +This is the previous time step, and then we +have to encode the new sentences and basically + +0:31:27.342 --> 0:31:32.764 +make them language independent or train to +make them language independent. + +0:31:33.974 --> 0:31:43.502 +And then the hope is that now if we do this +using monolingual data alone we can just switch + +0:31:43.502 --> 0:31:47.852 +connections and then get our translation. + +0:31:47.852 --> 0:31:49.613 +So the scale of. + +0:31:54.574 --> 0:32:03.749 +And so as I said before, guns are quite good +for vision right, so this is kind of like the + +0:32:03.749 --> 0:32:11.312 +cycle gun approach that you might have seen +in any computer vision course. + +0:32:11.911 --> 0:32:19.055 +Somehow protect that place at least not as +promising as for merchants, and so people. + +0:32:19.055 --> 0:32:23.706 +What they did is to enforce this language +independence. + +0:32:25.045 --> 0:32:31.226 +They try to use a shared encoder instead of +having these different encoders right, and + +0:32:31.226 --> 0:32:37.835 +so this is basically the same painting objectives +as before, but what you're going to do now + +0:32:37.835 --> 0:32:43.874 +is learn cross language language and then use +the single encoder for both languages. + +0:32:44.104 --> 0:32:49.795 +And this kind also forces them to be in the +same space, and then you can choose whichever + +0:32:49.795 --> 0:32:50.934 +decoder you want. + +0:32:52.552 --> 0:32:58.047 +You can use guns or you can just use a shared +encoder and type to build your unsupervised + +0:32:58.047 --> 0:32:58.779 +MTT system. + +0:33:08.488 --> 0:33:09.808 +These are now the. + +0:33:09.808 --> 0:33:15.991 +The enhancements that you can do on top of +your unsavoizant system is one you can create + +0:33:15.991 --> 0:33:16.686 +a shared. + +0:33:18.098 --> 0:33:22.358 +On top of the shared encoder you can ask are +your guns lost or whatever so there's a lot + +0:33:22.358 --> 0:33:22.550 +of. + +0:33:24.164 --> 0:33:29.726 +The other thing that is more relevant right +now is that you can create parallel data by + +0:33:29.726 --> 0:33:35.478 +word to word translation right because you +know how to do all supervised word translation. + +0:33:36.376 --> 0:33:40.548 +First step is to create parallel data, assuming +that word translations are quite good. + +0:33:41.361 --> 0:33:47.162 +And then you claim a supervised and empty +model on these more likely wrong model data, + +0:33:47.162 --> 0:33:50.163 +but somehow gives you a good starting point. + +0:33:50.163 --> 0:33:56.098 +So you build your supervised and empty system +on the word translation data, and then you + +0:33:56.098 --> 0:33:59.966 +initialize it before you're doing unsupervised +and empty. + +0:34:00.260 --> 0:34:05.810 +And the hope is that when you're doing the +back pain installation, it's a good starting + +0:34:05.810 --> 0:34:11.234 +point, but it's one technique that you can +do to to improve your anthropoids and the. + +0:34:17.097 --> 0:34:25.879 +In the previous case we had: The way we know +when to stop was to see comedians on the gun + +0:34:25.879 --> 0:34:26.485 +training. + +0:34:26.485 --> 0:34:28.849 +Actually, all we want to do is when W. + +0:34:28.849 --> 0:34:32.062 +Comedians, which is quite easy to know when +to stop. + +0:34:32.062 --> 0:34:37.517 +But in a realistic case, we don't have any +parallel data right, so there's no validation. + +0:34:37.517 --> 0:34:42.002 +Or I mean, we might have test data in the +end, but there's no validation. + +0:34:43.703 --> 0:34:48.826 +How will we tune our hyper parameters in this +case because it's not really there's nothing + +0:34:48.826 --> 0:34:49.445 +for us to? + +0:34:50.130 --> 0:34:53.326 +Or the gold data in a sense like so. + +0:34:53.326 --> 0:35:01.187 +How do you think we can evaluate such systems +or how can we tune hyper parameters in this? + +0:35:11.711 --> 0:35:17.089 +So what you're going to do is use the back +translation technique. + +0:35:17.089 --> 0:35:24.340 +It's like a common technique where you have +nothing okay that is to use back translation + +0:35:24.340 --> 0:35:26.947 +somehow and what you can do is. + +0:35:26.947 --> 0:35:31.673 +The main idea is validate on how good the +reconstruction. + +0:35:32.152 --> 0:35:37.534 +So the idea is that if you have a good system +then the intermediate translation is quite + +0:35:37.534 --> 0:35:39.287 +good and going back is easy. + +0:35:39.287 --> 0:35:44.669 +But if it's just noise that you generate in +the forward step then it's really hard to go + +0:35:44.669 --> 0:35:46.967 +back, which is kind of the main idea. + +0:35:48.148 --> 0:35:53.706 +So the way it works is that we take a source +sentence, we generate a translation in target + +0:35:53.706 --> 0:35:59.082 +language right, and then again can state the +generated sentence and compare it with the + +0:35:59.082 --> 0:36:01.342 +original one, and if they're closer. + +0:36:01.841 --> 0:36:09.745 +It means that we have a good system, and if +they are far this is kind of like an unsupervised + +0:36:09.745 --> 0:36:10.334 +grade. + +0:36:17.397 --> 0:36:21.863 +As far as the amount of data that you need. + +0:36:23.083 --> 0:36:27.995 +This was like the first initial resistance +on on these systems is that you had. + +0:36:27.995 --> 0:36:32.108 +They wanted to do English and French and they +had fifteen million. + +0:36:32.108 --> 0:36:38.003 +There was fifteen million more linguist sentences +so it's quite a lot and they were able to get + +0:36:38.003 --> 0:36:40.581 +thirty two blue on these kinds of setups. + +0:36:41.721 --> 0:36:47.580 +But unsurprisingly if you have zero point +one million pilot sentences you get the same + +0:36:47.580 --> 0:36:48.455 +performance. + +0:36:48.748 --> 0:36:50.357 +So it's a lot of training. + +0:36:50.357 --> 0:36:55.960 +It's a lot of monolingual data, but monolingual +data is relatively easy to obtain is the fact + +0:36:55.960 --> 0:37:01.264 +that the training is also quite longer than +the supervised system, but it's unsupervised + +0:37:01.264 --> 0:37:04.303 +so it's kind of the trade off that you are +making. + +0:37:07.367 --> 0:37:13.101 +The other thing to note is that it's English +and French, which is very close to our exemptions. + +0:37:13.101 --> 0:37:18.237 +Also, the monolingual data that they took +are kind of from similar domains and so on. + +0:37:18.638 --> 0:37:27.564 +So that's why they're able to build such a +good system, but you'll see later that it fails. + +0:37:36.256 --> 0:37:46.888 +Voice, and so mean what people usually do +is first build a system right using whatever + +0:37:46.888 --> 0:37:48.110 +parallel. + +0:37:48.608 --> 0:37:55.864 +Then they use monolingual data and do back +translation, so this is always being the standard + +0:37:55.864 --> 0:38:04.478 +way to to improve, and what people have seen +is that: You don't even need zero point one + +0:38:04.478 --> 0:38:05.360 +million right. + +0:38:05.360 --> 0:38:10.706 +You just need like ten thousand or so on and +then you do the monolingual back time station + +0:38:10.706 --> 0:38:12.175 +and you're still better. + +0:38:12.175 --> 0:38:13.291 +The answer is why. + +0:38:13.833 --> 0:38:19.534 +The question is it's really worth trying to +to do this or maybe it's always better to find + +0:38:19.534 --> 0:38:20.787 +some parallel data. + +0:38:20.787 --> 0:38:26.113 +I'll expand a bit of money on getting few +parallel data and then use it to start and + +0:38:26.113 --> 0:38:27.804 +find to build your system. + +0:38:27.804 --> 0:38:33.756 +So it was kind of the understanding that billing +wool and spoiled systems are not that really. + +0:38:50.710 --> 0:38:54.347 +The thing is that with unlabeled data. + +0:38:57.297 --> 0:39:05.488 +Not in an obtaining signal, so when we are +starting basically what we want to do is first + +0:39:05.488 --> 0:39:13.224 +get a good translation system and then use +an unlabeled monolingual data to improve. + +0:39:13.613 --> 0:39:15.015 +But if you start from U. + +0:39:15.015 --> 0:39:15.183 +N. + +0:39:15.183 --> 0:39:20.396 +Empty our model might be really bad like it +would be somewhere translating completely wrong. + +0:39:20.760 --> 0:39:26.721 +And then when you find your unlabeled data, +it basically might be harming, or maybe the + +0:39:26.721 --> 0:39:28.685 +same as supervised applause. + +0:39:28.685 --> 0:39:35.322 +So the thing is, I hope, by fine tuning on +labeled data as first is to get a good initialization. + +0:39:35.835 --> 0:39:38.404 +And then use the unsupervised techniques to +get better. + +0:39:38.818 --> 0:39:42.385 +But if your starting point is really bad then +it's not. + +0:39:45.185 --> 0:39:47.324 +Year so as we said before. + +0:39:47.324 --> 0:39:52.475 +This is kind of like the self supervised training +usually works. + +0:39:52.475 --> 0:39:54.773 +First we have parallel data. + +0:39:56.456 --> 0:39:58.062 +Source language is X. + +0:39:58.062 --> 0:39:59.668 +Target language is Y. + +0:39:59.668 --> 0:40:06.018 +In the end we want a system that does X to +Y, not Y to X, but first we want to train a + +0:40:06.018 --> 0:40:10.543 +backward model as it is Y to X, so target language +to source. + +0:40:11.691 --> 0:40:17.353 +Then we take our moonlighting will target +sentences, use our backward model to generate + +0:40:17.353 --> 0:40:21.471 +synthetic source, and then we join them with +our original data. + +0:40:21.471 --> 0:40:27.583 +So now we have this noisy input, but always +the gold output, which is kind of really important + +0:40:27.583 --> 0:40:29.513 +when you're doing backpaints. + +0:40:30.410 --> 0:40:36.992 +And then you can coordinate these big data +and then you can train your X to Y cholesterol + +0:40:36.992 --> 0:40:44.159 +system and then you can always do this in multiple +steps and usually three, four steps which kind + +0:40:44.159 --> 0:40:48.401 +of improves always and then finally get your +best system. + +0:40:49.029 --> 0:40:54.844 +The point that I'm trying to make is that +although answers and MPs the scores that I've + +0:40:54.844 --> 0:41:00.659 +shown before were quite good, you probably +can get the same performance with with fifty + +0:41:00.659 --> 0:41:06.474 +thousand sentences, and also the languages +that they've shown are quite similar and the + +0:41:06.474 --> 0:41:08.654 +texts were from the same domain. + +0:41:14.354 --> 0:41:21.494 +So any questions on u n m t ok yeah. + +0:41:22.322 --> 0:41:28.982 +So after this fact that temperature was already +better than than empty, what people have tried + +0:41:28.982 --> 0:41:34.660 +is to use this idea of multilinguality as you +have seen in the previous lecture. + +0:41:34.660 --> 0:41:41.040 +The question is how can we do this knowledge +transfer from high resource language to lower + +0:41:41.040 --> 0:41:42.232 +source language? + +0:41:44.484 --> 0:41:51.074 +One way to promote this language independent +representations is to share the encoder and + +0:41:51.074 --> 0:41:57.960 +decoder for all languages, all their available +languages, and that kind of hopefully enables + +0:41:57.960 --> 0:42:00.034 +the the knowledge transfer. + +0:42:03.323 --> 0:42:08.605 +When we're doing multilinguality, the two +questions we need to to think of is how does + +0:42:08.605 --> 0:42:09.698 +the encoder know? + +0:42:09.698 --> 0:42:14.495 +How does the encoder encoder know which language +that we're dealing with that? + +0:42:15.635 --> 0:42:20.715 +You already might have known the answer also, +and the second question is how can we promote + +0:42:20.715 --> 0:42:24.139 +the encoder to generate language independent +representations? + +0:42:25.045 --> 0:42:32.580 +By solving these two problems we can take +help of high resource languages to do unsupervised + +0:42:32.580 --> 0:42:33.714 +translations. + +0:42:34.134 --> 0:42:40.997 +Typical example would be you want to do unsurpressed +between English and Dutch right, but you are + +0:42:40.997 --> 0:42:47.369 +parallel data between English and German, so +the question is can we use this parallel data + +0:42:47.369 --> 0:42:51.501 +to help building an unsurpressed betweenEnglish +and Dutch? + +0:42:56.296 --> 0:43:01.240 +For the first one we try to take help of language +embeddings for tokens, and this kind of is + +0:43:01.240 --> 0:43:05.758 +a straightforward way to know to tell them +well which language they're dealing with. + +0:43:06.466 --> 0:43:11.993 +And for the second one we're going to look +at some pre training objectives which are also + +0:43:11.993 --> 0:43:17.703 +kind of unsupervised so we need monolingual +data mostly and this kind of helps us to promote + +0:43:17.703 --> 0:43:20.221 +the language independent representation. + +0:43:23.463 --> 0:43:29.954 +So the first three things more that we'll +look at is excel, which is quite famous if + +0:43:29.954 --> 0:43:32.168 +you haven't heard of it yet. + +0:43:32.552 --> 0:43:40.577 +And: The way it works is that it's basically +a transformer encoder right, so it's like the + +0:43:40.577 --> 0:43:42.391 +just the encoder module. + +0:43:42.391 --> 0:43:44.496 +No, there's no decoder here. + +0:43:44.884 --> 0:43:51.481 +And what we're trying to do is mask two tokens +in a sequence and try to predict these mask + +0:43:51.481 --> 0:43:52.061 +tokens. + +0:43:52.061 --> 0:43:55.467 +So I quickly called us mask language modeling. + +0:43:55.996 --> 0:44:05.419 +Typical language modeling that you see is +the Danish language modeling where you predict + +0:44:05.419 --> 0:44:08.278 +the next token in English. + +0:44:08.278 --> 0:44:11.136 +Then we have the position. + +0:44:11.871 --> 0:44:18.774 +Then we have the token embellings, and then +here we have the mass token, and then we have + +0:44:18.774 --> 0:44:22.378 +the transformer encoder blocks to predict the. + +0:44:24.344 --> 0:44:30.552 +To do this for all languages using the same +tang somewhere encoded and this kind of helps + +0:44:30.552 --> 0:44:36.760 +us to push the the sentence and bearings or +the output of the encoded into a common space + +0:44:36.760 --> 0:44:37.726 +per multiple. + +0:44:42.782 --> 0:44:49.294 +So first we train an MLM on both source, both +source and target language sites, and then + +0:44:49.294 --> 0:44:54.928 +we use it as a starting point for the encoded +and decoded for a UNMP system. + +0:44:55.475 --> 0:45:03.175 +So we take a monolingual data, build a mass +language model on both source and target languages, + +0:45:03.175 --> 0:45:07.346 +and then read it to be or initialize that in +the U. + +0:45:07.346 --> 0:45:07.586 +N. + +0:45:07.586 --> 0:45:07.827 +P. + +0:45:07.827 --> 0:45:08.068 +C. + +0:45:09.009 --> 0:45:14.629 +Here we look at two languages, but you can +also do it with one hundred languages once. + +0:45:14.629 --> 0:45:20.185 +So they're retain checkpoints that you can +use, which are quite which have seen quite + +0:45:20.185 --> 0:45:21.671 +a lot of data and use. + +0:45:21.671 --> 0:45:24.449 +It always has a starting point for your U. + +0:45:24.449 --> 0:45:24.643 +N. + +0:45:24.643 --> 0:45:27.291 +MP system, which in practice works well. + +0:45:31.491 --> 0:45:36.759 +This detail is that since this is an encoder +block only, and your U. + +0:45:36.759 --> 0:45:36.988 +N. + +0:45:36.988 --> 0:45:37.217 +M. + +0:45:37.217 --> 0:45:37.446 +T. + +0:45:37.446 --> 0:45:40.347 +System is encodered, decodered right. + +0:45:40.347 --> 0:45:47.524 +So there's this cross attention that's missing, +but you can always branch like that randomly. + +0:45:47.524 --> 0:45:48.364 +It's fine. + +0:45:48.508 --> 0:45:53.077 +Not everything is initialized, but it's still +decent. + +0:45:56.056 --> 0:46:02.141 +Then we have the other one is M by plane, +and here you see that this kind of builds on + +0:46:02.141 --> 0:46:07.597 +the the unsupervised training objector, which +is the realizing auto encoding. + +0:46:08.128 --> 0:46:14.337 +So what they do is they say that we don't +even need to do the gun outback translation, + +0:46:14.337 --> 0:46:17.406 +but you can do it later, but pre training. + +0:46:17.406 --> 0:46:24.258 +We just do do doing doing doing water inputting +on all different languages, and that also gives + +0:46:24.258 --> 0:46:32.660 +you: Out of the box good performance, so what +we basically have here is the transformer encoded. + +0:46:34.334 --> 0:46:37.726 +You are trying to generate a reconstructed +sequence. + +0:46:37.726 --> 0:46:38.942 +You need a tickle. + +0:46:39.899 --> 0:46:42.022 +So we gave an input sentence. + +0:46:42.022 --> 0:46:48.180 +We tried to predict the masked tokens from +the or we tried to reconstruct the original + +0:46:48.180 --> 0:46:52.496 +sentence from the input segments, which was +corrupted right. + +0:46:52.496 --> 0:46:57.167 +So this is the same denoting objective that +you have seen before. + +0:46:58.418 --> 0:46:59.737 +This is for English. + +0:46:59.737 --> 0:47:04.195 +I think this is for Japanese and then once +we do it for all languages. + +0:47:04.195 --> 0:47:09.596 +I mean they have this difference on twenty +five, fifty or so on and then you can find + +0:47:09.596 --> 0:47:11.794 +you on your sentence and document. + +0:47:13.073 --> 0:47:20.454 +And so what they is this for the supervised +techniques, but you can also use this as initializations + +0:47:20.454 --> 0:47:25.058 +for unsupervised buildup on that which also +in practice works. + +0:47:30.790 --> 0:47:36.136 +Then we have these, so still now we kind of +didn't see the the states benefit from the + +0:47:36.136 --> 0:47:38.840 +high resource language right, so as I said. + +0:47:38.878 --> 0:47:44.994 +Why you can use English as something for English +to Dutch, and if you want a new Catalan, you + +0:47:44.994 --> 0:47:46.751 +can use English to French. + +0:47:48.408 --> 0:47:55.866 +One typical way to do this is to use favorite +translation lights or you take the. + +0:47:55.795 --> 0:48:01.114 +So here it's finished two weeks so you take +your time say from finish to English English + +0:48:01.114 --> 0:48:03.743 +two weeks and then you get the translation. + +0:48:04.344 --> 0:48:10.094 +What's important is that you have these different +techniques and you can always think of which + +0:48:10.094 --> 0:48:12.333 +one to use given the data situation. + +0:48:12.333 --> 0:48:18.023 +So if it was like finish to Greek maybe it's +pivotal better because you might get good finish + +0:48:18.023 --> 0:48:20.020 +to English and English to Greek. + +0:48:20.860 --> 0:48:23.255 +Sometimes it also depends on the language +pair. + +0:48:23.255 --> 0:48:27.595 +There might be some information loss and so +on, so there are quite a few variables you + +0:48:27.595 --> 0:48:30.039 +need to think of and decide which system to +use. + +0:48:32.752 --> 0:48:39.654 +Then there's a zero shot, which probably also +I've seen in the multilingual course, and how + +0:48:39.654 --> 0:48:45.505 +if you can improve the language independence +then your zero shot gets better. + +0:48:45.505 --> 0:48:52.107 +So maybe if you use the multilingual models +and do zero shot directly, it's quite good. + +0:48:53.093 --> 0:48:58.524 +Thought we have zero shots per word, and then +we have the answer to voice translation where + +0:48:58.524 --> 0:49:00.059 +we can calculate between. + +0:49:00.600 --> 0:49:02.762 +Just when there is no battle today. + +0:49:06.686 --> 0:49:07.565 +Is to solve. + +0:49:07.565 --> 0:49:11.959 +So sometimes what we have seen so far is that +we basically have. + +0:49:15.255 --> 0:49:16.754 +To do from looking at it. + +0:49:16.836 --> 0:49:19.307 +These two files alone you can create a dictionary. + +0:49:19.699 --> 0:49:26.773 +Can build an unsupervised entry system, not +always, but if the domains are similar in the + +0:49:26.773 --> 0:49:28.895 +languages, that's similar. + +0:49:28.895 --> 0:49:36.283 +But if there are distant languages, then the +unsupervised texting doesn't usually work really + +0:49:36.283 --> 0:49:36.755 +well. + +0:49:37.617 --> 0:49:40.297 +What um. + +0:49:40.720 --> 0:49:46.338 +Would be is that if you can get some paddle +data from somewhere or do bitex mining that + +0:49:46.338 --> 0:49:51.892 +we have seen in the in the laser practicum +then you can use that as to initialize your + +0:49:51.892 --> 0:49:57.829 +system and then try and accept a semi supervised +energy system and that would be better than + +0:49:57.829 --> 0:50:00.063 +just building an unsupervised and. + +0:50:00.820 --> 0:50:06.546 +With that as the end. + +0:50:07.207 --> 0:50:08.797 +Quickly could be. + +0:50:16.236 --> 0:50:25.070 +In common, they can catch the worst because +the thing about finding a language is: And + +0:50:25.070 --> 0:50:34.874 +there's another joy in playing these games, +almost in the middle of a game, and she's a + +0:50:34.874 --> 0:50:40.111 +characteristic too, and she is a global waver. + +0:50:56.916 --> 0:51:03.798 +Next talk inside and this somehow gives them +many abilities, not only translation but other + +0:51:03.798 --> 0:51:08.062 +than that there are quite a few things that +they can do. + +0:51:10.590 --> 0:51:17.706 +But the translation in itself usually doesn't +really work really well if you build a system + +0:51:17.706 --> 0:51:20.878 +from your specific system for your case. + +0:51:22.162 --> 0:51:27.924 +I would guess that it's usually better than +the LLM, but you can always adapt the LLM to + +0:51:27.924 --> 0:51:31.355 +the task that you want, and then it could be +better. + +0:51:32.152 --> 0:51:37.849 +A little amount of the box might not be the +best choice for your task force. + +0:51:37.849 --> 0:51:44.138 +For me, I'm working on new air translation, +so it's more about translating software. + +0:51:45.065 --> 0:51:50.451 +And it's quite often each domain as well, +and if use the LLM out of the box, they're + +0:51:50.451 --> 0:51:53.937 +actually quite bad compared to the systems +that built. + +0:51:54.414 --> 0:51:56.736 +But you can do these different techniques +like prompting. + +0:51:57.437 --> 0:52:03.442 +This is what people usually do is heart prompting +where they give similar translation pairs in + +0:52:03.442 --> 0:52:08.941 +the prompt and then ask it to translate and +then that kind of improves the performance + +0:52:08.941 --> 0:52:09.383 +a lot. + +0:52:09.383 --> 0:52:15.135 +So there are different techniques that you +can do to adapt your eye lens and then it might + +0:52:15.135 --> 0:52:16.399 +be better than the. + +0:52:16.376 --> 0:52:17.742 +Task a fixed system. + +0:52:18.418 --> 0:52:22.857 +But if you're looking for niche things, I +don't think error limbs are that good. + +0:52:22.857 --> 0:52:26.309 +But if you want to do to do, let's say, unplugged +translation. + +0:52:26.309 --> 0:52:30.036 +In this case you can never be sure that they +haven't seen the data. + +0:52:30.036 --> 0:52:35.077 +First of all is that if you see the data in +that language or not, and if they're panthetic, + +0:52:35.077 --> 0:52:36.831 +they probably did see the data. + +0:52:40.360 --> 0:53:00.276 +I feel like they have pretty good understanding +of each million people. + +0:53:04.784 --> 0:53:09.059 +Depends on the language, but I'm pretty surprised +that it works on a lotus language. + +0:53:09.059 --> 0:53:11.121 +I would expect it to work on German and. + +0:53:11.972 --> 0:53:13.633 +But if you take a lot of first language,. + +0:53:14.474 --> 0:53:20.973 +Don't think it works, and also there are quite +a few papers where they've already showed that + +0:53:20.973 --> 0:53:27.610 +if you build a system yourself or build a typical +way to build a system, it's quite better than + +0:53:27.610 --> 0:53:29.338 +the bit better than the. + +0:53:29.549 --> 0:53:34.883 +But you can always do things with limbs to +get better, but then I'm probably. + +0:53:37.557 --> 0:53:39.539 +Anymore. + +0:53:41.421 --> 0:53:47.461 +So if not then we're going to end the lecture +here and then on Thursday we're going to have + +0:53:47.461 --> 0:53:51.597 +documented empty which is also run by me so +thanks for coming. + diff --git a/demo_data/lectures/Lecture-15-11.07.2023/video.mp4 b/demo_data/lectures/Lecture-15-11.07.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..52d7c21a294e05ca4dc9cdcad0415fb9aec48fa8 --- /dev/null +++ b/demo_data/lectures/Lecture-15-11.07.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62985057e3dfdb7c34a3ef8e74a9b52e9529b2a974ff62438c617e6d699b5a89 +size 81272567 diff --git a/demo_data/lectures/Lecture-18-18.07.2023/English.vtt b/demo_data/lectures/Lecture-18-18.07.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..76516f32273e4a4b40566dbf931027cddab91c05 --- /dev/null +++ b/demo_data/lectures/Lecture-18-18.07.2023/English.vtt @@ -0,0 +1,2732 @@ +WEBVTT + +0:00:01.541 --> 0:00:06.926 +Okay, so we'll come back to today's lecture. + +0:00:08.528 --> 0:00:23.334 +We want to talk about is speech translation, +so we'll have two lectures in this week about + +0:00:23.334 --> 0:00:26.589 +speech translation. + +0:00:27.087 --> 0:00:36.456 +And so in the last week we'll have some exercise +and repetition. + +0:00:36.456 --> 0:00:46.690 +We want to look at what is now to do when +we want to translate speech. + +0:00:46.946 --> 0:00:55.675 +So we want to address the specific challenges +that occur when we switch from translating + +0:00:55.675 --> 0:00:56.754 +to speech. + +0:00:57.697 --> 0:01:13.303 +Today we will look at the more general picture +out and build the systems. + +0:01:13.493 --> 0:01:23.645 +And then secondly an end approach where we +are going to put in audio and generate. + +0:01:24.224 --> 0:01:41.439 +Which are the main dominant systems which +are used in research and commercial systems. + +0:01:43.523 --> 0:01:56.879 +More general, what is the general task of +speech translation that is shown here? + +0:01:56.879 --> 0:02:01.826 +The idea is we have a speech. + +0:02:02.202 --> 0:02:12.838 +Then we want to have a system which takes +this audio and then translates it into another + +0:02:12.838 --> 0:02:14.033 +language. + +0:02:15.095 --> 0:02:20.694 +Then it's no longer as clear the output modality. + +0:02:20.694 --> 0:02:33.153 +In contrast, for humans we can typically have: +So you can either have more textual translation, + +0:02:33.153 --> 0:02:37.917 +then you have subtitles, and the. + +0:02:38.538 --> 0:02:57.010 +Are you want to have it also in audio like +it's done for human interpretation? + +0:02:57.417 --> 0:03:03.922 +See there is not the one best solution, so +all of this one is always better. + +0:03:03.922 --> 0:03:09.413 +It heavily depends on what is the use of what +the people prefer. + +0:03:09.929 --> 0:03:14.950 +For example, you can think of if you know +a bit the source of language, but you're a + +0:03:14.950 --> 0:03:17.549 +bit unsure and don't understand everything. + +0:03:17.549 --> 0:03:23.161 +They may texture it out for this pattern because +you can direct your gear to what was said and + +0:03:23.161 --> 0:03:26.705 +only if you're unsure you check down with your +translation. + +0:03:27.727 --> 0:03:33.511 +Are another things that might be preferable +to have a complete spoken of. + +0:03:34.794 --> 0:03:48.727 +So there are both ones for a long time in +automatic systems focused mainly on text output. + +0:03:48.727 --> 0:04:06.711 +In most cases: But of course you can always +hand them to text to speech systems which generates + +0:04:06.711 --> 0:04:09.960 +audio from that. + +0:04:12.772 --> 0:04:14.494 +Why should we care about that? + +0:04:14.494 --> 0:04:15.771 +Why should we do that? + +0:04:17.737 --> 0:04:24.141 +There is the nice thing that yeah, with a +globalized world, we are able to now interact + +0:04:24.141 --> 0:04:25.888 +with a lot more people. + +0:04:25.888 --> 0:04:29.235 +You can do some conferences around the world. + +0:04:29.235 --> 0:04:31.564 +We can travel around the world. + +0:04:31.671 --> 0:04:37.802 +We can by Internet watch movies from all over +the world and watch TV from all over the world. + +0:04:38.618 --> 0:04:47.812 +However, there is still this barrier that +is mainly to watch videos, either in English + +0:04:47.812 --> 0:04:49.715 +or in a language. + +0:04:50.250 --> 0:05:00.622 +So what is currently happening in order to +reach a large audience is that everybody. + +0:05:00.820 --> 0:05:07.300 +So if we are going, for example, to a conferences, +these are international conferences. + +0:05:08.368 --> 0:05:22.412 +However, everybody will then speak English +since that is some of the common language that + +0:05:22.412 --> 0:05:26.001 +everybody understands. + +0:05:26.686 --> 0:05:32.929 +So on the other hand, we cannot like have +human interpreters like they ever work. + +0:05:32.892 --> 0:05:37.797 +You have that maybe in the European Parliament +or in important business meetings. + +0:05:38.078 --> 0:05:47.151 +But this is relatively expensive, and so the +question is, can we enable communication in + +0:05:47.151 --> 0:05:53.675 +your mother-in-law without having to have human +interpretation? + +0:05:54.134 --> 0:06:04.321 +And there like speech translation can be helpful +in order to help you bridge this gap. + +0:06:06.726 --> 0:06:22.507 +In this case, there are different scenarios +of how you can apply speech translation. + +0:06:22.422 --> 0:06:29.282 +That's typically more interactive than we +are talking about text translation. + +0:06:29.282 --> 0:06:32.800 +Text translation is most commonly used. + +0:06:33.153 --> 0:06:41.637 +Course: Nowadays there's things like chat +and so on where it could also be interactive. + +0:06:42.082 --> 0:06:48.299 +In contrast to speech translation, that is +less static, so there is different ways of + +0:06:48.299 --> 0:06:48.660 +how. + +0:06:49.149 --> 0:07:00.544 +The one scenario is what is called a translation +where you first get an input, then you translate + +0:07:00.544 --> 0:07:03.799 +this fixed input, and then. + +0:07:04.944 --> 0:07:12.823 +With me, which means you have always like +fixed, yeah fixed challenges which you need + +0:07:12.823 --> 0:07:14.105 +to translate. + +0:07:14.274 --> 0:07:25.093 +You don't need to like beat your mind what +are the boundaries where there's an end. + +0:07:25.405 --> 0:07:31.023 +Also, there is no overlapping. + +0:07:31.023 --> 0:07:42.983 +There is always a one-person sentence that +is getting translated. + +0:07:43.443 --> 0:07:51.181 +Of course, this has a disadvantage that it +makes the conversation a lot longer because + +0:07:51.181 --> 0:07:55.184 +you always have only speech and translation. + +0:07:57.077 --> 0:08:03.780 +For example, if you would use that for a presentation +there would be yeah quite get quite long, if + +0:08:03.780 --> 0:08:09.738 +I would just imagine you sitting here in the +lecture I would say three sentences that I + +0:08:09.738 --> 0:08:15.765 +would wait for this interpreter to translate +it, then I would say the next two sentences + +0:08:15.765 --> 0:08:16.103 +and. + +0:08:16.676 --> 0:08:28.170 +That is why in these situations, for example, +if you have a direct conversation with a patient, + +0:08:28.170 --> 0:08:28.888 +then. + +0:08:29.209 --> 0:08:32.733 +But still there it's too big to be taking +them very long. + +0:08:33.473 --> 0:08:42.335 +And that's why there's also the research on +simultaneous translation, where the idea is + +0:08:42.335 --> 0:08:43.644 +in parallel. + +0:08:43.964 --> 0:08:46.179 +That Is the Dining for Human. + +0:08:46.126 --> 0:08:52.429 +Interpretation like if you think of things +like the European Parliament where they of + +0:08:52.429 --> 0:08:59.099 +course not only speak always one sentence but +are just giving their speech and in parallel + +0:08:59.099 --> 0:09:04.157 +human interpreters are translating the speech +into another language. + +0:09:04.985 --> 0:09:12.733 +The same thing is interesting for automatic +speech translation where we in parallel generate + +0:09:12.733 --> 0:09:13.817 +translation. + +0:09:15.415 --> 0:09:32.271 +The challenges then, of course, are that we +need to segment our speech into somehow's chunks. + +0:09:32.152 --> 0:09:34.903 +We just looked for the dots we saw. + +0:09:34.903 --> 0:09:38.648 +There are some challenges that we have to +check. + +0:09:38.648 --> 0:09:41.017 +The Doctor may not understand. + +0:09:41.201 --> 0:09:47.478 +But in generally getting sentence boundary +sentences is not a really research question. + +0:09:47.647 --> 0:09:51.668 +While in speech translation, this is not that +easy. + +0:09:51.952 --> 0:10:05.908 +Either getting that in the audio is difficult +because it's not like we typically do breaks + +0:10:05.908 --> 0:10:09.742 +when there's a sentence. + +0:10:10.150 --> 0:10:17.432 +And even if you then see the transcript and +would have to add the punctuation, this is + +0:10:17.432 --> 0:10:18.101 +not as. + +0:10:20.340 --> 0:10:25.942 +Another question is how many speakers we have +here. + +0:10:25.942 --> 0:10:31.759 +In presentations you have more like a single +speaker. + +0:10:31.931 --> 0:10:40.186 +That is normally easier from the part of audio +processing, so in general in speech translation. + +0:10:40.460 --> 0:10:49.308 +You can have different challenges and they +can be of different components. + +0:10:49.308 --> 0:10:57.132 +In addition to translation, you have: And +if you're not going, for example, the magical + +0:10:57.132 --> 0:11:00.378 +speaker, there are significantly additional +challenges. + +0:11:00.720 --> 0:11:10.313 +So we as humans we are very good in filtering +out noises, or if two people speak in parallel + +0:11:10.313 --> 0:11:15.058 +to like separate these two speakers and hear. + +0:11:15.495 --> 0:11:28.300 +However, if you want to do that with automatic +systems that is very challenging so that you + +0:11:28.300 --> 0:11:33.172 +can separate the speakers so that. + +0:11:33.453 --> 0:11:41.284 +For the more of you have this multi-speaker +scenario, typically it's also less well prepared. + +0:11:41.721 --> 0:11:45.807 +So you're getting very, we'll talk about the +spontaneous effects. + +0:11:46.186 --> 0:11:53.541 +So people like will stop in the middle of +the sentence, they change their sentence, and + +0:11:53.541 --> 0:12:01.481 +so on, and like filtering these, these fluences +out of the text and working with them is often + +0:12:01.481 --> 0:12:02.986 +very challenging. + +0:12:05.565 --> 0:12:09.144 +So these are all additional challenges when +you have multiples. + +0:12:10.330 --> 0:12:19.995 +Then there's a question of an online or offline +system, sometimes textbook station. + +0:12:19.995 --> 0:12:21.836 +We also mainly. + +0:12:21.962 --> 0:12:36.507 +That means you can take the whole text and +you can translate it in a badge. + +0:12:37.337 --> 0:12:44.344 +However, for speech translation there's also +several scenarios where this is the case. + +0:12:44.344 --> 0:12:51.513 +For example, when you're translating a movie, +it's not only that you don't have to do it + +0:12:51.513 --> 0:12:54.735 +live, but you can take the whole movie. + +0:12:55.215 --> 0:13:05.473 +However, there is also a lot of situations +where you don't have this opportunity like + +0:13:05.473 --> 0:13:06.785 +or sports. + +0:13:07.247 --> 0:13:13.963 +And you don't want to like first like let +around a sports event and then like show in + +0:13:13.963 --> 0:13:19.117 +the game three hours later then there is not +really any interest. + +0:13:19.399 --> 0:13:31.118 +So you have to do it live, and so we have +the additional challenge of translating the + +0:13:31.118 --> 0:13:32.208 +system. + +0:13:32.412 --> 0:13:42.108 +There are still things on the one end of course. + +0:13:42.108 --> 0:13:49.627 +It needs to be real time translation. + +0:13:49.869 --> 0:13:54.153 +It's taking longer, then you're getting more +and more and more delayed. + +0:13:55.495 --> 0:14:05.245 +So it maybe seems simple, but there have been +research systems which are undertime slower + +0:14:05.245 --> 0:14:07.628 +than real time or so. + +0:14:07.628 --> 0:14:15.103 +If you want to show what is possible with +the best current systems,. + +0:14:16.596 --> 0:14:18.477 +But that isn't even not enough. + +0:14:18.918 --> 0:14:29.593 +The other question: You can have a system +which is even like several times real time. + +0:14:29.509 --> 0:14:33.382 +In less than one second, it might still be +not useful. + +0:14:33.382 --> 0:14:39.648 +Then the question is like the latency, so +how much time has passed since you can produce + +0:14:39.648 --> 0:14:39.930 +an. + +0:14:40.120 --> 0:14:45.814 +It might be that in average you can like concress +it, but you still can't do it directly. + +0:14:45.814 --> 0:14:51.571 +You need to do it after, or you need to have +the full context of thirty seconds before you + +0:14:51.571 --> 0:14:55.178 +can output something, and then you have a large +latency. + +0:14:55.335 --> 0:15:05.871 +So it can be that do it as fast as it is produced, +but have to wait until the food. + +0:15:06.426 --> 0:15:13.772 +So we'll look into that on Thursday how we +can then generate translations that are having + +0:15:13.772 --> 0:15:14.996 +a low latency. + +0:15:15.155 --> 0:15:21.587 +You can imagine, for example, in German that +it's maybe quite challenging since the word + +0:15:21.587 --> 0:15:23.466 +is often like at the end. + +0:15:23.466 --> 0:15:30.115 +If you're using perfect, like in harbor and +so on, and then in English you have to directly + +0:15:30.115 --> 0:15:30.983 +produce it. + +0:15:31.311 --> 0:15:38.757 +So if you really want to have no context you +might need to wait until the end of the sentence. + +0:15:41.021 --> 0:15:45.920 +Besides that, of course, offline and it gives +you more additional help. + +0:15:45.920 --> 0:15:52.044 +I think last week you talked about context +based systems that typically have context from + +0:15:52.044 --> 0:15:55.583 +maybe from the past but maybe also from the +future. + +0:15:55.595 --> 0:16:02.923 +Then, of course, you cannot use anything from +the future in this case, but you can use it. + +0:16:07.407 --> 0:16:24.813 +Finally, there is a thing about how you want +to present it to the audience in automatic + +0:16:24.813 --> 0:16:27.384 +translation. + +0:16:27.507 --> 0:16:31.361 +There is also the thing that you want to do. + +0:16:31.361 --> 0:16:35.300 +All your outfits are running like the system. + +0:16:35.996 --> 0:16:36.990 +Top of it. + +0:16:36.990 --> 0:16:44.314 +Then they answered questions: How should it +be spoken so you can do things like. + +0:16:46.586 --> 0:16:52.507 +Voice cloning so that it's like even the same +voice than the original speaker. + +0:16:53.994 --> 0:16:59.081 +And if you do text or dubbing then there might +be additional constraints. + +0:16:59.081 --> 0:17:05.729 +So if you think about subtitles: And they +should be readable, and we are too big to speak + +0:17:05.729 --> 0:17:07.957 +faster than you can maybe read. + +0:17:08.908 --> 0:17:14.239 +So you might need to shorten your text. + +0:17:14.239 --> 0:17:20.235 +People say that a subtitle can be two lines. + +0:17:20.235 --> 0:17:26.099 +Each line can be this number of characters. + +0:17:26.346 --> 0:17:31.753 +So you cannot like if you have too long text, +we might need to shorten that to do that. + +0:17:32.052 --> 0:17:48.272 +Similarly, if you think about dubbing, if +you want to produce dubbing voice, then the + +0:17:48.272 --> 0:17:50.158 +original. + +0:17:51.691 --> 0:17:59.294 +Here is another problem that we have different +settings like a more formal setting and let's + +0:17:59.294 --> 0:18:00.602 +have different. + +0:18:00.860 --> 0:18:09.775 +If you think about the United Nations maybe +you want more former things and between friends + +0:18:09.775 --> 0:18:14.911 +maybe that former and there are languages which +use. + +0:18:15.355 --> 0:18:21.867 +That is sure that is an important research +question. + +0:18:21.867 --> 0:18:28.010 +To do that would more think of it more generally. + +0:18:28.308 --> 0:18:32.902 +That's important in text translation. + +0:18:32.902 --> 0:18:41.001 +If you translate a letter to your boss, it +should sound different. + +0:18:42.202 --> 0:18:53.718 +So there is a question of how you can do this +style work on how you can do that. + +0:18:53.718 --> 0:19:00.542 +For example, if you can specify that you might. + +0:19:00.460 --> 0:19:10.954 +So you can tax the center or generate an informal +style because, as you correctly said, this + +0:19:10.954 --> 0:19:16.709 +is especially challenging again in the situations. + +0:19:16.856 --> 0:19:20.111 +Of course, there are ways of like being formal +or less formal. + +0:19:20.500 --> 0:19:24.846 +But it's not like as clear as you do it, for +example, in German where you have the twin + +0:19:24.846 --> 0:19:24.994 +C. + +0:19:25.165 --> 0:19:26.855 +So there is no one to own mapping. + +0:19:27.287 --> 0:19:34.269 +If you want to make that sure you can build +a system which generates different styles in + +0:19:34.269 --> 0:19:38.662 +the output, so yeah that's definitely also +a challenge. + +0:19:38.662 --> 0:19:43.762 +It just may be not mentioned here because +it's not specific now. + +0:19:44.524 --> 0:19:54.029 +Generally, of course, these are all challenges +in how to customize and adapt systems to use + +0:19:54.029 --> 0:19:56.199 +cases with specific. + +0:20:00.360 --> 0:20:11.020 +Speech translation has been done for quite +a while and it's maybe not surprising it started + +0:20:11.020 --> 0:20:13.569 +with more simple use. + +0:20:13.793 --> 0:20:24.557 +So people first started to look into, for +example, limited to main translations. + +0:20:24.557 --> 0:20:33.726 +The tourist was typically application if you're +going to a new city. + +0:20:34.834 --> 0:20:44.028 +Then there are several open things of doing +open domain translation, especially people. + +0:20:44.204 --> 0:20:51.957 +Like where there's a lot of data so you could +build systems which are more open to main, + +0:20:51.957 --> 0:20:55.790 +but of course it's still a bit restrictive. + +0:20:55.790 --> 0:20:59.101 +It's true in the European Parliament. + +0:20:59.101 --> 0:21:01.888 +People talk about anything but. + +0:21:02.162 --> 0:21:04.820 +And so it's not completely used for everything. + +0:21:05.165 --> 0:21:11.545 +Nowadays we've seen this technology in a lot +of different situations guess you ought. + +0:21:11.731 --> 0:21:17.899 +Use it so there is some basic technologies +where you can use them already. + +0:21:18.218 --> 0:21:33.599 +There is still a lot of open questions going +from if you are going to really spontaneous + +0:21:33.599 --> 0:21:35.327 +meetings. + +0:21:35.655 --> 0:21:41.437 +Then these systems typically work good for +like some languages where we have a lot of + +0:21:41.437 --> 0:21:42.109 +friendly. + +0:21:42.742 --> 0:21:48.475 +But if we want to go for really low resource +data then things are often challenging. + +0:21:48.448 --> 0:22:02.294 +Last week we had a workshop on spoken language +translation and there is a low-resource data + +0:22:02.294 --> 0:22:05.756 +track which is dialed. + +0:22:05.986 --> 0:22:06.925 +And so on. + +0:22:06.925 --> 0:22:14.699 +All these languages can still then have significantly +lower performance than for a higher. + +0:22:17.057 --> 0:22:20.126 +So how does this work? + +0:22:20.126 --> 0:22:31.614 +If we want to do speech translation, there's +like three basic technology: So on the one + +0:22:31.614 --> 0:22:40.908 +hand, it's automatic speech recognition where +automatic speech recognition normally transacts + +0:22:40.908 --> 0:22:41.600 +audio. + +0:22:42.822 --> 0:22:58.289 +Then what we talked about here is machine +translation, which takes input and translates + +0:22:58.289 --> 0:23:01.276 +into the target. + +0:23:02.642 --> 0:23:11.244 +And the very simple model now, if you think +about it, is of course the similar combination. + +0:23:11.451 --> 0:23:14.740 +We have solved all these parts in a salt bedrock. + +0:23:14.975 --> 0:23:31.470 +We are working on all these problems there, +so if we want to do a speech transition, maybe. + +0:23:31.331 --> 0:23:35.058 +Such problems we just put all these combinations +together. + +0:23:35.335 --> 0:23:45.130 +And then you get what you have as a cascading +system, which first is so you take your audio. + +0:23:45.045 --> 0:23:59.288 +To take this as input and generate the output, +and then you take this text output, put it + +0:23:59.288 --> 0:24:00.238 +into. + +0:24:00.640 --> 0:24:05.782 +So in that way you have now. + +0:24:08.008 --> 0:24:18.483 +Have now a solution for generating doing speech +translation for these types of systems, and + +0:24:18.483 --> 0:24:20.874 +this type is called. + +0:24:21.681 --> 0:24:28.303 +It is still often reaching state of the art, +however it has benefits and disadvantages. + +0:24:28.668 --> 0:24:41.709 +So the one big benefit is we have independent +components and some of that is nice. + +0:24:41.709 --> 0:24:48.465 +So if there are great ideas put into your. + +0:24:48.788 --> 0:24:57.172 +And then some other times people develop a +new good way of how to improve. + +0:24:57.172 --> 0:25:00.972 +You can also take this model and. + +0:25:01.381 --> 0:25:07.639 +So you can leverage improvements from all +the different communities in order to adapt. + +0:25:08.288 --> 0:25:18.391 +Furthermore, we would like to see, since all +of them is learning, that the biggest advantage + +0:25:18.391 --> 0:25:23.932 +is that we have training data for each individual. + +0:25:24.164 --> 0:25:34.045 +So there's a lot less training data where +you have the English audio, so it's easy to + +0:25:34.045 --> 0:25:34.849 +train. + +0:25:36.636 --> 0:25:48.595 +Now am a one that we will focus on when talking +about the cascaded approach is that often it. + +0:25:48.928 --> 0:25:58.049 +So you need to adapt each component a bit +so that it's adapting to its input and. + +0:25:58.278 --> 0:26:07.840 +So we'll focus there especially on how to +combine and since said the main focus is: So + +0:26:07.840 --> 0:26:18.589 +if you would directly use an output that might +not work as perfect as you would,. + +0:26:18.918 --> 0:26:33.467 +So a major challenge when building a cascade +of speech translation systems is how can we + +0:26:33.467 --> 0:26:38.862 +adapt these systems and how can? + +0:26:41.681 --> 0:26:43.918 +So why, why is this the kick? + +0:26:44.164 --> 0:26:49.183 +So it would look quite nice. + +0:26:49.183 --> 0:26:54.722 +It seems to be very reasonable. + +0:26:54.722 --> 0:26:58.356 +You have some audio. + +0:26:58.356 --> 0:27:03.376 +You put it into your system. + +0:27:04.965 --> 0:27:23.759 +However, this is a bit which for thinking +because if you speak what you speak is more. + +0:27:23.984 --> 0:27:29.513 +And especially all that rarely have punctuations +in there, and while the anti-system. + +0:27:29.629 --> 0:27:43.247 +They assume, of course, that it's a full sentence, +that you don't have there some. + +0:27:43.523 --> 0:27:55.087 +So we see we want to get this bridge between +the output and the input, and we might need + +0:27:55.087 --> 0:27:56.646 +additional. + +0:27:58.778 --> 0:28:05.287 +And that is typically what is referred to +as re-case and re-piculation system. + +0:28:05.445 --> 0:28:15.045 +So the idea is that you might be good to have +something like an adapter here in between, + +0:28:15.045 --> 0:28:20.007 +which really tries to adapt the speech input. + +0:28:20.260 --> 0:28:28.809 +That can be at different levels, but it might +be even more rephrasing. + +0:28:29.569 --> 0:28:40.620 +If you think of the sentence, if you have +false starts, then when speaking you sometimes + +0:28:40.620 --> 0:28:41.986 +assume oh. + +0:28:41.901 --> 0:28:52.224 +You restart it, then you might want to delete +that because if you read it you don't want + +0:28:52.224 --> 0:28:52.688 +to. + +0:28:56.096 --> 0:28:57.911 +Why is this yeah? + +0:28:57.911 --> 0:29:01.442 +The case in punctuation important. + +0:29:02.622 --> 0:29:17.875 +One important thing is directly for the challenge +is when speak is just a continuous stream of + +0:29:17.875 --> 0:29:18.999 +words. + +0:29:19.079 --> 0:29:27.422 +Then just speaking and punctuation marks, +and so on are all notes are there in natural. + +0:29:27.507 --> 0:29:30.281 +However, they are of course important. + +0:29:30.410 --> 0:29:33.877 +They are first of all very important for readability. + +0:29:34.174 --> 0:29:41.296 +If you have once read a text without characterization +marks, you need more time to process it. + +0:29:41.861 --> 0:29:47.375 +They're sometimes even semantically important. + +0:29:47.375 --> 0:29:52.890 +There's a list for grandpa and big difference. + +0:29:53.553 --> 0:30:00.089 +And so this, of course, with humans as well, +it'd be easy to distinguish by again doing + +0:30:00.089 --> 0:30:01.426 +it automatically. + +0:30:01.426 --> 0:30:06.180 +It's more typically and finally, in our case, +if we want to do. + +0:30:06.386 --> 0:30:13.672 +We are assuming normally sentence wise, so +we always enter out system which is like one + +0:30:13.672 --> 0:30:16.238 +sentence by the next sentence. + +0:30:16.736 --> 0:30:26.058 +If you want to do speech translation of a +continuous stream, then of course what are + +0:30:26.058 --> 0:30:26.716 +your. + +0:30:28.168 --> 0:30:39.095 +And the easiest and most straightforward situation +is, of course, if you have a continuously. + +0:30:39.239 --> 0:30:51.686 +And if it generates your calculation marks, +it's easy to separate your text into sentences. + +0:30:52.032 --> 0:31:09.157 +So we can again reuse our system and thereby +have a normal anti-system on this continuous. + +0:31:14.174 --> 0:31:21.708 +These are a bit older numbers, but they show +you a bit also how important all that is. + +0:31:21.861 --> 0:31:31.719 +So this was so the best is if you do insurance +transcript you get roughly a blue score of. + +0:31:32.112 --> 0:31:47.678 +If you have as it is with some air based length +segmentation, then you get something like. + +0:31:47.907 --> 0:31:57.707 +If you then use the segments correctly as +it's done from the reference, you get one blue + +0:31:57.707 --> 0:32:01.010 +point and another blue point. + +0:32:01.201 --> 0:32:08.085 +So you see that you have been total like nearly +two blue points just by having the correct + +0:32:08.085 --> 0:32:09.144 +segmentation. + +0:32:10.050 --> 0:32:21.178 +This shows you that it's important to estimate +as good a segmentation because even if you + +0:32:21.178 --> 0:32:25.629 +still have the same arrows in your. + +0:32:27.147 --> 0:32:35.718 +Is to be into this movement, which is also +not as unusual as we do in translation. + +0:32:36.736 --> 0:32:40.495 +So this is done by looking at the reference. + +0:32:40.495 --> 0:32:48.097 +It should show you how much these scores are +done to just analyze how important are these. + +0:32:48.097 --> 0:32:55.699 +So you take the A's R transcript and you look +at the reference and it's only done for the. + +0:32:55.635 --> 0:33:01.720 +If we have optimal punctuations, if our model +is as good and optimal, so as a reference we + +0:33:01.720 --> 0:33:15.602 +could: But of course this is not how we can +do it in reality because we don't have access + +0:33:15.602 --> 0:33:16.990 +to that. + +0:33:17.657 --> 0:33:24.044 +Because one would invade you okay, why should +we do that? + +0:33:24.044 --> 0:33:28.778 +If we have the optimal then it's possible. + +0:33:31.011 --> 0:33:40.060 +And yeah, that is why a typical system does +not only yeah depend on if our key component. + +0:33:40.280 --> 0:33:56.468 +But in between you have this segmentation +in there in order to have more input and. + +0:33:56.496 --> 0:34:01.595 +You can also prefer often this invariability +over the average study. + +0:34:04.164 --> 0:34:19.708 +So the task of segmentation is to re-segment +the text into what is called sentence like + +0:34:19.708 --> 0:34:24.300 +unit, so you also assign. + +0:34:24.444 --> 0:34:39.421 +That is more a traditional thing because for +a long time case information was not provided. + +0:34:39.879 --> 0:34:50.355 +So there was any good ASR system which directly +provides you with case information and this + +0:34:50.355 --> 0:34:52.746 +may not be any more. + +0:34:56.296 --> 0:35:12.060 +How that can be done is you can have three +different approaches because that was some + +0:35:12.060 --> 0:35:16.459 +of the most common one. + +0:35:17.097 --> 0:35:23.579 +Course: That is not the only thing you can +do. + +0:35:23.579 --> 0:35:30.888 +You can also try to train the data to generate +that. + +0:35:31.891 --> 0:35:41.324 +On the other hand, that is of course more +challenging. + +0:35:41.324 --> 0:35:47.498 +You need some type of segmentation. + +0:35:48.028 --> 0:35:59.382 +Mean, of course, you can easily remove and +capture information from your data and then + +0:35:59.382 --> 0:36:05.515 +play a system which does non-case to non-case. + +0:36:05.945 --> 0:36:15.751 +You can also, of course, try to combine these +two into one so that you directly translate + +0:36:15.751 --> 0:36:17.386 +from non-case. + +0:36:17.817 --> 0:36:24.722 +What is more happening by now is that you +also try to provide these to that you provide. + +0:36:24.704 --> 0:36:35.267 +The ASR is a segmentation directly get these +information in there. + +0:36:35.267 --> 0:36:45.462 +The systems that combine the A's and A's are: +Yes, there is a valid rule. + +0:36:45.462 --> 0:36:51.187 +What we come later to today is that you do +audio to text in the target language. + +0:36:51.187 --> 0:36:54.932 +That is what is referred to as an end to end +system. + +0:36:54.932 --> 0:36:59.738 +So it's directly and this is still more often +done for text output. + +0:36:59.738 --> 0:37:03.414 +But there is also end to end system which +directly. + +0:37:03.683 --> 0:37:09.109 +There you have additional challenges by how +to even measure if things are correct or not. + +0:37:09.089 --> 0:37:10.522 +Mean for text. + +0:37:10.522 --> 0:37:18.073 +You can mention, in other words, that for +audio the audio signal is even more. + +0:37:18.318 --> 0:37:27.156 +That's why it's currently mostly speech to +text, but that is one single system, but of + +0:37:27.156 --> 0:37:27.969 +course. + +0:37:32.492 --> 0:37:35.605 +Yeah, how can you do that? + +0:37:35.605 --> 0:37:45.075 +You can do adding these calculation information: +Will look into three systems. + +0:37:45.075 --> 0:37:53.131 +You can do that as a sequence labeling problem +or as a monolingual. + +0:37:54.534 --> 0:37:57.145 +Let's have a little bit of a series. + +0:37:57.145 --> 0:37:59.545 +This was some of the first ideas. + +0:37:59.545 --> 0:38:04.626 +There's the idea where you try to do it mainly +based on language model. + +0:38:04.626 --> 0:38:11.471 +So how probable is that there is a punctuation +that was done with like old style engram language + +0:38:11.471 --> 0:38:12.883 +models to visually. + +0:38:13.073 --> 0:38:24.687 +So you can, for example, if you have a program +language model to calculate the score of Hello, + +0:38:24.687 --> 0:38:25.787 +how are? + +0:38:25.725 --> 0:38:33.615 +And then you compare this probability and +take the one which has the highest probability. + +0:38:33.615 --> 0:38:39.927 +You might have something like if you have +very long pauses, you anyway. + +0:38:40.340 --> 0:38:51.953 +So this is a very easy model, which only calculates +some language model probabilities, and however + +0:38:51.953 --> 0:39:00.023 +the advantages of course are: And then, of +course, in general, so what we will look into + +0:39:00.023 --> 0:39:06.249 +here is that maybe interesting is that most +of the systems, also the advance, are really + +0:39:06.249 --> 0:39:08.698 +mainly focused purely on the text. + +0:39:09.289 --> 0:39:19.237 +If you think about how to insert punctuation +marks, maybe your first idea would have been + +0:39:19.237 --> 0:39:22.553 +we can use pause information. + +0:39:23.964 --> 0:39:30.065 +But however interestingly most systems that +use are really focusing on the text. + +0:39:31.151 --> 0:39:34.493 +There are several reasons. + +0:39:34.493 --> 0:39:44.147 +One is that it's easier to get training data +so you only need pure text data. + +0:39:46.806 --> 0:40:03.221 +The next way you can do it is you can make +it as a secret labeling tax or something like + +0:40:03.221 --> 0:40:04.328 +that. + +0:40:04.464 --> 0:40:11.734 +Then you have how there is nothing in you, +and there is a. + +0:40:11.651 --> 0:40:15.015 +A question. + +0:40:15.315 --> 0:40:31.443 +So you have the number of labels, the number +of punctuation symbols you have for the basic + +0:40:31.443 --> 0:40:32.329 +one. + +0:40:32.892 --> 0:40:44.074 +Typically nowadays it would use something +like bird, and then you can train a sister. + +0:40:48.168 --> 0:40:59.259 +Any questions to that then it would probably +be no contrary, you know, or not. + +0:41:00.480 --> 0:41:03.221 +Yeah, you have definitely a labeled imbalance. + +0:41:04.304 --> 0:41:12.405 +Think that works relatively well and haven't +seen that. + +0:41:12.405 --> 0:41:21.085 +It's not a completely crazy label, maybe twenty +times more. + +0:41:21.561 --> 0:41:29.636 +It can and especially for the more rare things +mean, the more rare things is question marks. + +0:41:30.670 --> 0:41:43.877 +At least for question marks you have typically +very strong indicator words. + +0:41:47.627 --> 0:42:03.321 +And then what was done for quite a long time +can we know how to do machine translation? + +0:42:04.504 --> 0:42:12.640 +So the idea is, can we just translate non +punctuated English into punctuated English + +0:42:12.640 --> 0:42:14.650 +and do it correctly? + +0:42:15.855 --> 0:42:25.344 +So what you need is something like this type +of data where the source doesn't have punctuation. + +0:42:25.845 --> 0:42:30.641 +Course: A year is already done. + +0:42:30.641 --> 0:42:36.486 +You have to make it a bit challenging. + +0:42:41.661 --> 0:42:44.550 +Yeah, that is true. + +0:42:44.550 --> 0:42:55.237 +If you think about the normal trained age, +you have to do one thing more. + +0:42:55.237 --> 0:43:00.724 +Is it otherwise difficult to predict? + +0:43:05.745 --> 0:43:09.277 +Here it's already this already looks different +than normal training data. + +0:43:09.277 --> 0:43:09.897 +What is the. + +0:43:10.350 --> 0:43:15.305 +People want to use this transcript of speech. + +0:43:15.305 --> 0:43:19.507 +We'll probably go to our text editors. + +0:43:19.419 --> 0:43:25.906 +Yes, that is all already quite too difficult. + +0:43:26.346 --> 0:43:33.528 +Mean, that's making things a lot better with +the first and easiest thing is you have to + +0:43:33.528 --> 0:43:35.895 +randomly cut your sentences. + +0:43:35.895 --> 0:43:43.321 +So if you take just me normally we have one +sentence per line and if you take this as your + +0:43:43.321 --> 0:43:44.545 +training data. + +0:43:44.924 --> 0:43:47.857 +And that is, of course, not very helpful. + +0:43:48.208 --> 0:44:01.169 +So in order to build the training corpus for +doing punctuation you randomly cut your sentences + +0:44:01.169 --> 0:44:08.264 +and then you can remove all your punctuation +marks. + +0:44:08.528 --> 0:44:21.598 +Because of course there is no longer to do +when you have some random segments in your + +0:44:21.598 --> 0:44:22.814 +system. + +0:44:25.065 --> 0:44:37.984 +And then you can, for example, if you then +have generated your punctuation marks before + +0:44:37.984 --> 0:44:41.067 +going to the system. + +0:44:41.221 --> 0:44:54.122 +And that is an important thing, which we like +to see is more challenging for end systems. + +0:44:54.122 --> 0:45:00.143 +We can change the segmentation, so maybe. + +0:45:00.040 --> 0:45:06.417 +You can, then if you're combining these things +you can change the segmentation here, so. + +0:45:06.406 --> 0:45:18.178 +While you have ten new ten segments in your, +you might only have five ones in your anymore. + +0:45:18.178 --> 0:45:18.946 +Then. + +0:45:19.259 --> 0:45:33.172 +Which might be more useful or helpful in because +you have to reorder things and so on. + +0:45:33.273 --> 0:45:43.994 +And if you think of the wrong segmentation +then you cannot reorder things from the beginning + +0:45:43.994 --> 0:45:47.222 +to the end of the sentence. + +0:45:49.749 --> 0:45:58.006 +Okay, so much about segmentation do you have +any more questions about that? + +0:46:02.522 --> 0:46:21.299 +Then there is one additional thing you can +do, and that is when we refer to the idea. + +0:46:21.701 --> 0:46:29.356 +And when you get input there might be some +arrows in there, so it might not be perfect. + +0:46:29.889 --> 0:46:36.322 +So the question is, can we adapt to that? + +0:46:36.322 --> 0:46:45.358 +And can the system be improved by saying that +it can some. + +0:46:45.265 --> 0:46:50.591 +So that is as aware that before there is a. + +0:46:50.490 --> 0:46:55.449 +Their arm might not be the best one. + +0:46:55.935 --> 0:47:01.961 +There are different ways of dealing with them. + +0:47:01.961 --> 0:47:08.116 +You can use a best list but several best lists. + +0:47:08.408 --> 0:47:16.711 +So the idea is that you're not only telling +the system this is the transcript, but here + +0:47:16.711 --> 0:47:18.692 +I'm not going to be. + +0:47:19.419 --> 0:47:30.748 +Or that you can try to make it more robust +towards arrows from an system so that. + +0:47:32.612 --> 0:47:48.657 +Interesting what is often done is hope convince +you it might be a good idea to deal. + +0:47:48.868 --> 0:47:57.777 +The interesting thing is if you're looking +into a lot of systems, this is often ignored, + +0:47:57.777 --> 0:48:04.784 +so they are not adapting their T-system to +this type of A-S-R system. + +0:48:05.345 --> 0:48:15.232 +So it's not really doing any handling of Arab, +and the interesting thing is often works as + +0:48:15.232 --> 0:48:15.884 +good. + +0:48:16.516 --> 0:48:23.836 +And one reason is, of course, one reason is +if the ASR system does not arrow up to like + +0:48:23.836 --> 0:48:31.654 +a challenging situation, and then the antisystem +is really for the antisystem hard to detect. + +0:48:31.931 --> 0:48:39.375 +If it would be easy for the system to detect +the error you would integrate this information + +0:48:39.375 --> 0:48:45.404 +into: That is not always the case, but that +of course makes it a bit challenging, and that's + +0:48:45.404 --> 0:48:49.762 +why there is a lot of systems where it's not +explicitly handled how to deal with. + +0:48:52.912 --> 0:49:06.412 +But of course it might be good, so one thing +is you can give him a best list and you can + +0:49:06.412 --> 0:49:09.901 +translate every entry. + +0:49:10.410 --> 0:49:17.705 +And then you have two scores like the anti-probability +and the square probability. + +0:49:18.058 --> 0:49:25.695 +Combine them and then generate or output the +output from what has the best combined. + +0:49:26.366 --> 0:49:29.891 +And then it might no longer be the best. + +0:49:29.891 --> 0:49:38.144 +It might like we had a bean search, so this +has the best score, but this has a better combined. + +0:49:39.059 --> 0:49:46.557 +The problem sometimes works, but the problem +is that the anti-system might then tend to + +0:49:46.557 --> 0:49:52.777 +just translate not the correct sentence but +the one easier to translate. + +0:49:53.693 --> 0:50:03.639 +You can also generate a more compact representation +of this invest in it by having this type of + +0:50:03.639 --> 0:50:04.467 +graphs. + +0:50:05.285 --> 0:50:22.952 +Lettices: So then you could like try to do +a graph to text translation so you can translate. + +0:50:22.802 --> 0:50:26.582 +Where like all possibilities, by the way our +systems are invented. + +0:50:26.906 --> 0:50:31.485 +So it can be like a hostage, a conference +with some programs. + +0:50:31.591 --> 0:50:35.296 +So the highest probability is here. + +0:50:35.296 --> 0:50:41.984 +Conference is being recorded, but there are +other possibilities. + +0:50:42.302 --> 0:50:53.054 +And you can take all of this information out +there with your probabilities. + +0:50:59.980 --> 0:51:07.614 +But we'll see this type of arrow propagation +that if you have an error that this might then + +0:51:07.614 --> 0:51:15.165 +propagate to, and t errors is one of the main +reasons why people looked into other ways of + +0:51:15.165 --> 0:51:17.240 +doing it and not having. + +0:51:19.219 --> 0:51:28.050 +By generally a cascaded combination, as we've +seen it, it has several advantages: The biggest + +0:51:28.050 --> 0:51:42.674 +maybe is the data availability so we can train +systems for the different components. + +0:51:42.822 --> 0:51:47.228 +So you can train your individual components +on relatively large stages. + +0:51:47.667 --> 0:51:58.207 +A modular system where you can improve each +individual model and if there's new development + +0:51:58.207 --> 0:52:01.415 +and models you can improve. + +0:52:01.861 --> 0:52:11.280 +There are several advantages, but of course +there are also some disadvantages: The most + +0:52:11.280 --> 0:52:19.522 +common thing is that there is what is referred +to as arrow propagation. + +0:52:19.522 --> 0:52:28.222 +If the arrow is arrow, probably your output +will then directly do an arrow. + +0:52:28.868 --> 0:52:41.740 +Typically it's like if there's an error in +the system, it's easier to like ignore by a + +0:52:41.740 --> 0:52:46.474 +quantity scale than the output. + +0:52:46.967 --> 0:52:49.785 +What do that mean? + +0:52:49.785 --> 0:53:01.209 +It's complicated, so if you have German, the +ASR does the Arab, and instead. + +0:53:01.101 --> 0:53:05.976 +Then most probably you'll ignore it or you'll +still know what it was said. + +0:53:05.976 --> 0:53:11.827 +Maybe you even don't notice because you'll +fastly read over it and don't see that there's + +0:53:11.827 --> 0:53:12.997 +one letter wrong. + +0:53:13.673 --> 0:53:25.291 +However, if you translate this one in an English +sentence about speeches, there's something + +0:53:25.291 --> 0:53:26.933 +about wines. + +0:53:27.367 --> 0:53:37.238 +So it's a lot easier typically to read over +like arrows in the than reading over them in + +0:53:37.238 --> 0:53:38.569 +the speech. + +0:53:40.120 --> 0:53:45.863 +But there is additional challenges in in cascaded +systems. + +0:53:46.066 --> 0:53:52.667 +So secondly we have seen that we optimize +each component individually so you have a separate + +0:53:52.667 --> 0:53:59.055 +optimization and that doesn't mean that the +overall performance is really the best at the + +0:53:59.055 --> 0:53:59.410 +end. + +0:53:59.899 --> 0:54:07.945 +And we have tried to do that by already saying +yes. + +0:54:07.945 --> 0:54:17.692 +You need to adapt them a bit to work good +together, but still. + +0:54:20.280 --> 0:54:24.185 +Secondly, like that, there's a computational +complexity. + +0:54:24.185 --> 0:54:30.351 +You always need to run an ASR system and an +MTT system, and especially if you think about + +0:54:30.351 --> 0:54:32.886 +it, it should be fast and real time. + +0:54:32.886 --> 0:54:37.065 +It's challenging to always run two systems +and not a single. + +0:54:38.038 --> 0:54:45.245 +And one final thing which you might have not +directly thought of, but most of the world's + +0:54:45.245 --> 0:54:47.407 +languages do not have any. + +0:54:48.108 --> 0:55:01.942 +So if you have a language which doesn't have +any script, then of course if you want to translate + +0:55:01.942 --> 0:55:05.507 +it you cannot first use. + +0:55:05.905 --> 0:55:13.705 +So in order to do this, the pressure was mentioned +before ready. + +0:55:13.705 --> 0:55:24.264 +Build somehow a system which takes the audio +and directly generates text in the target. + +0:55:26.006 --> 0:55:41.935 +And there is quite big opportunity for that +because before that there was very different + +0:55:41.935 --> 0:55:44.082 +technology. + +0:55:44.644 --> 0:55:55.421 +However, since we are using neuromachine translation +encoded decoder models, the interesting thing + +0:55:55.421 --> 0:56:00.429 +is that we are using very similar technology. + +0:56:00.360 --> 0:56:06.047 +It's like in both cases very similar architecture. + +0:56:06.047 --> 0:56:09.280 +The main difference is once. + +0:56:09.649 --> 0:56:17.143 +But generally how it's done is very similar, +and therefore of course it might be put everything + +0:56:17.143 --> 0:56:22.140 +together, and that is what is referred to as +end-to-end speech. + +0:56:22.502 --> 0:56:31.411 +So that means we're having one large neural +network and decoded voice system, but we put + +0:56:31.411 --> 0:56:34.914 +an audio in one language and then. + +0:56:36.196 --> 0:56:43.106 +We can then have a system which directly does +the full process. + +0:56:43.106 --> 0:56:46.454 +We don't have to care anymore. + +0:56:48.048 --> 0:57:02.615 +So if you think of it as before, so we have +this decoder, and that's the two separate. + +0:57:02.615 --> 0:57:04.792 +We have the. + +0:57:05.085 --> 0:57:18.044 +And instead of going via the discrete text +representation in the Suez language, we can + +0:57:18.044 --> 0:57:21.470 +go via the continuous. + +0:57:21.681 --> 0:57:26.027 +Of course, they hope it's by not doing this +discrimination in between. + +0:57:26.146 --> 0:57:30.275 +We don't have a problem at doing errors. + +0:57:30.275 --> 0:57:32.793 +We can only cover later. + +0:57:32.772 --> 0:57:47.849 +But we can encode here the variability or +so that we have and then only define the decision. + +0:57:51.711 --> 0:57:54.525 +And so. + +0:57:54.274 --> 0:58:02.253 +What we're doing is we're having very similar +technique. + +0:58:02.253 --> 0:58:12.192 +We're having still the decoder model where +we're coming from the main. + +0:58:12.552 --> 0:58:24.098 +Instead of getting discrete tokens in there +as we have subwords, we always encoded that + +0:58:24.098 --> 0:58:26.197 +in one pattern. + +0:58:26.846 --> 0:58:42.505 +The problem is that this is in continuous, +so we have to check how we can work with continuous + +0:58:42.505 --> 0:58:43.988 +signals. + +0:58:47.627 --> 0:58:55.166 +Mean, the first thing in your system is when +you do your disc freeze and code it. + +0:59:02.402 --> 0:59:03.888 +A newer machine translation. + +0:59:03.888 --> 0:59:05.067 +You're getting a word. + +0:59:05.067 --> 0:59:06.297 +It's one hot, some not. + +0:59:21.421 --> 0:59:24.678 +The first layer of the machine translation. + +0:59:27.287 --> 0:59:36.147 +Yes, you do the word embedding, so then you +have a continuous thing. + +0:59:36.147 --> 0:59:40.128 +So if you know get continuous. + +0:59:40.961 --> 0:59:46.316 +Deal with it the same way, so we'll see not +a big of a challenge. + +0:59:46.316 --> 0:59:48.669 +What is more challenging is. + +0:59:49.349 --> 1:00:04.498 +So the audio signal is ten times longer or +so, like more time steps you have. + +1:00:04.764 --> 1:00:10.332 +And so that is, of course, any challenge how +we can deal with this type of long sequence. + +1:00:11.171 --> 1:00:13.055 +The advantage is a bit. + +1:00:13.055 --> 1:00:17.922 +The long sequence is only at the input and +not at the output. + +1:00:17.922 --> 1:00:24.988 +So when you remember for the efficiency, for +example, like a long sequence are especially + +1:00:24.988 --> 1:00:29.227 +challenging in the decoder, but also for the +encoder. + +1:00:31.371 --> 1:00:33.595 +So how it is this? + +1:00:33.595 --> 1:00:40.617 +How can we process audio into an speech translation +system? + +1:00:41.501 --> 1:00:51.856 +And you can follow mainly what is done in +an system, so you have the audio signal. + +1:00:52.172 --> 1:00:59.135 +Then you measure your amplitude at every time +step. + +1:00:59.135 --> 1:01:04.358 +It's typically something like killing. + +1:01:04.384 --> 1:01:13.893 +And then you're doing this, this windowing, +so that you get a signal of a length twenty + +1:01:13.893 --> 1:01:22.430 +to thirty seconds, and you have all these windowings +so that you measure them. + +1:01:22.342 --> 1:01:32.260 +A simple gear, and then you look at these +time signals of seconds. + +1:01:32.432 --> 1:01:36.920 +So in the end then it is ten seconds, ten +million seconds. + +1:01:36.920 --> 1:01:39.735 +You have for every ten milliseconds. + +1:01:40.000 --> 1:01:48.309 +Some type of representation which type of +representation you can generate from that, + +1:01:48.309 --> 1:01:49.286 +but that. + +1:01:49.649 --> 1:02:06.919 +So instead of having no letter or word, you +have no representations for every 10mm of your + +1:02:06.919 --> 1:02:08.437 +system. + +1:02:08.688 --> 1:02:13.372 +How we record that now your thirty second +window here there is different ways. + +1:02:16.176 --> 1:02:31.891 +Was a traditional way of how people have done +that from an audio signal what frequencies + +1:02:31.891 --> 1:02:34.010 +are in the. + +1:02:34.114 --> 1:02:44.143 +So to do that you can do this malfrequency, +capsule co-pression so you can use gear transformations. + +1:02:44.324 --> 1:02:47.031 +Which frequencies are there? + +1:02:47.031 --> 1:02:53.566 +You know that the letters are different by +the different frequencies. + +1:02:53.813 --> 1:03:04.243 +And then if you're doing that, use the matte +to covers for your window we have before. + +1:03:04.624 --> 1:03:14.550 +So for each of these windows: You will calculate +what frequencies in there and then get features + +1:03:14.550 --> 1:03:20.059 +for this window and features for this window. + +1:03:19.980 --> 1:03:28.028 +These are the frequencies that occur there +and that help you to model which letters are + +1:03:28.028 --> 1:03:28.760 +spoken. + +1:03:31.611 --> 1:03:43.544 +More recently, instead of doing the traditional +signal processing, you can also replace that + +1:03:43.544 --> 1:03:45.853 +by deep learning. + +1:03:46.126 --> 1:03:56.406 +So that we are using a self-supervised approach +from language model to generate features that + +1:03:56.406 --> 1:03:58.047 +describe what. + +1:03:58.358 --> 1:03:59.821 +So you have your. + +1:03:59.759 --> 1:04:07.392 +All your signal again, and then for each child +to do your convolutional neural networks to + +1:04:07.392 --> 1:04:07.811 +get. + +1:04:07.807 --> 1:04:23.699 +First representation here is a transformer +network here, and in the end it's similar to + +1:04:23.699 --> 1:04:25.866 +a language. + +1:04:25.705 --> 1:04:30.238 +And you tried to predict what was referenced +here. + +1:04:30.670 --> 1:04:42.122 +So that is in a way similar that you also +try to learn a good representation of all these + +1:04:42.122 --> 1:04:51.608 +audio signals by predicting: And then you don't +do the signal processing base, but have this + +1:04:51.608 --> 1:04:52.717 +way to make. + +1:04:52.812 --> 1:04:59.430 +But in all the things that you have to remember +what is most important for you, and to end + +1:04:59.430 --> 1:05:05.902 +system is, of course, that you in the end get +for every minute ten milliseconds, you get + +1:05:05.902 --> 1:05:11.283 +a representation of this audio signal, which +is again a vector, and that. + +1:05:11.331 --> 1:05:15.365 +And then you can use your normal encoder to +code your model to do this research. + +1:05:21.861 --> 1:05:32.694 +So that is all which directly has to be changed, +and then you can build your first base. + +1:05:33.213 --> 1:05:37.167 +You do the audio processing. + +1:05:37.167 --> 1:05:49.166 +You of course need data which is like Audio +and English and Text in German and then you + +1:05:49.166 --> 1:05:50.666 +can train. + +1:05:53.333 --> 1:05:57.854 +And interestingly, it works at the beginning. + +1:05:57.854 --> 1:06:03.261 +The systems were maybe a bit worse, but we +saw really. + +1:06:03.964 --> 1:06:11.803 +This is like from the biggest workshop where +people like compared different systems. + +1:06:11.751 --> 1:06:17.795 +Special challenge on comparing Cascaded to +end to end systems and you see two thousand + +1:06:17.795 --> 1:06:18.767 +and eighteen. + +1:06:18.767 --> 1:06:25.089 +We had quite a huge gap between the Cascaded +and end to end systems and then it got nearer + +1:06:25.089 --> 1:06:27.937 +and earlier in starting in two thousand. + +1:06:27.907 --> 1:06:33.619 +Twenty the performance was mainly the same, +so there was no clear difference anymore. + +1:06:34.014 --> 1:06:42.774 +So this is, of course, writing a bit of hope +saying if we better learn how to build these + +1:06:42.774 --> 1:06:47.544 +internal systems, they might really fall better. + +1:06:49.549 --> 1:06:52.346 +However, a bit. + +1:06:52.452 --> 1:06:59.018 +This satisfying this is how this all continues, +and this is not only in two thousand and twenty + +1:06:59.018 --> 1:07:04.216 +one, but even nowadays we can say there is +no clear performance difference. + +1:07:04.216 --> 1:07:10.919 +It's not like the one model is better than +the other, but we are seeing very similar performance. + +1:07:11.391 --> 1:07:19.413 +So the question is what is the difference? + +1:07:19.413 --> 1:07:29.115 +Of course, this can only be achieved by new +tricks. + +1:07:30.570 --> 1:07:35.658 +Yes and no, that's what we will mainly look +into now. + +1:07:35.658 --> 1:07:39.333 +How can we make use of other types of. + +1:07:39.359 --> 1:07:53.236 +In that case you can achieve some performance +by using different types of training so you + +1:07:53.236 --> 1:07:55.549 +can also make. + +1:07:55.855 --> 1:08:04.961 +So if you are training or preparing the systems +only on very small corpora where you have as + +1:08:04.961 --> 1:08:10.248 +much data than you have for the individual +ones then. + +1:08:10.550 --> 1:08:22.288 +So that is the biggest challenge of an end +system that you have small corpora and therefore. + +1:08:24.404 --> 1:08:30.479 +Of course, there is several advantages so +you can give access to the audio information. + +1:08:30.750 --> 1:08:42.046 +So that's, for example, interesting if you +think about it, you might not have modeled + +1:08:42.046 --> 1:08:45.198 +everything in the text. + +1:08:45.198 --> 1:08:50.321 +So remember when we talk about biases. + +1:08:50.230 --> 1:08:55.448 +Male or female, and that of course is not +in the text any more, but in the audio signal + +1:08:55.448 --> 1:08:56.515 +it's still there. + +1:08:58.078 --> 1:09:03.108 +It also allows you to talk about that on Thursday +when you talk about latency. + +1:09:03.108 --> 1:09:08.902 +You have a bit better chance if you do an +end to end system to get a lower latency because + +1:09:08.902 --> 1:09:14.377 +you only have one system and you don't have +two systems which might have to wait for. + +1:09:14.934 --> 1:09:20.046 +And having one system might be also a bit +easier management. + +1:09:20.046 --> 1:09:23.146 +See that two systems work and so on. + +1:09:26.346 --> 1:09:41.149 +The biggest challenge of end systems is the +data, so as you correctly pointed out, typically + +1:09:41.149 --> 1:09:42.741 +there is. + +1:09:43.123 --> 1:09:45.829 +There is some data for Ted. + +1:09:45.829 --> 1:09:47.472 +People did that. + +1:09:47.472 --> 1:09:52.789 +They took the English audio with all the translations. + +1:09:53.273 --> 1:10:02.423 +But in January there is a lot less so we'll +look into how you can use other data sources. + +1:10:05.305 --> 1:10:10.950 +And secondly, the second challenge is that +we have to deal with audio. + +1:10:11.431 --> 1:10:22.163 +For example, in input length, and therefore +it's also important to handle this in your + +1:10:22.163 --> 1:10:27.590 +network and maybe have dedicated solutions. + +1:10:31.831 --> 1:10:40.265 +So in general we have this challenge that +we have a lot of text and translation and audio + +1:10:40.265 --> 1:10:43.076 +transcript data by quite few. + +1:10:43.643 --> 1:10:50.844 +So what can we do in one trick? + +1:10:50.844 --> 1:11:00.745 +You already know a bit from other research. + +1:11:02.302 --> 1:11:14.325 +Exactly so what you can do is you can, for +example, use to take a power locust, generate + +1:11:14.325 --> 1:11:19.594 +an audio of a Suez language, and then. + +1:11:21.341 --> 1:11:33.780 +There has been a bit motivated by what we +have seen in Beck translation, which was very + +1:11:33.780 --> 1:11:35.476 +successful. + +1:11:38.758 --> 1:11:54.080 +However, it's a bit more challenging because +it is often very different from real audience. + +1:11:54.314 --> 1:12:07.131 +So often if you build a system only trained +on, but then generalized to real audio data + +1:12:07.131 --> 1:12:10.335 +is quite challenging. + +1:12:10.910 --> 1:12:20.927 +And therefore here the synthetic data generation +is significantly more challenging than when. + +1:12:20.981 --> 1:12:27.071 +Because if you read a text, it's maybe bad +translation. + +1:12:27.071 --> 1:12:33.161 +It's hard, but it's a real text or a text +generated by. + +1:12:35.835 --> 1:12:42.885 +But it's a valid solution, and for example +we use that also for say current systems. + +1:12:43.923 --> 1:12:53.336 +Of course you can also do a bit of forward +translation that is done so that you take data. + +1:12:53.773 --> 1:13:02.587 +But then the problem is that your reference +is not always correct, and you remember when + +1:13:02.587 --> 1:13:08.727 +we talked about back translation, it's a bit +of an advantage. + +1:13:09.229 --> 1:13:11.930 +But both can be done and both have been done. + +1:13:12.212 --> 1:13:20.277 +So you can think about this picture again. + +1:13:20.277 --> 1:13:30.217 +You can take this data and generate the audio +to it. + +1:13:30.750 --> 1:13:37.938 +However, it is only synthetic of what can +be used for the voice handling technology for: + +1:13:40.240 --> 1:13:47.153 +But you have not, I mean, yet you get text +to speech, but the voice cloning would need + +1:13:47.153 --> 1:13:47.868 +a voice. + +1:13:47.868 --> 1:13:53.112 +You can use, of course, and then it's nothing +else than a normal. + +1:13:54.594 --> 1:14:03.210 +But still think there are better than both, +but there are some characteristics of that + +1:14:03.210 --> 1:14:05.784 +which is quite different. + +1:14:07.327 --> 1:14:09.341 +But yeah, it's getting better. + +1:14:09.341 --> 1:14:13.498 +That is definitely true, and then this might +get more and more. + +1:14:16.596 --> 1:14:21.885 +Here make sure it's a good person and our +own systems because we try to train and. + +1:14:21.881 --> 1:14:24.356 +And it's like a feedback mood. + +1:14:24.356 --> 1:14:28.668 +There's anything like the Dutch English model +that's. + +1:14:28.648 --> 1:14:33.081 +Yeah, you of course need a decent amount of +real data. + +1:14:33.081 --> 1:14:40.255 +But I mean, as I said, so there is always +an advantage if you have this synthetics thing + +1:14:40.255 --> 1:14:44.044 +only on the input side and not on the outside. + +1:14:44.464 --> 1:14:47.444 +That you at least always generate correct +outcomes. + +1:14:48.688 --> 1:14:54.599 +That's different in a language case because +they have input and the output and it's not + +1:14:54.599 --> 1:14:55.002 +like. + +1:14:58.618 --> 1:15:15.815 +The other idea is to integrate additional +sources so you can have more model sharing. + +1:15:16.376 --> 1:15:23.301 +But you can use these components also in the +system. + +1:15:23.301 --> 1:15:28.659 +Typically the text decoder and the text. + +1:15:29.169 --> 1:15:41.845 +And so the other way of languaging is to join +a train or somehow train all these tasks. + +1:15:43.403 --> 1:15:54.467 +The first and easy thing to do is multi task +training so the idea is you take these components + +1:15:54.467 --> 1:16:02.038 +and train these two components and train the +speech translation. + +1:16:02.362 --> 1:16:13.086 +So then, for example, all your encoders used +by the speech translation system can also gain + +1:16:13.086 --> 1:16:14.951 +from the large. + +1:16:14.975 --> 1:16:24.048 +So everything can gain a bit of emphasis, +but it can partly gain in there quite a bit. + +1:16:27.407 --> 1:16:39.920 +The other idea is to do it in a pre-training +phase. + +1:16:40.080 --> 1:16:50.414 +And then you take the end coder and the text +decoder and trade your model on that. + +1:16:54.774 --> 1:17:04.895 +Finally, there is also what is referred to +as knowledge distillation, so there you have + +1:17:04.895 --> 1:17:11.566 +to remember if you learn from a probability +distribution. + +1:17:11.771 --> 1:17:24.371 +So what you can do then is you have your system +and if you then have your audio and text input + +1:17:24.371 --> 1:17:26.759 +you can use your. + +1:17:27.087 --> 1:17:32.699 +And then get a more rich signal that you'll +not only know this is the word, but you have + +1:17:32.699 --> 1:17:33.456 +a complete. + +1:17:34.394 --> 1:17:41.979 +Example is typically also done because, of +course, if you have ski data, it still begins + +1:17:41.979 --> 1:17:49.735 +that you don't only have source language audio +and target language text, but then you also + +1:17:49.735 --> 1:17:52.377 +have the source language text. + +1:17:53.833 --> 1:18:00.996 +Get a good idea of the text editor and the +artist design. + +1:18:00.996 --> 1:18:15.888 +Now have to be aligned so that: Otherwise +they wouldn't be able to determine which degree + +1:18:15.888 --> 1:18:17.922 +they'd be. + +1:18:18.178 --> 1:18:25.603 +What you've been doing in non-stasilation +is you run your MP and then you get your probability + +1:18:25.603 --> 1:18:32.716 +distribution for all the words and you use +that to train and that is not only more helpful + +1:18:32.716 --> 1:18:34.592 +than only getting back. + +1:18:35.915 --> 1:18:44.427 +You can, of course, use the same decoder to +be even similar. + +1:18:44.427 --> 1:18:49.729 +Otherwise you don't have exactly the. + +1:18:52.832 --> 1:19:03.515 +Is a good point making these tools, and generally +in all these cases it's good to have more similar + +1:19:03.515 --> 1:19:05.331 +representations. + +1:19:05.331 --> 1:19:07.253 +You can transfer. + +1:19:07.607 --> 1:19:23.743 +If you hear your representation to give from +the audio encoder and the text encoder are + +1:19:23.743 --> 1:19:27.410 +more similar, then. + +1:19:30.130 --> 1:19:39.980 +So here you have your text encoder in the +target language and you can train it on large + +1:19:39.980 --> 1:19:40.652 +data. + +1:19:41.341 --> 1:19:45.994 +But of course you want to benefit also for +this task because that's what your most interested. + +1:19:46.846 --> 1:19:59.665 +Of course, the most benefit for this task +is if these two representations you give are + +1:19:59.665 --> 1:20:01.728 +more similar. + +1:20:02.222 --> 1:20:10.583 +Therefore, it's interesting to look into how +can we make these two representations as similar + +1:20:10.583 --> 1:20:20.929 +as: The hope is that in the end you can't even +do something like zero shot transfer, but while + +1:20:20.929 --> 1:20:25.950 +you only learn this one you can also deal with. + +1:20:30.830 --> 1:20:40.257 +So what you can do is you can look at these +two representations. + +1:20:40.257 --> 1:20:42.867 +So once the text. + +1:20:43.003 --> 1:20:51.184 +And you can either put them into the text +decoder to the encoder. + +1:20:51.184 --> 1:20:53.539 +We have seen both. + +1:20:53.539 --> 1:21:03.738 +You can think: If you want to build an A's +and to insist on you can either take the audio + +1:21:03.738 --> 1:21:06.575 +encoder and see how deep. + +1:21:08.748 --> 1:21:21.915 +However, you have these two representations +and you want to make them more similar. + +1:21:21.915 --> 1:21:23.640 +One thing. + +1:21:23.863 --> 1:21:32.797 +Here we have, like you said, for every ten +million seconds we have a representation. + +1:21:35.335 --> 1:21:46.085 +So what people may have done, for example, +is to remove redundant information so you can: + +1:21:46.366 --> 1:21:56.403 +So you can use your system to put India based +on letter or words and then average over the + +1:21:56.403 --> 1:21:58.388 +words or letters. + +1:21:59.179 --> 1:22:07.965 +So that the number of representations from +the encoder is the same as you would get from. + +1:22:12.692 --> 1:22:20.919 +Okay, that much to data do have any more questions +first about that. + +1:22:27.207 --> 1:22:36.787 +Then we'll finish with the audience assessing +and highlight a bit while this is challenging, + +1:22:36.787 --> 1:22:52.891 +so here's: One test here has one thousand eight +hundred sentences, so there are words or characters. + +1:22:53.954 --> 1:22:59.336 +If you look how many all your features, so +how many samples there is like one point five + +1:22:59.336 --> 1:22:59.880 +million. + +1:23:00.200 --> 1:23:10.681 +So you have ten times more pizzas than you +have characters, and then again five times + +1:23:10.681 --> 1:23:11.413 +more. + +1:23:11.811 --> 1:23:23.934 +So you have the sequence leg of the audio +as long as you have for words, and that is + +1:23:23.934 --> 1:23:25.788 +a challenge. + +1:23:26.086 --> 1:23:34.935 +So the question is what can you do to make +the sequins a bit shorter and not have this? + +1:23:38.458 --> 1:23:48.466 +The one thing is you can try to reduce the +dimensional entity in your encounter. + +1:23:48.466 --> 1:23:50.814 +There's different. + +1:23:50.991 --> 1:24:04.302 +So, for example, you can just sum up always +over some or you can do a congregation. + +1:24:04.804 --> 1:24:12.045 +Are you a linear projectile or you even take +not every feature but only every fifth or something? + +1:24:12.492 --> 1:24:23.660 +So this way you can very easily reduce your +number of features in there, and there has + +1:24:23.660 --> 1:24:25.713 +been different. + +1:24:26.306 --> 1:24:38.310 +There's also what you can do with things like +a convolutional layer. + +1:24:38.310 --> 1:24:43.877 +If you skip over what you can,. + +1:24:47.327 --> 1:24:55.539 +And then, in addition to the audio, the other +problem is higher variability. + +1:24:55.539 --> 1:25:04.957 +So if you have a text you can: But there are +very different ways of saying that you can + +1:25:04.957 --> 1:25:09.867 +distinguish whether say a sentence or your +voice. + +1:25:10.510 --> 1:25:21.224 +That of course makes it more challenging because +now you get different inputs and while they + +1:25:21.224 --> 1:25:22.837 +were in text. + +1:25:23.263 --> 1:25:32.360 +So that makes especially for limited data +things more challenging and you want to somehow + +1:25:32.360 --> 1:25:35.796 +learn that this is not important. + +1:25:36.076 --> 1:25:39.944 +So there is the idea again okay. + +1:25:39.944 --> 1:25:47.564 +Can we doing some type of data augmentation +to better deal with? + +1:25:48.908 --> 1:25:55.735 +And again people can mainly use what has been +done in and try to do the same things. + +1:25:56.276 --> 1:26:02.937 +You can try to do a bit of noise and speech +perturbation so playing the audio like slower + +1:26:02.937 --> 1:26:08.563 +and a bit faster to get more samples then and +you can train on all of them. + +1:26:08.563 --> 1:26:14.928 +What is very important and very successful +recently is what is called Spektr augment. + +1:26:15.235 --> 1:26:25.882 +The idea is that you directly work on all +your features and you can try to last them + +1:26:25.882 --> 1:26:29.014 +and that gives you more. + +1:26:29.469 --> 1:26:41.717 +What do they mean with masking so this is +your audio feature and then there is different? + +1:26:41.962 --> 1:26:47.252 +You can do what is referred to as mask and +a time masking. + +1:26:47.252 --> 1:26:50.480 +That means you just set some masks. + +1:26:50.730 --> 1:26:58.003 +And since then you should be still able to +to deal with it because you can normally. + +1:26:57.937 --> 1:27:05.840 +Also without that you are getting more robust +and not and you can handle that because then + +1:27:05.840 --> 1:27:10.877 +many symbols which have different time look +more similar. + +1:27:11.931 --> 1:27:22.719 +You are not only doing that for time masking +but also for frequency masking so that if you + +1:27:22.719 --> 1:27:30.188 +have here the frequency channels you mask a +frequency channel. + +1:27:30.090 --> 1:27:33.089 +Thereby being able to better recognize these +things. + +1:27:35.695 --> 1:27:43.698 +This we have had an overview of the two main +approaches for speech translation that is on + +1:27:43.698 --> 1:27:51.523 +the one hand cascaded speech translation and +on the other hand we talked about advanced + +1:27:51.523 --> 1:27:53.302 +speech translation. + +1:27:53.273 --> 1:28:02.080 +It's like how to combine things and what they +work together for end speech translations. + +1:28:02.362 --> 1:28:06.581 +Here was data challenges and a bit about long +circuits. + +1:28:07.747 --> 1:28:09.304 +We have any more questions. + +1:28:11.451 --> 1:28:19.974 +Can you really describe the change in cascading +from translation to text to speech because + +1:28:19.974 --> 1:28:22.315 +thought the translation. + +1:28:25.745 --> 1:28:30.201 +Yes, so mean that works again the easiest +thing. + +1:28:30.201 --> 1:28:33.021 +What of course is challenging? + +1:28:33.021 --> 1:28:40.751 +What can be challenging is how to make that +more lively and like that pronunciation? + +1:28:40.680 --> 1:28:47.369 +And yeah, which things are put more important, +how to put things like that into. + +1:28:47.627 --> 1:28:53.866 +In the normal text, otherwise it would sound +very monotone. + +1:28:53.866 --> 1:28:57.401 +You want to add this information. + +1:28:58.498 --> 1:29:02.656 +That is maybe one thing to make it a bit more +emotional. + +1:29:02.656 --> 1:29:04.917 +That is maybe one thing which. + +1:29:05.305 --> 1:29:13.448 +But you are right there and out of the box. + +1:29:13.448 --> 1:29:20.665 +If you have everything works decently. + +1:29:20.800 --> 1:29:30.507 +Still, especially if you have a very monotone +voice, so think these are quite some open challenges. + +1:29:30.750 --> 1:29:35.898 +Maybe another open challenge is that it's +not so much for the end product, but for the + +1:29:35.898 --> 1:29:37.732 +development is very important. + +1:29:37.732 --> 1:29:40.099 +It's very hard to evaluate the quality. + +1:29:40.740 --> 1:29:48.143 +So you cannot doubt that there is a way about +most systems are currently evaluated by human + +1:29:48.143 --> 1:29:49.109 +evaluation. + +1:29:49.589 --> 1:29:54.474 +So you cannot try hundreds of things and run +your blue score and get this score. + +1:29:54.975 --> 1:30:00.609 +So therefore no means very important to have +some type of evaluation metric and that is + +1:30:00.609 --> 1:30:01.825 +quite challenging. + +1:30:08.768 --> 1:30:15.550 +And thanks for listening, and we'll have the +second part of speech translation on search. + diff --git a/demo_data/lectures/Lecture-18-18.07.2023/video.mp4 b/demo_data/lectures/Lecture-18-18.07.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..de2564aeea63e400d78a392be11cd555b8ab8238 --- /dev/null +++ b/demo_data/lectures/Lecture-18-18.07.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7158cf58687ceeb69cae55cb9786cecc77ea95e9afcc0b29251b8b9cfe54cdb5 +size 125329284 diff --git a/demo_data/lectures/Lecture-19-21.07.2023/English.vtt b/demo_data/lectures/Lecture-19-21.07.2023/English.vtt new file mode 100644 index 0000000000000000000000000000000000000000..45d4033b5bc26247eda29e267ce6608c97fad005 --- /dev/null +++ b/demo_data/lectures/Lecture-19-21.07.2023/English.vtt @@ -0,0 +1,2853 @@ +WEBVTT + +0:00:01.121 --> 0:00:14.214 +Okay, so welcome to today's lecture, on Tuesday +we started to talk about speech translation. + +0:00:14.634 --> 0:00:27.037 +And the idea is hopefully an idea of the basic +ideas we have in speech translation, the two + +0:00:27.037 --> 0:00:29.464 +major approaches. + +0:00:29.829 --> 0:00:41.459 +And the other one is the end system where +we have one large system which is everything + +0:00:41.459 --> 0:00:42.796 +together. + +0:00:43.643 --> 0:00:58.459 +Until now we mainly focus on text output that +we'll see today, but you can extend these ideas + +0:00:58.459 --> 0:01:01.138 +to other speech. + +0:01:01.441 --> 0:01:08.592 +But since it's also like a machine translation +lecture, you of course mainly focus a bit on + +0:01:08.592 --> 0:01:10.768 +the translation challenges. + +0:01:12.172 --> 0:01:25.045 +And what is the main focus of today's lecture +is to look into why that is challenging speech + +0:01:25.045 --> 0:01:26.845 +translation. + +0:01:27.627 --> 0:01:33.901 +So a bit more focus on what is now really +the difference to all you and how we can address. + +0:01:34.254 --> 0:01:39.703 +We'll start there by with the segmentation +problem. + +0:01:39.703 --> 0:01:45.990 +We had that already of bits, but especially +for end-to-end. + +0:01:46.386 --> 0:01:57.253 +So the problem is that until now it was easy +to segment the input into sentences and then + +0:01:57.253 --> 0:02:01.842 +translate each sentence individually. + +0:02:02.442 --> 0:02:17.561 +When you're now translating audio, the challenge +is that you have just a sequence of audio input + +0:02:17.561 --> 0:02:20.055 +and there's no. + +0:02:21.401 --> 0:02:27.834 +So you have this difference that your audio +is a continuous stream, but the text is typically + +0:02:27.834 --> 0:02:28.930 +sentence based. + +0:02:28.930 --> 0:02:31.667 +So how can you match this gap in there? + +0:02:31.667 --> 0:02:37.690 +We'll see that is really essential, and if +you're not using a decent good system there, + +0:02:37.690 --> 0:02:41.249 +then you can lose a lot of quality and performance. + +0:02:41.641 --> 0:02:44.267 +That is what also meant before. + +0:02:44.267 --> 0:02:51.734 +So if you have a more complex system out of +several units, it's really essential that they + +0:02:51.734 --> 0:02:56.658 +all work together and it's very easy to lose +significantly. + +0:02:57.497 --> 0:03:13.029 +The second challenge we'll talk about is disfluencies, +so the style of speaking is very different + +0:03:13.029 --> 0:03:14.773 +from text. + +0:03:15.135 --> 0:03:24.727 +So if you translate or TedTalks, that's normally +very good speakers. + +0:03:24.727 --> 0:03:30.149 +They will give you a very fluent text. + +0:03:30.670 --> 0:03:36.692 +When you want to translate a lecture, it might +be more difficult or rednested. + +0:03:37.097 --> 0:03:39.242 +Mean people are not well that well. + +0:03:39.242 --> 0:03:42.281 +They should be prepared in giving the lecture +and. + +0:03:42.362 --> 0:03:48.241 +But it's not that I mean, typically a lecture +will have like rehearsal like five times before + +0:03:48.241 --> 0:03:52.682 +he is giving this lecture, and then like will +it completely be fluent? + +0:03:52.682 --> 0:03:56.122 +He might at some point notice all this is +not perfect. + +0:03:56.122 --> 0:04:00.062 +I want to rephrase, and he'll have to sing +during the lecture. + +0:04:00.300 --> 0:04:04.049 +Might be also good that he's thinking, so +he's not going too fast and things like. + +0:04:05.305 --> 0:04:07.933 +If you then go to the other extreme, it's +more meetings. + +0:04:08.208 --> 0:04:15.430 +If you have a lively discussion, of course, +people will interrupt, they will restart, they + +0:04:15.430 --> 0:04:22.971 +will think while they speak, and you know that +sometimes you tell people first think and speak + +0:04:22.971 --> 0:04:26.225 +because they are changing their opinion. + +0:04:26.606 --> 0:04:31.346 +So the question of how can you deal with this? + +0:04:31.346 --> 0:04:37.498 +And there again it might be solutions for +that, or at least. + +0:04:39.759 --> 0:04:46.557 +Then for the output we will look into simultaneous +translation that is at least not very important + +0:04:46.557 --> 0:04:47.175 +in text. + +0:04:47.175 --> 0:04:53.699 +There might be some cases but normally you +have all text available and then you're translating + +0:04:53.699 --> 0:04:54.042 +and. + +0:04:54.394 --> 0:05:09.220 +While for speech translation, since it's often +a life interaction, then of course it's important. + +0:05:09.149 --> 0:05:12.378 +Otherwise it's hard to follow. + +0:05:12.378 --> 0:05:19.463 +You see what said five minutes ago and the +slide is not as helpful. + +0:05:19.739 --> 0:05:35.627 +You have to wait very long before you can +answer because you have to first wait for what + +0:05:35.627 --> 0:05:39.197 +is happening there. + +0:05:40.660 --> 0:05:46.177 +And finally, we can talk a bit about presentation. + +0:05:46.177 --> 0:05:54.722 +For example, mentioned that if you're generating +subtitles, it's not possible. + +0:05:54.854 --> 0:06:01.110 +So in professional subtitles there are clear +rules. + +0:06:01.110 --> 0:06:05.681 +Subtitle has to be shown for seconds. + +0:06:05.681 --> 0:06:08.929 +It's maximum of two lines. + +0:06:09.549 --> 0:06:13.156 +Because otherwise it's getting too long, it's +not able to read it anymore, and so. + +0:06:13.613 --> 0:06:19.826 +So if you want to achieve that, of course, +you might have to adjust and select what you + +0:06:19.826 --> 0:06:20.390 +really. + +0:06:23.203 --> 0:06:28.393 +The first date starts with the segmentation. + +0:06:28.393 --> 0:06:36.351 +On the one end it's an issue while training, +on the other hand it's. + +0:06:38.678 --> 0:06:47.781 +What is the problem so when we train it's +relatively easy to separate our data into sentence + +0:06:47.781 --> 0:06:48.466 +level. + +0:06:48.808 --> 0:07:02.241 +So if you have your example, you have the +audio and the text, then you typically know + +0:07:02.241 --> 0:07:07.083 +that this sentence is aligned. + +0:07:07.627 --> 0:07:16.702 +You can use these time information to cut +your audio and then you can train and then. + +0:07:18.018 --> 0:07:31.775 +Because what we need for an enchilada model +is to be an output chart, in this case an audio + +0:07:31.775 --> 0:07:32.822 +chart. + +0:07:33.133 --> 0:07:38.551 +And even if this is a long speech, it's easy +then since we have this time information to + +0:07:38.551 --> 0:07:39.159 +separate. + +0:07:39.579 --> 0:07:43.866 +But we are using therefore, of course, the +target side information. + +0:07:45.865 --> 0:07:47.949 +The problem is now in runtime. + +0:07:47.949 --> 0:07:49.427 +This is not possible. + +0:07:49.427 --> 0:07:55.341 +Here we can do that based on the calculation +marks and the sentence segmentation on the + +0:07:55.341 --> 0:07:57.962 +target side because that is splitting. + +0:07:57.962 --> 0:08:02.129 +But during transcript, during translation +it is not possible. + +0:08:02.442 --> 0:08:10.288 +Because there is just a long audio signal, +and of course if you have your test data to + +0:08:10.288 --> 0:08:15.193 +split it into: That has been done for some +experience. + +0:08:15.193 --> 0:08:22.840 +It's fine, but it's not a realistic scenario +because if you really apply it in real world, + +0:08:22.840 --> 0:08:25.949 +we won't have a manual segmentation. + +0:08:26.266 --> 0:08:31.838 +If a human has to do that then he can do the +translation so you want to have a full automatic + +0:08:31.838 --> 0:08:32.431 +pipeline. + +0:08:32.993 --> 0:08:38.343 +So the question is how can we deal with this +type of you know? + +0:09:09.309 --> 0:09:20.232 +So the question is how can we deal with this +time of situation and how can we segment the + +0:09:20.232 --> 0:09:23.024 +audio into some units? + +0:09:23.863 --> 0:09:32.495 +And here is one further really big advantage +of a cascaded sauce: Because how is this done + +0:09:32.495 --> 0:09:34.259 +in a cascade of systems? + +0:09:34.259 --> 0:09:38.494 +We are splitting the audio with some features +we are doing. + +0:09:38.494 --> 0:09:42.094 +We can use similar ones which we'll discuss +later. + +0:09:42.094 --> 0:09:43.929 +Then we run against chin. + +0:09:43.929 --> 0:09:48.799 +We have the transcript and then we can do +what we talked last about. + +0:09:49.069 --> 0:10:02.260 +So if you have this is an audio signal and +the training data it was good. + +0:10:02.822 --> 0:10:07.951 +So here we have a big advantage. + +0:10:07.951 --> 0:10:16.809 +We can use a different segmentation for the +and for the. + +0:10:16.809 --> 0:10:21.316 +Why is that a big advantage? + +0:10:23.303 --> 0:10:34.067 +Will say for a team task is more important +because we can then do the sentence transformation. + +0:10:34.955 --> 0:10:37.603 +See and Yeah, We Can Do the Same Thing. + +0:10:37.717 --> 0:10:40.226 +To save us, why is it not as important for +us? + +0:10:40.226 --> 0:10:40.814 +Are maybe. + +0:10:43.363 --> 0:10:48.589 +We don't need that much context. + +0:10:48.589 --> 0:11:01.099 +We only try to restrict the word, but the +context to consider is mainly small. + +0:11:03.283 --> 0:11:11.419 +Would agree with it in more context, but there +is one more important: its. + +0:11:11.651 --> 0:11:16.764 +The is monotone, so there's no reordering. + +0:11:16.764 --> 0:11:22.472 +The second part of the signal is no reordering. + +0:11:22.472 --> 0:11:23.542 +We have. + +0:11:23.683 --> 0:11:29.147 +And of course if we are doing that we cannot +really order across boundaries between segments. + +0:11:29.549 --> 0:11:37.491 +It might be challenging if we split the words +so that it's not perfect for so that. + +0:11:37.637 --> 0:11:40.846 +But we need to do quite long range reordering. + +0:11:40.846 --> 0:11:47.058 +If you think about the German where the work +has moved, and now the English work is in one + +0:11:47.058 --> 0:11:50.198 +part, but the end of the sentence is another. + +0:11:50.670 --> 0:11:59.427 +And of course this advantage we have now here +that if we have a segment we have. + +0:12:01.441 --> 0:12:08.817 +And that this segmentation is important. + +0:12:08.817 --> 0:12:15.294 +Here are some motivations for that. + +0:12:15.675 --> 0:12:25.325 +What you are doing is you are taking the reference +text and you are segmenting. + +0:12:26.326 --> 0:12:30.991 +And then, of course, your segments are exactly +yeah cute. + +0:12:31.471 --> 0:12:42.980 +If you're now using different segmentation +strategies, you're using significantly in blue + +0:12:42.980 --> 0:12:44.004 +points. + +0:12:44.004 --> 0:12:50.398 +If the segmentation is bad, you have a lot +worse. + +0:12:52.312 --> 0:13:10.323 +And interesting, here you ought to see how +it was a human, but people have in a competition. + +0:13:10.450 --> 0:13:22.996 +You can see that by working on the segmentation +and using better segmentation you can improve + +0:13:22.996 --> 0:13:25.398 +your performance. + +0:13:26.006 --> 0:13:29.932 +So it's really essential. + +0:13:29.932 --> 0:13:41.712 +One other interesting thing is if you're looking +into the difference between. + +0:13:42.082 --> 0:13:49.145 +So it really seems to be more important to +have a good segmentation for our cascaded system. + +0:13:49.109 --> 0:13:56.248 +For an intra-end system because there you +can't re-segment while it is less important + +0:13:56.248 --> 0:13:58.157 +for a cascaded system. + +0:13:58.157 --> 0:14:05.048 +Of course, it's still important, but the difference +between the two segmentations. + +0:14:06.466 --> 0:14:18.391 +It was a shared task some years ago like it's +just one system from different. + +0:14:22.122 --> 0:14:31.934 +So the question is how can we deal with this +in speech translation and what people look + +0:14:31.934 --> 0:14:32.604 +into? + +0:14:32.752 --> 0:14:48.360 +Now we want to use different techniques to +split the audio signal into segments. + +0:14:48.848 --> 0:14:54.413 +You have the disadvantage that you can't change +it. + +0:14:54.413 --> 0:15:00.407 +Therefore, some of the quality might be more +important. + +0:15:00.660 --> 0:15:15.678 +But in both cases, of course, the A's are +better if you have a good segmentation. + +0:15:17.197 --> 0:15:23.149 +So any idea, how would you have this task +now split this audio? + +0:15:23.149 --> 0:15:26.219 +What type of tool would you use? + +0:15:28.648 --> 0:15:41.513 +The fuse was a new network to segment half +for instance supervise. + +0:15:41.962 --> 0:15:44.693 +Yes, that's exactly already the better system. + +0:15:44.693 --> 0:15:50.390 +So for long time people have done more simple +things because we'll come to that a bit challenging + +0:15:50.390 --> 0:15:52.250 +as creating or having the data. + +0:15:53.193 --> 0:16:00.438 +The first thing is you use some tool out of +the box like voice activity detection which + +0:16:00.438 --> 0:16:07.189 +has been there as a whole research field so +people find when somebody's speaking. + +0:16:07.647 --> 0:16:14.952 +And then you use that in this different threshold +you always have the ability that somebody's + +0:16:14.952 --> 0:16:16.273 +speaking or not. + +0:16:17.217 --> 0:16:19.889 +Then you split your signal. + +0:16:19.889 --> 0:16:26.762 +It will not be perfect, but you transcribe +or translate each component. + +0:16:28.508 --> 0:16:39.337 +But as you see, a supervised classification +task is even better, and that is now the most + +0:16:39.337 --> 0:16:40.781 +common use. + +0:16:41.441 --> 0:16:49.909 +The supervisor is doing that as a supervisor +classification and then you'll try to use this + +0:16:49.909 --> 0:16:50.462 +type. + +0:16:50.810 --> 0:16:53.217 +We're going into a bit more detail on how +to do that. + +0:16:53.633 --> 0:17:01.354 +So what you need to do first is, of course, +you have to have some labels whether this is + +0:17:01.354 --> 0:17:03.089 +an end of sentence. + +0:17:03.363 --> 0:17:10.588 +You do that by using the alignment between +the segments and the audio. + +0:17:10.588 --> 0:17:12.013 +You have the. + +0:17:12.212 --> 0:17:15.365 +The two people have not for each word, so +these tank steps. + +0:17:15.365 --> 0:17:16.889 +This word is said this time. + +0:17:17.157 --> 0:17:27.935 +This word is said by what you typically have +from this time to time to time. + +0:17:27.935 --> 0:17:34.654 +We have the second segment, the second segment. + +0:17:35.195 --> 0:17:39.051 +Which also used to trade for example your +advanced system and everything. + +0:17:41.661 --> 0:17:53.715 +Based on that you can label each frame in +there so if you have a green or blue that is + +0:17:53.715 --> 0:17:57.455 +our speech segment so you. + +0:17:58.618 --> 0:18:05.690 +And these labels will then later help you, +but you extract exactly these types of. + +0:18:07.067 --> 0:18:08.917 +There's one big challenge. + +0:18:08.917 --> 0:18:15.152 +If you have two sentences which are directly +connected to each other, then if you're doing + +0:18:15.152 --> 0:18:18.715 +this labeling, you would not have a break in +later. + +0:18:18.715 --> 0:18:23.512 +If you tried to extract that, there should +be something great or not. + +0:18:23.943 --> 0:18:31.955 +So what you typically do is in the last frame. + +0:18:31.955 --> 0:18:41.331 +You mark as outside, although it's not really +outside. + +0:18:43.463 --> 0:18:46.882 +Yes, I guess you could also do that in more +of a below check. + +0:18:46.882 --> 0:18:48.702 +I mean, this is the most simple. + +0:18:48.702 --> 0:18:51.514 +It's like inside outside, so it's related +to that. + +0:18:51.514 --> 0:18:54.988 +Of course, you could have an extra startup +segment, and so on. + +0:18:54.988 --> 0:18:57.469 +I guess this is just to make it more simple. + +0:18:57.469 --> 0:19:00.226 +You only have two labels, not a street classroom. + +0:19:00.226 --> 0:19:02.377 +But yeah, you could do similar things. + +0:19:12.432 --> 0:19:20.460 +Has caused down the roads to problems because +it could be an important part of a segment + +0:19:20.460 --> 0:19:24.429 +which has some meaning and we do something. + +0:19:24.429 --> 0:19:28.398 +The good thing is frames are normally very. + +0:19:28.688 --> 0:19:37.586 +Like some milliseconds, so normally if you +remove some milliseconds you can still understand + +0:19:37.586 --> 0:19:38.734 +everything. + +0:19:38.918 --> 0:19:46.999 +Mean the speech signal is very repetitive, +and so you have information a lot of times. + +0:19:47.387 --> 0:19:50.730 +That's why we talked along there last time +they could try to shrink the steak and. + +0:19:51.031 --> 0:20:00.995 +If you now have a short sequence where there +is like which would be removed and that's not + +0:20:00.995 --> 0:20:01.871 +really. + +0:20:02.162 --> 0:20:06.585 +Yeah, but it's not a full letter is missing. + +0:20:06.585 --> 0:20:11.009 +It's like only the last ending of the vocal. + +0:20:11.751 --> 0:20:15.369 +Think it doesn't really happen. + +0:20:15.369 --> 0:20:23.056 +We have our audio signal and we have these +gags that are not above. + +0:20:23.883 --> 0:20:29.288 +With this blue rectangulars the inside speech +segment and with the guess it's all set yes. + +0:20:29.669 --> 0:20:35.736 +So then you have the full signal and you're +meaning now labeling your task as a blue or + +0:20:35.736 --> 0:20:36.977 +white prediction. + +0:20:36.977 --> 0:20:39.252 +So that is your prediction task. + +0:20:39.252 --> 0:20:44.973 +You have the audio signal only and your prediction +task is like label one or zero. + +0:20:45.305 --> 0:20:55.585 +Once you do that then based on this labeling +you can extract each segment again like each + +0:20:55.585 --> 0:20:58.212 +consecutive blue area. + +0:20:58.798 --> 0:21:05.198 +See then removed maybe the non-speaking part +already and duo speech translation only on + +0:21:05.198 --> 0:21:05.998 +the parts. + +0:21:06.786 --> 0:21:19.768 +Which is good because the training would have +done similarly. + +0:21:20.120 --> 0:21:26.842 +So on the noise in between you never saw in +the training, so it's good to throw it away. + +0:21:29.649 --> 0:21:34.930 +One challenge, of course, is now if you're +doing that, what is your input? + +0:21:34.930 --> 0:21:40.704 +You cannot do the sequence labeling normally +on the whole talk, so it's too long. + +0:21:40.704 --> 0:21:46.759 +So if you're doing this prediction of the +label, you also have a window for which you + +0:21:46.759 --> 0:21:48.238 +do the segmentation. + +0:21:48.788 --> 0:21:54.515 +And that's the bedline we have in the punctuation +prediction. + +0:21:54.515 --> 0:22:00.426 +If we don't have good borders, random splits +are normally good. + +0:22:00.426 --> 0:22:03.936 +So what we do now is split the audio. + +0:22:04.344 --> 0:22:09.134 +So that would be our input, and then the part +three would be our labels. + +0:22:09.269 --> 0:22:15.606 +This green would be the input and here we +want, for example, blue labels and then white. + +0:22:16.036 --> 0:22:20.360 +Here only do labors and here at the beginning +why maybe at the end why. + +0:22:21.401 --> 0:22:28.924 +So thereby you have now a fixed window always +for which you're doing than this task of predicting. + +0:22:33.954 --> 0:22:43.914 +How you build your classifier that is based +again. + +0:22:43.914 --> 0:22:52.507 +We had this wave to be mentioned last week. + +0:22:52.752 --> 0:23:00.599 +So in training you use labels to say whether +it's in speech or outside speech. + +0:23:01.681 --> 0:23:17.740 +Inference: You give them always the chance +and then predict whether this part like each + +0:23:17.740 --> 0:23:20.843 +label is afraid. + +0:23:23.143 --> 0:23:29.511 +Bit more complicated, so one challenge is +if you randomly split off cognition, losing + +0:23:29.511 --> 0:23:32.028 +your context for the first brain. + +0:23:32.028 --> 0:23:38.692 +It might be very hard to predict whether this +is now in or out of, and also for the last. + +0:23:39.980 --> 0:23:48.449 +You often need a bit of context whether this +is audio or not, and at the beginning. + +0:23:49.249 --> 0:23:59.563 +So what you do is you put the audio in twice. + +0:23:59.563 --> 0:24:08.532 +You want to do it with splits and then. + +0:24:08.788 --> 0:24:15.996 +It is shown you have shifted the two offsets, +so one is predicted with the other offset. + +0:24:16.416 --> 0:24:23.647 +And then averaging the probabilities so that +at each time you have, at least for one of + +0:24:23.647 --> 0:24:25.127 +the predictions,. + +0:24:25.265 --> 0:24:36.326 +Because at the end of the second it might +be very hard to predict whether this is now + +0:24:36.326 --> 0:24:39.027 +speech or nonspeech. + +0:24:39.939 --> 0:24:47.956 +Think it is a high parameter, but you are +not optimizing it, so you just take two shifts. + +0:24:48.328 --> 0:24:54.636 +Of course try a lot of different shifts and +so on. + +0:24:54.636 --> 0:24:59.707 +The thing is it's mainly a problem here. + +0:24:59.707 --> 0:25:04.407 +If you don't do two outsets you have. + +0:25:05.105 --> 0:25:14.761 +You could get better by doing that, but would +be skeptical if it really matters, and also + +0:25:14.761 --> 0:25:18.946 +have not seen any experience in doing. + +0:25:19.159 --> 0:25:27.629 +Guess you're already good, you have maybe +some arrows in there and you're getting. + +0:25:31.191 --> 0:25:37.824 +So with this you have your segmentation. + +0:25:37.824 --> 0:25:44.296 +However, there is a problem in between. + +0:25:44.296 --> 0:25:49.150 +Once the model is wrong then. + +0:25:49.789 --> 0:26:01.755 +The normal thing would be the first thing +that you take some threshold and that you always + +0:26:01.755 --> 0:26:05.436 +label everything in speech. + +0:26:06.006 --> 0:26:19.368 +The problem is when you are just doing this +one threshold that you might have. + +0:26:19.339 --> 0:26:23.954 +Those are the challenges. + +0:26:23.954 --> 0:26:31.232 +Short segments mean you have no context. + +0:26:31.232 --> 0:26:35.492 +The policy will be bad. + +0:26:37.077 --> 0:26:48.954 +Therefore, people use this probabilistic divided +cocker algorithm, so the main idea is start + +0:26:48.954 --> 0:26:56.744 +with the whole segment, and now you split the +whole segment. + +0:26:57.397 --> 0:27:09.842 +Then you split there and then you continue +until each segment is smaller than the maximum + +0:27:09.842 --> 0:27:10.949 +length. + +0:27:11.431 --> 0:27:23.161 +But you can ignore some splits, and if you +split one segment into two parts you first + +0:27:23.161 --> 0:27:23.980 +trim. + +0:27:24.064 --> 0:27:40.197 +So normally it's not only one signal position, +it's a longer area of non-voice, so you try + +0:27:40.197 --> 0:27:43.921 +to find this longer. + +0:27:43.943 --> 0:27:51.403 +Now your large segment is split into two smaller +segments. + +0:27:51.403 --> 0:27:56.082 +Now you are checking these segments. + +0:27:56.296 --> 0:28:04.683 +So if they are very, very short, it might +be good not to spin at this point because you're + +0:28:04.683 --> 0:28:05.697 +ending up. + +0:28:06.006 --> 0:28:09.631 +And this way you continue all the time, and +then hopefully you'll have a good stretch. + +0:28:10.090 --> 0:28:19.225 +So, of course, there's one challenge with +this approach: if you think about it later, + +0:28:19.225 --> 0:28:20.606 +low latency. + +0:28:25.405 --> 0:28:31.555 +So in this case you have to have the full +audio available. + +0:28:32.132 --> 0:28:38.112 +So you cannot continuously do that mean if +you would do it just always. + +0:28:38.112 --> 0:28:45.588 +If the probability is higher you split but +in this case you try to find a global optimal. + +0:28:46.706 --> 0:28:49.134 +A heuristic body. + +0:28:49.134 --> 0:28:58.170 +You find a global solution for your whole +tar and not a local one. + +0:28:58.170 --> 0:29:02.216 +Where's the system most sure? + +0:29:02.802 --> 0:29:12.467 +So that's a bit of a challenge here, but the +advantage of course is that in the end you + +0:29:12.467 --> 0:29:14.444 +have no segments. + +0:29:17.817 --> 0:29:23.716 +Any more questions like this. + +0:29:23.716 --> 0:29:36.693 +Then the next thing is we also need to evaluate +in this scenario. + +0:29:37.097 --> 0:29:44.349 +So know machine translation is quite a long +way. + +0:29:44.349 --> 0:29:55.303 +History now was the beginning of the semester, +but hope you can remember. + +0:29:55.675 --> 0:30:09.214 +Might be with blue score, might be with comment +or similar, but you need to have. + +0:30:10.310 --> 0:30:22.335 +But this assumes that you have this one-to-one +match, so you always have an output and machine + +0:30:22.335 --> 0:30:26.132 +translation, which is nicely. + +0:30:26.506 --> 0:30:34.845 +So then it might be that our output has four +segments, while our reference output has only + +0:30:34.845 --> 0:30:35.487 +three. + +0:30:36.756 --> 0:30:40.649 +And now is, of course, questionable like what +should we compare in our metric. + +0:30:44.704 --> 0:30:53.087 +So it's no longer directly possible to directly +do that because what should you compare? + +0:30:53.413 --> 0:31:00.214 +Just have four segments there and three segments +there, and of course it seems to be that. + +0:31:00.920 --> 0:31:06.373 +The first one it likes to the first one when +you see I can't speak Spanish, but you're an + +0:31:06.373 --> 0:31:09.099 +audience of the guests who is already there. + +0:31:09.099 --> 0:31:14.491 +So even like just a woman, the blue comparing +wouldn't work, so you need to do something + +0:31:14.491 --> 0:31:17.157 +about that to take this type of evaluation. + +0:31:19.019 --> 0:31:21.727 +Still any suggestions what you could do. + +0:31:25.925 --> 0:31:44.702 +How can you calculate a blue score because +you don't have one you want to see? + +0:31:45.925 --> 0:31:49.365 +Here you put another layer which spies to +add in the second. + +0:31:51.491 --> 0:31:56.979 +It's even not aligning only, but that's one +solution, so you need to align and resign. + +0:31:57.177 --> 0:32:06.886 +Because even if you have no alignment so this +to this and this to that you see that it's + +0:32:06.886 --> 0:32:12.341 +not good because the audio would compare to +that. + +0:32:13.453 --> 0:32:16.967 +That we'll discuss is even one simpler solution. + +0:32:16.967 --> 0:32:19.119 +Yes, it's a simpler solution. + +0:32:19.119 --> 0:32:23.135 +It's called document based blue or something +like that. + +0:32:23.135 --> 0:32:25.717 +So you just take the full document. + +0:32:26.566 --> 0:32:32.630 +For some matrix it's good and it's not clear +how good it is to the other, but there might + +0:32:32.630 --> 0:32:32.900 +be. + +0:32:33.393 --> 0:32:36.454 +Think of more simple metrics like blue. + +0:32:36.454 --> 0:32:40.356 +Do you have any idea what could be a disadvantage? + +0:32:49.249 --> 0:32:56.616 +Blue is matching ingrams so you start with +the original. + +0:32:56.616 --> 0:33:01.270 +You check how many ingrams in here. + +0:33:01.901 --> 0:33:11.233 +If you're not doing that on the full document, +you can also match grams from year to year. + +0:33:11.751 --> 0:33:15.680 +So you can match things very far away. + +0:33:15.680 --> 0:33:21.321 +Start doing translation and you just randomly +randomly. + +0:33:22.142 --> 0:33:27.938 +And that, of course, could be a bit of a disadvantage +or like is a problem, and therefore people + +0:33:27.938 --> 0:33:29.910 +also look into the segmentation. + +0:33:29.910 --> 0:33:34.690 +But I've recently seen some things, so document +levels tours are also normally. + +0:33:34.690 --> 0:33:39.949 +If you have a relatively high quality system +or state of the art, then they also have a + +0:33:39.949 --> 0:33:41.801 +good correlation of the human. + +0:33:46.546 --> 0:33:59.241 +So how are we doing that so we are putting +end of sentence boundaries in there and then. + +0:33:59.179 --> 0:34:07.486 +Alignment based on a similar Livingston distance, +so at a distance between our output and the + +0:34:07.486 --> 0:34:09.077 +reference output. + +0:34:09.449 --> 0:34:13.061 +And here is our boundary. + +0:34:13.061 --> 0:34:23.482 +We map the boundary based on the alignment, +so in Lithuania you only have. + +0:34:23.803 --> 0:34:36.036 +And then, like all the words that are before, +it might be since there is not a random. + +0:34:36.336 --> 0:34:44.890 +Mean it should be, but it can happen things +like that, and it's not clear where. + +0:34:44.965 --> 0:34:49.727 +At the break, however, they are typically +not that bad because they are words which are + +0:34:49.727 --> 0:34:52.270 +not matching between reference and hypothesis. + +0:34:52.270 --> 0:34:56.870 +So normally it doesn't really matter that +much because they are anyway not matching. + +0:34:57.657 --> 0:35:05.888 +And then you take the mule as a T output and +use that to calculate your metric. + +0:35:05.888 --> 0:35:12.575 +Then it's again a perfect alignment for which +you can calculate. + +0:35:14.714 --> 0:35:19.229 +Any idea you could do it the other way around. + +0:35:19.229 --> 0:35:23.359 +You could resigment your reference to the. + +0:35:29.309 --> 0:35:30.368 +Which one would you select? + +0:35:34.214 --> 0:35:43.979 +I think segmenting the assertive also is much +more natural because the reference sentence + +0:35:43.979 --> 0:35:46.474 +is the fixed solution. + +0:35:47.007 --> 0:35:52.947 +Yes, that's the right motivation if you do +think about blue or so. + +0:35:52.947 --> 0:35:57.646 +Additionally important if you change your +reference. + +0:35:57.857 --> 0:36:07.175 +You might have a different number of diagrams +or diagrams because the sentences are different + +0:36:07.175 --> 0:36:08.067 +lengths. + +0:36:08.068 --> 0:36:15.347 +Here your five system, you're always comparing +it to the same system, and you don't compare + +0:36:15.347 --> 0:36:16.455 +to different. + +0:36:16.736 --> 0:36:22.317 +The only different base of segmentation, but +still it could make some do. + +0:36:25.645 --> 0:36:38.974 +Good, that's all about sentence segmentation, +then a bit about disfluencies and what there + +0:36:38.974 --> 0:36:40.146 +really. + +0:36:42.182 --> 0:36:51.138 +So as said in daily life, you're not speaking +like very nice full sentences every. + +0:36:51.471 --> 0:36:53.420 +He was speaking powerful sentences. + +0:36:53.420 --> 0:36:54.448 +We do repetitions. + +0:36:54.834 --> 0:37:00.915 +It's especially if it's more interactive, +so in meetings, phone calls and so on. + +0:37:00.915 --> 0:37:04.519 +If you have multiple speakers, they also break. + +0:37:04.724 --> 0:37:16.651 +Each other, and then if you keep them, they +are harder to translate because most of your + +0:37:16.651 --> 0:37:17.991 +training. + +0:37:18.278 --> 0:37:30.449 +It's also very difficult to read, so we'll +have some examples there to transcribe everything + +0:37:30.449 --> 0:37:32.543 +as it was said. + +0:37:33.473 --> 0:37:36.555 +What type of things are there? + +0:37:37.717 --> 0:37:42.942 +So you have all these pillow works. + +0:37:42.942 --> 0:37:47.442 +These are very easy to remove. + +0:37:47.442 --> 0:37:52.957 +You can just use regular expressions. + +0:37:53.433 --> 0:38:00.139 +Is getting more difficult with some other +type of filler works. + +0:38:00.139 --> 0:38:03.387 +In German you have this or in. + +0:38:04.024 --> 0:38:08.473 +And these ones you cannot just remove by regular +expression. + +0:38:08.473 --> 0:38:15.039 +You shouldn't remove all yacht from a text +because it might be very important information + +0:38:15.039 --> 0:38:15.768 +for well. + +0:38:15.715 --> 0:38:19.995 +It may be not as important as you are, but +still it might be very important. + +0:38:20.300 --> 0:38:24.215 +So just removing them is there already more +difficult. + +0:38:26.586 --> 0:38:29.162 +Then you have these repetitions. + +0:38:29.162 --> 0:38:32.596 +You have something like mean saw him there. + +0:38:32.596 --> 0:38:33.611 +There was a. + +0:38:34.334 --> 0:38:41.001 +And while for the first one that might be +very easy to remove because you just look for + +0:38:41.001 --> 0:38:47.821 +double, the thing is that the repetition might +not be exactly the same, so there is there + +0:38:47.821 --> 0:38:48.199 +was. + +0:38:48.199 --> 0:38:54.109 +So there is already getting a bit more complicated, +of course still possible. + +0:38:54.614 --> 0:39:01.929 +You can remove Denver so the real sense would +be like to have a ticket to Houston. + +0:39:02.882 --> 0:39:13.327 +But there the detection, of course, is getting +more challenging as you want to get rid of. + +0:39:13.893 --> 0:39:21.699 +You don't have the data, of course, which +makes all the tasks harder, but you probably + +0:39:21.699 --> 0:39:22.507 +want to. + +0:39:22.507 --> 0:39:24.840 +That's really meaningful. + +0:39:24.840 --> 0:39:26.185 +Current isn't. + +0:39:26.185 --> 0:39:31.120 +That is now a really good point and it's really +there. + +0:39:31.051 --> 0:39:34.785 +The thing about what is your final task? + +0:39:35.155 --> 0:39:45.526 +If you want to have a transcript reading it, +I'm not sure if we have another example. + +0:39:45.845 --> 0:39:54.171 +So there it's nicer if you have a clean transfer +and if you see subtitles in, they're also not + +0:39:54.171 --> 0:39:56.625 +having all the repetitions. + +0:39:56.625 --> 0:40:03.811 +It's the nice way to shorten but also getting +the structure you cannot even make. + +0:40:04.064 --> 0:40:11.407 +So in this situation, of course, they might +give you information. + +0:40:11.407 --> 0:40:14.745 +There is a lot of stuttering. + +0:40:15.015 --> 0:40:22.835 +So in this case agree it might be helpful +in some way, but meaning reading all the disfluencies + +0:40:22.835 --> 0:40:25.198 +is getting really difficult. + +0:40:25.198 --> 0:40:28.049 +If you have the next one, we have. + +0:40:28.308 --> 0:40:31.630 +That's a very long text. + +0:40:31.630 --> 0:40:35.883 +You need a bit of time to pass. + +0:40:35.883 --> 0:40:39.472 +This one is not important. + +0:40:40.480 --> 0:40:48.461 +It might be nice if you can start reading +from here. + +0:40:48.461 --> 0:40:52.074 +Let's have a look here. + +0:40:52.074 --> 0:40:54.785 +Try to read this. + +0:40:57.297 --> 0:41:02.725 +You can understand it, but think you need +a bit of time to really understand what was. + +0:41:11.711 --> 0:41:21.480 +And now we have the same text, but you have +highlighted in bold, and not only read the + +0:41:21.480 --> 0:41:22.154 +bold. + +0:41:23.984 --> 0:41:25.995 +And ignore everything which is not bold. + +0:41:30.250 --> 0:41:49.121 +Would assume it's easier to read just the +book part more faster and more faster. + +0:41:50.750 --> 0:41:57.626 +Yeah, it might be, but I'm not sure we have +a master thesis of that. + +0:41:57.626 --> 0:41:59.619 +If seen my videos,. + +0:42:00.000 --> 0:42:09.875 +Of the recordings, I also have it more likely +that it's like a fluent speak and I'm not like + +0:42:09.875 --> 0:42:12.318 +doing the hesitations. + +0:42:12.652 --> 0:42:23.764 +Don't know if somebody else has looked into +the Cusera video, but notice that. + +0:42:25.005 --> 0:42:31.879 +For these videos spoke every minute, three +times or something, and then people were there + +0:42:31.879 --> 0:42:35.011 +and cutting things and making hopefully. + +0:42:35.635 --> 0:42:42.445 +And therefore if you want to more achieve +that, of course, no longer exactly what was + +0:42:42.445 --> 0:42:50.206 +happening, but if it more looks like a professional +video, then you would have to do that and cut + +0:42:50.206 --> 0:42:50.998 +that out. + +0:42:50.998 --> 0:42:53.532 +But yeah, there are definitely. + +0:42:55.996 --> 0:42:59.008 +We're also going to do this thing again. + +0:42:59.008 --> 0:43:02.315 +First turn is like I'm going to have a very. + +0:43:02.422 --> 0:43:07.449 +Which in the end they start to slow down just +without feeling as though they're. + +0:43:07.407 --> 0:43:10.212 +It's a good point for the next. + +0:43:10.212 --> 0:43:13.631 +There is not the one perfect solution. + +0:43:13.631 --> 0:43:20.732 +There's some work on destruction removal, +but of course there's also disability. + +0:43:20.732 --> 0:43:27.394 +Removal is not that easy, so do you just remove +that's in order everywhere. + +0:43:27.607 --> 0:43:29.708 +But how much like cleaning do you do? + +0:43:29.708 --> 0:43:31.366 +It's more a continuous thing. + +0:43:31.811 --> 0:43:38.211 +Is it more really you only remove stuff or +are you also into rephrasing and here is only + +0:43:38.211 --> 0:43:38.930 +removing? + +0:43:39.279 --> 0:43:41.664 +But maybe you want to rephrase it. + +0:43:41.664 --> 0:43:43.231 +That's hearing better. + +0:43:43.503 --> 0:43:49.185 +So then it's going into what people are doing +in style transfer. + +0:43:49.185 --> 0:43:52.419 +We are going from a speech style to. + +0:43:52.872 --> 0:44:07.632 +So there is more continuum, and of course +Airconditioner is not the perfect solution, + +0:44:07.632 --> 0:44:10.722 +but exactly what. + +0:44:15.615 --> 0:44:19.005 +Yeah, we're challenging. + +0:44:19.005 --> 0:44:30.258 +You have examples where the direct copy is +not as hard or is not exactly the same. + +0:44:30.258 --> 0:44:35.410 +That is, of course, more challenging. + +0:44:41.861 --> 0:44:49.889 +If it's getting really mean why it's so challenging, +if it's really spontaneous even for the speaker, + +0:44:49.889 --> 0:44:55.634 +you need maybe even the video to really get +that and at least the audio. + +0:45:01.841 --> 0:45:06.025 +Yeah what it also depends on. + +0:45:06.626 --> 0:45:15.253 +The purpose, of course, and very important +thing is the easiest tasks just to removing. + +0:45:15.675 --> 0:45:25.841 +Of course you have to be very careful because +if you remove some of the not, it's normally + +0:45:25.841 --> 0:45:26.958 +not much. + +0:45:27.227 --> 0:45:33.176 +But if you remove too much, of course, that's +very, very bad because you're losing important. + +0:45:33.653 --> 0:45:46.176 +And this might be even more challenging if +you think about rarer and unseen works. + +0:45:46.226 --> 0:45:56.532 +So when doing this removal, it's important +to be careful and normally more conservative. + +0:46:03.083 --> 0:46:15.096 +Of course, also you have to again see if you're +doing that now in a two step approach, not + +0:46:15.096 --> 0:46:17.076 +an end to end. + +0:46:17.076 --> 0:46:20.772 +So first you need a remote. + +0:46:21.501 --> 0:46:30.230 +But you have to somehow sing it in the whole +type line. + +0:46:30.230 --> 0:46:36.932 +If you learn text or remove disfluencies,. + +0:46:36.796 --> 0:46:44.070 +But it might be that the ASR system is outputing +something else or that it's more of an ASR + +0:46:44.070 --> 0:46:44.623 +error. + +0:46:44.864 --> 0:46:46.756 +So um. + +0:46:46.506 --> 0:46:52.248 +Just for example, if you do it based on language +modeling scores, it might be that you're just + +0:46:52.248 --> 0:46:57.568 +the language modeling score because the has +done some errors, so you really have to see + +0:46:57.568 --> 0:46:59.079 +the combination of that. + +0:46:59.419 --> 0:47:04.285 +And for example, we had like partial words. + +0:47:04.285 --> 0:47:06.496 +They are like some. + +0:47:06.496 --> 0:47:08.819 +We didn't have that. + +0:47:08.908 --> 0:47:18.248 +So these feelings cannot be that you start +in the middle of the world and then you switch + +0:47:18.248 --> 0:47:19.182 +because. + +0:47:19.499 --> 0:47:23.214 +And of course, in text in perfect transcript, +that's very easy to recognize. + +0:47:23.214 --> 0:47:24.372 +That's not a real word. + +0:47:24.904 --> 0:47:37.198 +However, when you really do it into an system, +he will normally detect some type of word because + +0:47:37.198 --> 0:47:40.747 +he only can help the words. + +0:47:50.050 --> 0:48:03.450 +Example: We should think so if you have this +in the transcript it's easy to detect as a + +0:48:03.450 --> 0:48:05.277 +disgusting. + +0:48:05.986 --> 0:48:11.619 +And then, of course, it's more challenging +in a real world example where you have. + +0:48:12.492 --> 0:48:29.840 +Now to the approaches one thing is to really +put it in between so you put your A's system. + +0:48:31.391 --> 0:48:45.139 +So what your task is like, so you have this +text and the outputs in this text. + +0:48:45.565 --> 0:48:49.605 +There is different formulations of that. + +0:48:49.605 --> 0:48:54.533 +You might not be able to do everything like +that. + +0:48:55.195 --> 0:49:10.852 +Or do you also allow, for example, rephrasing +for reordering so in text you might have the + +0:49:10.852 --> 0:49:13.605 +word correctly. + +0:49:13.513 --> 0:49:24.201 +But the easiest thing is you only do it more +like removing, so some things can be removed. + +0:49:29.049 --> 0:49:34.508 +Any ideas how to do that this is output. + +0:49:34.508 --> 0:49:41.034 +You have training data so we have training +data. + +0:49:47.507 --> 0:49:55.869 +To put in with the spoon you can eat it even +after it is out, but after the machine has. + +0:50:00.000 --> 0:50:05.511 +Was wearing rocks, so you have not just the +shoes you remove but wearing them as input, + +0:50:05.511 --> 0:50:07.578 +as disfluent text and as output. + +0:50:07.578 --> 0:50:09.207 +It should be fueled text. + +0:50:09.207 --> 0:50:15.219 +It can be before or after recycling as you +said, but you have this type of task, so technically + +0:50:15.219 --> 0:50:20.042 +how would you address this type of task when +you have to solve this type of. + +0:50:24.364 --> 0:50:26.181 +That's exactly so. + +0:50:26.181 --> 0:50:28.859 +That's one way of doing it. + +0:50:28.859 --> 0:50:33.068 +It's a translation task and you train your. + +0:50:33.913 --> 0:50:34.683 +Can do. + +0:50:34.683 --> 0:50:42.865 +Then, of course, the bit of the challenge +is that you automatically allow rephrasing + +0:50:42.865 --> 0:50:43.539 +stuff. + +0:50:43.943 --> 0:50:52.240 +Which of the one end is good so you have more +opportunities but it might be also a bad thing + +0:50:52.240 --> 0:50:58.307 +because if you have more opportunities you +have more opportunities. + +0:51:01.041 --> 0:51:08.300 +If you want to prevent that, it can also do +more simple labeling, so for each word your + +0:51:08.300 --> 0:51:10.693 +label should not be removed. + +0:51:12.132 --> 0:51:17.658 +People have also been looked into parsley. + +0:51:17.658 --> 0:51:29.097 +You remember maybe the past trees at the beginning +like the structure because the ideas. + +0:51:29.649 --> 0:51:45.779 +There's also more unsupervised approaches +where you then phrase it as a style transfer + +0:51:45.779 --> 0:51:46.892 +task. + +0:51:50.310 --> 0:51:58.601 +At the last point since we have that yes, +it has also been done in an end-to-end fashion + +0:51:58.601 --> 0:52:06.519 +so that it's really you have as input the audio +signal and output you have than the. + +0:52:06.446 --> 0:52:10.750 +The text, without influence, is a clearly +clear text. + +0:52:11.131 --> 0:52:19.069 +You model every single total, which of course +has a big advantage. + +0:52:19.069 --> 0:52:25.704 +You can use these paralinguistic features, +pauses, and. + +0:52:25.705 --> 0:52:34.091 +If you switch so you start something then +oh it doesn't work continue differently so. + +0:52:34.374 --> 0:52:42.689 +So you can easily use in a fashion while in +a cascade approach. + +0:52:42.689 --> 0:52:47.497 +As we saw there you have text input. + +0:52:49.990 --> 0:53:02.389 +But on the one end we have again, and in the +more extreme case the problem before was endless. + +0:53:02.389 --> 0:53:06.957 +Of course there is even less data. + +0:53:11.611 --> 0:53:12.837 +Good. + +0:53:12.837 --> 0:53:30.814 +This is all about the input to a very more +person, or maybe if you think about YouTube. + +0:53:32.752 --> 0:53:34.989 +Talk so this could use be very exciting. + +0:53:36.296 --> 0:53:42.016 +Is more viewed as style transferred. + +0:53:42.016 --> 0:53:53.147 +You can use ideas from machine translation +where you have one language. + +0:53:53.713 --> 0:53:57.193 +So there is ways of trying to do this type +of style transfer. + +0:53:57.637 --> 0:54:02.478 +Think is definitely also very promising to +make it more and more fluent in a business. + +0:54:03.223 --> 0:54:17.974 +Because one major issue about all the previous +ones is that you need training data and then + +0:54:17.974 --> 0:54:21.021 +you need training. + +0:54:21.381 --> 0:54:32.966 +So I mean, think that we are only really of +data that we have for English. + +0:54:32.966 --> 0:54:39.453 +Maybe there is a very few data in German. + +0:54:42.382 --> 0:54:49.722 +Okay, then let's talk about low latency speech. + +0:54:50.270 --> 0:55:05.158 +So the idea is if we are doing life translation +of a talker, so we want to start out. + +0:55:05.325 --> 0:55:23.010 +This is possible because there is typically +some kind of monotony in many languages. + +0:55:24.504 --> 0:55:29.765 +And this is also what, for example, human +interpreters are doing to have a really low + +0:55:29.765 --> 0:55:30.071 +leg. + +0:55:30.750 --> 0:55:34.393 +They are even going further. + +0:55:34.393 --> 0:55:40.926 +They guess what will be the ending of the +sentence. + +0:55:41.421 --> 0:55:51.120 +Then they can already continue, although it's +not sad it might be needed, but that is even + +0:55:51.120 --> 0:55:53.039 +more challenging. + +0:55:54.714 --> 0:55:58.014 +Why is it so difficult? + +0:55:58.014 --> 0:56:09.837 +There is this train of on the one end for +a and you want to have more context because + +0:56:09.837 --> 0:56:14.511 +we learn if we have more context. + +0:56:15.015 --> 0:56:24.033 +And therefore to have more contacts you have +to wait as long as possible. + +0:56:24.033 --> 0:56:27.689 +The best is to have the full. + +0:56:28.168 --> 0:56:35.244 +On the other hand, you want to have a low +latency for the user to wait to generate as + +0:56:35.244 --> 0:56:35.737 +soon. + +0:56:36.356 --> 0:56:47.149 +So if you're doing no situation you have to +find the best way to start in order to have + +0:56:47.149 --> 0:56:48.130 +a good. + +0:56:48.728 --> 0:56:52.296 +There's no longer the perfect solution. + +0:56:52.296 --> 0:56:56.845 +People will also evaluate what is the translation. + +0:56:57.657 --> 0:57:09.942 +While it's challenging in German to English, +German has this very nice thing where the prefix + +0:57:09.942 --> 0:57:16.607 +of the word can be put at the end of the sentence. + +0:57:17.137 --> 0:57:24.201 +And you only know if the person registers +or cancels his station at the end of the center. + +0:57:24.985 --> 0:57:33.690 +So if you want to start the translation in +English you need to know at this point is the. + +0:57:35.275 --> 0:57:39.993 +So you would have to wait until the end of +the year. + +0:57:39.993 --> 0:57:42.931 +That's not really what you want. + +0:57:43.843 --> 0:57:45.795 +What happened. + +0:57:47.207 --> 0:58:12.550 +Other solutions of doing that are: Have been +motivating like how we can do that subject + +0:58:12.550 --> 0:58:15.957 +object or subject work. + +0:58:16.496 --> 0:58:24.582 +In German it's not always subject, but there +are relative sentence where you have that, + +0:58:24.582 --> 0:58:25.777 +so it needs. + +0:58:28.808 --> 0:58:41.858 +How we can do that is, we'll look today into +three ways of doing that. + +0:58:41.858 --> 0:58:46.269 +The one is to mitigate. + +0:58:46.766 --> 0:58:54.824 +And then the IVAR idea is to do retranslating, +and there you can now use the text output. + +0:58:54.934 --> 0:59:02.302 +So the idea is you translate, and if you later +notice it was wrong then you can retranslate + +0:59:02.302 --> 0:59:03.343 +and correct. + +0:59:03.803 --> 0:59:14.383 +Or you can do what is called extremely coding, +so you can generically. + +0:59:17.237 --> 0:59:30.382 +Let's start with the optimization, so if you +have a sentence, it may reach a conference, + +0:59:30.382 --> 0:59:33.040 +and in this time. + +0:59:32.993 --> 0:59:39.592 +So you have a good translation quality while +still having low latency. + +0:59:39.699 --> 0:59:50.513 +You have an extra model which does your segmentation +before, but your aim is not to have a segmentation. + +0:59:50.470 --> 0:59:53.624 +But you can somehow measure in training data. + +0:59:53.624 --> 0:59:59.863 +If do these types of segment lengths, that's +my latency and that's my translation quality, + +0:59:59.863 --> 1:00:02.811 +and then you can try to search a good way. + +1:00:03.443 --> 1:00:20.188 +If you're doing that one, it's an extra component, +so you can use your system as it was. + +1:00:22.002 --> 1:00:28.373 +The other idea is to directly output the first +high processes always, so always when you have + +1:00:28.373 --> 1:00:34.201 +text or audio we translate, and if we then +have more context available we can update. + +1:00:35.015 --> 1:00:50.195 +So imagine before, if get an eye register +and there's a sentence continued, then. + +1:00:50.670 --> 1:00:54.298 +So you change the output. + +1:00:54.298 --> 1:01:07.414 +Of course, that might be also leading to bad +user experience if you always flicker and change + +1:01:07.414 --> 1:01:09.228 +your output. + +1:01:09.669 --> 1:01:15.329 +The bit like human interpreters also are able +to correct, so they're doing a more long text. + +1:01:15.329 --> 1:01:20.867 +If they are guessing how to continue to say +and then he's saying something different, they + +1:01:20.867 --> 1:01:22.510 +also have to correct them. + +1:01:22.510 --> 1:01:26.831 +So here, since it's not all you, we can even +change what we have said. + +1:01:26.831 --> 1:01:29.630 +Yes, that's exactly what we have implemented. + +1:01:31.431 --> 1:01:49.217 +So how that works is, we are aware, and then +we translate it, and if we get more input like + +1:01:49.217 --> 1:01:51.344 +you, then. + +1:01:51.711 --> 1:02:00.223 +And so we can always continue to do that and +improve the transcript that we have. + +1:02:00.480 --> 1:02:07.729 +So in the end we have the lowest possible +latency because we always output what is possible. + +1:02:07.729 --> 1:02:14.784 +On the other hand, introducing a bit of a +new problem is: There's another challenge when + +1:02:14.784 --> 1:02:20.061 +we first used that this one was first used +for old and that it worked fine. + +1:02:20.061 --> 1:02:21.380 +You switch to NMT. + +1:02:21.380 --> 1:02:25.615 +You saw one problem that is even generating +more flickering. + +1:02:25.615 --> 1:02:28.878 +The problem is the normal machine translation. + +1:02:29.669 --> 1:02:35.414 +So implicitly learn all the output that always +ends with a dot, and it's always a full sentence. + +1:02:36.696 --> 1:02:42.466 +And this was even more important somewhere +in the model than really what is in the input. + +1:02:42.983 --> 1:02:55.910 +So if you give him a partial sentence, it +will still generate a full sentence. + +1:02:55.910 --> 1:02:58.201 +So encourage. + +1:02:58.298 --> 1:03:05.821 +It's like trying to just continue it somehow +to a full sentence and if it's doing better + +1:03:05.821 --> 1:03:10.555 +guessing stuff then you have to even have more +changes. + +1:03:10.890 --> 1:03:23.944 +So here we have a trained mismatch and that's +maybe more a general important thing that the + +1:03:23.944 --> 1:03:28.910 +modem might learn a bit different. + +1:03:29.289 --> 1:03:32.636 +It's always ending with a dog, so you don't +just guess something in general. + +1:03:33.053 --> 1:03:35.415 +So we have your trained test mismatch. + +1:03:38.918 --> 1:03:41.248 +And we have a trained test message. + +1:03:41.248 --> 1:03:43.708 +What is the best way to address that? + +1:03:46.526 --> 1:03:51.934 +That's exactly the right, so we have to like +train also on that. + +1:03:52.692 --> 1:03:55.503 +The problem is for particle sentences. + +1:03:55.503 --> 1:03:59.611 +There's not training data, so it's hard to +find all our. + +1:04:00.580 --> 1:04:06.531 +Hi, I'm ransom quite easy to generate artificial +pottery scent or at least for the source. + +1:04:06.926 --> 1:04:15.367 +So you just take, you take all the prefixes +of the source data. + +1:04:17.017 --> 1:04:22.794 +On the problem of course, with a bit what +do you know lying? + +1:04:22.794 --> 1:04:30.845 +If you have a sentence, I encourage all of +what should be the right target for that. + +1:04:31.491 --> 1:04:45.381 +And the constraints on the one hand, it should +be as long as possible, so you always have + +1:04:45.381 --> 1:04:47.541 +a long delay. + +1:04:47.687 --> 1:04:55.556 +On the other hand, it should be also a suspect +of the previous ones, and it should be not + +1:04:55.556 --> 1:04:57.304 +too much inventing. + +1:04:58.758 --> 1:05:02.170 +A very easy solution works fine. + +1:05:02.170 --> 1:05:05.478 +You can just do a length space. + +1:05:05.478 --> 1:05:09.612 +You also take two thirds of the target. + +1:05:10.070 --> 1:05:19.626 +His learning then implicitly to guess a bit +if you think about the beginning of example. + +1:05:20.000 --> 1:05:30.287 +This one, if you do two sorts like half, in +this case the target would be eye register. + +1:05:30.510 --> 1:05:39.289 +So you're doing a bit of implicit guessing, +and if it's getting wrong you have rewriting, + +1:05:39.289 --> 1:05:43.581 +but you're doing a good amount of guessing. + +1:05:49.849 --> 1:05:53.950 +In addition, this would be like how it looks +like if it was like. + +1:05:53.950 --> 1:05:58.300 +If it wasn't a housing game, then the target +could be something like. + +1:05:58.979 --> 1:06:02.513 +One problem is that you just do that this +way. + +1:06:02.513 --> 1:06:04.619 +It's most of your training. + +1:06:05.245 --> 1:06:11.983 +And in the end you're interested in the overall +translation quality, so for full sentence. + +1:06:11.983 --> 1:06:19.017 +So if you train on that, it will mainly learn +how to translate prefixes because ninety percent + +1:06:19.017 --> 1:06:21.535 +or more of your data is prefixed. + +1:06:22.202 --> 1:06:31.636 +That's why we'll see that it's better to do +like a ratio. + +1:06:31.636 --> 1:06:39.281 +So half your training data are full sentences. + +1:06:39.759 --> 1:06:47.693 +Because if you're doing this well you see +that for every word prefix and only one sentence. + +1:06:48.048 --> 1:06:52.252 +You also see that nicely here here are both. + +1:06:52.252 --> 1:06:56.549 +This is the blue scores and you see the bass. + +1:06:58.518 --> 1:06:59.618 +Is this one? + +1:06:59.618 --> 1:07:03.343 +It has a good quality because it's trained. + +1:07:03.343 --> 1:07:11.385 +If you know, train with all the partial sentences +is more focusing on how to translate partial + +1:07:11.385 --> 1:07:12.316 +sentences. + +1:07:12.752 --> 1:07:17.840 +Because all the partial sentences will at +some point be removed, because at the end you + +1:07:17.840 --> 1:07:18.996 +translate the full. + +1:07:20.520 --> 1:07:24.079 +There's many tasks to read, but you have the +same performances. + +1:07:24.504 --> 1:07:26.938 +On the other hand, you see here the other +problem. + +1:07:26.938 --> 1:07:28.656 +This is how many words got updated. + +1:07:29.009 --> 1:07:31.579 +You want to have as few updates as possible. + +1:07:31.579 --> 1:07:34.891 +Updates need to remove things which are once +being shown. + +1:07:35.255 --> 1:07:40.538 +This is quite high for the baseline. + +1:07:40.538 --> 1:07:50.533 +If you know the partials that are going down, +they should be removed. + +1:07:51.151 --> 1:07:58.648 +And then for moody tasks you have a bit like +the best note of swim. + +1:08:02.722 --> 1:08:05.296 +Any more questions to this type of. + +1:08:09.309 --> 1:08:20.760 +The last thing is that you want to do an extremely. + +1:08:21.541 --> 1:08:23.345 +Again, it's a bit implication. + +1:08:23.345 --> 1:08:25.323 +Scenario is what you really want. + +1:08:25.323 --> 1:08:30.211 +As you said, we sometimes use this updating, +and for text output it'd be very nice. + +1:08:30.211 --> 1:08:35.273 +But imagine if you want to audio output, of +course you can't change it anymore because + +1:08:35.273 --> 1:08:37.891 +on one side you cannot change what was said. + +1:08:37.891 --> 1:08:40.858 +So in this time you more need like a fixed +output. + +1:08:41.121 --> 1:08:47.440 +And then the style of street decoding is interesting. + +1:08:47.440 --> 1:08:55.631 +Where you, for example, get sourced, the seagullins +are so stoked in. + +1:08:55.631 --> 1:09:00.897 +Then you decide oh, now it's better to wait. + +1:09:01.041 --> 1:09:14.643 +So you somehow need to have this type of additional +information. + +1:09:15.295 --> 1:09:23.074 +Here you have to decide should know I'll put +a token or should wait for my and feel. + +1:09:26.546 --> 1:09:32.649 +So you have to do this additional labels like +weight, weight, output, output, wage and so + +1:09:32.649 --> 1:09:32.920 +on. + +1:09:33.453 --> 1:09:38.481 +There are different ways of doing that. + +1:09:38.481 --> 1:09:45.771 +You can have an additional model that does +this decision. + +1:09:46.166 --> 1:09:53.669 +And then have a higher quality or better to +continue and then have a lower latency in this + +1:09:53.669 --> 1:09:54.576 +different. + +1:09:55.215 --> 1:09:59.241 +Surprisingly, a very easy task also works, +sometimes quite good. + +1:10:03.043 --> 1:10:10.981 +And that is the so called way care policy +and the idea is there at least for text to + +1:10:10.981 --> 1:10:14.623 +text translation that is working well. + +1:10:14.623 --> 1:10:22.375 +It's like you wait for words and then you +always output one and like one for each. + +1:10:22.682 --> 1:10:28.908 +So your weight slow works at the beginning +of the sentence, and every time a new board + +1:10:28.908 --> 1:10:29.981 +is coming you. + +1:10:31.091 --> 1:10:39.459 +So you have the same times to beat as input, +so you're not legging more or less, but to + +1:10:39.459 --> 1:10:41.456 +have enough context. + +1:10:43.103 --> 1:10:49.283 +Of course this for example for the unmarried +will not solve it perfectly but if you have + +1:10:49.283 --> 1:10:55.395 +a bit of local reordering inside your token +that you can manage very well and then it's + +1:10:55.395 --> 1:10:57.687 +a very simple solution but it's. + +1:10:57.877 --> 1:11:00.481 +The other one was dynamic. + +1:11:00.481 --> 1:11:06.943 +Depending on the context you can decide how +long you want to wait. + +1:11:07.687 --> 1:11:21.506 +It also only works if you have a similar amount +of tokens, so if your target is very short + +1:11:21.506 --> 1:11:22.113 +of. + +1:11:22.722 --> 1:11:28.791 +That's why it's also more challenging for +audio input because the speaking rate is changing + +1:11:28.791 --> 1:11:29.517 +and so on. + +1:11:29.517 --> 1:11:35.586 +You would have to do something like I'll output +a word for every second a year or something + +1:11:35.586 --> 1:11:35.981 +like. + +1:11:36.636 --> 1:11:45.459 +The problem is that the audio speaking speed +is not like fixed but quite very, and therefore. + +1:11:50.170 --> 1:11:58.278 +Therefore, what you can also do is you can +use a similar solution than we had before with + +1:11:58.278 --> 1:11:59.809 +the resetteling. + +1:12:00.080 --> 1:12:02.904 +You remember we were re-decoded all the time. + +1:12:03.423 --> 1:12:12.253 +And you can do something similar in this case +except that you add something in that you're + +1:12:12.253 --> 1:12:16.813 +saying, oh, if I read it cold, I'm not always. + +1:12:16.736 --> 1:12:22.065 +Can decode as I want, but you can do this +target prefix decoding, so what you say is + +1:12:22.065 --> 1:12:23.883 +in your achievement section. + +1:12:23.883 --> 1:12:26.829 +You can easily say generate a translation +bus. + +1:12:27.007 --> 1:12:29.810 +The translation has to start with the prefix. + +1:12:31.251 --> 1:12:35.350 +How can you do that? + +1:12:39.839 --> 1:12:49.105 +In the decoder exactly you start, so if you +do beam search you select always the most probable. + +1:12:49.349 --> 1:12:57.867 +And now you say oh, I'm not selecting the +most perfect, but this is the fourth, so in + +1:12:57.867 --> 1:13:04.603 +the first step have to take this one, in the +second start decoding. + +1:13:04.884 --> 1:13:09.387 +And then you're making sure that your second +always starts with this prefix. + +1:13:10.350 --> 1:13:18.627 +And then you can use your immediate retranslation, +but you're no longer changing the output. + +1:13:19.099 --> 1:13:31.595 +Out as it works, so it may get a speech signal +and input, and it is not outputing any. + +1:13:32.212 --> 1:13:45.980 +So then if you got you get a translation maybe +and then you decide yes output. + +1:13:46.766 --> 1:13:54.250 +And then you're translating as one as two +as sweet as four, but now you say generate + +1:13:54.250 --> 1:13:55.483 +only outputs. + +1:13:55.935 --> 1:14:07.163 +And then you're translating and maybe you're +deciding on and now a good translation. + +1:14:07.163 --> 1:14:08.880 +Then you're. + +1:14:09.749 --> 1:14:29.984 +Yes, but don't get to worry about what the +effect is. + +1:14:30.050 --> 1:14:31.842 +We're generating your target text. + +1:14:32.892 --> 1:14:36.930 +But we're not always outputing the full target +text now. + +1:14:36.930 --> 1:14:43.729 +What we are having is we have here some strategy +to decide: Oh, is a system already sure enough + +1:14:43.729 --> 1:14:44.437 +about it? + +1:14:44.437 --> 1:14:49.395 +If it's sure enough and it has all the information, +we can output it. + +1:14:49.395 --> 1:14:50.741 +And then the next. + +1:14:51.291 --> 1:14:55.931 +If we say here sometimes with better not to +get output we won't output it already. + +1:14:57.777 --> 1:15:06.369 +And thereby the hope is in the uphill model +should not yet outcut a register because it + +1:15:06.369 --> 1:15:10.568 +doesn't mean no yet if it's a case or not. + +1:15:13.193 --> 1:15:18.056 +So what we have to discuss is what is a good +output strategy. + +1:15:18.658 --> 1:15:20.070 +So you could do. + +1:15:20.070 --> 1:15:23.806 +The output strategy could be something like. + +1:15:23.743 --> 1:15:39.871 +If you think of weight cape, this is an output +strategy here that you always input. + +1:15:40.220 --> 1:15:44.990 +Good, and you can view your weight in a similar +way as. + +1:15:45.265 --> 1:15:55.194 +But now, of course, we can also look at other +output strategies where it's more generic and + +1:15:55.194 --> 1:15:59.727 +it's deciding whether in some situations. + +1:16:01.121 --> 1:16:12.739 +And one thing that works quite well is referred +to as local agreement, and that means you're + +1:16:12.739 --> 1:16:13.738 +always. + +1:16:14.234 --> 1:16:26.978 +Then you're looking what is the same thing +between my current translation and the one + +1:16:26.978 --> 1:16:28.756 +did before. + +1:16:29.349 --> 1:16:31.201 +So let's do that again in six hours. + +1:16:31.891 --> 1:16:45.900 +So your input is a first audio segment and +your title text is all model trains. + +1:16:46.346 --> 1:16:53.231 +Then you're getting six opposites, one and +two, and this time the output is all models. + +1:16:54.694 --> 1:17:08.407 +You see trains are different, but both of +them agree that it's all so in those cases. + +1:17:09.209 --> 1:17:13.806 +So we can be hopefully a big show that really +starts with all. + +1:17:15.155 --> 1:17:22.604 +So now we say we're output all, so at this +time instead we'll output all, although before. + +1:17:23.543 --> 1:17:27.422 +We are getting one, two, three as input. + +1:17:27.422 --> 1:17:35.747 +This time we have a prefix, so now we are +only allowing translations to start with all. + +1:17:35.747 --> 1:17:42.937 +We cannot change that anymore, so we now need +to generate some translation. + +1:17:43.363 --> 1:17:46.323 +And then it can be that its now all models +are run. + +1:17:47.927 --> 1:18:01.908 +Then we compare here and see this agrees on +all models so we can output all models. + +1:18:02.882 --> 1:18:07.356 +So this by we can dynamically decide is a +model is very anxious. + +1:18:07.356 --> 1:18:10.178 +We always talk with something different. + +1:18:11.231 --> 1:18:24.872 +Then it's, we'll wait longer, it's more for +the same thing, and hope we don't need to wait. + +1:18:30.430 --> 1:18:40.238 +Is it clear again that the signal wouldn't +be able to detect? + +1:18:43.203 --> 1:18:50.553 +The hope it is because if it's not sure of, +of course, it in this kind would have to switch + +1:18:50.553 --> 1:18:51.671 +all the time. + +1:18:56.176 --> 1:19:01.375 +So if it would be the first step to register +and the second time to cancel and they may + +1:19:01.375 --> 1:19:03.561 +register again, they wouldn't do it. + +1:19:03.561 --> 1:19:08.347 +Of course, it is very short because in register +a long time, then it can't deal. + +1:19:08.568 --> 1:19:23.410 +That's why there's two parameters that you +can use and which might be important, or how. + +1:19:23.763 --> 1:19:27.920 +So you do it like every one second, every +five seconds or something like that. + +1:19:28.648 --> 1:19:37.695 +Put it more often as your latency will be +because your weight is less long, but also + +1:19:37.695 --> 1:19:39.185 +you might do. + +1:19:40.400 --> 1:19:50.004 +So that is the one thing and the other thing +is for words you might do everywhere, but if + +1:19:50.004 --> 1:19:52.779 +you think about audio it. + +1:19:53.493 --> 1:20:04.287 +And the other question you can do like the +agreement, so the model is sure. + +1:20:04.287 --> 1:20:10.252 +If you say have to agree, then hopefully. + +1:20:10.650 --> 1:20:21.369 +What we saw is think there has been a really +normally good performance and otherwise your + +1:20:21.369 --> 1:20:22.441 +latency. + +1:20:22.963 --> 1:20:42.085 +Okay, we'll just make more tests and we'll +get the confidence. + +1:20:44.884 --> 1:20:47.596 +Have to completely agree with that. + +1:20:47.596 --> 1:20:53.018 +So when this was done, that was our first +idea of using the confidence. + +1:20:53.018 --> 1:21:00.248 +The problem is that currently that's my assumption +is that the modeling the model confidence is + +1:21:00.248 --> 1:21:03.939 +not that easy, and they are often overconfident. + +1:21:04.324 --> 1:21:17.121 +In the paper there is this type also where +you try to use the confidence in some way to + +1:21:17.121 --> 1:21:20.465 +decide the confidence. + +1:21:21.701 --> 1:21:26.825 +But that gave worse results, and that's why +we looked into that. + +1:21:27.087 --> 1:21:38.067 +So it's a very good idea think, but it seems +not to at least how it was implemented. + +1:21:38.959 --> 1:21:55.670 +There is one way that maybe goes in more direction, +which is very new. + +1:21:55.455 --> 1:22:02.743 +If this one, the last word is attending mainly +to the end of the audio. + +1:22:02.942 --> 1:22:04.934 +You might you should not output it yet. + +1:22:05.485 --> 1:22:15.539 +Because they might think there is something +more missing than you need to know, so they + +1:22:15.539 --> 1:22:24.678 +look at the attention and only output parts +which look to not the audio signal. + +1:22:25.045 --> 1:22:40.175 +So there is, of course, a lot of ways how +you can do it better or easier in some way. + +1:22:41.901 --> 1:22:53.388 +Instead tries to predict the next word with +a large language model, and then for text translation + +1:22:53.388 --> 1:22:54.911 +you predict. + +1:22:55.215 --> 1:23:01.177 +Then you translate all of them and decide +if there is a change so you can even earlier + +1:23:01.177 --> 1:23:02.410 +do your decision. + +1:23:02.362 --> 1:23:08.714 +The idea is that if we continue and then this +will be to a change in the translation, then + +1:23:08.714 --> 1:23:10.320 +we should have opened. + +1:23:10.890 --> 1:23:18.302 +So it's more doing your estimate about possible +continuations of the source instead of looking + +1:23:18.302 --> 1:23:19.317 +at previous. + +1:23:23.783 --> 1:23:31.388 +All that works is a bit here like one example. + +1:23:31.388 --> 1:23:39.641 +It has a legacy baselines and you are not +putting. + +1:23:40.040 --> 1:23:47.041 +And you see in this case you have worse blood +scores here. + +1:23:47.041 --> 1:23:51.670 +For equal one you have better latency. + +1:23:52.032 --> 1:24:01.123 +The how to and how does anybody have an idea +of what could be challenging there or when? + +1:24:05.825 --> 1:24:20.132 +One problem of these models are hallucinations, +and often very long has a negative impact on. + +1:24:24.884 --> 1:24:30.869 +If you don't remove the last four words but +your model now starts to hallucinate and invent + +1:24:30.869 --> 1:24:37.438 +just a lot of new stuff then yeah you're removing +the last four words of that but if it has invented + +1:24:37.438 --> 1:24:41.406 +ten words and you're still outputting six of +these invented. + +1:24:41.982 --> 1:24:48.672 +Typically once it starts hallucination generating +some output, it's quite long, so then it's + +1:24:48.672 --> 1:24:50.902 +no longer enough to just hold. + +1:24:51.511 --> 1:24:57.695 +And then, of course, a bit better if you compare +to the previous ones. + +1:24:57.695 --> 1:25:01.528 +Their destinations are typically different. + +1:25:07.567 --> 1:25:25.939 +Yes, so we don't talk about the details, but +for outputs, for presentations, there's different + +1:25:25.939 --> 1:25:27.100 +ways. + +1:25:27.347 --> 1:25:36.047 +So you want to have maximum two lines, maximum +forty-two characters per line, and the reading + +1:25:36.047 --> 1:25:40.212 +speed is a maximum of twenty-one characters. + +1:25:40.981 --> 1:25:43.513 +How to Do That We Can Skip. + +1:25:43.463 --> 1:25:46.804 +Then you can generate something like that. + +1:25:46.886 --> 1:25:53.250 +Another challenge is, of course, that you +not only need to generate the translation, + +1:25:53.250 --> 1:25:59.614 +but for subtlyning you also want to generate +when to put breaks and what to display. + +1:25:59.619 --> 1:26:06.234 +Because it cannot be full sentences, as said +here, if you have like maximum twenty four + +1:26:06.234 --> 1:26:10.443 +characters per line, that's not always a full +sentence. + +1:26:10.443 --> 1:26:12.247 +So how can you make it? + +1:26:13.093 --> 1:26:16.253 +And then for speech there's not even a hint +of wisdom. + +1:26:18.398 --> 1:26:27.711 +So what we have done today is yeah, we looked +into maybe three challenges: We have this segmentation, + +1:26:27.711 --> 1:26:33.013 +which is a challenge both in evaluation and +in the decoder. + +1:26:33.013 --> 1:26:40.613 +We talked about disfluencies and we talked +about simultaneous translations and how to + +1:26:40.613 --> 1:26:42.911 +address these challenges. + +1:26:43.463 --> 1:26:45.507 +Any more questions. + +1:26:48.408 --> 1:26:52.578 +Good then new content. + +1:26:52.578 --> 1:26:58.198 +We are done for this semester. + +1:26:58.198 --> 1:27:04.905 +You can keep your knowledge in that. + +1:27:04.744 --> 1:27:09.405 +Repetition where we can try to repeat a bit +what we've done all over the semester. + +1:27:10.010 --> 1:27:13.776 +Now prepare a bit of repetition to what think +is important. + +1:27:14.634 --> 1:27:21.441 +But of course is also the chance for you to +ask specific questions. + +1:27:21.441 --> 1:27:25.445 +It's not clear to me how things relate. + +1:27:25.745 --> 1:27:34.906 +So if you have any specific questions, please +come to me or send me an email or so, then + +1:27:34.906 --> 1:27:36.038 +I'm happy. + +1:27:36.396 --> 1:27:46.665 +If should focus on it really in depth, it +might be good not to come and send me an email + +1:27:46.665 --> 1:27:49.204 +on Wednesday evening. + diff --git a/demo_data/lectures/Lecture-19-21.07.2023/video.mp4 b/demo_data/lectures/Lecture-19-21.07.2023/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..aaafd6e6e4837c2e52290d72de355a88f09c9ef4 --- /dev/null +++ b/demo_data/lectures/Lecture-19-21.07.2023/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:627fd6a73ed6853821cd58c2fc9e938a7844998ed51c4163f2d0a4771dc5c156 +size 130103518 diff --git a/demo_data/nips-2021/25957/metadata.json b/demo_data/nips-2021/25957/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..edb8efdecd5452940ebc473f4a04bf578e37076b --- /dev/null +++ b/demo_data/nips-2021/25957/metadata.json @@ -0,0 +1,3 @@ +{ + "title": "Shared Independent Component Analysis for Multi-Subject Neuroimaging" +} \ No newline at end of file diff --git a/demo_data/nips-2021/25957/transcript_whisper_large-v2.txt b/demo_data/nips-2021/25957/transcript_whisper_large-v2.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9e9965b65c750f47dce338f035500f2ac913d98 --- /dev/null +++ b/demo_data/nips-2021/25957/transcript_whisper_large-v2.txt @@ -0,0 +1,179 @@ +Hi, I'm Hugo Richard, I'm a third year PhD student at Université Paris-Saclay. +I'm in the INRIA Paris et Alpes team and my supervisor is Bertrand Thirion. +Today I'll talk about shared independent component analysis for multi-subject neuroimaging. +This is a joint work with Pierre Abelin, Alexandre Grandfort, Bertrand Thirion and Anna Pouy-Varine. +First let us consider two sources that are emitting a signal that is recorded by two +sensors. +This can be seen as a simplified model of magnetoencephalography where brain sources +are recorded by magnetometers. +Because propagation time can be neglected, the signal recorded by the sensors can be +seen as a linear mixture of the signal emitted by the sources. +S is a set of sources that are assumed to be independent. +X are the recordings and A describes how the sources are mixed to produce the recordings. +At first sight this model may seem ill-defined because if we permute two columns in A and +permute the corresponding sources in S, we'll get a new set of sources S' and a new mixing +matrix A' that describes X just as well as A and S. +And similarly if we scale the column of A by some constant, one column of A by some +constant and the corresponding source by the same constant, we'll also get an equivalent +description of X. +However, these scale and permutation indeterminacies are the only one if the sources contain at +most one Gaussian component. +Let us consider the more general problem where you have multiple subjects that are exposed +to the same stimuli. +We have two subjects, X1 and X2, and they have different mixing matrices, A1 and A2, +and different noise levels, N1 and N2. +The interpretation is that they have shared sources because they have shared connective +processes. +They have different mixing matrices because they have different spatial topography. +And they have different noises because we want to model inter-subject variability. +This model is called group ICA. +There are many methods to provide a solution for the group ICA problem. +A very popular one introduced by Calhoun in 2001 is to just stack the data of all subjects +feature-wise and then perform a PCA, a principal component analysis, on the stacked data. +And therefore you obtain reduced data and apply independent component analysis on the +reduced data to obtain a set of sources. +Another formulation is introduced by Varoko in 2010 and is called K-NICA. +You just replace the principal component analysis with a multiset CCA, so a multiset canonical +correlation analysis, where you have to solve a generalized eigenvalue problem. +There are many different formulations of multiset CCA, but this one with a generalized eigenvalue +problem is the fastest to solve. +KNICA and Cut-ICA have a lot of advantages. +First, they are very fast to fit. +And second, they are simple to implement. +These are the two reasons why they are so popular in neuroimaging. +However, they do not optimize the proper likelihood. +So therefore they do not benefit from advantages of such estimators such as asymptotic efficiency. +There are a lot of other related work that do optimize the proper likelihood. +I want to mention the independent vector analysis, which is a very powerful framework introduced +by Li in 2008. +So unified approach of Guo in 2008 that we will also mention and talk about later. +The approach of Shen in 2015 that also allows to perform dimension reduction. +And the multi-view ICA that was introduced by our team last year. +I want to quickly say that it's not obvious to design a likelihood-based approach that +is tractable. +And with this example of the Gaussian mixture noisy ICA by Bermond and Cardozo, we'll see +that standard approach leads to intractable algorithms. +The model we take here is the same as the group ICA, but we assume that the noise is +Gaussian with the same variance for all subjects. +We'll also assume that the sources follow a Gaussian mixture model. +And we further assume that the weights of the Gaussian mixtures are known. +We can solve such model via expectation maximization. +And if we write the E-step, we'll get a closed form that involves a large sum. +Because of this large size, this sum, and therefore the M algorithm is intractable whenever +Q and K are large. +Our contribution is shared ICA, what we call Shikha for short, where the data of subject +i are assumed as a linear mixture of noisy sources, and the noise here is not on the +sensor, but on the sources. +The noise is Gaussian with a variance that can be different for each subject and different +for each component. +S are assumed to be independent, but in contrast to almost all existing work, some components +can be Gaussian. +We have a few blanket assumptions. +We assume that the data are centered, that the mixing metrics are invertible, that the +sources have identical variance, and that the number of subjects is greater than 3. +We have two algorithms to solve the Shikha model. +We have ShikhaJ, that is a FAS algorithm that is based on multiset CCA, and ShikhaML, a +maximum likelihood approach. +In Shikha, there are two ways to recover the parameters. +Either the source are non-Gaussian, in which case we can use classical ICA results to recover +the unmixing matrices. +When the components are Gaussian, then we need something else, and what we use here +is noise diversity. +When the noise is sufficiently diverse, then it's possible to recover the unmixing matrix +and the noise covariance up to a permutation and sign indeterminacy. +Note that the noise diversity in Gaussian components is also a necessary condition. +If it does not hold, then Shikha cannot be identified. +Let us now focus on this theorem that is at the core of the ShikhaJ algorithm. +Namely it shows that we can solve group ICA with multiset CCA. +So assume the data follows the Shikha model, and consider the multiset CCA framed as a +generalized eigenvalue problem. +This generalized eigenvalue problem relies on two matrices, C and D. So C is formed by +second-order statistics, and D is formed by the diagonal blocks in C. +And so if we solve this eigenvalue problem and take the first k leading eigenvectors, +we can recover the correct unmixing matrix from them, up to a permutation and a scaling. +And this can only be done if the k first eigenvalues are distinct. +Note that the distinct eigenvalue condition is also necessary. +If two eigenvalues are the same, then this adds the need to determine IC, and therefore +we cannot solve group IC. +Note also that the condition that some eigenvalues need to be distinct is stronger than the noise +diversity condition we have in the identifiability theorem. +And therefore we can exhibit an example which is identifiable, but on which multiset CCA +will fail. +And I refer you to the paper for more details on this. +So in our theorem, in order to recover the correct unmixing matrix, we need to have access +to the second-order statistics. +However, in practice, we only have access to them, up to some sampling noise. +And because the mapping from matrices to eigenvectors is highly non-smooth, a small deviation in +the second-order statistics can lead to a high deviation of the recovered unmixing matrix. +Now to show this in practice, we take three subjects, two components, and noise covariance +matrices with two values, lambda1 and lambda2, that are separated by an eigengap epsilon. +And we compare the solution of multiset CCA on the true covariance matrices and on the +perturbed covariance matrix, where the perturbation scale is given by delta. +And for different values of epsilon, 10-4, 10-3, 10-2, 10-1, we show how the performance +of the algorithm, so the M-ary distance between the true unmixing matrix and the estimated +unmixing matrix, varies when the perturbation scale increases. +And we see that when the eigengap is very close, so 10-4, the violet curve, then even +with a very small perturbation, you can get to a very bad M-ary distance. +So the black dashed curve is a performance of chance. +Luckily, there is a large gap between the k-th eigenvalues and the k plus 1. +This means that in practice, the span of the p-leading eigenvectors is approximately preserved. +We can recover the true unmixing matrix from the unmixing matrix estimated by multiset +CCA, just by multiplying by a matrix Q. +And in order to estimate Q, we make use of the fact that the unmixed data should have +a diagonal covariance. +This leads us to a joint diagonalization problem that we can solve efficiently. +So if we take the experiments we've done on the previous slide, the results are still +shown here. +You can see the violet curves, and that is very sensitive to perturbation. +And so if we apply joint diagonalization, all these curves move, and they join the dashed +curve on the bottom. +And therefore, it's much better, because now the new curves that are represented by the +dashed line are less sensitive to perturbations. +So now we've obtained the correct unmixing matrix, but up to a scaling. +And so we need an additional step to find the correct scaling, and another one to find +the other parameter that is still unestimated, which are the noise covariance. +And luckily, it's very easy to find the noise covariance. +We can do this via an EM algorithm. +The E-step and the M-step are in closed form, and this yields a very fast algorithm. +But the Shikha-J is not a maximum likelihood estimator. +So now we will focus on Shikha-ML, which is our maximum likelihood estimator. +So I won't go too much into details on this, but we optimize this via an EM using a Gaussian +mixture assumption as a source. +We assume that the weights are known. +What I just want to showcase here is that the E-step of the algorithm, the one that +gives you the expectation of the sources given the data, and the variance of the sources +given the data, only involves the sum of size 2. +So previously we had a sum that had an exponential number of terms, and here we don't have that +anymore. +So the E-step is much faster than what we had before, and therefore the EM algorithm +here is tractable, whereas it was not the case before. +I first want to present our synthetic experiment where we generate data according to the Shikha-ML +and Shikha-J model. +In case A, we have only Gaussian components, but we have noise diversity, and therefore +methods that use noise diversity to recover the sources such as Shikha-ML and Shikha-J +perform best. +In the second case, we have only non-Gaussian components and no noise diversity, so methods +that use non-Gaussianity perform well such as Kana-ICA, Shikha-ML, or MultiView-ICA. +And the last case, half of the components are Gaussian with noise diversity, and the +other half are non-Gaussian but without noise diversity. +And in this case, only Shikha-ML is able to correctly recover the sources. +MV-ICA doesn't do that, but it's not as good as Shikha-ML. +Let us now talk about our experiments on real data. +We have this reconstruction experiment on fMRI data where subjects are exposed to a +naturalistic stimuli such as movie watching. +We use 80% of the movie to learn the unmixing matrices of all subjects, and then on the +20% left of the movie, we compute the common sources, and from these common sources computed +using 80% of the subject, we try to reconstruct the data of the 20% left of the subject. +We compute the R2 score within regions of interest between the reconstructed data and +the true data, and plot them as a function of the number of components used. +As we see, Shikha-ML outperforms all of the methods. +As a take-home message, Shikha is a powerful framework to extract shared sources. +Shikha-J is a fast approach to fit the model, but it only uses second-order information. +In contrast, Shikha-ML is a bit slower, but is able to use non-gaussianity in addition +to second-order information. +In practice, Shikha-ML yields the best results. +The methods we've introduced work on reduced data. +It would be interesting to know how to reduce the data so that they perform optimally. +Another way to improve our results would be to learn the density of the shared sources +in Shikha-ML instead of having them fixed. +Thanks for listening, and have a good day! diff --git a/demo_data/nips-2021/25957/transcript_whisper_large-v2.vtt b/demo_data/nips-2021/25957/transcript_whisper_large-v2.vtt new file mode 100644 index 0000000000000000000000000000000000000000..8dbcb7ec3364dc0397852d26ccc0511d35de9885 --- /dev/null +++ b/demo_data/nips-2021/25957/transcript_whisper_large-v2.vtt @@ -0,0 +1,539 @@ +WEBVTT + +00:00.000 --> 00:14.000 +Hi, I'm Hugo Richard, I'm a third year PhD student at Université Paris-Saclay. + +00:14.000 --> 00:18.480 +I'm in the INRIA Paris et Alpes team and my supervisor is Bertrand Thirion. + +00:18.480 --> 00:24.600 +Today I'll talk about shared independent component analysis for multi-subject neuroimaging. + +00:24.600 --> 00:31.400 +This is a joint work with Pierre Abelin, Alexandre Grandfort, Bertrand Thirion and Anna Pouy-Varine. + +00:31.400 --> 00:36.360 +First let us consider two sources that are emitting a signal that is recorded by two + +00:36.360 --> 00:37.360 +sensors. + +00:37.360 --> 00:43.120 +This can be seen as a simplified model of magnetoencephalography where brain sources + +00:43.120 --> 00:46.000 +are recorded by magnetometers. + +00:46.000 --> 00:50.200 +Because propagation time can be neglected, the signal recorded by the sensors can be + +00:50.200 --> 00:55.840 +seen as a linear mixture of the signal emitted by the sources. + +00:55.840 --> 00:59.600 +S is a set of sources that are assumed to be independent. + +00:59.600 --> 01:06.400 +X are the recordings and A describes how the sources are mixed to produce the recordings. + +01:06.400 --> 01:12.120 +At first sight this model may seem ill-defined because if we permute two columns in A and + +01:12.120 --> 01:19.600 +permute the corresponding sources in S, we'll get a new set of sources S' and a new mixing + +01:19.600 --> 01:25.360 +matrix A' that describes X just as well as A and S. + +01:25.360 --> 01:30.360 +And similarly if we scale the column of A by some constant, one column of A by some + +01:30.360 --> 01:34.920 +constant and the corresponding source by the same constant, we'll also get an equivalent + +01:34.920 --> 01:35.920 +description of X. + +01:35.920 --> 01:44.840 +However, these scale and permutation indeterminacies are the only one if the sources contain at + +01:44.840 --> 01:46.840 +most one Gaussian component. + +01:46.840 --> 01:52.040 +Let us consider the more general problem where you have multiple subjects that are exposed + +01:52.040 --> 01:54.560 +to the same stimuli. + +01:54.560 --> 02:00.640 +We have two subjects, X1 and X2, and they have different mixing matrices, A1 and A2, + +02:00.640 --> 02:04.560 +and different noise levels, N1 and N2. + +02:04.560 --> 02:08.720 +The interpretation is that they have shared sources because they have shared connective + +02:08.720 --> 02:09.720 +processes. + +02:09.720 --> 02:15.120 +They have different mixing matrices because they have different spatial topography. + +02:15.120 --> 02:20.600 +And they have different noises because we want to model inter-subject variability. + +02:20.600 --> 02:22.480 +This model is called group ICA. + +02:22.480 --> 02:27.840 +There are many methods to provide a solution for the group ICA problem. + +02:27.840 --> 02:34.560 +A very popular one introduced by Calhoun in 2001 is to just stack the data of all subjects + +02:34.560 --> 02:42.520 +feature-wise and then perform a PCA, a principal component analysis, on the stacked data. + +02:42.520 --> 02:47.520 +And therefore you obtain reduced data and apply independent component analysis on the + +02:47.520 --> 02:50.520 +reduced data to obtain a set of sources. + +02:50.520 --> 02:55.960 +Another formulation is introduced by Varoko in 2010 and is called K-NICA. + +02:55.960 --> 03:01.320 +You just replace the principal component analysis with a multiset CCA, so a multiset canonical + +03:01.320 --> 03:06.120 +correlation analysis, where you have to solve a generalized eigenvalue problem. + +03:06.120 --> 03:12.800 +There are many different formulations of multiset CCA, but this one with a generalized eigenvalue + +03:12.800 --> 03:15.560 +problem is the fastest to solve. + +03:15.560 --> 03:17.840 +KNICA and Cut-ICA have a lot of advantages. + +03:17.840 --> 03:21.000 +First, they are very fast to fit. + +03:21.000 --> 03:23.320 +And second, they are simple to implement. + +03:23.320 --> 03:26.920 +These are the two reasons why they are so popular in neuroimaging. + +03:26.920 --> 03:30.160 +However, they do not optimize the proper likelihood. + +03:30.160 --> 03:35.680 +So therefore they do not benefit from advantages of such estimators such as asymptotic efficiency. + +03:35.680 --> 03:41.480 +There are a lot of other related work that do optimize the proper likelihood. + +03:41.480 --> 03:46.240 +I want to mention the independent vector analysis, which is a very powerful framework introduced + +03:46.240 --> 03:48.760 +by Li in 2008. + +03:48.760 --> 03:54.560 +So unified approach of Guo in 2008 that we will also mention and talk about later. + +03:54.560 --> 04:01.040 +The approach of Shen in 2015 that also allows to perform dimension reduction. + +04:01.040 --> 04:08.320 +And the multi-view ICA that was introduced by our team last year. + +04:08.320 --> 04:15.200 +I want to quickly say that it's not obvious to design a likelihood-based approach that + +04:15.200 --> 04:17.400 +is tractable. + +04:17.400 --> 04:23.680 +And with this example of the Gaussian mixture noisy ICA by Bermond and Cardozo, we'll see + +04:23.680 --> 04:31.400 +that standard approach leads to intractable algorithms. + +04:31.400 --> 04:37.080 +The model we take here is the same as the group ICA, but we assume that the noise is + +04:37.080 --> 04:40.120 +Gaussian with the same variance for all subjects. + +04:40.120 --> 04:47.600 +We'll also assume that the sources follow a Gaussian mixture model. + +04:47.600 --> 04:53.040 +And we further assume that the weights of the Gaussian mixtures are known. + +04:53.040 --> 04:56.360 +We can solve such model via expectation maximization. + +04:56.360 --> 05:01.400 +And if we write the E-step, we'll get a closed form that involves a large sum. + +05:01.400 --> 05:09.040 +Because of this large size, this sum, and therefore the M algorithm is intractable whenever + +05:09.040 --> 05:11.600 +Q and K are large. + +05:11.600 --> 05:17.520 +Our contribution is shared ICA, what we call Shikha for short, where the data of subject + +05:17.520 --> 05:23.080 +i are assumed as a linear mixture of noisy sources, and the noise here is not on the + +05:23.080 --> 05:24.080 +sensor, but on the sources. + +05:24.080 --> 05:30.000 +The noise is Gaussian with a variance that can be different for each subject and different + +05:30.000 --> 05:31.000 +for each component. + +05:31.000 --> 05:37.800 +S are assumed to be independent, but in contrast to almost all existing work, some components + +05:37.800 --> 05:38.800 +can be Gaussian. + +05:38.800 --> 05:41.600 +We have a few blanket assumptions. + +05:41.600 --> 05:45.840 +We assume that the data are centered, that the mixing metrics are invertible, that the + +05:45.840 --> 05:50.680 +sources have identical variance, and that the number of subjects is greater than 3. + +05:50.680 --> 05:54.000 +We have two algorithms to solve the Shikha model. + +05:54.000 --> 06:01.520 +We have ShikhaJ, that is a FAS algorithm that is based on multiset CCA, and ShikhaML, a + +06:01.520 --> 06:04.000 +maximum likelihood approach. + +06:04.000 --> 06:07.600 +In Shikha, there are two ways to recover the parameters. + +06:07.600 --> 06:12.880 +Either the source are non-Gaussian, in which case we can use classical ICA results to recover + +06:12.880 --> 06:15.720 +the unmixing matrices. + +06:15.720 --> 06:20.120 +When the components are Gaussian, then we need something else, and what we use here + +06:20.120 --> 06:22.480 +is noise diversity. + +06:22.480 --> 06:28.320 +When the noise is sufficiently diverse, then it's possible to recover the unmixing matrix + +06:28.320 --> 06:34.120 +and the noise covariance up to a permutation and sign indeterminacy. + +06:34.120 --> 06:38.240 +Note that the noise diversity in Gaussian components is also a necessary condition. + +06:38.240 --> 06:42.680 +If it does not hold, then Shikha cannot be identified. + +06:42.680 --> 06:48.520 +Let us now focus on this theorem that is at the core of the ShikhaJ algorithm. + +06:48.520 --> 06:53.520 +Namely it shows that we can solve group ICA with multiset CCA. + +06:53.520 --> 06:58.880 +So assume the data follows the Shikha model, and consider the multiset CCA framed as a + +06:58.880 --> 07:00.920 +generalized eigenvalue problem. + +07:00.920 --> 07:08.080 +This generalized eigenvalue problem relies on two matrices, C and D. So C is formed by + +07:08.080 --> 07:13.560 +second-order statistics, and D is formed by the diagonal blocks in C. + +07:13.560 --> 07:19.880 +And so if we solve this eigenvalue problem and take the first k leading eigenvectors, + +07:19.880 --> 07:26.520 +we can recover the correct unmixing matrix from them, up to a permutation and a scaling. + +07:26.520 --> 07:32.000 +And this can only be done if the k first eigenvalues are distinct. + +07:32.000 --> 07:34.320 +Note that the distinct eigenvalue condition is also necessary. + +07:34.320 --> 07:40.480 +If two eigenvalues are the same, then this adds the need to determine IC, and therefore + +07:40.480 --> 07:42.280 +we cannot solve group IC. + +07:42.280 --> 07:48.640 +Note also that the condition that some eigenvalues need to be distinct is stronger than the noise + +07:48.640 --> 07:54.080 +diversity condition we have in the identifiability theorem. + +07:54.080 --> 07:59.360 +And therefore we can exhibit an example which is identifiable, but on which multiset CCA + +07:59.360 --> 08:00.360 +will fail. + +08:00.360 --> 08:04.800 +And I refer you to the paper for more details on this. + +08:04.800 --> 08:10.160 +So in our theorem, in order to recover the correct unmixing matrix, we need to have access + +08:10.160 --> 08:12.480 +to the second-order statistics. + +08:12.480 --> 08:18.860 +However, in practice, we only have access to them, up to some sampling noise. + +08:18.860 --> 08:24.520 +And because the mapping from matrices to eigenvectors is highly non-smooth, a small deviation in + +08:24.520 --> 08:31.160 +the second-order statistics can lead to a high deviation of the recovered unmixing matrix. + +08:31.160 --> 08:38.080 +Now to show this in practice, we take three subjects, two components, and noise covariance + +08:38.080 --> 08:47.440 +matrices with two values, lambda1 and lambda2, that are separated by an eigengap epsilon. + +08:47.440 --> 08:52.440 +And we compare the solution of multiset CCA on the true covariance matrices and on the + +08:52.440 --> 08:59.520 +perturbed covariance matrix, where the perturbation scale is given by delta. + +08:59.520 --> 09:07.240 +And for different values of epsilon, 10-4, 10-3, 10-2, 10-1, we show how the performance + +09:07.240 --> 09:14.720 +of the algorithm, so the M-ary distance between the true unmixing matrix and the estimated + +09:14.720 --> 09:20.880 +unmixing matrix, varies when the perturbation scale increases. + +09:20.880 --> 09:26.600 +And we see that when the eigengap is very close, so 10-4, the violet curve, then even + +09:26.600 --> 09:31.440 +with a very small perturbation, you can get to a very bad M-ary distance. + +09:31.440 --> 09:35.720 +So the black dashed curve is a performance of chance. + +09:35.720 --> 09:41.200 +Luckily, there is a large gap between the k-th eigenvalues and the k plus 1. + +09:41.200 --> 09:46.120 +This means that in practice, the span of the p-leading eigenvectors is approximately preserved. + +09:46.120 --> 09:53.600 +We can recover the true unmixing matrix from the unmixing matrix estimated by multiset + +09:53.600 --> 09:56.520 +CCA, just by multiplying by a matrix Q. + +09:56.520 --> 10:02.640 +And in order to estimate Q, we make use of the fact that the unmixed data should have + +10:02.640 --> 10:03.640 +a diagonal covariance. + +10:03.640 --> 10:09.680 +This leads us to a joint diagonalization problem that we can solve efficiently. + +10:09.680 --> 10:14.480 +So if we take the experiments we've done on the previous slide, the results are still + +10:14.480 --> 10:15.480 +shown here. + +10:15.480 --> 10:21.640 +You can see the violet curves, and that is very sensitive to perturbation. + +10:21.640 --> 10:29.360 +And so if we apply joint diagonalization, all these curves move, and they join the dashed + +10:29.360 --> 10:30.360 +curve on the bottom. + +10:30.360 --> 10:34.720 +And therefore, it's much better, because now the new curves that are represented by the + +10:34.720 --> 10:42.920 +dashed line are less sensitive to perturbations. + +10:42.920 --> 10:47.920 +So now we've obtained the correct unmixing matrix, but up to a scaling. + +10:47.920 --> 10:55.040 +And so we need an additional step to find the correct scaling, and another one to find + +10:55.040 --> 11:00.680 +the other parameter that is still unestimated, which are the noise covariance. + +11:00.680 --> 11:04.000 +And luckily, it's very easy to find the noise covariance. + +11:04.000 --> 11:06.280 +We can do this via an EM algorithm. + +11:06.280 --> 11:11.920 +The E-step and the M-step are in closed form, and this yields a very fast algorithm. + +11:11.920 --> 11:15.200 +But the Shikha-J is not a maximum likelihood estimator. + +11:15.200 --> 11:22.600 +So now we will focus on Shikha-ML, which is our maximum likelihood estimator. + +11:22.600 --> 11:31.240 +So I won't go too much into details on this, but we optimize this via an EM using a Gaussian + +11:31.240 --> 11:33.480 +mixture assumption as a source. + +11:33.480 --> 11:35.960 +We assume that the weights are known. + +11:35.960 --> 11:41.480 +What I just want to showcase here is that the E-step of the algorithm, the one that + +11:41.480 --> 11:46.000 +gives you the expectation of the sources given the data, and the variance of the sources + +11:46.000 --> 11:50.760 +given the data, only involves the sum of size 2. + +11:50.760 --> 11:57.320 +So previously we had a sum that had an exponential number of terms, and here we don't have that + +11:57.320 --> 11:58.320 +anymore. + +11:58.320 --> 12:02.920 +So the E-step is much faster than what we had before, and therefore the EM algorithm + +12:02.920 --> 12:07.200 +here is tractable, whereas it was not the case before. + +12:07.200 --> 12:11.440 +I first want to present our synthetic experiment where we generate data according to the Shikha-ML + +12:11.440 --> 12:13.200 +and Shikha-J model. + +12:13.200 --> 12:18.560 +In case A, we have only Gaussian components, but we have noise diversity, and therefore + +12:18.560 --> 12:24.240 +methods that use noise diversity to recover the sources such as Shikha-ML and Shikha-J + +12:24.240 --> 12:25.240 +perform best. + +12:25.240 --> 12:34.000 +In the second case, we have only non-Gaussian components and no noise diversity, so methods + +12:34.000 --> 12:41.520 +that use non-Gaussianity perform well such as Kana-ICA, Shikha-ML, or MultiView-ICA. + +12:41.520 --> 12:45.200 +And the last case, half of the components are Gaussian with noise diversity, and the + +12:45.200 --> 12:49.000 +other half are non-Gaussian but without noise diversity. + +12:49.000 --> 12:53.000 +And in this case, only Shikha-ML is able to correctly recover the sources. + +12:53.000 --> 12:57.960 +MV-ICA doesn't do that, but it's not as good as Shikha-ML. + +12:57.960 --> 13:00.400 +Let us now talk about our experiments on real data. + +13:00.400 --> 13:05.080 +We have this reconstruction experiment on fMRI data where subjects are exposed to a + +13:05.080 --> 13:07.920 +naturalistic stimuli such as movie watching. + +13:07.920 --> 13:15.320 +We use 80% of the movie to learn the unmixing matrices of all subjects, and then on the + +13:15.320 --> 13:22.320 +20% left of the movie, we compute the common sources, and from these common sources computed + +13:22.320 --> 13:28.800 +using 80% of the subject, we try to reconstruct the data of the 20% left of the subject. + +13:28.800 --> 13:33.880 +We compute the R2 score within regions of interest between the reconstructed data and + +13:33.880 --> 13:39.480 +the true data, and plot them as a function of the number of components used. + +13:39.480 --> 13:43.000 +As we see, Shikha-ML outperforms all of the methods. + +13:43.000 --> 13:47.400 +As a take-home message, Shikha is a powerful framework to extract shared sources. + +13:47.400 --> 13:52.840 +Shikha-J is a fast approach to fit the model, but it only uses second-order information. + +13:52.840 --> 13:58.800 +In contrast, Shikha-ML is a bit slower, but is able to use non-gaussianity in addition + +13:58.800 --> 14:00.960 +to second-order information. + +14:00.960 --> 14:03.840 +In practice, Shikha-ML yields the best results. + +14:03.840 --> 14:05.960 +The methods we've introduced work on reduced data. + +14:05.960 --> 14:11.160 +It would be interesting to know how to reduce the data so that they perform optimally. + +14:11.160 --> 14:15.400 +Another way to improve our results would be to learn the density of the shared sources + +14:15.400 --> 14:19.480 +in Shikha-ML instead of having them fixed. + +14:19.480 --> 14:23.400 +Thanks for listening, and have a good day! + diff --git a/demo_data/nips-2021/25957/video.mp4 b/demo_data/nips-2021/25957/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..02345ba29207d38e6d02b38938dd08af2b362de5 --- /dev/null +++ b/demo_data/nips-2021/25957/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0539c1b965a157ce62df522fef5ea03cdec6198f5995fefa04cfddf947861fd +size 93633719 diff --git a/demo_data/nips-2021/25958/metadata.json b/demo_data/nips-2021/25958/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..df909178672cde75e788e99b32186f74bbb23849 --- /dev/null +++ b/demo_data/nips-2021/25958/metadata.json @@ -0,0 +1,3 @@ +{ + "title": "ParK: Sound and Efficient Kernel Ridge Regression by Feature Space Partitions" +} \ No newline at end of file diff --git a/demo_data/nips-2021/25958/transcript_whisper_large-v2.txt b/demo_data/nips-2021/25958/transcript_whisper_large-v2.txt new file mode 100644 index 0000000000000000000000000000000000000000..96f3d0fe95b980e1be1da15cffe8e0d6687edf51 --- /dev/null +++ b/demo_data/nips-2021/25958/transcript_whisper_large-v2.txt @@ -0,0 +1,124 @@ +Hello everyone, I'm Luigi Carretino, and this is a joint work with Stefano Vigonia, +Daniele Calandriello, and Lorenzo Rosasco. +The problem that we study in this work is a standard regression problem, where we want +to estimate an unknown function f star given n pairs of points, x's and y's, and then +given n pairs of points, x's and y's, where y's are noisy evaluations of the functions +f star on the input points axis. +A well-established method to learn nonlinear functions is kernel ridge regression. +The basic idea is to map the input points into a higher dimensional space, where linear +relationships can be learned that then translate in nonlinear ones in the input space. +To formalize this, we can think about solving a standard empirical risk minimization problem +regularized over a spatial function which is a reproducing kernel Hilbert space. +Numerically speaking, the solution of this type of problem boils down to solving a linear +system. Particularly, we can see here that the linear system is going to be Kc equal +y, where K is the kernel matrix evaluated in all the pairs of points of the training +sets, c are the weights that we aim to learn, and y's are the output points. +We know that this method is optimal from a statistical point of view, but a drawback +is that it suffers from computational scalability. In fact, in terms of time complexity, if we +have n training points and we want to solve the linear system directly, we'll have to +invert the matrix K, and this will cost us n cubed in time. +Multiple ways of accelerating this process have been proposed over time. +The first one is to solve the methods iteratively instead of inverting directly the matrix K. +This allows us to only have matrix vector multiplications, and so the overall cost of +an iterative method to solve this linear system is going to be Tn squared. +Another method is the one known as sketching, where we can see this as subsampling the linear +system, in particular subsampling columns of this linear system, where we can take m +columns of the linear system uniformly at random to get a smaller one, and the cost +of this will be m squared n. +Another method instead is splitting. This allows us to divide the main problem into +many, in this case Q, subproblems, each one that can be solved independently and so +potentially can be distributed. So we can have a cost which boils down to n over Q to +the power of 3. +Combinations of these methods have been proposed in the literature. In particular, if +we combine iterating and sketching, we can get a solver that can solve the problem in +a time complexity of Tmn. +If instead we combine sketching and splitting, we can get a solver that can be computed +in m squared times n over Q. +And in this work, we try to blend all these techniques to derive a new algorithm, which +we will call PARC, that can achieve a time complexity of Tm times n over Q to the power +of 2. +So as we just said, in this work, we propose a new large-scale kernel regression solver +that combines the computational benefits of iteration, sketching, and splitting. +Notice, though, that these are approximation techniques and they may come at the cost of +accuracy. But we are able to show that this new algorithm is able to preserve generalization +under suitable partitions. +Now also notice that instead of general splitting, we are going to need to focus on a +particular type, which is the partitions. +So we introduce a new principal partition scheme for kernel methods. +We now look at the difference between data splitting and space partitioning. +Given a set of points, the procedure of splitting takes groups of points at random and assign +them to different splits or clusters. +In this picture, for example, we divide the points in four splits. +Partitioning instead divides the space in different cells, and then the points are implicitly +assigned to a particular cluster based on which cell they belong to. +Notice that with the splitting methods, we don't consider local information while we +perform the splitting, but we do when we perform partitioning. +Now, from this picture, the concept of partitioning a space seems pretty straightforward. +However, when you start considering high dimensional feature space, subtle problems can +appear. +So first, as a recap, remember that there are two important spaces to consider in our +regression problem. +The input space X with its input space features and the kernel space H with its input space +features, and the kernel space H, which potentially has many more implicit features. +Traditionally, partition methods are applied directly to the input space. +For example, a classical approach is to select a subset of points as centroids and then +partition the space in cells by assigning each portion of the space to the closest centroid, +which is called a Voronoi partition. +Since we are in the input space, closest here is defined according to a simple Euclidean +distance. +However, remember that our target function and our whole regression does not happen +directly on the input data space, but rather on the data mapped in the feature space. +And after we apply our feature map to the data, the concept of closest and the partition +can radically change. +For example, here on the right, we choose a kernel space associated with a cosine similarity +and again plot how the centroids partition the input space, but this time we chose closest +according to the new cosine distance. +The resulting partition is very different from the Euclidean one as it captures the +non-linearity of the kernel function. +In the paper, we discuss how this difference can impact the regression and we identified +sufficient conditions that the partition should satisfy in order to guarantee good generalization +of the learning process. +Crucially, we will see that these guarantees depend not on how the input space is partitioned, +but rather how the feature space is partitioned. +As a consequence, for our PARC methods, we focus on choosing centroids solely using the +kernel version of the distance. +We are now ready to present in more detail how the PARC algorithm works. +First of all, PARC partitioned the feature space into Q Voronoi cells and the first thing +to do is to identify the centroids in the feature space that allows us to describe the +Voronoi cells. +Then inside each Voronoi cell, we learn a local estimator using an uniterated and sketched +version of kernel ridge regression. +And then at prediction time, when a new sample arrives, we can use the Q Voronoi feature +to identify the new sample. +We use the local estimator corresponding to the Voronoi cell to which the new points fall +on. +The generalization error of standard kernel ridge regression without partitioning can +be upper bounded by two terms, a bias term and a variance term. +In our work, we can show that also the generalization error of PARC can be upper bounded by a bias +term and a variance term. +But this time, these two terms are weighted and they are weighted by a certain quantity +that depends on an angle theta, which is the minimum angle between all the subspaces of +the partitions. +For example, when all the subspaces are orthogonal between each other, we recover the exact same +generalization error of standard kernel ridge regression. +But we are also able to show that for angles which are small enough, we are able to obtain +a generalization error which is of the same order of standard kernel ridge regression. +These theoretical results suggest us how to construct a good partition. +So in particular, PARC selects the Voronoi centroids greedily in order to promote orthogonality +between the Voronoi cells. +And in particular, we use the Schur complement to measure the orthogonality. +We also use the Schur complement to measure the orthogonality of the Voronoi centroids. +And in particular, we use the Schur complement to measure the orthogonality. +Given all these ingredients, we are now able to measure the computational complexity of +PARC, which has a time complexity that is the sum of two terms. +A first term, q squared n log n, which is the cost of computing the centroids with the +just mentioned procedure. +And a second term, q squared n log n, which is the cost of computing the most expensive +local estimator. +Empirically, we performed experiments on data set of millions and of billions of points, +and we compared with the currently fastest global kernel methods and with some other +splitting kernel methods. +We can see that PARC is the only method that manages to match the accuracy of the global +estimator. +Thank you all for your attention. +And thank you to the poster for all your questions and more details. diff --git a/demo_data/nips-2021/25958/transcript_whisper_large-v2.vtt b/demo_data/nips-2021/25958/transcript_whisper_large-v2.vtt new file mode 100644 index 0000000000000000000000000000000000000000..25eff1067a42735952fd9bc6cc9e41c0c4c698dd --- /dev/null +++ b/demo_data/nips-2021/25958/transcript_whisper_large-v2.vtt @@ -0,0 +1,374 @@ +WEBVTT + +00:00.000 --> 00:07.000 +Hello everyone, I'm Luigi Carretino, and this is a joint work with Stefano Vigonia, + +00:07.000 --> 00:10.000 +Daniele Calandriello, and Lorenzo Rosasco. + +00:10.000 --> 00:16.000 +The problem that we study in this work is a standard regression problem, where we want + +00:16.000 --> 00:24.000 +to estimate an unknown function f star given n pairs of points, x's and y's, and then + +00:24.000 --> 00:34.000 +given n pairs of points, x's and y's, where y's are noisy evaluations of the functions + +00:34.000 --> 00:38.000 +f star on the input points axis. + +00:41.000 --> 00:46.000 +A well-established method to learn nonlinear functions is kernel ridge regression. + +00:46.000 --> 00:53.000 +The basic idea is to map the input points into a higher dimensional space, where linear + +00:53.000 --> 00:59.000 +relationships can be learned that then translate in nonlinear ones in the input space. + +01:01.000 --> 01:07.000 +To formalize this, we can think about solving a standard empirical risk minimization problem + +01:07.000 --> 01:12.000 +regularized over a spatial function which is a reproducing kernel Hilbert space. + +01:14.000 --> 01:20.000 +Numerically speaking, the solution of this type of problem boils down to solving a linear + +01:20.000 --> 01:26.000 +system. Particularly, we can see here that the linear system is going to be Kc equal + +01:26.000 --> 01:33.000 +y, where K is the kernel matrix evaluated in all the pairs of points of the training + +01:33.000 --> 01:39.000 +sets, c are the weights that we aim to learn, and y's are the output points. + +01:40.000 --> 01:45.000 +We know that this method is optimal from a statistical point of view, but a drawback + +01:45.000 --> 01:52.000 +is that it suffers from computational scalability. In fact, in terms of time complexity, if we + +01:52.000 --> 01:57.000 +have n training points and we want to solve the linear system directly, we'll have to + +01:57.000 --> 02:03.000 +invert the matrix K, and this will cost us n cubed in time. + +02:06.000 --> 02:11.000 +Multiple ways of accelerating this process have been proposed over time. + +02:11.000 --> 02:17.000 +The first one is to solve the methods iteratively instead of inverting directly the matrix K. + +02:18.000 --> 02:25.000 +This allows us to only have matrix vector multiplications, and so the overall cost of + +02:25.000 --> 02:30.000 +an iterative method to solve this linear system is going to be Tn squared. + +02:31.000 --> 02:39.000 +Another method is the one known as sketching, where we can see this as subsampling the linear + +02:39.000 --> 02:46.000 +system, in particular subsampling columns of this linear system, where we can take m + +02:46.000 --> 02:52.000 +columns of the linear system uniformly at random to get a smaller one, and the cost + +02:52.000 --> 02:55.000 +of this will be m squared n. + +02:57.000 --> 03:04.000 +Another method instead is splitting. This allows us to divide the main problem into + +03:04.000 --> 03:12.000 +many, in this case Q, subproblems, each one that can be solved independently and so + +03:12.000 --> 03:20.000 +potentially can be distributed. So we can have a cost which boils down to n over Q to + +03:20.000 --> 03:22.000 +the power of 3. + +03:25.000 --> 03:30.000 +Combinations of these methods have been proposed in the literature. In particular, if + +03:30.000 --> 03:35.000 +we combine iterating and sketching, we can get a solver that can solve the problem in + +03:35.000 --> 03:38.000 +a time complexity of Tmn. + +03:40.000 --> 03:47.000 +If instead we combine sketching and splitting, we can get a solver that can be computed + +03:47.000 --> 03:51.000 +in m squared times n over Q. + +03:51.000 --> 03:59.000 +And in this work, we try to blend all these techniques to derive a new algorithm, which + +03:59.000 --> 04:09.000 +we will call PARC, that can achieve a time complexity of Tm times n over Q to the power + +04:09.000 --> 04:10.000 +of 2. + +04:12.000 --> 04:18.000 +So as we just said, in this work, we propose a new large-scale kernel regression solver + +04:18.000 --> 04:22.000 +that combines the computational benefits of iteration, sketching, and splitting. + +04:23.000 --> 04:27.000 +Notice, though, that these are approximation techniques and they may come at the cost of + +04:27.000 --> 04:35.000 +accuracy. But we are able to show that this new algorithm is able to preserve generalization + +04:35.000 --> 04:37.000 +under suitable partitions. + +04:38.000 --> 04:44.000 +Now also notice that instead of general splitting, we are going to need to focus on a + +04:44.000 --> 04:48.000 +particular type, which is the partitions. + +04:48.000 --> 04:53.000 +So we introduce a new principal partition scheme for kernel methods. + +04:56.000 --> 05:01.000 +We now look at the difference between data splitting and space partitioning. + +05:01.000 --> 05:08.000 +Given a set of points, the procedure of splitting takes groups of points at random and assign + +05:08.000 --> 05:10.000 +them to different splits or clusters. + +05:10.000 --> 05:14.000 +In this picture, for example, we divide the points in four splits. + +05:15.000 --> 05:21.000 +Partitioning instead divides the space in different cells, and then the points are implicitly + +05:21.000 --> 05:25.000 +assigned to a particular cluster based on which cell they belong to. + +05:27.000 --> 05:32.000 +Notice that with the splitting methods, we don't consider local information while we + +05:32.000 --> 05:37.000 +perform the splitting, but we do when we perform partitioning. + +05:37.000 --> 05:42.000 +Now, from this picture, the concept of partitioning a space seems pretty straightforward. + +05:43.000 --> 05:48.000 +However, when you start considering high dimensional feature space, subtle problems can + +05:48.000 --> 05:49.000 +appear. + +05:50.000 --> 05:55.000 +So first, as a recap, remember that there are two important spaces to consider in our + +05:55.000 --> 05:56.000 +regression problem. + +05:57.000 --> 06:04.000 +The input space X with its input space features and the kernel space H with its input space + +06:04.000 --> 06:10.000 +features, and the kernel space H, which potentially has many more implicit features. + +06:13.000 --> 06:17.000 +Traditionally, partition methods are applied directly to the input space. + +06:18.000 --> 06:24.000 +For example, a classical approach is to select a subset of points as centroids and then + +06:24.000 --> 06:30.000 +partition the space in cells by assigning each portion of the space to the closest centroid, + +06:30.000 --> 06:32.000 +which is called a Voronoi partition. + +06:32.000 --> 06:38.000 +Since we are in the input space, closest here is defined according to a simple Euclidean + +06:38.000 --> 06:39.000 +distance. + +06:40.000 --> 06:45.000 +However, remember that our target function and our whole regression does not happen + +06:45.000 --> 06:51.000 +directly on the input data space, but rather on the data mapped in the feature space. + +06:52.000 --> 06:58.000 +And after we apply our feature map to the data, the concept of closest and the partition + +06:58.000 --> 06:59.000 +can radically change. + +06:59.000 --> 07:05.000 +For example, here on the right, we choose a kernel space associated with a cosine similarity + +07:06.000 --> 07:12.000 +and again plot how the centroids partition the input space, but this time we chose closest + +07:12.000 --> 07:14.000 +according to the new cosine distance. + +07:15.000 --> 07:20.000 +The resulting partition is very different from the Euclidean one as it captures the + +07:20.000 --> 07:22.000 +non-linearity of the kernel function. + +07:22.000 --> 07:28.000 +In the paper, we discuss how this difference can impact the regression and we identified + +07:28.000 --> 07:34.000 +sufficient conditions that the partition should satisfy in order to guarantee good generalization + +07:34.000 --> 07:35.000 +of the learning process. + +07:37.000 --> 07:43.000 +Crucially, we will see that these guarantees depend not on how the input space is partitioned, + +07:43.000 --> 07:45.000 +but rather how the feature space is partitioned. + +07:45.000 --> 07:51.000 +As a consequence, for our PARC methods, we focus on choosing centroids solely using the + +07:51.000 --> 07:53.000 +kernel version of the distance. + +07:57.000 --> 08:00.000 +We are now ready to present in more detail how the PARC algorithm works. + +08:01.000 --> 08:07.000 +First of all, PARC partitioned the feature space into Q Voronoi cells and the first thing + +08:07.000 --> 08:16.000 +to do is to identify the centroids in the feature space that allows us to describe the + +08:16.000 --> 08:17.000 +Voronoi cells. + +08:19.000 --> 08:25.000 +Then inside each Voronoi cell, we learn a local estimator using an uniterated and sketched + +08:25.000 --> 08:27.000 +version of kernel ridge regression. + +08:30.000 --> 08:36.000 +And then at prediction time, when a new sample arrives, we can use the Q Voronoi feature + +08:36.000 --> 08:38.000 +to identify the new sample. + +08:40.000 --> 08:47.000 +We use the local estimator corresponding to the Voronoi cell to which the new points fall + +08:47.000 --> 08:48.000 +on. + +08:52.000 --> 08:57.000 +The generalization error of standard kernel ridge regression without partitioning can + +08:57.000 --> 09:02.000 +be upper bounded by two terms, a bias term and a variance term. + +09:02.000 --> 09:10.000 +In our work, we can show that also the generalization error of PARC can be upper bounded by a bias + +09:10.000 --> 09:11.000 +term and a variance term. + +09:11.000 --> 09:16.000 +But this time, these two terms are weighted and they are weighted by a certain quantity + +09:16.000 --> 09:25.000 +that depends on an angle theta, which is the minimum angle between all the subspaces of + +09:25.000 --> 09:26.000 +the partitions. + +09:26.000 --> 09:33.000 +For example, when all the subspaces are orthogonal between each other, we recover the exact same + +09:33.000 --> 09:36.000 +generalization error of standard kernel ridge regression. + +09:38.000 --> 09:45.000 +But we are also able to show that for angles which are small enough, we are able to obtain + +09:45.000 --> 09:50.000 +a generalization error which is of the same order of standard kernel ridge regression. + +09:50.000 --> 09:54.000 +These theoretical results suggest us how to construct a good partition. + +09:54.000 --> 10:00.000 +So in particular, PARC selects the Voronoi centroids greedily in order to promote orthogonality + +10:00.000 --> 10:01.000 +between the Voronoi cells. + +10:01.000 --> 10:06.000 +And in particular, we use the Schur complement to measure the orthogonality. + +10:10.000 --> 10:16.000 +We also use the Schur complement to measure the orthogonality of the Voronoi centroids. + +10:16.000 --> 10:20.000 +And in particular, we use the Schur complement to measure the orthogonality. + +10:24.000 --> 10:28.000 +Given all these ingredients, we are now able to measure the computational complexity of + +10:28.000 --> 10:32.000 +PARC, which has a time complexity that is the sum of two terms. + +10:33.000 --> 10:40.000 +A first term, q squared n log n, which is the cost of computing the centroids with the + +10:40.000 --> 10:41.000 +just mentioned procedure. + +10:41.000 --> 10:46.000 +And a second term, q squared n log n, which is the cost of computing the most expensive + +10:46.000 --> 10:47.000 +local estimator. + +10:51.000 --> 10:57.000 +Empirically, we performed experiments on data set of millions and of billions of points, + +10:57.000 --> 11:01.000 +and we compared with the currently fastest global kernel methods and with some other + +11:01.000 --> 11:02.000 +splitting kernel methods. + +11:03.000 --> 11:08.000 +We can see that PARC is the only method that manages to match the accuracy of the global + +11:08.000 --> 11:11.000 +estimator. + +11:11.000 --> 11:13.000 +Thank you all for your attention. + +11:13.000 --> 11:40.000 +And thank you to the poster for all your questions and more details. + diff --git a/demo_data/nips-2021/25958/video.mp4 b/demo_data/nips-2021/25958/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..cb913f6e8ff187ddd51208943e0ccc1dbca6f87b --- /dev/null +++ b/demo_data/nips-2021/25958/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fefd926545331be9df0497e824634fa23129d26c9c9e7fdbe67c0382b98b4556 +size 22931245 diff --git a/demo_data/nips-2021/25959/metadata.json b/demo_data/nips-2021/25959/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..02cdf9646c5a2fcf61e1661261dea7c99dac40fc --- /dev/null +++ b/demo_data/nips-2021/25959/metadata.json @@ -0,0 +1,3 @@ +{ + "title": "Adversarial Feature Desensitization" +} \ No newline at end of file diff --git a/demo_data/nips-2021/25959/transcript_whisper_large-v2.txt b/demo_data/nips-2021/25959/transcript_whisper_large-v2.txt new file mode 100644 index 0000000000000000000000000000000000000000..82a869b6d72be2646cde4844ea7a0691a70143a3 --- /dev/null +++ b/demo_data/nips-2021/25959/transcript_whisper_large-v2.txt @@ -0,0 +1,117 @@ +Hello, my name is Pouya Bahshiban and I'm going to tell you about our paper titled +Adversarial Feature Desensitization. This is joint work with a number of wonderful collaborators +at MIWA, University of Montreal and McGill University, including Reza Bayat, Adam Ibrahim, +Kartika Hoja, Mojtaba Farmazi, Tourez Dale, Lake Richards and Erin Oji. A common assumption in +machine learning is that the train and test samples come from the same distribution. +While this is a reasonable assumption under most circumstances, it is intentionally violated in the +regime of adversarial attacks. Adversarial attacks are algorithms that search for slight input +perturbations that cause the input to be misclassified. In the case of white box attacks, +the model itself is transparent to the attacker and the attacker uses it to identify the possible +inputs that would lead to misclassifications. A famous example of this is the image of a panda +that when perturbed with imperceptible noise, alters the model's prediction from a panda to a +gibbon. As prior literature has shown, this is a common issue in almost all machine learning methods +and unless the classifier is specifically trained to be robust against these attacks, +the attacks could completely break down the classifier's performance. +This issue becomes even more critical when we consider the vast usage of these machine learning +systems in our societies. For example, the possible security concerns that rise in face +recognition systems prone to adversarial attacks or the safety in autonomous driving systems. +So what is an adversarial attack? To formally define the adversarial attacks, let's assume a +feature learning function f that projects inputs x to latent space with feature space z +and a classifier that uses the latent code z to predict the correct class label y hat. +The perturbation function or the attack generates a perturbed sample x prime +within the epsilon neighborhood of the input x, which we're showing here as b of x and epsilon. +By maximizing the classification objective, the opposite of how we normally optimize the classifier's +parameter. Many methods have been proposed to defend the models against adversarial attacks. +Two of these methods that have withstood the test of time so far are the adversarial training +by Alexander Modrianov, which proposes a defense method by solving a minimax optimization problem +that involves finding an adversarial input by maximizing the classification loss in the inner +loop followed by a classifier training to minimizing the classifier loss on these adversarial inputs. +This procedure is graphically shown for two hypothetical classes in the diagram on this slide. +The adversarial training method essentially learns to separate the distributions of adversarial +examples belonging to different classes. The second method is the trades method by Zhang et al, +which proposes to push the decision boundary of the classifier away from the data. +Trades achieves this by introducing a regularization term to the original learning +objective for classification that penalizes the mismatch between the predicted label +for the clean and perturbed inputs. The diagram on the right side again graphically illustrates +this procedure, where now the defense method learns to separate the distributions of clean examples +belonging to different classes while minimizing the loss of the classifier. +The third method is the trade method by Wang et al, which proposes to push the decision boundary +of the classifier to the inner loop followed by a classifier training to minimizing the +classification loss on these adversarial inputs. The third method is the trade method by Zhang et al, +which proposes to push the decision boundary of the classifier to the inner loop followed by a +classifier training to minimizing the classification loss on these adversarial inputs to the inner +loop. The third method is the trade method by Wang et al, which proposes to push the decision +boundary of the classifier to minimizing the classification loss. The fourth method is the +trade method by Wang et al, which proposes to push the decision boundary of the classifier +for a source domain, but we want the classifier to also perform the same task on a related target +domain that we might not have enough data for or that the generating procedure for sampling +domain might be expensive. The domain adaptation theory proposed by Ben David et al answers the +question of under what conditions can we adapt a classifier trained on the source domain for use +in the target domain. Here we consider the original clean distributions as the source domain and the +distribution of adversarial images generated from those images as the target domain. Although here +the target domain continuously evolves because the adversarial examples are based on the current +state of the model at each time step. And similar to the domain adaptation theory, our goal here +is to learn how to perform well on both source and target domains, meaning the natural and +adversarial domains. Now before I tell you about our proposed method, let's dive a bit deeper into +what the domain adaptation theory from Ben David et al states. Similar to before, let's assume a +feature learning function f that projects inputs x to latent space or feature space z and the +classifier that predicts the correct label y, y hat, from those latent codes. Now consider natural +and adversarial examples as input domains dx and d' x and their induced feature distributions +which go through the f function as dz and d' z. Also consider epsilon z and epsilon' z +as the classification error over the domains dz and d' z, what we are going to refer to as the +clean accuracy and the adversarial accuracy. The domain adaptation theory now gives a bond +on the adversarial error in terms of the natural error and the distance between the two domains. +Fortunately, from the prior work, we know that h delta h distance, which measures the distance +between two domains, can be estimated using the classifier trained to discriminate between the +two domains. Now our defense method called adversarial feature desensitization essentially +minimizes the bound on the adversarial error epsilon' z using a three-step procedure which +has some conceptual similarities with prior work on adversarial domain adaptation from Ganin et al. +For this, we first update the parameters theta and phi in the feature learning function f and +task classifier c to minimize the classification loss on the natural domain. This is shown with +green arrows and green boxes marked 1 on both the equation and on the diagram. +Secondly, we estimate the h delta h distance using an additional domain discriminator +network that predicts the domain identity from the latent code z. We update the domain +discriminator parameters psi to minimize the domain classification loss. And finally, +in the third step, we update the feature learning network parameters theta to maximize the domain +classification loss in an adversarial way. These two steps are marked with red arrows in the figure +and red boxes on the equation. Similar to previous two methods, adversarial training and trades that +I showed you, we here we can also graphically demonstrate this procedure. In our method AFD, +we learn to separate the classes from the distributions of clean examples while at the +same time we optimize a domain classifier that learns the boundary between the clean and adversarial +examples for each class. And finally, we push the adversarial examples to the opposite side of that +boundary. This procedure implicitly desensitizes the learned features to adversarial perturbations +and hence the name adversarial feature desensitization. We tested our method on four +data sets and compared them with a number of other baselines including with adversarial training and +trades. We made two versions of our method called AFDTCGAN that uses the adversarial losses from +Goodfellow et al and AFDWGAN that uses the Wasserstein losses from Arjovski and Goodtuner. +In the table, we evaluated all methods on several white box and black box attacks with +nominal strengths into each data set. Overall, our method AFD and especially AFDWGAN showed superior +performance against most attacks in most data sets. However, AFD was behind trades on several attacks +especially on CIFAR-100 and TinyImageNet data set that had more classes in it. +We also looked in trust attack methods and attack strengths which we controlled with the parameter +epsilon. The diagrams on the right show the robust accuracy for each defense method across +eight attack methods and various epsilon values for each of them. Overall, our results in these +diagrams showed that AFD's robustness generalizes better than the baselines across attacks and +across attack strengths. To quantify these differences, we also computed the area under +the curve for each method for each attack and summarized them in a table on the left. +As you can see, AFD's robust performance generalizes better to unseen and stronger attacks +compared to other baselines. If you remember from previous slides, the domain adaptation theory +predicted a bound on the adversarial error which can also be turned into a bound on the generalization +gap between natural and adversarial attacks. We empirically tested this prediction in our experiments +under two settings. Under the first setting, we varied the epsilon value for the PGDL-infinity +attack which was used during the training. And under the second setting, we varied the +epsilon value for the PGDL-infinity attack which was used during the training. And under the second setting, we used a diverse set of attacks and various attack strengths for each of them. +And under both scenarios, we found that the domain discriminator, which was originally trained on a +particular attack and attack strength, in our case it was PGDL-infinity attack with a fixed epsilon +for each data set, could well predict the generalization gap to unseen attacks and +different attack magnitudes. This suggests that the adversarial training against a domain classifier +like that used in our proposed method could potentially lead to robust models with better +generalization capacity. Finally, while we showed that AFD generalizes well to most other attacks +and attack strengths, it occasionally was worse compared to other baselines, especially in data +sets with more classes like Tiny ImageNet. This could potentially be due to the difficulty of training +domain classifiers in these data sets and leaves much space for future work on +investigating the effect of domain classifiers on the robustness of feature learning functions. +Also, AFD required more backward computations compared to some of the other baselines +such as adversarial training, and as a result, its training time was on average about 31% +longer than adversarial training. We invite you to read our paper for more details and please +get in touch with us if you have any questions. Thanks for watching this video and we hope you enjoyed it. diff --git a/demo_data/nips-2021/25959/transcript_whisper_large-v2.vtt b/demo_data/nips-2021/25959/transcript_whisper_large-v2.vtt new file mode 100644 index 0000000000000000000000000000000000000000..fca1d6baca2c3255fe5eefdc3db5c23c391d5910 --- /dev/null +++ b/demo_data/nips-2021/25959/transcript_whisper_large-v2.vtt @@ -0,0 +1,353 @@ +WEBVTT + +00:00.000 --> 00:13.120 +Hello, my name is Pouya Bahshiban and I'm going to tell you about our paper titled + +00:13.120 --> 00:18.720 +Adversarial Feature Desensitization. This is joint work with a number of wonderful collaborators + +00:18.720 --> 00:24.400 +at MIWA, University of Montreal and McGill University, including Reza Bayat, Adam Ibrahim, + +00:24.400 --> 00:32.160 +Kartika Hoja, Mojtaba Farmazi, Tourez Dale, Lake Richards and Erin Oji. A common assumption in + +00:32.160 --> 00:36.560 +machine learning is that the train and test samples come from the same distribution. + +00:37.200 --> 00:42.960 +While this is a reasonable assumption under most circumstances, it is intentionally violated in the + +00:42.960 --> 00:49.600 +regime of adversarial attacks. Adversarial attacks are algorithms that search for slight input + +00:49.600 --> 00:55.600 +perturbations that cause the input to be misclassified. In the case of white box attacks, + +00:55.600 --> 01:01.600 +the model itself is transparent to the attacker and the attacker uses it to identify the possible + +01:01.600 --> 01:07.760 +inputs that would lead to misclassifications. A famous example of this is the image of a panda + +01:07.760 --> 01:13.360 +that when perturbed with imperceptible noise, alters the model's prediction from a panda to a + +01:13.360 --> 01:19.840 +gibbon. As prior literature has shown, this is a common issue in almost all machine learning methods + +01:19.840 --> 01:25.280 +and unless the classifier is specifically trained to be robust against these attacks, + +01:25.280 --> 01:28.720 +the attacks could completely break down the classifier's performance. + +01:30.240 --> 01:35.600 +This issue becomes even more critical when we consider the vast usage of these machine learning + +01:35.600 --> 01:41.040 +systems in our societies. For example, the possible security concerns that rise in face + +01:41.040 --> 01:46.720 +recognition systems prone to adversarial attacks or the safety in autonomous driving systems. + +01:48.080 --> 01:54.000 +So what is an adversarial attack? To formally define the adversarial attacks, let's assume a + +01:54.000 --> 02:00.080 +feature learning function f that projects inputs x to latent space with feature space z + +02:01.600 --> 02:08.720 +and a classifier that uses the latent code z to predict the correct class label y hat. + +02:08.720 --> 02:14.480 +The perturbation function or the attack generates a perturbed sample x prime + +02:14.480 --> 02:21.520 +within the epsilon neighborhood of the input x, which we're showing here as b of x and epsilon. + +02:22.160 --> 02:28.880 +By maximizing the classification objective, the opposite of how we normally optimize the classifier's + +02:28.880 --> 02:36.720 +parameter. Many methods have been proposed to defend the models against adversarial attacks. + +02:36.720 --> 02:42.640 +Two of these methods that have withstood the test of time so far are the adversarial training + +02:43.200 --> 02:50.160 +by Alexander Modrianov, which proposes a defense method by solving a minimax optimization problem + +02:50.160 --> 02:56.000 +that involves finding an adversarial input by maximizing the classification loss in the inner + +02:56.000 --> 03:03.840 +loop followed by a classifier training to minimizing the classifier loss on these adversarial inputs. + +03:03.840 --> 03:09.920 +This procedure is graphically shown for two hypothetical classes in the diagram on this slide. + +03:10.560 --> 03:15.440 +The adversarial training method essentially learns to separate the distributions of adversarial + +03:15.440 --> 03:22.400 +examples belonging to different classes. The second method is the trades method by Zhang et al, + +03:22.400 --> 03:27.440 +which proposes to push the decision boundary of the classifier away from the data. + +03:27.440 --> 03:32.480 +Trades achieves this by introducing a regularization term to the original learning + +03:32.480 --> 03:38.320 +objective for classification that penalizes the mismatch between the predicted label + +03:38.320 --> 03:44.400 +for the clean and perturbed inputs. The diagram on the right side again graphically illustrates + +03:44.400 --> 03:50.000 +this procedure, where now the defense method learns to separate the distributions of clean examples + +03:50.000 --> 03:54.400 +belonging to different classes while minimizing the loss of the classifier. + +03:54.400 --> 03:59.920 +The third method is the trade method by Wang et al, which proposes to push the decision boundary + +03:59.920 --> 04:06.880 +of the classifier to the inner loop followed by a classifier training to minimizing the + +04:06.880 --> 04:13.120 +classification loss on these adversarial inputs. The third method is the trade method by Zhang et al, + +04:13.120 --> 04:18.720 +which proposes to push the decision boundary of the classifier to the inner loop followed by a + +04:18.720 --> 04:27.840 +classifier training to minimizing the classification loss on these adversarial inputs to the inner + +04:27.840 --> 04:34.640 +loop. The third method is the trade method by Wang et al, which proposes to push the decision + +04:34.640 --> 04:39.920 +boundary of the classifier to minimizing the classification loss. The fourth method is the + +04:39.920 --> 04:45.600 +trade method by Wang et al, which proposes to push the decision boundary of the classifier + +04:45.600 --> 04:52.160 +for a source domain, but we want the classifier to also perform the same task on a related target + +04:52.160 --> 05:00.960 +domain that we might not have enough data for or that the generating procedure for sampling + +05:00.960 --> 05:09.440 +domain might be expensive. The domain adaptation theory proposed by Ben David et al answers the + +05:09.440 --> 05:15.840 +question of under what conditions can we adapt a classifier trained on the source domain for use + +05:15.840 --> 05:23.920 +in the target domain. Here we consider the original clean distributions as the source domain and the + +05:23.920 --> 05:31.280 +distribution of adversarial images generated from those images as the target domain. Although here + +05:31.280 --> 05:38.240 +the target domain continuously evolves because the adversarial examples are based on the current + +05:38.240 --> 05:46.000 +state of the model at each time step. And similar to the domain adaptation theory, our goal here + +05:46.000 --> 05:52.960 +is to learn how to perform well on both source and target domains, meaning the natural and + +05:52.960 --> 06:02.240 +adversarial domains. Now before I tell you about our proposed method, let's dive a bit deeper into + +06:02.240 --> 06:08.960 +what the domain adaptation theory from Ben David et al states. Similar to before, let's assume a + +06:08.960 --> 06:14.880 +feature learning function f that projects inputs x to latent space or feature space z and the + +06:14.880 --> 06:23.040 +classifier that predicts the correct label y, y hat, from those latent codes. Now consider natural + +06:23.040 --> 06:31.440 +and adversarial examples as input domains dx and d' x and their induced feature distributions + +06:31.440 --> 06:42.560 +which go through the f function as dz and d' z. Also consider epsilon z and epsilon' z + +06:42.560 --> 06:50.320 +as the classification error over the domains dz and d' z, what we are going to refer to as the + +06:50.320 --> 06:58.880 +clean accuracy and the adversarial accuracy. The domain adaptation theory now gives a bond + +06:58.880 --> 07:04.320 +on the adversarial error in terms of the natural error and the distance between the two domains. + +07:05.120 --> 07:11.680 +Fortunately, from the prior work, we know that h delta h distance, which measures the distance + +07:11.680 --> 07:17.440 +between two domains, can be estimated using the classifier trained to discriminate between the + +07:17.440 --> 07:26.080 +two domains. Now our defense method called adversarial feature desensitization essentially + +07:26.080 --> 07:34.720 +minimizes the bound on the adversarial error epsilon' z using a three-step procedure which + +07:34.720 --> 07:40.560 +has some conceptual similarities with prior work on adversarial domain adaptation from Ganin et al. + +07:42.240 --> 07:49.280 +For this, we first update the parameters theta and phi in the feature learning function f and + +07:49.280 --> 07:56.320 +task classifier c to minimize the classification loss on the natural domain. This is shown with + +07:56.320 --> 08:01.920 +green arrows and green boxes marked 1 on both the equation and on the diagram. + +08:04.000 --> 08:10.400 +Secondly, we estimate the h delta h distance using an additional domain discriminator + +08:10.960 --> 08:17.600 +network that predicts the domain identity from the latent code z. We update the domain + +08:17.600 --> 08:24.720 +discriminator parameters psi to minimize the domain classification loss. And finally, + +08:24.720 --> 08:31.680 +in the third step, we update the feature learning network parameters theta to maximize the domain + +08:31.680 --> 08:39.600 +classification loss in an adversarial way. These two steps are marked with red arrows in the figure + +08:39.600 --> 08:48.960 +and red boxes on the equation. Similar to previous two methods, adversarial training and trades that + +08:48.960 --> 08:55.760 +I showed you, we here we can also graphically demonstrate this procedure. In our method AFD, + +08:55.760 --> 09:01.040 +we learn to separate the classes from the distributions of clean examples while at the + +09:01.040 --> 09:07.840 +same time we optimize a domain classifier that learns the boundary between the clean and adversarial + +09:07.840 --> 09:14.560 +examples for each class. And finally, we push the adversarial examples to the opposite side of that + +09:14.560 --> 09:22.400 +boundary. This procedure implicitly desensitizes the learned features to adversarial perturbations + +09:22.400 --> 09:30.480 +and hence the name adversarial feature desensitization. We tested our method on four + +09:30.480 --> 09:35.840 +data sets and compared them with a number of other baselines including with adversarial training and + +09:35.840 --> 09:43.760 +trades. We made two versions of our method called AFDTCGAN that uses the adversarial losses from + +09:43.760 --> 09:50.880 +Goodfellow et al and AFDWGAN that uses the Wasserstein losses from Arjovski and Goodtuner. + +09:52.000 --> 09:57.840 +In the table, we evaluated all methods on several white box and black box attacks with + +09:57.840 --> 10:07.360 +nominal strengths into each data set. Overall, our method AFD and especially AFDWGAN showed superior + +10:07.360 --> 10:15.200 +performance against most attacks in most data sets. However, AFD was behind trades on several attacks + +10:15.200 --> 10:20.720 +especially on CIFAR-100 and TinyImageNet data set that had more classes in it. + +10:20.720 --> 10:26.080 +We also looked in trust attack methods and attack strengths which we controlled with the parameter + +10:26.080 --> 10:32.800 +epsilon. The diagrams on the right show the robust accuracy for each defense method across + +10:32.800 --> 10:41.200 +eight attack methods and various epsilon values for each of them. Overall, our results in these + +10:41.200 --> 10:48.240 +diagrams showed that AFD's robustness generalizes better than the baselines across attacks and + +10:48.240 --> 10:55.200 +across attack strengths. To quantify these differences, we also computed the area under + +10:55.200 --> 11:00.000 +the curve for each method for each attack and summarized them in a table on the left. + +11:00.880 --> 11:06.800 +As you can see, AFD's robust performance generalizes better to unseen and stronger attacks + +11:06.800 --> 11:15.680 +compared to other baselines. If you remember from previous slides, the domain adaptation theory + +11:15.680 --> 11:22.400 +predicted a bound on the adversarial error which can also be turned into a bound on the generalization + +11:22.400 --> 11:30.320 +gap between natural and adversarial attacks. We empirically tested this prediction in our experiments + +11:30.320 --> 11:37.600 +under two settings. Under the first setting, we varied the epsilon value for the PGDL-infinity + +11:37.600 --> 11:45.600 +attack which was used during the training. And under the second setting, we varied the + +11:45.600 --> 11:51.120 +epsilon value for the PGDL-infinity attack which was used during the training. And under the second setting, we used a diverse set of attacks and various attack strengths for each of them. + +11:52.000 --> 11:58.480 +And under both scenarios, we found that the domain discriminator, which was originally trained on a + +11:58.480 --> 12:05.280 +particular attack and attack strength, in our case it was PGDL-infinity attack with a fixed epsilon + +12:05.280 --> 12:10.960 +for each data set, could well predict the generalization gap to unseen attacks and + +12:10.960 --> 12:18.000 +different attack magnitudes. This suggests that the adversarial training against a domain classifier + +12:18.000 --> 12:24.000 +like that used in our proposed method could potentially lead to robust models with better + +12:24.000 --> 12:33.520 +generalization capacity. Finally, while we showed that AFD generalizes well to most other attacks + +12:33.520 --> 12:39.200 +and attack strengths, it occasionally was worse compared to other baselines, especially in data + +12:39.200 --> 12:45.760 +sets with more classes like Tiny ImageNet. This could potentially be due to the difficulty of training + +12:46.320 --> 12:51.680 +domain classifiers in these data sets and leaves much space for future work on + +12:51.680 --> 12:57.120 +investigating the effect of domain classifiers on the robustness of feature learning functions. + +12:58.080 --> 13:04.400 +Also, AFD required more backward computations compared to some of the other baselines + +13:04.400 --> 13:11.120 +such as adversarial training, and as a result, its training time was on average about 31% + +13:11.120 --> 13:17.680 +longer than adversarial training. We invite you to read our paper for more details and please + +13:17.680 --> 13:34.720 +get in touch with us if you have any questions. Thanks for watching this video and we hope you enjoyed it. + diff --git a/demo_data/nips-2021/25959/video.mp4 b/demo_data/nips-2021/25959/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..87b5a1216d08b9ea631ecdca5706fca63ee1a2da --- /dev/null +++ b/demo_data/nips-2021/25959/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76fac80c58c0fd077be83cb3d4b052aaf70c0128d8884b24f83a34a9f9c72fe3 +size 86886949 diff --git a/demo_data/nips-2021/25963/metadata.json b/demo_data/nips-2021/25963/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..8d796f4001015b731ada9a01cfbca6ff53f6f50d --- /dev/null +++ b/demo_data/nips-2021/25963/metadata.json @@ -0,0 +1,3 @@ +{ + "title": "Reusing Combinatorial Structure: Faster Iterative Projections over Submodular Base Polytopes" +} \ No newline at end of file diff --git a/demo_data/nips-2021/25963/transcript_whisper_large-v2.txt b/demo_data/nips-2021/25963/transcript_whisper_large-v2.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fb9d4183459a033eb131737389f0d6a976f15f0 --- /dev/null +++ b/demo_data/nips-2021/25963/transcript_whisper_large-v2.txt @@ -0,0 +1,178 @@ +Hello, I'm Hassam Murtaghi. I'm a PhD student at Georgia Tech. Along with my collaborator +Jay Mundra, we will present our work on reusing combinatorial structure, faster projections +over submodular-based polytopes. This is joint work with Swati Gupta. +In this talk, we consider a sequence of similar structured optimization problems a setup often +encountered in practice. We first start with our main problem of minimizing a convex function +over a decision set P. At the next time step, this problem sees some perturbation and we +obtain another similar problem, and so on. An example of this setup is the case of iterative +projections where at each time step, we are computing the projection of a new point y +t that is close to previously projected points y i. These iterative projections form a key +step in many optimal learning algorithms and they are currently solved from scratch every +iteration. They are not viewed in the context of an iterative environment where previously +computed projections can be exploited to speed up subsequent ones. +Thus, in this talk, we ask, is it possible to speed up similar iterative optimization +problems by reusing structural information from previous minimizers? +Let me now give you some more details about our setup. Here is a table that summarizes +various widespread first-order optimization algorithms. The first two algorithms are conditional +gradient variants and they only solve linear optimization every iteration. Their convergence +rates depend on the dimension of the problem and on geometric constants for the underlying +decision set, such as the pyramidal width for the waystep-Fraenkel variant given in +the second row. On the other hand, the remaining third algorithms +are projection-based algorithms that compute the projection every iteration, and their +convergence rates, however, are optimal in the sense that they only rely on the condition +number of the function and they are dimension-independent. Further, to capture a wide range of combinatorial +sets, we consider the case where decision set P is given by a submodular polytope, and +the challenge is that these polytopes have an exponential number of constraints. Thus, +computing a projection over those polytopes is a big computational bottleneck in projection-based +algorithms. Motivated by the straight-off in convergence rates versus runtime, we further +ask, is it possible to speed up iterative projections over submodular polytopes by reusing +structural information from previous minimizers? I'm now going to give more introduction on +the problem and submodularity and review of first-order methods. So, as mentioned, we +assume that the combinatorial structure in a problem is given by a submodular function. +Set function F, defined over a ground set E of n elements, is submodular if it satisfies +the following property. Furthermore, the base polytope associated with F is defined as the +following system of linear inequalities, and here we see that V of F is modeled using an +exponential number of constraints because we have a constraint for each subset of the +concept. An example is the permutahedron, a polytope whose vertices are permutations +of 1 through n. And here we have an example in the slide for when n is equal to 3. These +polytopes are extensively used in online learning over rankings of items. A special class of +submodular polytopes are known as Cardinality-based functions, and a Cardinality-based function +F is defined as F of S equal to G Cardinality of S, where G is a concave function. And here +we have another table that summarizes various machine and online learning problems in a +submodular set function that gives rise to them. We see the permutahedron in the second +row of this table, and it is in fact a Cardinality-based polytope. Other non-Cardinality-based examples +include spanning trees and independent sets of matroids. +So let's go back to our main problem of minimizing a convex function over the base polytope. +So there typically exist three main paradigms to solve this problem. The first is a class +of methods, known as conditional gradient methods, and as I mentioned before, those +assume access to B of F via linear optimization oracle. And these methods are specifically +advantageous for base polytopes because linear optimization over base polytopes could be +done very efficiently using Edmunds' greedy algorithm. The second class of methods are +mere descent variants, and those compute a projection every iteration to ensure feasibility. +And again, as I also previously mentioned, although those methods have optimal convergence +rates and are robust, they are, they remained of theoretical nature due to being computationally +expensive. The third class of methods are combinatorial algorithms specifically tailored +for convex optimization over some modular-based polytopes. Those algorithms require instead +solving a some modular function minimization problem every iteration, which again can be +very expensive. However, those algorithms enjoy the nice property of returning exact +optimal solution. In this talk, we will focus on bridging the efficiency of CG methods and +the structural properties and exactness of combinatorial algorithms to speed up iterative +projections appearing in mere descent and beyond. So first, let's consider the simpler +case when our polytope is cardinality-based. So here we have a cardinality-based some modular +function F, and for notation we define this vector c to be the vector of discrete derivatives +of the concave function g. We now give the following Duati result, which states that +the problem of computing a Bregman projection over a cardinality-based polytope is dual +to isotonic optimization. Although our results hold for general Bregman projections, we will +focus on the case of Euclidean projections for simplicity. To that end, consider a vector +y that we're trying to compute its Euclidean projection over a cardinality-based polytope, +and let e1 through en be an ordering of the ground set such that y is decreasing. In this +case, we have the following primal problem, and the dual to that is the following isotonic +regression problem. And further, we can map between the two problems using the following identity here. +So just to give you some historical context, previously the best known running time for +projections was O n squared using a primal algorithm by Gupta et al. Later on in that +year, Lim and Wright used the same Duati approach to compute projections over the permutahedron, +and we extended their approach to general cardinality-based polytopes. Now the dual +isotonic regression problem could be solved in O n time using a simple algorithm called +pool-adjacent violators algorithm, and this basically gives us an O n log n algorithm by +solving the problem in the dual space and mapping it back to the primal space. And this is currently +the fastest known algorithm. And the key takeaway is that solving projections over these polytopes +can be very efficiently done. In fact, computing a projection and solving linear optimization +have the same running time. Now let's demonstrate our result with an example. So here we are going +to project this vector y onto the probability simplex, and the probability simplex is modeled +by this cardinality-based modular function here given on the slide. And we see that y is already +ordered for simplicity and c is the vector of discrete derivatives. Now the algorithm will +proceed as follows. It initializes the dual iterates by the vector that we're trying to +compute the isotonic regression for, c minus y, and here we have an adjacent violation because the +second coordinate is strictly smaller than the first coordinate. Now the algorithm will basically +average those two coordinates to obtain the following solution z star, and here we see that +the ordering constraints are satisfied and z star is in fact the dual optimal. Next it will map it +back to a primal optimal. And let's go back to this figure from the previous slide that just compares +a basic linear regression fit with an isotonic regression fit. Here in the red stepwise curve, +the points at which the curve remains flat is where a block of consecutive adjacent violated +points are averaged similar to our example. This very efficient algorithm for computing +regimen projections over cardinality-based polytopes unfortunately does not extend to +general submodular based polytopes. And now my collaborator Jay will present different combinatorial +strategies for dealing with those polytopes. We now describe our toolkit for speeding up +projections on general submodular based polytopes. There are two basic objects that we can learn from. +First, given projections of previous points, can we do better than computing a new projection from +scratch? Second, given an iterative algorithm to compute a projection, can we use the combinatorial +structure present in the sequence of iterates to speed up the algorithm and terminate it early? +We have the well-known first-order optimality condition on the left. It helps us verify if a +point is indeed optimal. This check is reduced to a linear optimization over the base polytope, +which can be done using Edmunds-Greedy algorithm. We have an example. Suppose we know the gradient +at a point x star and want to check if x star is indeed optimal. We look at the distinct values +of the partial derivatives at x star and arrange them in an increasing order. Each time we see a +gap in this order, we want that the point x star on the prefix set equal the submodular function +value on that set. In the figure, the first such gap is after we have seen even an E5. Therefore, +x star S1 must equal f of S1. Similarly, x star S2 must equal f of S2. Finally, xE must equal f of +E. These sets S1, S2, and E are called tight sets at x and define the face containing the point x +star. This leads us to two interesting observations that we use later. One, that if we know precisely +what the tight sets are at the optimal points, we can also calculate the optimal point for all +suitable functions h. Two, that knowing the gradient at the optimal point gives us these +tight sets. We give an example using our combinatorial idea. Suppose we know a point +zk that is close to our optimal x star. If the function is smooth, this implies gradient at zk +and x star are close. This gives us a way to learn some tight sets defining the optimal face. +In the example, for each coordinate, the blue line in the middle represents the partial derivative +value at zk and the blue shade represents the possible variation in that value for the optimal +point x star. That is, the corresponding partial derivative for x star lies in the shaded interval. +The largest values in these intervals for E1 and E5 are lower than the lowest values in these +intervals for every other element. This helps us conclude that the set E1 and E5, that is S1, +is a tight set at x star. Similarly, we infer that S2 is also a tight set at x star. +We now use that idea to give our first two tools. These apply more generally, but we demonstrate +them using Euclidean projections. Suppose we already know the projection xi of a point yi, +and we wish to find the projection xt of point yt, given that yt is close to yi. +The non-expansiveness of projection implies that the gradients at xi and xt are also close, +and therefore we can infer some tight sets at xt even before solving. +Suppose we start computing the projection of yt using an iterative algorithm. +We now use the iterates zi that converge to xt. An iterate zt that is close to xt also has a +gradient that is close to the gradient at xt, and once again we can infer some tight sets at xt +as we approach the optimal. We also conducted an experiment to show that tool T1 can recover +most tight sets from previous projections. We now give two tools that help us round an +approximate solution exactly to the projection. First is our tool T3 called Relax. +We give a heuristic to check if we have already found all the tight sets at the optimal. +We also show that we can round combinatorially when we know the function f to be integral, +and an iterate zt is close enough to the optimal xt. This is our tool T4. +We can reuse previously known vertices of the polytope. Suppose that our optimal is xt, +and we are given a close by point xi as a convex combination of some vertices in the polytope. +We can use those vertices to warm start the search for xt. Now our sixth tool, Restrict. +Once we know a few tight sets for xt using our inferred tools T1 and T2, +we needn't search over the optimal or the whole base polytope. We can restrict ourselves to the +face of the polytope that satisfies these constraints. We show that a simple extension +of Edmunds' greedy algorithm provides yellow oracle for each face of the polytope. +We now bring together these tools and apply them to the awaystep-frank-wolff algorithm, +giving the algorithm we dub adaptive awaystep-frank-wolff, or A2FW for short. +First, warm start A2FW using tight sets for the optimal inferred from previous projected points, +and active sets from previous projected points. While the algorithm runs and generates new +iterates, it keeps inferring new tight sets for the optimal point using these iterates. +In each iteration, if a new set has been found, the algorithm checks if all tight sets have been +found. If indeed so, then stop and output the exact solution. Otherwise, simply restrict the +problem to a low-dimensional face and keep going on. Note that the linear optimization is over a +restricted face of the polytope. Let's see an example. Suppose we are optimizing over the +polytope P. We look for the best frank-wolff vertex and the best away vertex. We find that +the best frank-wolff vertex is the best away vertex. Since the direction opposite to the away +vertex is the better direction to move in, we find the next iterate ZT plus 1. Now, ZT plus 1 is +close enough to X star that it allows us to detect another tight set and round to the face F new. +One way to do that is to round to an arbitrary vertex in F new using our yellow oracle. Another +option is to relax to F new and see if the solution obtained is feasible. If feasibility +check is uncertain, return to the previous strategy. Eventually, we reach the optimal +X star either way. We give this theorem about the primal gap for the modified algorithm. +The function h is l-smooth and mu strongly convex and d refers to the diameter of BF. +Notice how this compares to the AFW algorithm. When we restrict to a face F of BF, our guarantee +depends only on the pyramidal width of F instead of the pyramidal width of BF. This pyramidal width +can be much lower for the restricted face. For instance, it depends on the dimension of the face +for the probability simplex. Therefore, A2FW leads to a faster convergence. We now show the +effectiveness of our toolkit and the A2FW algorithm using experiments. For our computations, +we simulate an online recommendation system where we are learning over rankings of items +displayed to users. Our loss functions are stochastic model click-through rates. This +can be seen as optimization over the permutahedron. We use online mirror descent which performs +iterative projections and uses away step Frank-Wulf for these projections. We benchmark the +original AFW algorithm against variants modified by our tools. We report significant improvement +in both runtime and the number of AFW iterations. The green line stands for OMD with the original +unoptimized AFW. The yellow line stands for OMD with A2FW algorithm. We do note that both OMDPAV, +that is OMD with projections using the poor adjacent violators algorithm, and OFW were +significantly faster than OMD with any AFW variant. However, OFW does not lead to optimum +regret rates while OMDPAV works only for cardinality-based submodular polytopes. To +conclude, we studied iterative projections for prevalent submodular-based polytopes. We presented +an algorithm for cardinality-based polytopes. For general polytopes, we developed a combinatorial +toolkit to speed up iterative projections and applied it to the AFW algorithm and computationally +showed that our algorithm is orders of magnitude faster than the original AFW variant. diff --git a/demo_data/nips-2021/25963/transcript_whisper_large-v2.vtt b/demo_data/nips-2021/25963/transcript_whisper_large-v2.vtt new file mode 100644 index 0000000000000000000000000000000000000000..8ee958efe67dfc27e83d1b445f0cc067bff64100 --- /dev/null +++ b/demo_data/nips-2021/25963/transcript_whisper_large-v2.vtt @@ -0,0 +1,536 @@ +WEBVTT + +00:00.000 --> 00:13.040 +Hello, I'm Hassam Murtaghi. I'm a PhD student at Georgia Tech. Along with my collaborator + +00:13.040 --> 00:16.880 +Jay Mundra, we will present our work on reusing combinatorial structure, faster projections + +00:16.880 --> 00:20.260 +over submodular-based polytopes. This is joint work with Swati Gupta. + +00:20.260 --> 00:24.220 +In this talk, we consider a sequence of similar structured optimization problems a setup often + +00:24.220 --> 00:28.220 +encountered in practice. We first start with our main problem of minimizing a convex function + +00:28.220 --> 00:32.260 +over a decision set P. At the next time step, this problem sees some perturbation and we + +00:32.260 --> 00:36.700 +obtain another similar problem, and so on. An example of this setup is the case of iterative + +00:36.700 --> 00:40.340 +projections where at each time step, we are computing the projection of a new point y + +00:40.340 --> 00:44.860 +t that is close to previously projected points y i. These iterative projections form a key + +00:44.860 --> 00:48.140 +step in many optimal learning algorithms and they are currently solved from scratch every + +00:48.140 --> 00:51.900 +iteration. They are not viewed in the context of an iterative environment where previously + +00:51.900 --> 00:55.140 +computed projections can be exploited to speed up subsequent ones. + +00:55.140 --> 00:59.500 +Thus, in this talk, we ask, is it possible to speed up similar iterative optimization + +00:59.500 --> 01:03.660 +problems by reusing structural information from previous minimizers? + +01:03.660 --> 01:07.580 +Let me now give you some more details about our setup. Here is a table that summarizes + +01:07.580 --> 01:11.180 +various widespread first-order optimization algorithms. The first two algorithms are conditional + +01:11.180 --> 01:16.660 +gradient variants and they only solve linear optimization every iteration. Their convergence + +01:16.660 --> 01:21.140 +rates depend on the dimension of the problem and on geometric constants for the underlying + +01:21.140 --> 01:25.140 +decision set, such as the pyramidal width for the waystep-Fraenkel variant given in + +01:25.140 --> 01:28.340 +the second row. On the other hand, the remaining third algorithms + +01:28.340 --> 01:32.580 +are projection-based algorithms that compute the projection every iteration, and their + +01:32.580 --> 01:37.660 +convergence rates, however, are optimal in the sense that they only rely on the condition + +01:37.660 --> 01:43.260 +number of the function and they are dimension-independent. Further, to capture a wide range of combinatorial + +01:43.260 --> 01:47.980 +sets, we consider the case where decision set P is given by a submodular polytope, and + +01:47.980 --> 01:53.380 +the challenge is that these polytopes have an exponential number of constraints. Thus, + +01:53.380 --> 01:58.020 +computing a projection over those polytopes is a big computational bottleneck in projection-based + +01:58.020 --> 02:03.820 +algorithms. Motivated by the straight-off in convergence rates versus runtime, we further + +02:03.820 --> 02:08.740 +ask, is it possible to speed up iterative projections over submodular polytopes by reusing + +02:08.740 --> 02:14.300 +structural information from previous minimizers? I'm now going to give more introduction on + +02:14.300 --> 02:18.500 +the problem and submodularity and review of first-order methods. So, as mentioned, we + +02:18.500 --> 02:22.220 +assume that the combinatorial structure in a problem is given by a submodular function. + +02:22.220 --> 02:26.780 +Set function F, defined over a ground set E of n elements, is submodular if it satisfies + +02:26.780 --> 02:33.260 +the following property. Furthermore, the base polytope associated with F is defined as the + +02:33.260 --> 02:38.060 +following system of linear inequalities, and here we see that V of F is modeled using an + +02:38.060 --> 02:41.500 +exponential number of constraints because we have a constraint for each subset of the + +02:41.500 --> 02:46.860 +concept. An example is the permutahedron, a polytope whose vertices are permutations + +02:46.860 --> 02:51.740 +of 1 through n. And here we have an example in the slide for when n is equal to 3. These + +02:51.740 --> 02:58.220 +polytopes are extensively used in online learning over rankings of items. A special class of + +02:58.220 --> 03:02.140 +submodular polytopes are known as Cardinality-based functions, and a Cardinality-based function + +03:02.140 --> 03:07.620 +F is defined as F of S equal to G Cardinality of S, where G is a concave function. And here + +03:07.620 --> 03:10.940 +we have another table that summarizes various machine and online learning problems in a + +03:10.940 --> 03:14.380 +submodular set function that gives rise to them. We see the permutahedron in the second + +03:14.380 --> 03:19.180 +row of this table, and it is in fact a Cardinality-based polytope. Other non-Cardinality-based examples + +03:19.180 --> 03:22.220 +include spanning trees and independent sets of matroids. + +03:24.060 --> 03:28.220 +So let's go back to our main problem of minimizing a convex function over the base polytope. + +03:28.220 --> 03:32.620 +So there typically exist three main paradigms to solve this problem. The first is a class + +03:32.620 --> 03:37.020 +of methods, known as conditional gradient methods, and as I mentioned before, those + +03:37.020 --> 03:42.620 +assume access to B of F via linear optimization oracle. And these methods are specifically + +03:42.620 --> 03:46.780 +advantageous for base polytopes because linear optimization over base polytopes could be + +03:46.780 --> 03:51.180 +done very efficiently using Edmunds' greedy algorithm. The second class of methods are + +03:51.180 --> 03:55.500 +mere descent variants, and those compute a projection every iteration to ensure feasibility. + +03:56.060 --> 03:59.980 +And again, as I also previously mentioned, although those methods have optimal convergence + +03:59.980 --> 04:05.100 +rates and are robust, they are, they remained of theoretical nature due to being computationally + +04:05.100 --> 04:10.060 +expensive. The third class of methods are combinatorial algorithms specifically tailored + +04:10.060 --> 04:15.820 +for convex optimization over some modular-based polytopes. Those algorithms require instead + +04:15.820 --> 04:20.860 +solving a some modular function minimization problem every iteration, which again can be + +04:20.860 --> 04:25.020 +very expensive. However, those algorithms enjoy the nice property of returning exact + +04:25.020 --> 04:30.700 +optimal solution. In this talk, we will focus on bridging the efficiency of CG methods and + +04:30.700 --> 04:35.420 +the structural properties and exactness of combinatorial algorithms to speed up iterative + +04:35.420 --> 04:40.780 +projections appearing in mere descent and beyond. So first, let's consider the simpler + +04:40.780 --> 04:44.380 +case when our polytope is cardinality-based. So here we have a cardinality-based some modular + +04:44.380 --> 04:48.940 +function F, and for notation we define this vector c to be the vector of discrete derivatives + +04:48.940 --> 04:53.340 +of the concave function g. We now give the following Duati result, which states that + +04:53.340 --> 04:57.180 +the problem of computing a Bregman projection over a cardinality-based polytope is dual + +04:57.180 --> 05:02.860 +to isotonic optimization. Although our results hold for general Bregman projections, we will + +05:02.860 --> 05:08.620 +focus on the case of Euclidean projections for simplicity. To that end, consider a vector + +05:08.620 --> 05:11.980 +y that we're trying to compute its Euclidean projection over a cardinality-based polytope, + +05:11.980 --> 05:17.340 +and let e1 through en be an ordering of the ground set such that y is decreasing. In this + +05:17.340 --> 05:21.580 +case, we have the following primal problem, and the dual to that is the following isotonic + +05:21.580 --> 05:27.820 +regression problem. And further, we can map between the two problems using the following identity here. + +05:29.580 --> 05:32.860 +So just to give you some historical context, previously the best known running time for + +05:32.860 --> 05:37.500 +projections was O n squared using a primal algorithm by Gupta et al. Later on in that + +05:37.500 --> 05:41.180 +year, Lim and Wright used the same Duati approach to compute projections over the permutahedron, + +05:41.180 --> 05:45.020 +and we extended their approach to general cardinality-based polytopes. Now the dual + +05:45.020 --> 05:49.340 +isotonic regression problem could be solved in O n time using a simple algorithm called + +05:49.340 --> 05:53.900 +pool-adjacent violators algorithm, and this basically gives us an O n log n algorithm by + +05:53.900 --> 05:59.180 +solving the problem in the dual space and mapping it back to the primal space. And this is currently + +05:59.180 --> 06:04.060 +the fastest known algorithm. And the key takeaway is that solving projections over these polytopes + +06:04.060 --> 06:09.420 +can be very efficiently done. In fact, computing a projection and solving linear optimization + +06:09.420 --> 06:15.260 +have the same running time. Now let's demonstrate our result with an example. So here we are going + +06:15.260 --> 06:20.060 +to project this vector y onto the probability simplex, and the probability simplex is modeled + +06:20.060 --> 06:24.940 +by this cardinality-based modular function here given on the slide. And we see that y is already + +06:24.940 --> 06:29.900 +ordered for simplicity and c is the vector of discrete derivatives. Now the algorithm will + +06:29.900 --> 06:35.260 +proceed as follows. It initializes the dual iterates by the vector that we're trying to + +06:35.260 --> 06:39.980 +compute the isotonic regression for, c minus y, and here we have an adjacent violation because the + +06:39.980 --> 06:45.180 +second coordinate is strictly smaller than the first coordinate. Now the algorithm will basically + +06:45.180 --> 06:49.740 +average those two coordinates to obtain the following solution z star, and here we see that + +06:49.740 --> 06:54.060 +the ordering constraints are satisfied and z star is in fact the dual optimal. Next it will map it + +06:54.060 --> 06:58.700 +back to a primal optimal. And let's go back to this figure from the previous slide that just compares + +06:58.700 --> 07:04.060 +a basic linear regression fit with an isotonic regression fit. Here in the red stepwise curve, + +07:04.060 --> 07:08.060 +the points at which the curve remains flat is where a block of consecutive adjacent violated + +07:08.060 --> 07:12.460 +points are averaged similar to our example. This very efficient algorithm for computing + +07:12.460 --> 07:15.820 +regimen projections over cardinality-based polytopes unfortunately does not extend to + +07:15.820 --> 07:20.460 +general submodular based polytopes. And now my collaborator Jay will present different combinatorial + +07:20.460 --> 07:25.100 +strategies for dealing with those polytopes. We now describe our toolkit for speeding up + +07:25.100 --> 07:31.100 +projections on general submodular based polytopes. There are two basic objects that we can learn from. + +07:31.100 --> 07:35.900 +First, given projections of previous points, can we do better than computing a new projection from + +07:35.900 --> 07:41.820 +scratch? Second, given an iterative algorithm to compute a projection, can we use the combinatorial + +07:41.820 --> 07:46.780 +structure present in the sequence of iterates to speed up the algorithm and terminate it early? + +07:49.180 --> 07:53.180 +We have the well-known first-order optimality condition on the left. It helps us verify if a + +07:53.180 --> 07:58.140 +point is indeed optimal. This check is reduced to a linear optimization over the base polytope, + +07:58.140 --> 08:03.740 +which can be done using Edmunds-Greedy algorithm. We have an example. Suppose we know the gradient + +08:03.740 --> 08:08.780 +at a point x star and want to check if x star is indeed optimal. We look at the distinct values + +08:08.780 --> 08:13.900 +of the partial derivatives at x star and arrange them in an increasing order. Each time we see a + +08:13.900 --> 08:19.580 +gap in this order, we want that the point x star on the prefix set equal the submodular function + +08:19.580 --> 08:26.380 +value on that set. In the figure, the first such gap is after we have seen even an E5. Therefore, + +08:26.380 --> 08:35.900 +x star S1 must equal f of S1. Similarly, x star S2 must equal f of S2. Finally, xE must equal f of + +08:35.900 --> 08:42.940 +E. These sets S1, S2, and E are called tight sets at x and define the face containing the point x + +08:42.940 --> 08:49.100 +star. This leads us to two interesting observations that we use later. One, that if we know precisely + +08:49.100 --> 08:53.980 +what the tight sets are at the optimal points, we can also calculate the optimal point for all + +08:53.980 --> 08:59.100 +suitable functions h. Two, that knowing the gradient at the optimal point gives us these + +08:59.100 --> 09:06.220 +tight sets. We give an example using our combinatorial idea. Suppose we know a point + +09:06.220 --> 09:11.420 +zk that is close to our optimal x star. If the function is smooth, this implies gradient at zk + +09:11.420 --> 09:16.540 +and x star are close. This gives us a way to learn some tight sets defining the optimal face. + +09:17.260 --> 09:21.740 +In the example, for each coordinate, the blue line in the middle represents the partial derivative + +09:21.740 --> 09:26.700 +value at zk and the blue shade represents the possible variation in that value for the optimal + +09:26.700 --> 09:31.500 +point x star. That is, the corresponding partial derivative for x star lies in the shaded interval. + +09:32.140 --> 09:36.860 +The largest values in these intervals for E1 and E5 are lower than the lowest values in these + +09:36.860 --> 09:44.300 +intervals for every other element. This helps us conclude that the set E1 and E5, that is S1, + +09:44.300 --> 09:50.540 +is a tight set at x star. Similarly, we infer that S2 is also a tight set at x star. + +09:51.500 --> 09:56.460 +We now use that idea to give our first two tools. These apply more generally, but we demonstrate + +09:56.460 --> 10:01.820 +them using Euclidean projections. Suppose we already know the projection xi of a point yi, + +10:01.820 --> 10:06.540 +and we wish to find the projection xt of point yt, given that yt is close to yi. + +10:07.660 --> 10:11.980 +The non-expansiveness of projection implies that the gradients at xi and xt are also close, + +10:11.980 --> 10:15.820 +and therefore we can infer some tight sets at xt even before solving. + +10:16.940 --> 10:20.620 +Suppose we start computing the projection of yt using an iterative algorithm. + +10:20.620 --> 10:26.860 +We now use the iterates zi that converge to xt. An iterate zt that is close to xt also has a + +10:26.860 --> 10:32.780 +gradient that is close to the gradient at xt, and once again we can infer some tight sets at xt + +10:32.780 --> 10:39.740 +as we approach the optimal. We also conducted an experiment to show that tool T1 can recover + +10:39.740 --> 10:44.300 +most tight sets from previous projections. We now give two tools that help us round an + +10:44.300 --> 10:49.260 +approximate solution exactly to the projection. First is our tool T3 called Relax. + +10:49.260 --> 10:53.500 +We give a heuristic to check if we have already found all the tight sets at the optimal. + +10:55.020 --> 10:59.660 +We also show that we can round combinatorially when we know the function f to be integral, + +10:59.660 --> 11:04.140 +and an iterate zt is close enough to the optimal xt. This is our tool T4. + +11:05.900 --> 11:10.620 +We can reuse previously known vertices of the polytope. Suppose that our optimal is xt, + +11:10.620 --> 11:15.660 +and we are given a close by point xi as a convex combination of some vertices in the polytope. + +11:15.660 --> 11:21.980 +We can use those vertices to warm start the search for xt. Now our sixth tool, Restrict. + +11:23.580 --> 11:27.260 +Once we know a few tight sets for xt using our inferred tools T1 and T2, + +11:27.260 --> 11:32.540 +we needn't search over the optimal or the whole base polytope. We can restrict ourselves to the + +11:32.540 --> 11:38.380 +face of the polytope that satisfies these constraints. We show that a simple extension + +11:38.380 --> 11:42.300 +of Edmunds' greedy algorithm provides yellow oracle for each face of the polytope. + +11:42.300 --> 11:46.300 +We now bring together these tools and apply them to the awaystep-frank-wolff algorithm, + +11:46.300 --> 11:50.700 +giving the algorithm we dub adaptive awaystep-frank-wolff, or A2FW for short. + +11:51.660 --> 11:57.340 +First, warm start A2FW using tight sets for the optimal inferred from previous projected points, + +11:57.340 --> 12:01.660 +and active sets from previous projected points. While the algorithm runs and generates new + +12:01.660 --> 12:05.660 +iterates, it keeps inferring new tight sets for the optimal point using these iterates. + +12:05.660 --> 12:09.580 +In each iteration, if a new set has been found, the algorithm checks if all tight sets have been + +12:09.580 --> 12:16.300 +found. If indeed so, then stop and output the exact solution. Otherwise, simply restrict the + +12:16.300 --> 12:22.060 +problem to a low-dimensional face and keep going on. Note that the linear optimization is over a + +12:22.060 --> 12:27.420 +restricted face of the polytope. Let's see an example. Suppose we are optimizing over the + +12:27.420 --> 12:32.860 +polytope P. We look for the best frank-wolff vertex and the best away vertex. We find that + +12:32.860 --> 12:37.500 +the best frank-wolff vertex is the best away vertex. Since the direction opposite to the away + +12:37.500 --> 12:44.060 +vertex is the better direction to move in, we find the next iterate ZT plus 1. Now, ZT plus 1 is + +12:44.060 --> 12:50.540 +close enough to X star that it allows us to detect another tight set and round to the face F new. + +12:50.540 --> 12:56.140 +One way to do that is to round to an arbitrary vertex in F new using our yellow oracle. Another + +12:56.140 --> 13:00.700 +option is to relax to F new and see if the solution obtained is feasible. If feasibility + +13:00.700 --> 13:06.300 +check is uncertain, return to the previous strategy. Eventually, we reach the optimal + +13:06.300 --> 13:11.660 +X star either way. We give this theorem about the primal gap for the modified algorithm. + +13:11.660 --> 13:16.460 +The function h is l-smooth and mu strongly convex and d refers to the diameter of BF. + +13:17.100 --> 13:22.940 +Notice how this compares to the AFW algorithm. When we restrict to a face F of BF, our guarantee + +13:22.940 --> 13:29.020 +depends only on the pyramidal width of F instead of the pyramidal width of BF. This pyramidal width + +13:29.020 --> 13:33.420 +can be much lower for the restricted face. For instance, it depends on the dimension of the face + +13:33.420 --> 13:40.620 +for the probability simplex. Therefore, A2FW leads to a faster convergence. We now show the + +13:40.620 --> 13:46.460 +effectiveness of our toolkit and the A2FW algorithm using experiments. For our computations, + +13:46.460 --> 13:50.940 +we simulate an online recommendation system where we are learning over rankings of items + +13:50.940 --> 13:55.660 +displayed to users. Our loss functions are stochastic model click-through rates. This + +13:55.660 --> 14:01.340 +can be seen as optimization over the permutahedron. We use online mirror descent which performs + +14:01.340 --> 14:07.020 +iterative projections and uses away step Frank-Wulf for these projections. We benchmark the + +14:07.020 --> 14:14.780 +original AFW algorithm against variants modified by our tools. We report significant improvement + +14:14.780 --> 14:19.820 +in both runtime and the number of AFW iterations. The green line stands for OMD with the original + +14:19.820 --> 14:27.180 +unoptimized AFW. The yellow line stands for OMD with A2FW algorithm. We do note that both OMDPAV, + +14:27.180 --> 14:31.900 +that is OMD with projections using the poor adjacent violators algorithm, and OFW were + +14:31.900 --> 14:38.140 +significantly faster than OMD with any AFW variant. However, OFW does not lead to optimum + +14:38.140 --> 14:43.420 +regret rates while OMDPAV works only for cardinality-based submodular polytopes. To + +14:43.420 --> 14:48.140 +conclude, we studied iterative projections for prevalent submodular-based polytopes. We presented + +14:48.140 --> 14:53.020 +an algorithm for cardinality-based polytopes. For general polytopes, we developed a combinatorial + +14:53.020 --> 14:58.380 +toolkit to speed up iterative projections and applied it to the AFW algorithm and computationally + +14:58.380 --> 15:18.540 +showed that our algorithm is orders of magnitude faster than the original AFW variant. + diff --git a/demo_data/nips-2021/25963/video.mp4 b/demo_data/nips-2021/25963/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..aca06876ffc331bb06fbe54c9e7f2042fdebba8b --- /dev/null +++ b/demo_data/nips-2021/25963/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c17d5bf2a983d390139e7980e31b6e37781054da4158899547b604d8ff24dbfb +size 97013818 diff --git a/demo_data/nips-2021/25965/metadata.json b/demo_data/nips-2021/25965/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4c96089054c5d042047d87a4566ad8f04c2e3263 --- /dev/null +++ b/demo_data/nips-2021/25965/metadata.json @@ -0,0 +1,3 @@ +{ + "title": "Residual2Vec: Debiasing graph embedding with random graphs" +} \ No newline at end of file diff --git a/demo_data/nips-2021/25965/transcript_whisper_large-v2.txt b/demo_data/nips-2021/25965/transcript_whisper_large-v2.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b2bf2cd0220b9ae785a560cbeef969bae2b14c3 --- /dev/null +++ b/demo_data/nips-2021/25965/transcript_whisper_large-v2.txt @@ -0,0 +1,136 @@ +How many friends do you have? +At least you have more friends than I do. +Well, on average. +Don't get me wrong, I am not a pity person. +This is a mathematical fact known as the friendship paradox. +Suppose we have two persons, A who has one friend and B who has three friends. +Now let me ask in which friend list am I likely to appear? +Because B has three times more friends, I am three times more likely to appear in the +B's friend list. +The friendship paradox dictates that on average, your friends have more friends than you do. +The more friends someone has, the more likely someone appears in your friend list. +Beyond an interesting piece of trivia, the friendship paradox has substantial importance +because it may introduce biases in graph embeddings. +Hello everyone, my name is Sadamori Kojak, and we will walk you through a new insight +into biases in graph embedding arising from the friendship paradox. +The graph embedding is a technique to map a graph into a vector space that reflects +the structure of the graph. +A widespread paradigm is the approach based on Word2Vec. +In this approach, one somehow generates a sequence of nodes from the graph. +The nodes in the sentences are then mapped to a vector space by Word2Vec. +Now the key is that Word2Vec does not directly learn the graph, but through the sentences +generated from the graph. +Unlike the word embedding, where the input sentences are the actual data, for graph embedding, +the input sentence is artificially generated, and how to generate it is a critical modeling +decision. +This leads us to the question of how to generate the sentences from the graph. +A common way is to use random walks. +The worker starts from a node in the graph, and this node is the first node in the sentence. +Then the worker moves to one of the neighbors selected randomly. +This new node is added to the sentence. +By repeating this process, we can generate a sentence of nodes from this graph. +The friendship paradox comes into play when the worker follows an edge. +It is more likely to visit a node with many neighbors. +In other words, following edges is a bias sampling that preferentially leads random +workers to nodes with many neighbors. +To see this effect, let us consider a graph with co-peripheral structure, where kernels +have more neighbors than periphery. +A sentence can be generated from this graph by running a random walk. +Now, the kernels are about 20% of nodes in the graph. +But when looking at the generated sentence, the kernels are overrepresented, which is +because of the bias due to the friendship paradox. +The fact that the sentence is biased by the friendship paradox leads us to our main question. +Does the sampling bias have negative impact? +If so, how can we fix it? +Surprisingly, it has no effect because Word2Vec itself has an overlooked built-in devising +feature that happens to negate the bias due to the friendship paradox. +This built-in devising feature can be easily utilized to negate other types of biases, +and we demonstrate how to do this. +Our starting point is a sentence of words. +Word2Vec picks a word called center and surrounding words called context, and then models the +conditional probability using a softmax function, where the conditional probability is reflected +as a dot similarity of the two vectors of the words. +We want to fit this model to the data, but it is computationally challenging due to the +normalization constant, which extends over all unique words in the corpus. +A common way to reduce this burden is negative sampling. +Now, it is often underappreciated that negative sampling is actually a simplified version +of noise contrastive estimation. +And it is this simplification that gives rise to an interesting feature of Word2Vec. +How does the noise contrastive estimation, or NCE, works? +NCE samples k random contexts from so-called noise distribution. +This noise distribution is roughly proportional to the frequency of a word in the corpus. +The random contexts are labeled as 0, and the actual context is labeled as 1. +Then NCE calculates the probability that a word comes from actual data using a Bayesian +framework. +By putting the prior likelihood together, we have a posterior like this. +This function is a sigmoid function and takes the dot similarity and the noise distribution +as the arguments. +Now the key feature of the NCE is that it is asymptomatically unbiased for the model +of the Word2Vec. +Meaning if the data is actually generated from this model, and we increase the number +of trainings, then the embedding vectors converge to the true vectors. +Beyond Word2Vec, the noise contrastive estimation is also an unbiased estimator for a more general +model that takes a real value function f instead of the dot similarity. +Now the negative sampling simplifies the noise contrastive estimation. +It estimates the same probability, but variably drops the term of the noise distribution. +You might be wondering what happens without this term. +To see this, we rewrite it in form of the noise contrastive estimation, where we define +a new function f' which consists of the original function f as well as the noise distribution. +This is asymptomatically unbiased for a probability model which now includes the noise distribution. +So all in all, Word2Vec trained with skip-gram-negative sampling is asymptomatically unbiased for +this probability model, or more specifically for Word2Vec, this function. +In this model, the noise distribution offsets the modeled probability, serving as a baseline. +The embedding vectors captures the residual from the baseline. +Now, remind that the baseline probability is roughly proportional to the frequency. +Therefore, the embedding vectors capture the information other than the frequency. +In other words, SGNS Word2Vec has a built-in debiasing feature for frequency bias. +Now let us revisit the friendship paradox. +The sampling bias due to the friendship paradox is that the frequency of a word is determined +thoroughly by the degree of noise. +Notice that this frequency is actually accounted for by the baseline probability. +Therefore, the friendship paradox has no effect thanks to the built-in debiasing feature of +SGNS Word2Vec. +This realization leads us to Residual2Vec. +The key idea is to model the baseline probability explicitly to control what bias to remove +in embedding. +So how can we model the baseline more specifically? +We start from the given graph and randomize the structure, then generate a sequence using +random walks, then calculate the conditional probability as the baseline, which is based +on the idea that we should remove biases arising from the trivial structure. +This debiasing feature is useful to predict links in the graph. +Residual2Vec performs the best or nearly the best for all six graphs of different domains. +Furthermore, Residual2Vec is the best or the second best performer for a community detection +benchmark. +To showcase the debiasing feature, we constructed a citation graph of general issues using the +web of science, where the nodes are general issues connected by undirected and weighted +citations. +When applying grove embedding, all genres are concentrated on the center, reflecting +temporal aspects of the issues. +This is because the old issues have time to accumulate many citations, and therefore well +connected to many different issues. +For subject-wise, grove separates different fields to some extent. +With Residual2Vec, we can remove the biases due to time. +In effect, the old genres now spread out, and the disciplinary separations are more +clearly visible. +Beyond eyeballing the embeddings, we test the embeddings quantitatively by predicting +the genre impact factor as well as the subject categories. +We find that the impact factor and the subject of genres can be well predicted by removing +the temporal biases as well as the friendship paradox effect. +In summary, we show that World2Vec has a built-in debiasing feature attributed to negative sampling. +Inspired by this finding, we propose Residual2Vec that can negate other types of structural +biases. +We demonstrate that removing biases not only improves the performance, but also enabling +us to control on the biases in the final representation. +Our results highlighted a new potential of negative sampling as a way to mitigate biases +in representations, which may be useful to address the problem of the biases in AI. +Although we have not studied the biases in AI, given the wide usage of negative sampling +to train AI, our approach may lead to methods and studies that expose and mitigate biases +in AI. +We believe that our approach contributes to the effort to create transparent and accountable +machine learning methods, especially because our method enables us to explicitly control +the biases in the graph representation. +That's all for the presentation, and finally I'd like to acknowledge Jason Yoon, Isabel +Constantino, and Yongyuan An for creating and adding momentum to this project for years, +and for all of you who watched this video. +If you want to know more in detail, please check out our paper. +Thanks! diff --git a/demo_data/nips-2021/25965/transcript_whisper_large-v2.vtt b/demo_data/nips-2021/25965/transcript_whisper_large-v2.vtt new file mode 100644 index 0000000000000000000000000000000000000000..054fcd20ed1c150675263975266dbba758d2d0f5 --- /dev/null +++ b/demo_data/nips-2021/25965/transcript_whisper_large-v2.vtt @@ -0,0 +1,410 @@ +WEBVTT + +00:00.000 --> 00:10.880 +How many friends do you have? + +00:10.880 --> 00:13.480 +At least you have more friends than I do. + +00:13.480 --> 00:15.960 +Well, on average. + +00:15.960 --> 00:18.280 +Don't get me wrong, I am not a pity person. + +00:18.280 --> 00:23.920 +This is a mathematical fact known as the friendship paradox. + +00:23.920 --> 00:30.800 +Suppose we have two persons, A who has one friend and B who has three friends. + +00:30.800 --> 00:36.520 +Now let me ask in which friend list am I likely to appear? + +00:36.520 --> 00:42.400 +Because B has three times more friends, I am three times more likely to appear in the + +00:42.400 --> 00:45.600 +B's friend list. + +00:45.600 --> 00:52.140 +The friendship paradox dictates that on average, your friends have more friends than you do. + +00:52.140 --> 00:58.280 +The more friends someone has, the more likely someone appears in your friend list. + +00:58.280 --> 01:04.120 +Beyond an interesting piece of trivia, the friendship paradox has substantial importance + +01:04.120 --> 01:10.040 +because it may introduce biases in graph embeddings. + +01:10.040 --> 01:15.680 +Hello everyone, my name is Sadamori Kojak, and we will walk you through a new insight + +01:15.680 --> 01:21.340 +into biases in graph embedding arising from the friendship paradox. + +01:21.340 --> 01:26.160 +The graph embedding is a technique to map a graph into a vector space that reflects + +01:26.160 --> 01:28.360 +the structure of the graph. + +01:28.360 --> 01:34.040 +A widespread paradigm is the approach based on Word2Vec. + +01:34.040 --> 01:39.480 +In this approach, one somehow generates a sequence of nodes from the graph. + +01:39.480 --> 01:45.600 +The nodes in the sentences are then mapped to a vector space by Word2Vec. + +01:45.600 --> 01:52.360 +Now the key is that Word2Vec does not directly learn the graph, but through the sentences + +01:52.360 --> 01:55.360 +generated from the graph. + +01:55.360 --> 02:01.120 +Unlike the word embedding, where the input sentences are the actual data, for graph embedding, + +02:01.120 --> 02:07.640 +the input sentence is artificially generated, and how to generate it is a critical modeling + +02:07.640 --> 02:08.640 +decision. + +02:08.640 --> 02:15.280 +This leads us to the question of how to generate the sentences from the graph. + +02:15.280 --> 02:20.160 +A common way is to use random walks. + +02:20.160 --> 02:28.560 +The worker starts from a node in the graph, and this node is the first node in the sentence. + +02:28.560 --> 02:32.840 +Then the worker moves to one of the neighbors selected randomly. + +02:32.840 --> 02:35.920 +This new node is added to the sentence. + +02:35.920 --> 02:43.320 +By repeating this process, we can generate a sentence of nodes from this graph. + +02:43.320 --> 02:48.940 +The friendship paradox comes into play when the worker follows an edge. + +02:48.940 --> 02:53.320 +It is more likely to visit a node with many neighbors. + +02:53.320 --> 02:58.800 +In other words, following edges is a bias sampling that preferentially leads random + +02:58.800 --> 03:02.400 +workers to nodes with many neighbors. + +03:02.400 --> 03:07.640 +To see this effect, let us consider a graph with co-peripheral structure, where kernels + +03:07.640 --> 03:10.600 +have more neighbors than periphery. + +03:10.600 --> 03:15.560 +A sentence can be generated from this graph by running a random walk. + +03:15.560 --> 03:21.200 +Now, the kernels are about 20% of nodes in the graph. + +03:21.200 --> 03:26.200 +But when looking at the generated sentence, the kernels are overrepresented, which is + +03:26.200 --> 03:30.160 +because of the bias due to the friendship paradox. + +03:30.160 --> 03:38.160 +The fact that the sentence is biased by the friendship paradox leads us to our main question. + +03:38.160 --> 03:41.760 +Does the sampling bias have negative impact? + +03:41.760 --> 03:44.360 +If so, how can we fix it? + +03:44.360 --> 03:50.920 +Surprisingly, it has no effect because Word2Vec itself has an overlooked built-in devising + +03:50.920 --> 03:56.440 +feature that happens to negate the bias due to the friendship paradox. + +03:56.440 --> 04:03.040 +This built-in devising feature can be easily utilized to negate other types of biases, + +04:03.040 --> 04:06.640 +and we demonstrate how to do this. + +04:06.640 --> 04:10.280 +Our starting point is a sentence of words. + +04:10.280 --> 04:17.480 +Word2Vec picks a word called center and surrounding words called context, and then models the + +04:17.480 --> 04:24.400 +conditional probability using a softmax function, where the conditional probability is reflected + +04:24.400 --> 04:28.880 +as a dot similarity of the two vectors of the words. + +04:28.880 --> 04:35.240 +We want to fit this model to the data, but it is computationally challenging due to the + +04:35.240 --> 04:42.080 +normalization constant, which extends over all unique words in the corpus. + +04:42.080 --> 04:46.160 +A common way to reduce this burden is negative sampling. + +04:46.160 --> 04:53.720 +Now, it is often underappreciated that negative sampling is actually a simplified version + +04:53.720 --> 04:56.800 +of noise contrastive estimation. + +04:56.800 --> 05:04.920 +And it is this simplification that gives rise to an interesting feature of Word2Vec. + +05:04.920 --> 05:09.880 +How does the noise contrastive estimation, or NCE, works? + +05:09.880 --> 05:16.840 +NCE samples k random contexts from so-called noise distribution. + +05:16.840 --> 05:23.400 +This noise distribution is roughly proportional to the frequency of a word in the corpus. + +05:23.400 --> 05:30.240 +The random contexts are labeled as 0, and the actual context is labeled as 1. + +05:30.240 --> 05:37.160 +Then NCE calculates the probability that a word comes from actual data using a Bayesian + +05:37.160 --> 05:39.040 +framework. + +05:39.040 --> 05:46.040 +By putting the prior likelihood together, we have a posterior like this. + +05:46.040 --> 05:52.320 +This function is a sigmoid function and takes the dot similarity and the noise distribution + +05:52.320 --> 05:54.400 +as the arguments. + +05:54.400 --> 06:01.160 +Now the key feature of the NCE is that it is asymptomatically unbiased for the model + +06:01.160 --> 06:03.000 +of the Word2Vec. + +06:03.000 --> 06:08.160 +Meaning if the data is actually generated from this model, and we increase the number + +06:08.160 --> 06:14.240 +of trainings, then the embedding vectors converge to the true vectors. + +06:14.240 --> 06:20.560 +Beyond Word2Vec, the noise contrastive estimation is also an unbiased estimator for a more general + +06:20.560 --> 06:27.680 +model that takes a real value function f instead of the dot similarity. + +06:27.680 --> 06:33.400 +Now the negative sampling simplifies the noise contrastive estimation. + +06:33.400 --> 06:40.640 +It estimates the same probability, but variably drops the term of the noise distribution. + +06:40.640 --> 06:45.000 +You might be wondering what happens without this term. + +06:45.000 --> 06:50.880 +To see this, we rewrite it in form of the noise contrastive estimation, where we define + +06:50.880 --> 06:59.480 +a new function f' which consists of the original function f as well as the noise distribution. + +06:59.480 --> 07:08.640 +This is asymptomatically unbiased for a probability model which now includes the noise distribution. + +07:08.640 --> 07:16.160 +So all in all, Word2Vec trained with skip-gram-negative sampling is asymptomatically unbiased for + +07:16.160 --> 07:23.440 +this probability model, or more specifically for Word2Vec, this function. + +07:23.440 --> 07:30.840 +In this model, the noise distribution offsets the modeled probability, serving as a baseline. + +07:30.840 --> 07:35.720 +The embedding vectors captures the residual from the baseline. + +07:35.720 --> 07:41.760 +Now, remind that the baseline probability is roughly proportional to the frequency. + +07:41.760 --> 07:48.160 +Therefore, the embedding vectors capture the information other than the frequency. + +07:48.160 --> 07:57.320 +In other words, SGNS Word2Vec has a built-in debiasing feature for frequency bias. + +07:57.320 --> 08:01.800 +Now let us revisit the friendship paradox. + +08:01.800 --> 08:08.280 +The sampling bias due to the friendship paradox is that the frequency of a word is determined + +08:08.280 --> 08:12.000 +thoroughly by the degree of noise. + +08:12.000 --> 08:17.480 +Notice that this frequency is actually accounted for by the baseline probability. + +08:17.480 --> 08:24.840 +Therefore, the friendship paradox has no effect thanks to the built-in debiasing feature of + +08:24.840 --> 08:28.400 +SGNS Word2Vec. + +08:28.400 --> 08:33.840 +This realization leads us to Residual2Vec. + +08:33.840 --> 08:41.120 +The key idea is to model the baseline probability explicitly to control what bias to remove + +08:41.120 --> 08:43.160 +in embedding. + +08:43.160 --> 08:47.880 +So how can we model the baseline more specifically? + +08:47.880 --> 08:54.880 +We start from the given graph and randomize the structure, then generate a sequence using + +08:54.880 --> 09:01.600 +random walks, then calculate the conditional probability as the baseline, which is based + +09:01.600 --> 09:08.080 +on the idea that we should remove biases arising from the trivial structure. + +09:08.080 --> 09:12.640 +This debiasing feature is useful to predict links in the graph. + +09:12.640 --> 09:20.320 +Residual2Vec performs the best or nearly the best for all six graphs of different domains. + +09:20.320 --> 09:27.360 +Furthermore, Residual2Vec is the best or the second best performer for a community detection + +09:27.360 --> 09:29.360 +benchmark. + +09:29.360 --> 09:36.280 +To showcase the debiasing feature, we constructed a citation graph of general issues using the + +09:36.280 --> 09:43.800 +web of science, where the nodes are general issues connected by undirected and weighted + +09:43.800 --> 09:45.800 +citations. + +09:45.800 --> 09:51.640 +When applying grove embedding, all genres are concentrated on the center, reflecting + +09:51.640 --> 09:55.040 +temporal aspects of the issues. + +09:55.040 --> 10:01.440 +This is because the old issues have time to accumulate many citations, and therefore well + +10:01.440 --> 10:04.840 +connected to many different issues. + +10:04.840 --> 10:10.720 +For subject-wise, grove separates different fields to some extent. + +10:10.720 --> 10:15.560 +With Residual2Vec, we can remove the biases due to time. + +10:15.560 --> 10:21.720 +In effect, the old genres now spread out, and the disciplinary separations are more + +10:21.720 --> 10:23.920 +clearly visible. + +10:23.920 --> 10:29.480 +Beyond eyeballing the embeddings, we test the embeddings quantitatively by predicting + +10:29.480 --> 10:35.160 +the genre impact factor as well as the subject categories. + +10:35.160 --> 10:41.280 +We find that the impact factor and the subject of genres can be well predicted by removing + +10:41.280 --> 10:46.560 +the temporal biases as well as the friendship paradox effect. + +10:46.560 --> 10:54.600 +In summary, we show that World2Vec has a built-in debiasing feature attributed to negative sampling. + +10:54.600 --> 11:00.320 +Inspired by this finding, we propose Residual2Vec that can negate other types of structural + +11:00.320 --> 11:02.320 +biases. + +11:02.320 --> 11:08.360 +We demonstrate that removing biases not only improves the performance, but also enabling + +11:08.360 --> 11:13.400 +us to control on the biases in the final representation. + +11:13.400 --> 11:19.480 +Our results highlighted a new potential of negative sampling as a way to mitigate biases + +11:19.480 --> 11:27.200 +in representations, which may be useful to address the problem of the biases in AI. + +11:27.200 --> 11:33.320 +Although we have not studied the biases in AI, given the wide usage of negative sampling + +11:33.320 --> 11:39.880 +to train AI, our approach may lead to methods and studies that expose and mitigate biases + +11:39.880 --> 11:41.920 +in AI. + +11:41.920 --> 11:47.720 +We believe that our approach contributes to the effort to create transparent and accountable + +11:47.720 --> 11:53.520 +machine learning methods, especially because our method enables us to explicitly control + +11:53.520 --> 11:57.160 +the biases in the graph representation. + +11:57.160 --> 12:03.520 +That's all for the presentation, and finally I'd like to acknowledge Jason Yoon, Isabel + +12:03.520 --> 12:11.280 +Constantino, and Yongyuan An for creating and adding momentum to this project for years, + +12:11.280 --> 12:15.160 +and for all of you who watched this video. + +12:15.160 --> 12:19.640 +If you want to know more in detail, please check out our paper. + +12:19.640 --> 12:27.880 +Thanks! + diff --git a/demo_data/nips-2021/25965/video.mp4 b/demo_data/nips-2021/25965/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..128211948363ab92e183390f09a01a49c46661f7 --- /dev/null +++ b/demo_data/nips-2021/25965/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5f31ed21fe17212ce4286f2adfa7761931ae1255d6ce0379a658e42870f4a7e +size 25519466 diff --git a/demo_data/nips-2021/25969/metadata.json b/demo_data/nips-2021/25969/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..22e501ac798101dc0010bd2fcb74ed4a56efaa4a --- /dev/null +++ b/demo_data/nips-2021/25969/metadata.json @@ -0,0 +1,3 @@ +{ + "title": "Play to Grade: Testing Coding Games as Classifying Markov Decision Process" +} \ No newline at end of file diff --git a/demo_data/nips-2021/25969/transcript_whisper_large-v2.txt b/demo_data/nips-2021/25969/transcript_whisper_large-v2.txt new file mode 100644 index 0000000000000000000000000000000000000000..2429d8ae71e7e37b2fae07ba6c7feefa631448e5 --- /dev/null +++ b/demo_data/nips-2021/25969/transcript_whisper_large-v2.txt @@ -0,0 +1,160 @@ +Hello everyone, my name is Alan. I'm a PhD student from Stanford University. I'm presenting +our work Play to Grade, testing coding games as classifying Markov decision process. This +is joint work with Emma Bronskill and Chris Peach. +In this talk, we will highlight the central problem that we're trying to solve, which +is scaling up quality feedback for students learning to code is crucial. Grading interactive +coding game is very difficult, and we frame this as an instance of identifying if a program +has the same behavior as a desired MDP. Even with 11 label programs, we can achieve 94% +accuracy on real student assignment from code.org. +Each year, hundreds of thousands of people, children and adults alike, want to learn coding. +Modern massive online education platforms like code.org serves over 40% of US K-12 students. +Scaling up quality feedback for these students is crucial, especially in areas where there +are shortages of computer science teachers. +Interactive coding assignments are becoming more popular. It's a lot more fun for students +to program them. They're also a common type of programs for students to code. For example, +web pages are interactive. However, in order to grade them, teachers often need to play +each student homework for 20 seconds to a couple minutes. This quickly becomes a scaling +issue. A 20-student classroom might still be manageable, but in a large university where +there are hundreds of students taking the same class or on an online education platform +like code.org, grading these assignments is a real challenge. This places a real burden +on teachers. +Why is it difficult to develop automatic grading tools? First of all, each assignment is different +from each other. Traditional machine learning solutions that rely on collecting a large +set of data set simply won't work here. Oftentimes, assignments for the same class can even change +from year to year. Spending effort to collect a large label data set is a hard sell to teachers. +Second, the same assignment can be written in different coding languages. The solutions +could end up looking quite different. At last, code solutions can be very long, especially +when interaction is involved. Unfortunately, current state-of-the-art code analysis solutions +don't scale beyond 10 lines of code. In this work, we hope to offer a new solution +inspired by human teachers' grade these assignments. +Let's take a look at how a teacher plays to grade a student homework. This is what +a correct solution for code.org's coding assignment, Bounce, looks like. The teacher +controls a paddle to bounce a ball into a goal post and gets one score. +Here's what an incorrect student submission looks like. The student didn't put the boundary +condition for the wall and the ball goes right through it. +Here's another incorrect submission. Instead of getting a point after successfully bouncing +the ball into the goal post, the player gets a point whenever the ball bounces on wall +and paddle. This is clearly not the correct behavior. +However, a teacher isn't just playing the game normally. In order to grade it, the teacher +has to play it in a specific way to expose bugs in the game. Take a look at both programs +on the left and right. Both have wall boundary problems, but we would never know if the teacher +didn't try to bounce the ball on the wall. The right panel shows a game, though broken, +can look like a perfectly correct game. +Using the Markov Decision Process framework from reinforcement learning, we can characterize +the intuition we have built up. The MDP framework can be used to describe any interactive environment, +not just games. It includes a state space, action space, a transition dynamics that defines +how the game moves from one frame to the next, and a reward function. We can train an agent +using a reinforcement learning algorithm that learns to maximize the reward. So how does +the MDP framework help us understand programs with bugs? +We can treat each program as its own MDP. The teacher's correct program is the correct +or desired MDP, while the student's program is another MDP or a test MDP. We can frame +grading as an instance of identifying if a test MDP has the same behavior as a desired +MDP. Using components from the MDP framework, we can express bugs as distance between two +MDPs' transition and reward functions. The ball going through the wall is clearly not +a correct transition. Receive reward when you shouldn't can also be captured by the +difference in the reward function output. More precisely, we can treat grading as calculating +a distance between two MDPs. Equation 1 might suggest that we should check over all states. +However, since distance is non-negative and we're interested in the overall sum, we +only need to find one state-action pair in the test MDP to know if the overall distance +is non-zero. If we set this distance as a reward for an RL agent, we can make the task +of reaching bug states a lot more intelligent and efficient. This RL agent's objective +is to reach states that have the highest potential to be different between the two MDPs with +respect to this distance function. We do have one more challenge that remains. +The distance function DSA requires access to both MDPs' transition and reward functions. +We cannot assume we have access to the student program's inner mechanism. We can't control +the randomness in the student's code either, meaning two MDPs can have different random +initial starting positions. Therefore, when we interact with the student's MDP, we need +to learn a parametrized distance function that can tell us how far the observed state-action +pairs from the student MDP is from the correct MDP. +Now we have two parametrized models. The agent requires training to find the bug. The classifier +requires training to identify the bug. We call this the code star problem. So, if I +have a classifier that can classify which state triggers a bug, then we can simply replace +reward function in the MDP with this classifier and directly teach our agent. If I have an +agent that can always reach the bug state, I can probably just collect a dataset of trajectories +and train a good classifier. But at the beginning, neither the agent nor the classifier can do +a very good job. Therefore, we introduce a procedure called +collaborative training. The agent will start out as a random agent, where we can train +the agent to maximize the original reward in the MDP. It collects trajectories and trains +the classifier. Then we use the classifier as a reward function to guide the agent on +how to reach bug states. They both start out bad, but the agent can help the classifier +learn and the classifier can in return teach the agent. +We present two baselines to train the bug classifier. Since we have some training data, +though not a lot, we can simply apply coarse labeling, creating a dataset where all state-action +pairs from the correct labeled MDP as non-bug states and all state-action pairs from the +broken MDP as bug states. This is incredibly noisy because not all state-action pairs from +the broken MDP are bug states, only a few of them are. But this is a good baseline to +have. We can also train an unsupervised learning +model to memorize all state-action pairs from the correct MDP and use log probability or +reconstruction loss to detect abnormal state-action pairs in the broken MDP. +Inspired by Hohr-Triples and MDP state equivalence literature, we designed two models to fully +capture this notion of MDP-based state difference. We assume that the students can specify and +set random seed for their game. Therefore, the game objects, such as a ball, will not +always appear in the same initial state. Therefore, it is crucial for us to approximate one MDP's +transition dynamics and reward function. When our agent interacts with a new MDP, this is +where Hohr-LSTM comes in. We train it to model the correct MDP's transition dynamics and +reward function and treat bug states in the new MDP when sufficient deviation occurs from +the prediction. We further introduce contrastive Hohr-LSTM. +Sometimes the agent will explore a new region that it might not have visited in the correct +MDP. The predictive difference between the observed state and predictive state is in +fact a function approximation error. In order to reduce this error, we approximate both +the correct MDP and the broken MDP. +Let's take a look at how these models work. We introduce a car environment. In here, the +student miscalculated the boundary of this environment, so whenever the car goes outside +of the red dotted line, it will get stuck and can only wriggle back and forth. This +is a task where you will always reach a bug state at the end of each trajectory. Therefore, +every single agent is already an optimal agent. We create a specific one that only knows how +to drive north in a straight line. +As we can see, almost all models, except Gaussian mixture model, can be close to 100% accuracy +at classifying bug states and non-bug states. However, the agent that only knows how to +drive north is not a very interesting agent, and we probably will never use that in real +life. So what if we make it a little bit harder? +We can create an agent that drives the car randomly. Now the trajectory will become different +each time. We see a significant drop in performance for baseline solutions like noisy supervised +learning and variational autoencoder. However, our LSTM-based models can still do very well +at close to 100% accuracy. This is a pretty challenging task because we're measuring the +accuracy of each classifier on every state in a trajectory, even though we're in a toy +environment. +Let's make this setting even harder. The car environment can stay the same, but for now, +bugs can only be triggered if the agent successfully drives the car into some small red rectangular +areas. Not all agents are optimal now, and it would be unlikely for a single-direction +agent to ever see a bug state. We can now showcase the power of collaborative training +through this example. +We can see at the beginning, the agent is pretty random, and the classifier is pretty +bad except for the LSTM models. However, after only one round of collaborative training, +we see a substantial improvement for the two baseline models, both noisy supervised learning +model and variational autoencoder are able to improve their accuracy by 30% and precision +by 60%. This shows that the collaborative training is helping both the agent and the +classifier to be more optimal, even for the weaker classifiers. +We also notice that this improvement is not monotonic. Just like every other AI training +scheme, overfitting sometimes happens. Only the most expressive classifiers, our proposed +Horl LSTM and contrastive Horl LSTM can remain stable and even mildly improve their recall +in the last round of collaborative training. +We can directly examine the agent's learning by looking at its trajectory. At first, the +agent drives the car randomly, but after only one round of collaborative training, the agent +becomes sharply focused and only visits the possible buggy areas. +We verify our method on a real student dataset that we obtained from code.org. We use this +assignment as our motivating examples earlier. Bounce is a simple coding exercise where 450,000 +students have submitted their solutions. We built a simulator that can run and execute +students' programs that conforms to the OpenAI GEM API. For each student program, we have +created goal labels for bug behaviors. We further binarize them into a single label +indicating correct or incorrect. +Bounce is a lot more complicated than car. Learning to bounce a ball into the goalpost +and understanding the physics is a lot more difficult for the agent. Therefore, we pre-train +the agent using the score as a reward. We call this play-to-win agent. Then we use this +agent to train our bug classifier. We're able to reach 94% accuracy with only 11 label +programs as training data. A similar algorithm that uses code as text input cannot match +our method's performance due to the smallness of the training dataset. +In addition to just grading, since we're able to determine bugs at the state level, +we can simply record a few frames before and after the bug occurs and compile a short video +for the students to demonstrate what the bug is in their assignment. +To summarize our work, we provide a fully functional simulator and a massive amount +of real student programs with goal labels. We demonstrate that our solution achieves +a high performance. However, there are still many problems remain. For example, can we +know which bug is triggered in the student program? This is helpful for providing fine-grained +feedback to the students. Training an RL agent with a classifier has also been explored in +other areas like SafeRL, where unsafe states are predicted by a classifier. +At last, we pose this question of creativity. Can our formulation accommodate creativity? +Creative programs are different but not broken. A ball can move faster or slower than the +teacher's solution, but it doesn't mean it's wrong. Exploring how we can recognize +and encourage student creativity is crucial for automated grading. Thanks for listening. +Come and chat with me during the poster session. diff --git a/demo_data/nips-2021/25969/transcript_whisper_large-v2.vtt b/demo_data/nips-2021/25969/transcript_whisper_large-v2.vtt new file mode 100644 index 0000000000000000000000000000000000000000..3276ca3a500ae5bb92749466a2cdb62196908908 --- /dev/null +++ b/demo_data/nips-2021/25969/transcript_whisper_large-v2.vtt @@ -0,0 +1,482 @@ +WEBVTT + +00:00.000 --> 00:14.160 +Hello everyone, my name is Alan. I'm a PhD student from Stanford University. I'm presenting + +00:14.160 --> 00:19.880 +our work Play to Grade, testing coding games as classifying Markov decision process. This + +00:19.880 --> 00:23.720 +is joint work with Emma Bronskill and Chris Peach. + +00:23.720 --> 00:28.000 +In this talk, we will highlight the central problem that we're trying to solve, which + +00:28.000 --> 00:34.240 +is scaling up quality feedback for students learning to code is crucial. Grading interactive + +00:34.240 --> 00:40.040 +coding game is very difficult, and we frame this as an instance of identifying if a program + +00:40.040 --> 00:48.040 +has the same behavior as a desired MDP. Even with 11 label programs, we can achieve 94% + +00:48.040 --> 00:52.560 +accuracy on real student assignment from code.org. + +00:52.560 --> 00:58.560 +Each year, hundreds of thousands of people, children and adults alike, want to learn coding. + +00:58.560 --> 01:06.680 +Modern massive online education platforms like code.org serves over 40% of US K-12 students. + +01:06.680 --> 01:11.580 +Scaling up quality feedback for these students is crucial, especially in areas where there + +01:11.580 --> 01:15.780 +are shortages of computer science teachers. + +01:15.780 --> 01:20.040 +Interactive coding assignments are becoming more popular. It's a lot more fun for students + +01:20.040 --> 01:25.880 +to program them. They're also a common type of programs for students to code. For example, + +01:25.880 --> 01:31.160 +web pages are interactive. However, in order to grade them, teachers often need to play + +01:31.160 --> 01:36.560 +each student homework for 20 seconds to a couple minutes. This quickly becomes a scaling + +01:36.560 --> 01:42.760 +issue. A 20-student classroom might still be manageable, but in a large university where + +01:42.760 --> 01:47.840 +there are hundreds of students taking the same class or on an online education platform + +01:47.840 --> 01:53.720 +like code.org, grading these assignments is a real challenge. This places a real burden + +01:53.720 --> 01:55.960 +on teachers. + +01:55.960 --> 02:01.680 +Why is it difficult to develop automatic grading tools? First of all, each assignment is different + +02:01.680 --> 02:06.280 +from each other. Traditional machine learning solutions that rely on collecting a large + +02:06.280 --> 02:13.040 +set of data set simply won't work here. Oftentimes, assignments for the same class can even change + +02:13.040 --> 02:19.840 +from year to year. Spending effort to collect a large label data set is a hard sell to teachers. + +02:19.840 --> 02:24.840 +Second, the same assignment can be written in different coding languages. The solutions + +02:24.840 --> 02:31.080 +could end up looking quite different. At last, code solutions can be very long, especially + +02:31.080 --> 02:36.840 +when interaction is involved. Unfortunately, current state-of-the-art code analysis solutions + +02:36.840 --> 02:42.240 +don't scale beyond 10 lines of code. In this work, we hope to offer a new solution + +02:42.240 --> 02:46.480 +inspired by human teachers' grade these assignments. + +02:46.480 --> 02:50.760 +Let's take a look at how a teacher plays to grade a student homework. This is what + +02:50.760 --> 02:55.760 +a correct solution for code.org's coding assignment, Bounce, looks like. The teacher + +02:55.760 --> 03:02.080 +controls a paddle to bounce a ball into a goal post and gets one score. + +03:02.080 --> 03:06.080 +Here's what an incorrect student submission looks like. The student didn't put the boundary + +03:06.080 --> 03:10.120 +condition for the wall and the ball goes right through it. + +03:10.120 --> 03:14.920 +Here's another incorrect submission. Instead of getting a point after successfully bouncing + +03:14.920 --> 03:19.840 +the ball into the goal post, the player gets a point whenever the ball bounces on wall + +03:19.840 --> 03:24.040 +and paddle. This is clearly not the correct behavior. + +03:24.040 --> 03:29.640 +However, a teacher isn't just playing the game normally. In order to grade it, the teacher + +03:29.640 --> 03:35.880 +has to play it in a specific way to expose bugs in the game. Take a look at both programs + +03:35.880 --> 03:41.320 +on the left and right. Both have wall boundary problems, but we would never know if the teacher + +03:41.320 --> 03:47.200 +didn't try to bounce the ball on the wall. The right panel shows a game, though broken, + +03:47.200 --> 03:50.420 +can look like a perfectly correct game. + +03:50.420 --> 03:55.040 +Using the Markov Decision Process framework from reinforcement learning, we can characterize + +03:55.040 --> 04:00.720 +the intuition we have built up. The MDP framework can be used to describe any interactive environment, + +04:00.720 --> 04:06.280 +not just games. It includes a state space, action space, a transition dynamics that defines + +04:06.280 --> 04:12.160 +how the game moves from one frame to the next, and a reward function. We can train an agent + +04:12.160 --> 04:16.800 +using a reinforcement learning algorithm that learns to maximize the reward. So how does + +04:16.800 --> 04:21.480 +the MDP framework help us understand programs with bugs? + +04:21.480 --> 04:26.600 +We can treat each program as its own MDP. The teacher's correct program is the correct + +04:26.600 --> 04:33.480 +or desired MDP, while the student's program is another MDP or a test MDP. We can frame + +04:33.480 --> 04:39.240 +grading as an instance of identifying if a test MDP has the same behavior as a desired + +04:39.240 --> 04:46.000 +MDP. Using components from the MDP framework, we can express bugs as distance between two + +04:46.000 --> 04:50.960 +MDPs' transition and reward functions. The ball going through the wall is clearly not + +04:50.960 --> 04:55.600 +a correct transition. Receive reward when you shouldn't can also be captured by the + +04:55.600 --> 05:02.240 +difference in the reward function output. More precisely, we can treat grading as calculating + +05:02.240 --> 05:09.000 +a distance between two MDPs. Equation 1 might suggest that we should check over all states. + +05:09.000 --> 05:14.160 +However, since distance is non-negative and we're interested in the overall sum, we + +05:14.160 --> 05:19.800 +only need to find one state-action pair in the test MDP to know if the overall distance + +05:19.800 --> 05:25.720 +is non-zero. If we set this distance as a reward for an RL agent, we can make the task + +05:25.720 --> 05:32.120 +of reaching bug states a lot more intelligent and efficient. This RL agent's objective + +05:32.120 --> 05:37.680 +is to reach states that have the highest potential to be different between the two MDPs with + +05:37.680 --> 05:43.700 +respect to this distance function. We do have one more challenge that remains. + +05:43.700 --> 05:50.620 +The distance function DSA requires access to both MDPs' transition and reward functions. + +05:50.620 --> 05:55.960 +We cannot assume we have access to the student program's inner mechanism. We can't control + +05:55.960 --> 06:01.320 +the randomness in the student's code either, meaning two MDPs can have different random + +06:01.320 --> 06:07.000 +initial starting positions. Therefore, when we interact with the student's MDP, we need + +06:07.000 --> 06:12.640 +to learn a parametrized distance function that can tell us how far the observed state-action + +06:12.640 --> 06:17.600 +pairs from the student MDP is from the correct MDP. + +06:17.600 --> 06:23.120 +Now we have two parametrized models. The agent requires training to find the bug. The classifier + +06:23.120 --> 06:28.880 +requires training to identify the bug. We call this the code star problem. So, if I + +06:28.880 --> 06:34.360 +have a classifier that can classify which state triggers a bug, then we can simply replace + +06:34.360 --> 06:40.640 +reward function in the MDP with this classifier and directly teach our agent. If I have an + +06:40.640 --> 06:46.480 +agent that can always reach the bug state, I can probably just collect a dataset of trajectories + +06:46.480 --> 06:52.240 +and train a good classifier. But at the beginning, neither the agent nor the classifier can do + +06:52.240 --> 06:56.640 +a very good job. Therefore, we introduce a procedure called + +06:56.640 --> 07:01.440 +collaborative training. The agent will start out as a random agent, where we can train + +07:01.440 --> 07:07.680 +the agent to maximize the original reward in the MDP. It collects trajectories and trains + +07:07.680 --> 07:12.960 +the classifier. Then we use the classifier as a reward function to guide the agent on + +07:12.960 --> 07:18.360 +how to reach bug states. They both start out bad, but the agent can help the classifier + +07:18.360 --> 07:23.280 +learn and the classifier can in return teach the agent. + +07:23.280 --> 07:28.360 +We present two baselines to train the bug classifier. Since we have some training data, + +07:28.360 --> 07:33.560 +though not a lot, we can simply apply coarse labeling, creating a dataset where all state-action + +07:33.560 --> 07:40.240 +pairs from the correct labeled MDP as non-bug states and all state-action pairs from the + +07:40.240 --> 07:46.160 +broken MDP as bug states. This is incredibly noisy because not all state-action pairs from + +07:46.160 --> 07:51.600 +the broken MDP are bug states, only a few of them are. But this is a good baseline to + +07:51.600 --> 07:54.640 +have. We can also train an unsupervised learning + +07:54.640 --> 08:00.120 +model to memorize all state-action pairs from the correct MDP and use log probability or + +08:00.120 --> 08:06.200 +reconstruction loss to detect abnormal state-action pairs in the broken MDP. + +08:06.200 --> 08:12.280 +Inspired by Hohr-Triples and MDP state equivalence literature, we designed two models to fully + +08:12.280 --> 08:18.240 +capture this notion of MDP-based state difference. We assume that the students can specify and + +08:18.240 --> 08:23.680 +set random seed for their game. Therefore, the game objects, such as a ball, will not + +08:23.680 --> 08:30.000 +always appear in the same initial state. Therefore, it is crucial for us to approximate one MDP's + +08:30.000 --> 08:35.840 +transition dynamics and reward function. When our agent interacts with a new MDP, this is + +08:35.840 --> 08:41.560 +where Hohr-LSTM comes in. We train it to model the correct MDP's transition dynamics and + +08:41.560 --> 08:47.320 +reward function and treat bug states in the new MDP when sufficient deviation occurs from + +08:47.320 --> 08:52.440 +the prediction. We further introduce contrastive Hohr-LSTM. + +08:52.440 --> 08:57.880 +Sometimes the agent will explore a new region that it might not have visited in the correct + +08:57.880 --> 09:03.800 +MDP. The predictive difference between the observed state and predictive state is in + +09:03.800 --> 09:09.560 +fact a function approximation error. In order to reduce this error, we approximate both + +09:09.560 --> 09:13.760 +the correct MDP and the broken MDP. + +09:13.760 --> 09:18.600 +Let's take a look at how these models work. We introduce a car environment. In here, the + +09:18.600 --> 09:23.480 +student miscalculated the boundary of this environment, so whenever the car goes outside + +09:23.480 --> 09:28.240 +of the red dotted line, it will get stuck and can only wriggle back and forth. This + +09:28.240 --> 09:35.160 +is a task where you will always reach a bug state at the end of each trajectory. Therefore, + +09:35.160 --> 09:41.280 +every single agent is already an optimal agent. We create a specific one that only knows how + +09:41.280 --> 09:44.360 +to drive north in a straight line. + +09:44.360 --> 09:50.840 +As we can see, almost all models, except Gaussian mixture model, can be close to 100% accuracy + +09:50.840 --> 09:56.680 +at classifying bug states and non-bug states. However, the agent that only knows how to + +09:56.680 --> 10:01.440 +drive north is not a very interesting agent, and we probably will never use that in real + +10:01.440 --> 10:05.560 +life. So what if we make it a little bit harder? + +10:05.560 --> 10:10.840 +We can create an agent that drives the car randomly. Now the trajectory will become different + +10:10.840 --> 10:16.660 +each time. We see a significant drop in performance for baseline solutions like noisy supervised + +10:16.660 --> 10:22.880 +learning and variational autoencoder. However, our LSTM-based models can still do very well + +10:22.880 --> 10:28.400 +at close to 100% accuracy. This is a pretty challenging task because we're measuring the + +10:28.400 --> 10:33.880 +accuracy of each classifier on every state in a trajectory, even though we're in a toy + +10:33.880 --> 10:35.760 +environment. + +10:35.760 --> 10:40.360 +Let's make this setting even harder. The car environment can stay the same, but for now, + +10:40.360 --> 10:45.800 +bugs can only be triggered if the agent successfully drives the car into some small red rectangular + +10:45.800 --> 10:51.400 +areas. Not all agents are optimal now, and it would be unlikely for a single-direction + +10:51.400 --> 10:56.400 +agent to ever see a bug state. We can now showcase the power of collaborative training + +10:56.400 --> 10:58.880 +through this example. + +10:58.880 --> 11:03.200 +We can see at the beginning, the agent is pretty random, and the classifier is pretty + +11:03.200 --> 11:10.240 +bad except for the LSTM models. However, after only one round of collaborative training, + +11:10.240 --> 11:15.360 +we see a substantial improvement for the two baseline models, both noisy supervised learning + +11:15.360 --> 11:21.720 +model and variational autoencoder are able to improve their accuracy by 30% and precision + +11:21.720 --> 11:26.800 +by 60%. This shows that the collaborative training is helping both the agent and the + +11:26.800 --> 11:32.000 +classifier to be more optimal, even for the weaker classifiers. + +11:32.000 --> 11:37.680 +We also notice that this improvement is not monotonic. Just like every other AI training + +11:37.680 --> 11:43.560 +scheme, overfitting sometimes happens. Only the most expressive classifiers, our proposed + +11:43.560 --> 11:49.840 +Horl LSTM and contrastive Horl LSTM can remain stable and even mildly improve their recall + +11:49.840 --> 11:53.700 +in the last round of collaborative training. + +11:53.700 --> 12:00.040 +We can directly examine the agent's learning by looking at its trajectory. At first, the + +12:00.040 --> 12:05.320 +agent drives the car randomly, but after only one round of collaborative training, the agent + +12:05.320 --> 12:11.000 +becomes sharply focused and only visits the possible buggy areas. + +12:11.000 --> 12:16.560 +We verify our method on a real student dataset that we obtained from code.org. We use this + +12:16.560 --> 12:23.920 +assignment as our motivating examples earlier. Bounce is a simple coding exercise where 450,000 + +12:23.920 --> 12:28.520 +students have submitted their solutions. We built a simulator that can run and execute + +12:28.520 --> 12:34.280 +students' programs that conforms to the OpenAI GEM API. For each student program, we have + +12:34.280 --> 12:40.040 +created goal labels for bug behaviors. We further binarize them into a single label + +12:40.040 --> 12:43.360 +indicating correct or incorrect. + +12:43.360 --> 12:48.360 +Bounce is a lot more complicated than car. Learning to bounce a ball into the goalpost + +12:48.360 --> 12:54.080 +and understanding the physics is a lot more difficult for the agent. Therefore, we pre-train + +12:54.080 --> 12:59.680 +the agent using the score as a reward. We call this play-to-win agent. Then we use this + +12:59.680 --> 13:06.400 +agent to train our bug classifier. We're able to reach 94% accuracy with only 11 label + +13:06.400 --> 13:13.440 +programs as training data. A similar algorithm that uses code as text input cannot match + +13:13.440 --> 13:18.960 +our method's performance due to the smallness of the training dataset. + +13:18.960 --> 13:24.240 +In addition to just grading, since we're able to determine bugs at the state level, + +13:24.240 --> 13:30.040 +we can simply record a few frames before and after the bug occurs and compile a short video + +13:30.040 --> 13:35.040 +for the students to demonstrate what the bug is in their assignment. + +13:35.040 --> 13:39.640 +To summarize our work, we provide a fully functional simulator and a massive amount + +13:39.640 --> 13:44.080 +of real student programs with goal labels. We demonstrate that our solution achieves + +13:44.080 --> 13:48.920 +a high performance. However, there are still many problems remain. For example, can we + +13:48.920 --> 13:53.840 +know which bug is triggered in the student program? This is helpful for providing fine-grained + +13:53.840 --> 13:59.200 +feedback to the students. Training an RL agent with a classifier has also been explored in + +13:59.200 --> 14:04.520 +other areas like SafeRL, where unsafe states are predicted by a classifier. + +14:04.520 --> 14:10.760 +At last, we pose this question of creativity. Can our formulation accommodate creativity? + +14:10.760 --> 14:15.520 +Creative programs are different but not broken. A ball can move faster or slower than the + +14:15.520 --> 14:20.040 +teacher's solution, but it doesn't mean it's wrong. Exploring how we can recognize + +14:20.040 --> 14:25.200 +and encourage student creativity is crucial for automated grading. Thanks for listening. + +14:25.200 --> 14:34.840 +Come and chat with me during the poster session. + diff --git a/demo_data/nips-2021/25969/video.mp4 b/demo_data/nips-2021/25969/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a545347226015937aa4a5a3d780c3e19387f849b --- /dev/null +++ b/demo_data/nips-2021/25969/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:242e9cb2408cc72eb911a822a9c40a56b17801b8a9ed2cec81d8dc1cdf20c7b6 +size 56882548 diff --git a/demo_data/nips-2021/25970/metadata.json b/demo_data/nips-2021/25970/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..0eae451699b307bd39ea7bff25d57dd90eefbf7d --- /dev/null +++ b/demo_data/nips-2021/25970/metadata.json @@ -0,0 +1,3 @@ +{ + "title": "Improving Coherence and Consistency in Neural Sequence Models with Dual-System, Neuro-Symbolic Reasoning" +} \ No newline at end of file diff --git a/demo_data/nips-2021/25970/transcript_whisper_large-v2.txt b/demo_data/nips-2021/25970/transcript_whisper_large-v2.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc9568c09ef6d1db2bdeac32acd12fa446acd5ef --- /dev/null +++ b/demo_data/nips-2021/25970/transcript_whisper_large-v2.txt @@ -0,0 +1,93 @@ +Hi, my name is Maxwell Nye, and today I'll be talking about improving coherence and consistency +in neural sequence models with dual system neurosymbolic reasoning. +So I first want to give a little bit of a demo, which is to ask this question. +A bat and a ball cost $1.10 in total. +The bat costs $1 more than the ball. +How much does the ball cost? +So I'll let you think a little bit for this. +So one answer that sort of might jump out at you is $0.10, but this is actually incorrect +because the sum of the two objects should be $1.10. +So the correct answer is actually $0.05. +And this is an example from a cognitive reflection test, and these are questions designed to +have a particular answer which comes to mind quite quickly, which is in fact wrong. +And something that's interesting is that large-scale language models such as GPT-3 predict the +wrong answers as well. +And this is true not just for the sort of the classic cognitive reflection test, but +also for variants with different numbers. +So this is sort of an interesting thing. +It talks about how neural language models often have issues with consistency and coherence. +So another place that we can see this a little more concretely is the clutter data set. +In the clutter data set, models are trained to... +There are sentences about people and their family relationships and stories about those +people. +And this was originally devised as a question-answering data set where you ask what the relations +are. +One thing you can do is ask models to be trained on this data set and then generate new stories. +And when you do that, you'll see that often the generated stories have inconsistency. +So if we look at the bottom of the screen here, we can see an example of this. +Robert and his brother Antonio played harmonicas together. +Robert's daughter, Elsie, asked him to play with her. +Elsie doesn't like having to babysit her younger brother, Antonio. +And so we can see that this is a common sense error because Elsie is not the younger brother +of Antonio. +Or Elsie's younger brother is not Antonio. +So what we've done is we've built a dual system model using large-scale neural networks and +symbolic deliberative logic in order to try to help with these consistency issues. +So the model is as follows. +You use neural generation to generate sentences in a particular story. +You might generate the next sentence using a model such as GPT-3 or BART. +What you can then do is parse that sentence into the semantic meaning with respect to +the family relationships and check whether or not it matches the current state of the +family relationships that's been described so far, and only accept the candidate sentence +generations that are actually consistent. +So this has a few components. +One of the components here is a symbolic world model. +In the case of this clutter domain, the symbolic world model that we built encodes people and +their family relationships. +So in other words, you could take a sentence and encode what the underlying family relationship +is. +And what you can do is you can use SMT solvers such as the Z3 solver to check consistency. +So given a new sentence, you can check that it doesn't disobey the rules of ancestry that +we've defined here. +And so some of those are, for example, what is the relationship between children and grandchildren? +And then another is what are the rules about whether ancestry, can you be your own ancestor, +et cetera. +So one question is how is this semantic parsing done? +And it turns out we can actually do this quite cheaply using GPT-3. +So what we can see here in the dotted box is an actual example of a few-shot prompt +we can use to parse each new sentence, each new candidate sentence from the system one +generation model and parse it into the semantic form that we can then give to the world model +solver. +So the results here show that models that use this dual system neurosymbolic stories +show improved coherence over just sentences that were constructed by a neural model. +So the example here is that what we've done is we've used human judgments on which of +the following sentences make more sense given the prior context of the story. +And we see that if we use a symbolic world model and the parsing scheme described above, +humans prefer the judgments given by this model. +We can also apply the same sort of reasoning to a completely different task. +Here we can discuss the grounded instruction following task, the grounded instruction following +domain called gscan. +In this domain, the goal is to have an agent, which is shown by this pink triangle, follow +a command to perform some simple action in this grid world. +So you can see here, walk to a small yellow cylinder might be an example of a command. +Prior work has shown that one thing you can do is encode the initial state, encode the +instruction and then train a neural model to predict the action sequences. +Other work has also shown that one thing you can do is train a model to predict a distribution +over the correct target location as part of the neural model. +That will also increase the performance of the model. +What we do here is show that if you do both of these things, you predict both an action +sequence and a target location, like what is the location you should end up in, and +then check whether or not when you execute the set of instructions, you will end up in +the predicted target location. +You can sort of check consistency between these two different predictions and only accept +those instruction sequences which match the target location prediction. +And this leads to also higher accuracy, especially in a low data regime. +We have more details about the results of the paper. +So that's a little bit of an overview of our paper. +Our takeaways are that you can build systems with combined neural methods and explicit +world knowledge. +And if you add just a little bit of world knowledge, you can really help increase coherence +and consistency for these large sequence models. +There are some challenges here about parsing in larger scale domains and also what it would +mean to automatically build a more complete world model. +Thank you very much. diff --git a/demo_data/nips-2021/25970/transcript_whisper_large-v2.vtt b/demo_data/nips-2021/25970/transcript_whisper_large-v2.vtt new file mode 100644 index 0000000000000000000000000000000000000000..2354cc55f10533d1e8323fc9707b9da875b9e3bf --- /dev/null +++ b/demo_data/nips-2021/25970/transcript_whisper_large-v2.vtt @@ -0,0 +1,281 @@ +WEBVTT + +00:00.000 --> 00:14.520 +Hi, my name is Maxwell Nye, and today I'll be talking about improving coherence and consistency + +00:14.520 --> 00:19.620 +in neural sequence models with dual system neurosymbolic reasoning. + +00:19.620 --> 00:23.800 +So I first want to give a little bit of a demo, which is to ask this question. + +00:23.800 --> 00:26.920 +A bat and a ball cost $1.10 in total. + +00:26.920 --> 00:29.300 +The bat costs $1 more than the ball. + +00:29.300 --> 00:31.720 +How much does the ball cost? + +00:31.720 --> 00:34.920 +So I'll let you think a little bit for this. + +00:34.920 --> 00:39.200 +So one answer that sort of might jump out at you is $0.10, but this is actually incorrect + +00:39.200 --> 00:43.920 +because the sum of the two objects should be $1.10. + +00:43.920 --> 00:46.880 +So the correct answer is actually $0.05. + +00:46.880 --> 00:54.240 +And this is an example from a cognitive reflection test, and these are questions designed to + +00:54.240 --> 01:00.140 +have a particular answer which comes to mind quite quickly, which is in fact wrong. + +01:00.140 --> 01:06.640 +And something that's interesting is that large-scale language models such as GPT-3 predict the + +01:06.640 --> 01:08.320 +wrong answers as well. + +01:08.320 --> 01:11.300 +And this is true not just for the sort of the classic cognitive reflection test, but + +01:11.300 --> 01:15.160 +also for variants with different numbers. + +01:15.160 --> 01:19.680 +So this is sort of an interesting thing. + +01:19.680 --> 01:27.400 +It talks about how neural language models often have issues with consistency and coherence. + +01:27.400 --> 01:30.720 +So another place that we can see this a little more concretely is the clutter data set. + +01:30.720 --> 01:36.680 +In the clutter data set, models are trained to... + +01:36.680 --> 01:42.080 +There are sentences about people and their family relationships and stories about those + +01:42.080 --> 01:43.840 +people. + +01:43.840 --> 01:48.800 +And this was originally devised as a question-answering data set where you ask what the relations + +01:48.800 --> 01:49.800 +are. + +01:49.800 --> 01:58.080 +One thing you can do is ask models to be trained on this data set and then generate new stories. + +01:58.080 --> 02:02.880 +And when you do that, you'll see that often the generated stories have inconsistency. + +02:02.880 --> 02:06.560 +So if we look at the bottom of the screen here, we can see an example of this. + +02:06.560 --> 02:10.080 +Robert and his brother Antonio played harmonicas together. + +02:10.080 --> 02:13.440 +Robert's daughter, Elsie, asked him to play with her. + +02:13.440 --> 02:17.280 +Elsie doesn't like having to babysit her younger brother, Antonio. + +02:17.280 --> 02:21.240 +And so we can see that this is a common sense error because Elsie is not the younger brother + +02:21.240 --> 02:22.240 +of Antonio. + +02:22.240 --> 02:27.720 +Or Elsie's younger brother is not Antonio. + +02:27.720 --> 02:35.760 +So what we've done is we've built a dual system model using large-scale neural networks and + +02:35.760 --> 02:42.800 +symbolic deliberative logic in order to try to help with these consistency issues. + +02:42.800 --> 02:44.400 +So the model is as follows. + +02:44.400 --> 02:52.680 +You use neural generation to generate sentences in a particular story. + +02:52.680 --> 02:59.360 +You might generate the next sentence using a model such as GPT-3 or BART. + +02:59.360 --> 03:10.320 +What you can then do is parse that sentence into the semantic meaning with respect to + +03:10.320 --> 03:15.520 +the family relationships and check whether or not it matches the current state of the + +03:15.520 --> 03:20.960 +family relationships that's been described so far, and only accept the candidate sentence + +03:20.960 --> 03:25.800 +generations that are actually consistent. + +03:25.800 --> 03:27.600 +So this has a few components. + +03:27.600 --> 03:30.380 +One of the components here is a symbolic world model. + +03:30.380 --> 03:35.160 +In the case of this clutter domain, the symbolic world model that we built encodes people and + +03:35.160 --> 03:36.160 +their family relationships. + +03:36.160 --> 03:42.840 +So in other words, you could take a sentence and encode what the underlying family relationship + +03:42.840 --> 03:43.840 +is. + +03:43.840 --> 03:50.680 +And what you can do is you can use SMT solvers such as the Z3 solver to check consistency. + +03:50.680 --> 03:57.240 +So given a new sentence, you can check that it doesn't disobey the rules of ancestry that + +03:57.240 --> 03:58.240 +we've defined here. + +03:58.240 --> 04:04.120 +And so some of those are, for example, what is the relationship between children and grandchildren? + +04:04.120 --> 04:10.000 +And then another is what are the rules about whether ancestry, can you be your own ancestor, + +04:10.000 --> 04:12.180 +et cetera. + +04:12.180 --> 04:15.040 +So one question is how is this semantic parsing done? + +04:15.040 --> 04:19.560 +And it turns out we can actually do this quite cheaply using GPT-3. + +04:19.560 --> 04:26.920 +So what we can see here in the dotted box is an actual example of a few-shot prompt + +04:26.920 --> 04:34.440 +we can use to parse each new sentence, each new candidate sentence from the system one + +04:34.440 --> 04:42.360 +generation model and parse it into the semantic form that we can then give to the world model + +04:42.360 --> 04:46.280 +solver. + +04:46.280 --> 04:52.120 +So the results here show that models that use this dual system neurosymbolic stories + +04:52.120 --> 05:02.160 +show improved coherence over just sentences that were constructed by a neural model. + +05:02.160 --> 05:10.160 +So the example here is that what we've done is we've used human judgments on which of + +05:10.160 --> 05:14.800 +the following sentences make more sense given the prior context of the story. + +05:14.800 --> 05:25.280 +And we see that if we use a symbolic world model and the parsing scheme described above, + +05:25.280 --> 05:32.520 +humans prefer the judgments given by this model. + +05:32.520 --> 05:36.360 +We can also apply the same sort of reasoning to a completely different task. + +05:36.360 --> 05:42.080 +Here we can discuss the grounded instruction following task, the grounded instruction following + +05:42.080 --> 05:44.020 +domain called gscan. + +05:44.020 --> 05:49.360 +In this domain, the goal is to have an agent, which is shown by this pink triangle, follow + +05:49.360 --> 05:53.240 +a command to perform some simple action in this grid world. + +05:53.240 --> 06:00.520 +So you can see here, walk to a small yellow cylinder might be an example of a command. + +06:00.520 --> 06:06.800 +Prior work has shown that one thing you can do is encode the initial state, encode the + +06:06.800 --> 06:14.280 +instruction and then train a neural model to predict the action sequences. + +06:14.280 --> 06:19.600 +Other work has also shown that one thing you can do is train a model to predict a distribution + +06:19.600 --> 06:25.200 +over the correct target location as part of the neural model. + +06:25.200 --> 06:29.600 +That will also increase the performance of the model. + +06:29.600 --> 06:38.400 +What we do here is show that if you do both of these things, you predict both an action + +06:38.400 --> 06:43.800 +sequence and a target location, like what is the location you should end up in, and + +06:43.800 --> 06:48.600 +then check whether or not when you execute the set of instructions, you will end up in + +06:48.600 --> 06:50.720 +the predicted target location. + +06:50.720 --> 06:57.800 +You can sort of check consistency between these two different predictions and only accept + +06:57.800 --> 07:06.560 +those instruction sequences which match the target location prediction. + +07:06.560 --> 07:14.700 +And this leads to also higher accuracy, especially in a low data regime. + +07:14.700 --> 07:18.320 +We have more details about the results of the paper. + +07:18.320 --> 07:21.160 +So that's a little bit of an overview of our paper. + +07:21.160 --> 07:24.520 +Our takeaways are that you can build systems with combined neural methods and explicit + +07:24.520 --> 07:25.560 +world knowledge. + +07:25.560 --> 07:28.880 +And if you add just a little bit of world knowledge, you can really help increase coherence + +07:28.880 --> 07:34.880 +and consistency for these large sequence models. + +07:34.880 --> 07:38.520 +There are some challenges here about parsing in larger scale domains and also what it would + +07:38.520 --> 07:41.360 +mean to automatically build a more complete world model. + +07:41.360 --> 08:01.360 +Thank you very much. + diff --git a/demo_data/nips-2021/25970/video.mp4 b/demo_data/nips-2021/25970/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..87726480df75320b75a2d599bc4a1440a80930da --- /dev/null +++ b/demo_data/nips-2021/25970/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90ddd8de0e21478e0b4497e5ce84758719566d05f4e8560191cb008e6d9d817d +size 49642027 diff --git a/demo_data/nips-2021/25973/metadata.json b/demo_data/nips-2021/25973/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..7d06b3d67d39a010a0379614bbff78b7f2ec40f0 --- /dev/null +++ b/demo_data/nips-2021/25973/metadata.json @@ -0,0 +1,3 @@ +{ + "title": "Learning to Iteratively Solve Routing Problems with Dual-Aspect Collaborative Transformer" +} \ No newline at end of file diff --git a/demo_data/nips-2021/25973/transcript_whisper_large-v2.txt b/demo_data/nips-2021/25973/transcript_whisper_large-v2.txt new file mode 100644 index 0000000000000000000000000000000000000000..db532fc2a428aceec419aa8a41f9a5d2988eee41 --- /dev/null +++ b/demo_data/nips-2021/25973/transcript_whisper_large-v2.txt @@ -0,0 +1,40 @@ +Hi everyone, I'm Jingwen, a PhD student in National University of Singapore. +In this paper, we introduce dual-aspect collaborative transformer for solving routine problems. +Until now, the neural solvers for VRPs could be classified in two types. +The first one is the neural construction solver. +It starts from an empty solution and iteratively selects a customer node to the solution, +until all customers have been visited. +And in this paper, we focus more on the neural improvement solvers. +It starts from an incomplete solution and iteratively improves the solution +based on the node features and solution features, until reaching a step limit T. +Although the transformer has shown the efficiency for processing the sequence data, +its positional encoding method may not be optimal for encoding the VRP solutions, +because it only learns a unified set of embeddings and combines the node embeddings +and the positional embeddings together. +Also, it can only encode the linear sequences, +which cannot capture the circularity and symmetry of VRP solutions. +So in this paper, we introduce the dual-aspect augmentation, +which could better describe the VRP solutions. +We separate the learnings to node feature embeddings and positional feature embeddings +based on the cross-aspect referential attention. +And in this table, we compare the performance of dual-aspect and single-aspect. +We can see the dual-aspect outperforms the single-aspect. +And here we introduce the cyclic positional encoding. +In this figure, we describe the embedding vectors and correlations between every two embeddings +of the original PE and our CPE method in subfeature A and B. +In subfeature C, we describe the top two principal components after PCA projection. +And we can see our PCA method can better capture the circularity of VRP solutions. +And here we did some ablation studies on the CPE method, +which can achieve better generalization performance. +And now we introduce our curriculum learning strategy in the training process. +And in this method, we're training with an unstepped PPO method and a curriculum learning strategy. +It gradually prescribes higher quality solutions as the initial stage for training. +And in this graph, we describe two curves. +The blue one is the PPO method only, and the green one is the PPO method only. +And the green one is the PPO method with our curriculum learning strategy. +And we can see the green one is more stable and achieves lower objective values. +And here is the comparison performance of our method and some baselines on both TST and CVRP. +We can see our DACT outperforms the existing transformer-based improvement models. +So, based on these experiments, we can see our DACT performs very well for the routing problems. +And in the future, we hope to use this method to solve more combinatorial optimization problems. +Thank you. diff --git a/demo_data/nips-2021/25973/transcript_whisper_large-v2.vtt b/demo_data/nips-2021/25973/transcript_whisper_large-v2.vtt new file mode 100644 index 0000000000000000000000000000000000000000..7eb64066de104768a4808068903788de874e49c9 --- /dev/null +++ b/demo_data/nips-2021/25973/transcript_whisper_large-v2.vtt @@ -0,0 +1,122 @@ +WEBVTT + +00:00.000 --> 00:13.440 +Hi everyone, I'm Jingwen, a PhD student in National University of Singapore. + +00:13.440 --> 00:21.040 +In this paper, we introduce dual-aspect collaborative transformer for solving routine problems. + +00:21.040 --> 00:25.280 +Until now, the neural solvers for VRPs could be classified in two types. + +00:25.280 --> 00:27.680 +The first one is the neural construction solver. + +00:27.680 --> 00:34.480 +It starts from an empty solution and iteratively selects a customer node to the solution, + +00:34.480 --> 00:36.880 +until all customers have been visited. + +00:36.880 --> 00:41.200 +And in this paper, we focus more on the neural improvement solvers. + +00:41.200 --> 00:45.680 +It starts from an incomplete solution and iteratively improves the solution + +00:45.680 --> 00:50.480 +based on the node features and solution features, until reaching a step limit T. + +00:52.160 --> 00:56.800 +Although the transformer has shown the efficiency for processing the sequence data, + +00:56.800 --> 01:01.600 +its positional encoding method may not be optimal for encoding the VRP solutions, + +01:01.600 --> 01:06.640 +because it only learns a unified set of embeddings and combines the node embeddings + +01:06.640 --> 01:08.320 +and the positional embeddings together. + +01:09.440 --> 01:12.720 +Also, it can only encode the linear sequences, + +01:12.720 --> 01:17.200 +which cannot capture the circularity and symmetry of VRP solutions. + +01:17.200 --> 01:21.040 +So in this paper, we introduce the dual-aspect augmentation, + +01:21.040 --> 01:24.320 +which could better describe the VRP solutions. + +01:24.320 --> 01:29.280 +We separate the learnings to node feature embeddings and positional feature embeddings + +01:29.280 --> 01:31.920 +based on the cross-aspect referential attention. + +01:32.480 --> 01:38.000 +And in this table, we compare the performance of dual-aspect and single-aspect. + +01:38.000 --> 01:41.360 +We can see the dual-aspect outperforms the single-aspect. + +01:41.360 --> 01:44.720 +And here we introduce the cyclic positional encoding. + +01:44.720 --> 01:50.400 +In this figure, we describe the embedding vectors and correlations between every two embeddings + +01:50.400 --> 01:55.360 +of the original PE and our CPE method in subfeature A and B. + +01:55.360 --> 02:01.760 +In subfeature C, we describe the top two principal components after PCA projection. + +02:01.760 --> 02:07.600 +And we can see our PCA method can better capture the circularity of VRP solutions. + +02:09.120 --> 02:13.840 +And here we did some ablation studies on the CPE method, + +02:13.840 --> 02:16.800 +which can achieve better generalization performance. + +02:16.800 --> 02:22.320 +And now we introduce our curriculum learning strategy in the training process. + +02:23.280 --> 02:29.440 +And in this method, we're training with an unstepped PPO method and a curriculum learning strategy. + +02:30.080 --> 02:36.080 +It gradually prescribes higher quality solutions as the initial stage for training. + +02:36.080 --> 02:39.040 +And in this graph, we describe two curves. + +02:39.680 --> 02:45.440 +The blue one is the PPO method only, and the green one is the PPO method only. + +02:45.440 --> 02:50.080 +And the green one is the PPO method with our curriculum learning strategy. + +02:50.080 --> 02:54.880 +And we can see the green one is more stable and achieves lower objective values. + +02:56.800 --> 03:04.320 +And here is the comparison performance of our method and some baselines on both TST and CVRP. + +03:04.320 --> 03:10.480 +We can see our DACT outperforms the existing transformer-based improvement models. + +03:10.480 --> 03:17.040 +So, based on these experiments, we can see our DACT performs very well for the routing problems. + +03:17.040 --> 03:23.520 +And in the future, we hope to use this method to solve more combinatorial optimization problems. + +03:23.520 --> 03:41.520 +Thank you. + diff --git a/demo_data/nips-2021/25973/video.mp4 b/demo_data/nips-2021/25973/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..39f8e5454209ce6ceec2137709d388f14be61841 --- /dev/null +++ b/demo_data/nips-2021/25973/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:397b569cbe49771e2e420a6fb6e8050f4d988cc6bcc2f05f44eaa109a8788472 +size 21886445 diff --git a/demo_data/nips-2021/25974/metadata.json b/demo_data/nips-2021/25974/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..6ecb7b1a7a20b1f55e20f4af59ee12afb065d04c --- /dev/null +++ b/demo_data/nips-2021/25974/metadata.json @@ -0,0 +1,3 @@ +{ + "title": "Gradient Starvation: A Learning Proclivity in Neural Networks" +} \ No newline at end of file diff --git a/demo_data/nips-2021/25974/transcript_whisper_large-v2.txt b/demo_data/nips-2021/25974/transcript_whisper_large-v2.txt new file mode 100644 index 0000000000000000000000000000000000000000..c896084ef14dd0bab0cc8126f1544593b046cd65 --- /dev/null +++ b/demo_data/nips-2021/25974/transcript_whisper_large-v2.txt @@ -0,0 +1,130 @@ +Hi, I am Mohamed Pezeshki from Mila and today I am going to talk about creating starvation. +This is a joint work with Omar Kaba, Joshua Bengio, Aaron Korvel, Doina Prikop, and Guillaume +Lajra. +Let me start with a story. +Back in 1904, there was a horse called Hans and people believed that he could do arithmetic. +Here is an article from New York Times published in 1904. +The article says that Hans is an expert in numbers. +For example, when two numbers of 5 and 9 are written on a blackboard, Hans replies by tapping +on the ground 14 times. +Seven years later, in an article, Oscar Feinst unveiled that the so-called clever Hans was +not actually capable of doing any arithmetic and instead reading subtle hints in his trainer's +behavior indicating when to stop tapping. +As the article says, even the trainer was not aware of providing these shortcut signals. +So Hans was clever but probably not in doing arithmetic. +Its cleverness was in reading his trainer's clues. +A similar phenomenon has been observed in many applications of machine learning. +Essentially, the situations where the model seemingly has a very good performance but +in fact it hasn't learned true underlying relationships between the input and the target. +In this paper by Robert Gares and co-authors, they list several instances of what they call +shortcut learning. +For example, in a task of image captioning, the model predicts grazing sheep only by seeing +the green hillside. +In another instance, the network hallucinates a teapot with high confidence in an image +of pure noise. +This is another and indeed dangerous example of the task of pneumonia detection from x-ray +images. +The model appears to have a very good performance even on the test set. +However, the heat maps reveal that the network is not looking at the long section at all +and just latching on some features in the corner of the image. +The intuition behind this phenomenon is a folk knowledge in one form or another. +Given a strongly correlated and fast to learn features in training data, grading the sense +is biased towards learning them first. +However, this intuition is a bit abstract and hand-wavy, so let's look at a more concrete +example. +Consider a 2D classification task with red and blue data points as shown. +If you train in raw network and this data, here is the decision boundary that we learn. +Now consider slightly different arrangements of the data points such that the blue data +points are slightly shifted to the left and the red data points are shifted to the right, +making the data linearly separable. +Now if we train in neural network on this, we get an almost linear decision boundary. +Note that the network is only making its predictions based on the feature along the x-axis. +Indicated in the red circle here, you can see that the decision boundary is very close +to the data points. +However, the network is super confident on its predictions and the training loss is indeed +zero. +So you can see that the slightly perturbing data point can get the network to predict +an incorrect label with high confidence. +This problem will be even more visible when testing the model on OOD, meaning out of distribution +test data. +An online interactive demo of this work is available on a blog post we wrote. +If you wish to play with it a bit, please visit the link provided here. +So we hypothesize that what is happening here is gradient starvation. +Gradient starvation is a phenomenon in which a neural network captures statistically dominant +features while remaining invariant to the rest. +Here gradient descent leads to parameter updates, predominantly in directions that only capture +these dominant features, thus starving the gradient from other potentially informative +features. +Here, the notions of feature and dominancy of a feature is rather vague. +To define them more formally, we need to look into the learning dynamics. +In the interest of time, I will be covering only the general intuition of our results +and encourage interested audiences to take a look at the full paper for detailed treatment. +So the two main theorems of the paper can be summarized into these two plots that I +now explain. +Let's first start with gradient starvation itself on the left. +We train a model with common binary cross entropy loss. +On the x-axis we have training iterations or epochs, and on the y-axis we monitor two +features z1 and z2. +Their dynamics depend on several factors, including their strength, meaning how easy +or how hard it is for the network to learn those features, and their correlation with +the target. +Here, z1 has a larger correlation and hence converges to a value around 6, and z2 with +a smaller correlation converges to a value around 2. +However, the strength is equal, i.e. kappa is set to be 1. +Again, it means that both of these features are as easy for the network to learn. +Now let's keep their correlation fixed but increase the strength of z1. +A kappa equal to 2 means that z1 is learned easier than z2. +We can immediately see that although their correlation is still the same as before, z1 +is overestimated while z2 is underestimated. +If we make kappa to be 4 or 8, it becomes more evident that simply because z1 is easier +to learn, it is being overestimated, while z2 is being starved. +Our theory shows that an increase in the strength of feature z1 has a detrimental effect on +the learning of feature z2. +Now our second theory shows that adding this term, indicated in the red rectangle, to the +loss decouples the features. +As you can see, a spectral decoupling decouples the features at the converged solution. +Regardless of the value of kappa, all of the experiments on z1 and z2 converge to the same +place. +Again, we refer interested audience to the paper for more theory as well as more intuition. +Now let's look at some experiments. +Recall the task that we studied earlier. +When the data is not linearly separable, we learn the curve decision boundary. +On the right, we see how z1 and z2 evolve. +When the data is linearly separable with a small margin, a linear decision boundary is +learned. +We observe that z1 is overestimated, while z2 is heavily underestimated. +Now let's see what happens if we add spectral decoupling. +Spectral decoupling suppresses z1 and as a result allows z2 to grow. +It also appears that other regularization methods do not succeed at learning a curve +decision boundary. +So we observed that spectral decoupling leads to a decision boundary with a larger margin. +What happens in real-world tasks? +The distance to the decision boundary is not trivial to compute when working with nonlinear +models. +However, we can use a proxy. +The amount of perturbation required to fool the network is a proxy to the margin. +Look at the plot on the right. +On the x-axis, we have the amount of perturbation and on the y-axis, we have how many of the +examples are misclassified. +You can see that with a fixed amount of perturbation, a model with vanilla binary cross entropy +is much more vulnerable compared to a model trained with spectral decoupling. +In another experiment, we studied colored MNIST, a well-known task of OOD generalization +where the color is spuriously correlated with the labels. +Also another task of OOD generalization is a classification task on the CILIB8 dataset +where the training data is again biased with respect to the color of the hair and the gender +such that most of male images have black hair while the majority of females have blonde +hair. +Here, we skip the details in the interest of time. +However, let me just draw your attention to the superiority of spectral decoupling in +these both tasks. +Finally to conclude, we talked about the clever hands effect. +We showed that a similar phenomenon can happen in neural networks and we called that gradient +starvation. +To understand gradient starvation, we looked into the learning dynamics. +We showed that the presence of a strongly correlated feature could result in a starvation +of other features. +We also showed that spectral decoupling provides some degree of control over what features +to learn and decouples essentially the features. +Thanks for your attention. +If you're interested to chat more, please visit our poster this afternoon. +Thank you very much. diff --git a/demo_data/nips-2021/25974/transcript_whisper_large-v2.vtt b/demo_data/nips-2021/25974/transcript_whisper_large-v2.vtt new file mode 100644 index 0000000000000000000000000000000000000000..ccf7ae80f593d8408cd20f7c5355ab8a3f7ec382 --- /dev/null +++ b/demo_data/nips-2021/25974/transcript_whisper_large-v2.vtt @@ -0,0 +1,392 @@ +WEBVTT + +00:00.000 --> 00:15.280 +Hi, I am Mohamed Pezeshki from Mila and today I am going to talk about creating starvation. + +00:15.280 --> 00:22.280 +This is a joint work with Omar Kaba, Joshua Bengio, Aaron Korvel, Doina Prikop, and Guillaume + +00:22.280 --> 00:23.280 +Lajra. + +00:23.280 --> 00:25.480 +Let me start with a story. + +00:25.480 --> 00:32.000 +Back in 1904, there was a horse called Hans and people believed that he could do arithmetic. + +00:32.000 --> 00:36.440 +Here is an article from New York Times published in 1904. + +00:36.440 --> 00:39.720 +The article says that Hans is an expert in numbers. + +00:39.720 --> 00:45.960 +For example, when two numbers of 5 and 9 are written on a blackboard, Hans replies by tapping + +00:45.960 --> 00:49.000 +on the ground 14 times. + +00:49.000 --> 00:54.960 +Seven years later, in an article, Oscar Feinst unveiled that the so-called clever Hans was + +00:54.960 --> 01:01.320 +not actually capable of doing any arithmetic and instead reading subtle hints in his trainer's + +01:01.320 --> 01:05.720 +behavior indicating when to stop tapping. + +01:05.720 --> 01:12.900 +As the article says, even the trainer was not aware of providing these shortcut signals. + +01:12.900 --> 01:16.800 +So Hans was clever but probably not in doing arithmetic. + +01:16.800 --> 01:21.120 +Its cleverness was in reading his trainer's clues. + +01:21.120 --> 01:26.480 +A similar phenomenon has been observed in many applications of machine learning. + +01:26.480 --> 01:32.200 +Essentially, the situations where the model seemingly has a very good performance but + +01:32.200 --> 01:38.960 +in fact it hasn't learned true underlying relationships between the input and the target. + +01:38.960 --> 01:47.040 +In this paper by Robert Gares and co-authors, they list several instances of what they call + +01:47.040 --> 01:48.520 +shortcut learning. + +01:48.520 --> 01:55.080 +For example, in a task of image captioning, the model predicts grazing sheep only by seeing + +01:55.080 --> 01:57.840 +the green hillside. + +01:57.840 --> 02:03.480 +In another instance, the network hallucinates a teapot with high confidence in an image + +02:03.480 --> 02:06.400 +of pure noise. + +02:06.400 --> 02:11.960 +This is another and indeed dangerous example of the task of pneumonia detection from x-ray + +02:11.960 --> 02:13.160 +images. + +02:13.160 --> 02:17.280 +The model appears to have a very good performance even on the test set. + +02:17.280 --> 02:23.600 +However, the heat maps reveal that the network is not looking at the long section at all + +02:23.600 --> 02:28.440 +and just latching on some features in the corner of the image. + +02:28.440 --> 02:33.880 +The intuition behind this phenomenon is a folk knowledge in one form or another. + +02:33.880 --> 02:39.520 +Given a strongly correlated and fast to learn features in training data, grading the sense + +02:39.520 --> 02:42.560 +is biased towards learning them first. + +02:42.560 --> 02:48.960 +However, this intuition is a bit abstract and hand-wavy, so let's look at a more concrete + +02:48.960 --> 02:51.240 +example. + +02:51.240 --> 02:57.240 +Consider a 2D classification task with red and blue data points as shown. + +02:57.240 --> 03:03.240 +If you train in raw network and this data, here is the decision boundary that we learn. + +03:03.240 --> 03:08.680 +Now consider slightly different arrangements of the data points such that the blue data + +03:08.680 --> 03:14.400 +points are slightly shifted to the left and the red data points are shifted to the right, + +03:14.400 --> 03:17.640 +making the data linearly separable. + +03:17.640 --> 03:23.480 +Now if we train in neural network on this, we get an almost linear decision boundary. + +03:23.480 --> 03:30.600 +Note that the network is only making its predictions based on the feature along the x-axis. + +03:30.600 --> 03:35.200 +Indicated in the red circle here, you can see that the decision boundary is very close + +03:35.200 --> 03:36.520 +to the data points. + +03:36.520 --> 03:42.400 +However, the network is super confident on its predictions and the training loss is indeed + +03:42.400 --> 03:43.880 +zero. + +03:43.880 --> 03:49.560 +So you can see that the slightly perturbing data point can get the network to predict + +03:49.560 --> 03:52.600 +an incorrect label with high confidence. + +03:52.600 --> 03:59.720 +This problem will be even more visible when testing the model on OOD, meaning out of distribution + +03:59.720 --> 04:03.040 +test data. + +04:03.040 --> 04:07.440 +An online interactive demo of this work is available on a blog post we wrote. + +04:07.440 --> 04:12.440 +If you wish to play with it a bit, please visit the link provided here. + +04:12.440 --> 04:18.240 +So we hypothesize that what is happening here is gradient starvation. + +04:18.240 --> 04:24.880 +Gradient starvation is a phenomenon in which a neural network captures statistically dominant + +04:24.880 --> 04:31.160 +features while remaining invariant to the rest. + +04:31.160 --> 04:37.000 +Here gradient descent leads to parameter updates, predominantly in directions that only capture + +04:37.000 --> 04:43.320 +these dominant features, thus starving the gradient from other potentially informative + +04:43.320 --> 04:44.320 +features. + +04:44.320 --> 04:50.280 +Here, the notions of feature and dominancy of a feature is rather vague. + +04:50.280 --> 04:55.520 +To define them more formally, we need to look into the learning dynamics. + +04:55.520 --> 05:00.720 +In the interest of time, I will be covering only the general intuition of our results + +05:00.720 --> 05:07.360 +and encourage interested audiences to take a look at the full paper for detailed treatment. + +05:07.360 --> 05:13.160 +So the two main theorems of the paper can be summarized into these two plots that I + +05:13.160 --> 05:14.160 +now explain. + +05:14.160 --> 05:19.720 +Let's first start with gradient starvation itself on the left. + +05:19.720 --> 05:23.800 +We train a model with common binary cross entropy loss. + +05:23.800 --> 05:29.000 +On the x-axis we have training iterations or epochs, and on the y-axis we monitor two + +05:29.000 --> 05:32.120 +features z1 and z2. + +05:32.120 --> 05:37.480 +Their dynamics depend on several factors, including their strength, meaning how easy + +05:37.480 --> 05:43.280 +or how hard it is for the network to learn those features, and their correlation with + +05:43.280 --> 05:44.280 +the target. + +05:44.280 --> 05:51.600 +Here, z1 has a larger correlation and hence converges to a value around 6, and z2 with + +05:51.600 --> 05:55.800 +a smaller correlation converges to a value around 2. + +05:55.800 --> 06:01.440 +However, the strength is equal, i.e. kappa is set to be 1. + +06:01.440 --> 06:09.800 +Again, it means that both of these features are as easy for the network to learn. + +06:09.800 --> 06:20.280 +Now let's keep their correlation fixed but increase the strength of z1. + +06:20.280 --> 06:25.400 +A kappa equal to 2 means that z1 is learned easier than z2. + +06:25.400 --> 06:31.640 +We can immediately see that although their correlation is still the same as before, z1 + +06:31.640 --> 06:36.560 +is overestimated while z2 is underestimated. + +06:36.560 --> 06:44.000 +If we make kappa to be 4 or 8, it becomes more evident that simply because z1 is easier + +06:44.000 --> 06:51.400 +to learn, it is being overestimated, while z2 is being starved. + +06:51.400 --> 06:58.520 +Our theory shows that an increase in the strength of feature z1 has a detrimental effect on + +06:58.520 --> 07:01.760 +the learning of feature z2. + +07:01.760 --> 07:08.840 +Now our second theory shows that adding this term, indicated in the red rectangle, to the + +07:08.840 --> 07:11.800 +loss decouples the features. + +07:11.800 --> 07:17.640 +As you can see, a spectral decoupling decouples the features at the converged solution. + +07:17.640 --> 07:25.680 +Regardless of the value of kappa, all of the experiments on z1 and z2 converge to the same + +07:25.680 --> 07:26.680 +place. + +07:26.680 --> 07:33.640 +Again, we refer interested audience to the paper for more theory as well as more intuition. + +07:33.640 --> 07:36.720 +Now let's look at some experiments. + +07:36.720 --> 07:39.080 +Recall the task that we studied earlier. + +07:39.080 --> 07:44.880 +When the data is not linearly separable, we learn the curve decision boundary. + +07:44.880 --> 07:49.840 +On the right, we see how z1 and z2 evolve. + +07:49.840 --> 07:55.080 +When the data is linearly separable with a small margin, a linear decision boundary is + +07:55.080 --> 07:56.080 +learned. + +07:56.080 --> 08:02.920 +We observe that z1 is overestimated, while z2 is heavily underestimated. + +08:02.920 --> 08:07.880 +Now let's see what happens if we add spectral decoupling. + +08:07.880 --> 08:14.320 +Spectral decoupling suppresses z1 and as a result allows z2 to grow. + +08:14.320 --> 08:20.480 +It also appears that other regularization methods do not succeed at learning a curve + +08:20.480 --> 08:23.240 +decision boundary. + +08:23.240 --> 08:30.860 +So we observed that spectral decoupling leads to a decision boundary with a larger margin. + +08:30.860 --> 08:33.520 +What happens in real-world tasks? + +08:33.520 --> 08:38.640 +The distance to the decision boundary is not trivial to compute when working with nonlinear + +08:38.640 --> 08:39.640 +models. + +08:39.640 --> 08:42.040 +However, we can use a proxy. + +08:42.040 --> 08:48.200 +The amount of perturbation required to fool the network is a proxy to the margin. + +08:48.200 --> 08:50.400 +Look at the plot on the right. + +08:50.400 --> 08:56.040 +On the x-axis, we have the amount of perturbation and on the y-axis, we have how many of the + +08:56.040 --> 08:59.760 +examples are misclassified. + +08:59.760 --> 09:07.420 +You can see that with a fixed amount of perturbation, a model with vanilla binary cross entropy + +09:07.420 --> 09:14.320 +is much more vulnerable compared to a model trained with spectral decoupling. + +09:14.320 --> 09:20.320 +In another experiment, we studied colored MNIST, a well-known task of OOD generalization + +09:20.320 --> 09:27.960 +where the color is spuriously correlated with the labels. + +09:27.960 --> 09:33.740 +Also another task of OOD generalization is a classification task on the CILIB8 dataset + +09:33.740 --> 09:44.100 +where the training data is again biased with respect to the color of the hair and the gender + +09:44.100 --> 09:50.080 +such that most of male images have black hair while the majority of females have blonde + +09:50.080 --> 09:51.080 +hair. + +09:51.080 --> 09:54.760 +Here, we skip the details in the interest of time. + +09:54.760 --> 10:00.920 +However, let me just draw your attention to the superiority of spectral decoupling in + +10:00.920 --> 10:03.840 +these both tasks. + +10:03.840 --> 10:09.080 +Finally to conclude, we talked about the clever hands effect. + +10:09.080 --> 10:15.360 +We showed that a similar phenomenon can happen in neural networks and we called that gradient + +10:15.360 --> 10:16.360 +starvation. + +10:16.360 --> 10:21.600 +To understand gradient starvation, we looked into the learning dynamics. + +10:21.600 --> 10:29.080 +We showed that the presence of a strongly correlated feature could result in a starvation + +10:29.080 --> 10:30.960 +of other features. + +10:30.960 --> 10:36.560 +We also showed that spectral decoupling provides some degree of control over what features + +10:36.560 --> 10:44.040 +to learn and decouples essentially the features. + +10:44.040 --> 10:45.720 +Thanks for your attention. + +10:45.720 --> 10:50.880 +If you're interested to chat more, please visit our poster this afternoon. + +10:50.880 --> 11:01.760 +Thank you very much. + diff --git a/demo_data/nips-2021/25974/video.mp4 b/demo_data/nips-2021/25974/video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..ed9120f48bcd5b3594545694aea4fba85d5a7333 --- /dev/null +++ b/demo_data/nips-2021/25974/video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b35962663480a872582f840effc9ebb43cf01117c7246aa1fa2ae7fffc0f410 +size 69848602 diff --git a/demo_data/ted_talks.json b/demo_data/ted_talks.json new file mode 100644 index 0000000000000000000000000000000000000000..ae371a715d5252206bb2c85314b2dd6cfc8f7898 --- /dev/null +++ b/demo_data/ted_talks.json @@ -0,0 +1 @@ +{"text":{"0":["I feel incredibly lucky to be from a country that's generally considered to be the best place in the world to be a woman.","In 1975, when I was seven years old, women in Iceland went on a strike.","They did no work that day, whether they held professional jobs or had the work of the home.","They marched into the center of Reykjav\u00edk -- 90 percent of women participated -- and peacefully and in solidarity asked for equality.","Nothing worked in Iceland that day, because nothing works when women are not at work.","(Applause)","Five years later, Icelanders had the courage to be the first country in the world to democratically elect a woman as their president.","I will never forget this day, that President Vigd\u00eds, as we know her by her first name, stepped out on the balcony of her own home, a single mom with her daughter by her side as she had won.","(Applause)","This woman was an incredible role model for me and everyone growing up at that time, including boys.","She frequently shares the story of how a young boy approached her after a couple of terms in office and asked, \"Can boys really grow up to be president?\"","(Laughter)","Role models really matter, but even with such strong role models who I am so grateful for, when I was encouraged to run for president, my first reaction was, \"Who am I to run for president?","Who am I to be president?\"","It turns out that women are less likely to consider running than men.","So a study done in the US in 2011 showed that 62 percent of men had considered running for office, but 45 percent of women.","That's gap of 16 percentage points, and it's the same gap that existed a decade earlier.","And it really is a shame, because I am so convinced that the world is in real need for women leaders and more principle-based leadership in general.","So my decision to run ultimately came down to the fact that I felt that I had to do my bit, even if I had no political experience, to step up and try to be part of creating the world that will make sense and be sustainable for our kids, and a world where we truly allow both our boys and girls to be all they can be.","And it was the journey of my life.","It was amazing.","The journey started with potentially as many as 20 candidates.","It boiled down to nine candidates qualifying, and ultimately the race came down to four of us, three men and me.","(Applause)","But that's not all the drama yet.","You may think you have drama in the US, but I can --","(Laughter)","I can assure you we had our own drama in Iceland.","So our sitting president of 20 years announced initially that he was not going to run, which is probably what gave rise to so many candidates considering running.","Then later he changed his mind when our prime minister resigned following the infamous Panama Papers that implicated him and his family.","And there was a popular protest in Iceland, so the sitting president thought they needed a trusted leader.","A few days later, relations to his wife and her family's companies were also discovered in the Panama Papers, and so he withdrew from the race again.","Before doing so, he said he was doing that because now there were two qualified men who he felt could fill his shoes running for office.","So on May 9, 45 days before election day, it was not looking too good for me.","I did not even make the graph in the newspaper.","The polls had me at 1 percent, but that was still the highest that any woman announcing her candidacy had earned.","So it would be an understatement to say that I had to work extremely hard to get my seat at the table and access to television, because the network decided that they would only include those with 2.5 percent or more in the polls in the first TV debate.","I found out on the afternoon of the first TV debate that I would participate along with the three men, and I found out on live TV that I came in at exactly 2.5 percent on the day of the first TV debate.","(Applause)","So, challenges.","The foremost challenges I had to face and overcome on this journey had to do with media, muscle and money.","Let's start with media.","There are those who say gender doesn't matter when it comes to media and politics.","I can't say that I agree.","It proved harder for me to both get access and airtime in media.","As a matter of fact, the leading candidate appeared in broadcast media 87 times in the months leading up to the elections, whereas I appeared 31 times.","And I am not saying media is doing this consciously.","I think largely this has to do with unconscious bias, because in media, much like everywhere else, we have both conscious and unconscious bias, and we need to have the courage to talk about it if we want to change it.","When I finally got access to TV, the first question I got was, \"Are you going to quit?\"","And that was a hard one.","But of course, with 1 percent to 2.5 percent in the polls, maybe it's understandable.","But media really matters, and every time I appeared on TV, we saw and experienced a rise in the polls, so I know firsthand how much this matters and why we have to talk about it.","I was the only one out of the final four candidates that never got a front page interview.","I was sometimes left out of the questions asked of all other candidates and out of coverage about the elections.","So I did face this, but I will say this to compliment the Icelandic media.","I got few if any comments about my hair and pantsuit.","(Applause)","So kudos to them.","But there is another experience that's very important.","I ran as an independent candidate, not with any political party or muscle behind me.","That lack of experience and lack of access to resources probably came at a cost to our campaign, but it also allowed us to innovate and do politics differently.","We ran a positive campaign, and we probably changed the tone of the election for others by doing that.","It may be the reason why I had less airtime on TV, because I wanted to show other contenders respect.","When access to media proved to be so difficult, we ran our own media.","I ran live Facebook sessions where I took questions from voters on anything and responded on the spot.","And we put all the questions I got and all the answers on an open Facebook because we thought transparency is important if you want to establish trust.","And when reaching young voters proved to be challenging, I became a Snapchatter.","I got young people to teach me how to do that, and I used every filter on Snapchat during the last part of the campaign.","And I actually had to use a lot of humor and humility, as I was very bad at it.","But we grew the following amongst young people by doing that.","So it's possible to run a different type of campaign.","But unfortunately, one cannot talk about politics without mentioning money.","I am sad that it is that way, but it's true, and we had less financial resources than the other candidates.","This probably was partly due to the fact that I think I had a harder time asking for financial support.","And maybe I also had the ambition to do more with less.","Some would call that very womanly of me.","But even with one third the media, one third the financial resources, and only an entrepreneurial team, but an amazing team, we managed to surprise everyone on election night, when the first numbers came in.","I surprised myself, as you may see in that photo.","(Laughter)","So the first numbers, I came in neck to neck to the leading candidate.","(Cheers)","Well, too early, because I didn't quite pull that, but I came in second, and we went a long way from the one percent, with nearly a third of the vote, and we beat the polls by an unprecedented margin, or 10 percentage points above what the last poll came in at.","Some people call me the real winner of the election because of this, and there are many people who encouraged me to run again.","But what really makes me proud is to know that I earned proportionately higher percentage support from the young people, and a lot of people encouraged my daughter to run in 2040.","(Applause)","She is 13, and she had never been on TV before.","And on election day, I observed her on TV repeatedly, and she was smart, she was self-confident, she was sincere, and she was supportive of her mother.","This was probably the highlight of my campaign.","(Applause)","But there was another one.","These are preschool girls out on a walk, and they found a poster of me on a bus stop, and they saw the need to kiss it.","Audience: Aw!","This picture was really enough of a win for me.","What we see, we can be.","So screw fear and challenges.","(Applause)","It matters that women run, and it's time for women to run for office, be it the office of the CEO or the office of the president.","I also managed to put an impression on your very own \"New Yorker.\"","I earned a new title, \"A living emoji of sincerity.\"","(Cheers)","It is possibly my proudest title yet, and the reason is that women too often get penalized for using what I call their emotional capital, but I know from experience that we become so good when we do just that.","(Applause)","And we need more of that.","We celebrated as if we had won on election night, because that's how we felt.","So you don't necessarily have to reach that office.","You just have to go for it, and you, your family, your friends, everyone working with you, if you do it well, you will grow beyond anything you will experience before.","So we had a good time, and I learned a lot on this journey, probably more lessons than I can share here in the time we have today.","But rest assured, it was hard work.","I lost a lot of sleep during those months.","It took resilience and perseverance to not quit, but I learned something that I knew before on the one percent day, and that is that you can only be good when you are truly, authentically listening to your own voice and working in alignment with that.","As a good sister of mine sometimes says, you may cheat on your intuition, but your intuition never cheats on you.","I think it's also very important, and you all know this, that on any journey you go on, it's the team you take along.","It's having people around you who share your values, your vision, but are different in every other way.","That's the formula for success for me, and I am blessed with an amazing husband, here today, an incredible family --","(Applause)","and great friends, and we came together as entrepreneurs in the political arena, and pulled something off that everyone said would be impossible.","As a matter of fact, the leading PR expert told me before I made my decision that I would do well to get seven percent.","I appreciated his perspective, because he was probably right, and he was basing it on valuable experience.","But on the one percent day, I decided here to show him that he was wrong.","It's very important to mention this, because I did lose a lot of sleep, and I worked hard, and so did the people with me.","We can never go the distance if we forget to take care of ourselves.","And it's two things that I think are very important in that, in surrounding yourself with people and practices that nourish you, but it's equally important, maybe even more important, to have the courage to get rid of people and practices that take away your energy, including the wonderful bloggers and commentators.","I took a lot of support from others in doing this, and I made the decision to go high when others went low, and that's partly how I kept my energy going throughout all of this.","And when I lost my energy for a moment -- and I did from time to time, it wasn't easy -- I went back to why I decided to run, and how I had decided to run my own race.","I called it a 4G campaign, the G's representing the Icelandic words.","And the first one is called \"Gagn.\"","I ran to do good, to be of service, and I wanted servant leadership to be at the center of how I worked and everybody else in the campaign.","Second one is \"Gle\u00f0i,\" or joy.","I decided to enjoy the journey.","There was a lot to be taken out of the journey, no matter if the destination was reached or not.","And I tried my utmost to inspire others to do so as well.","Third is \"Gagns\u00e6i.\"","I was open to any questions.","I kept no secrets, and it was all open, on Facebook and websites.","Because I think if you're choosing your president, you deserve answers to your questions.","Last but not least, I don't need to explain that in this room, we ran on the principle of Girlpower.","(Cheers)","I am incredibly glad that I had the courage to run, to risk failure but receive success on so many levels.","I can't tell you that it was easy, but I can tell you, and I think my entire team will agree with me, that it was worth it.","Thank you.","(Applause)","Thank you.","Thank you.","(Applause)","Pat Mitchell: I'm not letting you go yet.","Halla T\u00f3masd\u00f3ttir: What a great crowd.","PM: I can't let you go without saying that probably everybody in the room is ready to move to Iceland and vote for you.","But of course we probably can't vote there, but one thing we can get from Iceland and have always gotten is inspiration.","I mean, I'm old enough to remember 1975 when all the Icelandic women walked out, and that really was a very big factor in launching the women's movement.","You made a reference to it earlier.","I'd love to bring the picture back up and just have us remember what it was like when a country came to a standstill.","And then what you may not know because our American media did not report it, the Icelandic women walked out again on Monday.","Right?","HT: Yes, they did.","PM: Can you tell us about that?","HT: Yes, so 41 years after the original strike, we may be the best place in the world to be a woman, but our work isn't done.","So at 2:38pm on Monday, women in Iceland left work, because that's when they had earned their day's salary.","(Applause)","What's really cool about this is that young women and men participated in greater numbers than before, because it is time that we close the pay gap.","PM: So I'm not going to ask Halla to commit right now to what she's doing next, but I will say that you'd have a very large volunteer army should you decide to do that again.","Thank you Halla.","HT: Thank you all.","(Applause)"],"1":["It's not about technology, it's about people and stories.","I could show you what recently was on television as a high quality video: 60 Minutes, many of you may have seen it.","And it was the now director of the entire piece of the veteran's administration -- who, himself, had lost an arm 39 years ago in Vietnam -- who was adamantly opposed to these crazy devices that don't work.","And it turns out that with 60 Minutes cameras rolling in the background, after he pretty much made his position clear on this -- he had his hook and he had his -- he wore this arm for less than two hours and was able to pour himself a drink and got quite emotional over the fact that, quote -- his quote -- it's the first time he's felt like he's had an arm in 39 years.","But that would sort of be jumping to the middle of the story, and I'm not going to show you that polished video.","I'm going to, instead, in a minute or two, show you an early, crude video because I think it's a better way to tell a story.","A few years ago I was visited by the guy that runs DARPA, the people that fund all the advanced technologies that businesses and universities probably wouldn't take the risk of doing.","They have a particular interest in ones that will help our soldiers.","I get this sort of unrequested -- by me anyway -- visit, and sitting in my conference room is a very senior surgeon from the military and the guy that runs DARPA.","They proceed to tell me a story which comes down to basically the following.","We have used such advanced technologies now and made them available in the most remote places that we put soldiers: hills of Afghanistan, Iraq ...","They were quite proud of the fact that you know, before the dust clears, if some soldier has been hurt they will have collected him or her, they will have brought him back, they will be getting world-class triage emergency care faster than you and I would be getting it if we were hurt in a car accident in a major city in the United States.","That's the good news.","The bad news is if they've collected this person and he or she is missing an arm or leg, part of the face, it's probably not coming back.","So, they started giving me the statistics on how many of these kids had lost an arm.","And then the surgeon pointed out, with a lot of anger, he said, \"Why is it?","At the end of the Civil War, they were shooting each other with muskets.","If somebody lost an arm, we gave them a wooden stick with a hook on it.","Now we've got F18s and F22s, and if somebody loses an arm, we give them a plastic stick with a hook on it.\"","And they basically said, \"This is unacceptable,\" and then the punchline: \"So, Dean, we're here because you make medical stuff.","You're going to give us an arm.\"","And I was waiting for the 500 pages of bureaucracy, paperwork and DODs.","No, the guy says, \"We're going to bring a guy into this conference room, and wearing the arm you're going to give us, he or she is going to pick up a raisin or a grape off this table.","If it's the grape, they won't break it.\"","Great he needs efferent, afferent, haptic response sensors.","\"If it's the raisin, they won't drop it.\"","So he wants fine motor control: flex at the wrist, flex at the elbow, abduct and flex at the shoulder.","Either way they were going to eat it.","\"Oh, by the way Dean.","It's going to fit on a 50th percentile female frame -- namely 32 inches from the long finger -- and weigh less than nine pounds.\"","50th percentile female frame.","\"And it's going to be completely self contained including all its power.\"","So, they finished that.","And I, as you can tell, am a bashful guy.","I told them they're nuts.","(Laughter) They've been watching too much \"Terminator.\"","(Laughter) Then, the surgeon says to me, \"Dean, you need to know more than two dozen of these kids have come back bilateral.\"","Now, I cannot imagine -- I'm sorry, you may have a better imagination than I do -- I can't imagine losing my arm, and typically at 22 years old.","But compared to that, losing two?","Seems like that would be an inconvenience.","Anyway, I went home that night.","I thought about it.","I literally could not sleep thinking about, \"I wonder how you'd roll over with no shoulders.\"","So, I decided we've got to do this.","And trust me, I've got a day job, I've got a lot of day jobs.","Most of my day job keeps me busy funding my fantasies like FIRST and water and power .... And I've got a lot of day jobs.","But I figured I gotta do this.","Did a little investigation, went down to Washington, told them I still think they're nuts but we're going to do it.","And I told them I'd build them an arm.","I told them it would probably take five years to get through the FDA, and probably 10 years to be reasonably functional.","Look what it takes to make things like iPods.","\"Great,\" he said, \"You got two years.\"","(Laughter) I said, \"I'll tell you what.","I'll build you an arm that's under nine pounds that has all that capability in one year.","It will take the other nine to make it functional and useful.\"","We sort of agreed to disagree.","I went back and I started putting a team together, the best guys I could find with a passion to do this.","At the end of exactly one year we had a device with 14 degrees of freedom, all the sensors, all the microprocessors, all the stuff inside.","I could show you it with a cosmesis on it that's so real it's eerie, but then you wouldn't see all this cool stuff.","I then thought it would be years before we'd be able to make it really, really useful.","It turned out, as I think you could see in Aimee's capabilities and attitudes, people with a desire to do something are quite remarkable and nature is quite adaptable.","Anyway, with less than 10 hours of use, two guys -- one that's bilateral.","He's literally, he's got no shoulder on one side, and he's high trans-humeral on the other.","And that's Chuck and Randy together, after 10 hours -- were playing in our office.","And we took some pretty cruddy home movies.","At the end of the one I'm going to show, it's only about a minute and a couple of seconds long, Chuck does something that to this day I'm jealous of, I can't do it.","He picks up a spoon, picks it up, scoops out some Shredded Wheat and milk, holds the spoon level as he translates it, moving all these joints simultaneously, to his mouth, and he doesn't drop any milk.","(Laughter) I cannot do that.","(Laughter) His wife was standing behind me.","She's standing behind me at the time and she says, \"Dean, Chuck hasn't fed himself in 19 years.","So, you've got a choice: We keep the arm, or you keep Chuck.\"","(Laughter) (Applause)","So, can we see that?","This is Chuck showing simultaneous control of all the joints.","He's punching our controls guy.","The guy behind him is our engineer\/surgeon, which is a convenient guy to have around.","There's Randy, these guys are passing a rubber little puck between them.","And just as in the spirit of FIRST, gracious professionalism, they are quite proud of this, so they decide to share a drink.","This is a non-trivial thing to do, by the way.","Imagine doing that with a wooden stick and a hook on the end of it, doing either of those.","Now Chuck is doing something quite extraordinary, at least for my limited physical skill.","And now he's going to do what DARPA asked me for.","He's going to pick up a grape -- he didn't drop it, he didn't break it -- and he's going to eat it.","So, that's where we were at the end of about 15 months.","(Applause)","But, as I've learned from Richard, the technology, the processors, the sensors, the motors, is not the story.","I hadn't dealt with this kind of problem or frankly, this whole segment of the medical world.","I'll give you some astounding things that have happened as we started this.","After we were pretty much convinced we had a good design, and we'd have to make all the standard engineering trade-offs you always make -- you can always get three out of four of anything you want; the weight, the size, the cost, the functionality -- I put a bunch of guys in my plane and I said, \"We're flying down to Walter Reed, and we're going talk to these kids, because frankly it doesn't matter whether we like this arm.","It doesn't matter whether the Department of Defense likes this arm.\"","When I told them that they weren't entirely enthusiastic, but I told them, \"It really doesn't matter what their opinion is.","There is only one opinion that matters, the kids that are either going to use it or not.\"","I told a bunch of my engineers, \"Look we're going to walk into Walter Reed, and you're going to see people, lots of them, missing major body parts.","They're probably going to be angry, depressed, frustrated.","We're probably going to have to give them support, encouragement.","But we've got to extract from them enough information to make sure we're doing the right thing.\"","We walked into Walter Reed and I could not have been more wrong.","We did see a bunch of people, a lot of them missing a lot of body parts, and parts they had left were burned; half a face gone, an ear burned off.","They were sitting at a table.","They were brought together for us.","And we started asking them all questions.","\"Look,\" I'd say to them, \"We're not quite as good as nature yet.","I could give you fine motor control, or I could let you curl 40 pounds; I probably can't do both.","I can give you fast control with low reduction ratios in these gears, or I can give you power; I can't give you both.","And we were trying to get them to all help us know what to give them.","Not only were they enthusiastic, they kept thinking they're there to help us.","\"Well, would it help if I ...\" \"Guys, and woman, you've given enough.","We're here to help you.","We need data.","We need to know what you need.\"","After a half an hour, maybe, there was one guy at the far end of the table who wasn't saying much.","You could see he was missing an arm.","He was leaning on his other arm.","I called down to the end, \"Hey, you haven't said much.","If we needed this or this, what would you want?\"","And he said, \"You know, I'm the lucky guy at this table.","I lost my right arm, but I'm a lefty.\"","(Laughter) So, he wouldn't say much.","He had a great spirit, like all the rest of them had great spirits.","And he made a few comments.","And then the meeting ended.","We said goodbye to all these guys.","And that guy pushed himself back from the table ... he has no legs.","So, we left.","And I was thinking, \"We didn't give them support and encouragement; they gave it to us.","They're not finished giving yet.\"","It was astounding.","So, we went back.","And I started working harder, faster.","Then we went out to Brooke Army Medical Center.","And we saw lots of these kids, lots of them.","And it was astounding how positive they are.","So, we went back, and we've been working harder yet.","We're in clinical trials, we've got five of them on people.","We're screaming along.","And I get a call and we go back to Washington.","We go back to Walter Reed, and a kid, literally, 20 some-odd days before that was blown up.","And they shipped him to Germany and 24 hours later they shipped him from Germany to Walter Reed.","And he was there, and they said we needed to come.","And I went down and they rolled him into a room.","He's got no legs.","He's got no arms.","He's got a small residual limb on one side.","Half of his face is gone, but they said his vision is coming back.","He had one good eye.","His name is Brandon Marrocco.","And he said, \"I need your arms, but I need two of them.\"","\"You'll get them.\"","This kid was from Staten Island.","And he said, \"I had a truck, before I went over there, and it had a stick.","You think I'll be able to drive it?\"","\"Sure.\"","And I turned around and went, \"How are we going to do this?\"","(Laughter) Anyway, he was just like all the rest of them.","He doesn't really want a lot.","He wants to help.","He told me that he wanted to go back to help his buddies.","So, I was on my way out here.","I was asked to stop at Texas.","There were 3,500 people, the Veteran's Administration, U.S. ... just 3,500 at this huge event to help the families of all the kids -- some that have died, some that are like Brandon -- and they wanted me to speak.","I said, \"What am I going to say?","This is not a happy thing.","Look, if this happens to you, I can give you ...","This stuff is still not as good at the original equipment.\"","\"You need to come.\"","So, I went.","And, as I think you get the point, there were a lot people there recovering.","Some further along than others.","But universally, these people that had been through this had astounding attitudes, and just the fact that people care makes a huge difference to them.","I'll shut up, except one message or concern I have.","I don't think anybody does it intentionally, but there were people there literally talking about, \"Well, how much will they get?\"","You know, this country is involved as we've all heard, in this great healthcare debate.","\"Who is entitled to what?","Who is entitled to how much?","Who is going to pay for it?\"","Those are tough questions.","I don't have an answer to that.","Not everybody can be entitled to everything simply because you were born here.","It's not possible.","It would be nice but let's be realistic.","They were tough questions.","There's polarized groups down there.","I don't know the answers.","There are other questions that are tough.","\"Should we be there?","How do we get out?","What do we need to do?\"","There's very polarized answers to that question too, and I don't have any answers to that.","Those are political questions, economic questions, strategic questions.","I don't have the answer.","But let me give you a simple concern or maybe statement, then.","It is an easy answer.","I know what these kids deserve on the healthcare side.","I was talking to one of them, and he was really liking this arm -- it's way, way, way better than a plastic stick with a hook on it -- but there's nobody in this room that would rather have that than the one you got.","But I was saying to him, \"You know, the first airplane went 100 feet in 1903.","Wilbur and Orville.","But you know what?","It wouldn't have made an old pigeon jealous.","But now we got Eagles out there, F15s, even that Bald Eagle.","I've never seen a bird flying around at Mach 2.","I think eventually we'll make these things extraordinary.\"","And I said to that kid, \"I'll stop when your buddies are envious of your Luke arm because of what it can do, and how it does it.","And we'll keep working.","And I'm not going to stop working until we do that.\"","And I think this country ought to continue its great debate, whining and complaining, \"I'm entitled.\"","\"You're a victim.\"","And whining and complaining about what our foreign policy ought to be.","But while we have the luxury of whining and complaining about who's paying for what and how much we get, the people that are out there giving us that great privilege of whining and complaining, I know what they deserve: everything humanly possible.","And we ought to give it to them.","(Applause)"],"2":["I thought if I skipped it might help my nerves, but I'm actually having a paradoxical reaction to that, so that was a bad idea.","(Laughter)","Anyway, I was really delighted to receive the invitation to present to you some of my music and some of my work as a composer, presumably because it appeals to my well-known and abundant narcissism.","(Laughter) And I'm not kidding, I just think we should just say that and move forward.","(Laughter)","So, but the thing is, a dilemma quickly arose, and that is that I'm really bored with music, and I'm really bored with the role of the composer, and so I decided to put that idea, boredom, as the focus of my presentation to you today.","And I'm going to share my music with you, but I hope that I'm going to do so in a way that tells a story, tells a story about how I used boredom as a catalyst for creativity and invention, and how boredom actually forced me to change the fundamental question that I was asking in my discipline, and how boredom also, in a sense, pushed me towards taking on roles beyond the sort of most traditional, narrow definition of a composer.","What I'd like to do today is to start with an excerpt of a piece of music at the piano.","(Music) Okay, I wrote that.","(Laughter) No, it's not \u2014 (Applause) Oh, why thank you.","No, no, I didn't write that.","In fact, that was a piece by Beethoven, and so I was not functioning as a composer.","Just now I was functioning in the role of the interpreter, and there I am, interpreter.","So, an interpreter of what?","Of a piece of music, right?","But we can ask the question, \"But is it music?\"","And I say this rhetorically, because of course by just about any standard we would have to concede that this is, of course, a piece of music, but I put this here now because, just to set it in your brains for the moment, because we're going to return to this question.","It's going to be a kind of a refrain as we go through the presentation.","So here we have this piece of music by Beethoven, and my problem with it is, it's boring.","I mean, you \u2014 I'm just like, a hush, huh -- It's like -- (Laughter) It's Beethoven, how can you say that?","No, well, I don't know, it's very familiar to me.","I had to practice it as a kid, and I'm really sick of it.","So -- (Laughter) I would, so what I might like to try to do is to change it, to transform it in some ways, to personalize it, so I might take the opening, like this idea -- (Music) and then I might substitute -- (Music) and then I might improvise on that melody that goes forward from there -- (Music) (Music)","So that might be the kind of thing -- Why thank you.","(Applause) That would be the kind of thing that I would do, and it's not necessarily better than the Beethoven.","In fact, I think it's not better than it.","The thing is -- (Laughter) -- it's more interesting to me.","It's less boring for me.","I'm really leaning into me, because I, because I have to think about what decisions I'm going to make on the fly as that Beethoven text is running in time through my head and I'm trying to figure out what kinds of transformations I'm going to make to it.","So this is an engaging enterprise for me, and I've really leaned into that first person pronoun thing there, and now my face appears twice, so I think we can agree that this is a fundamentally solipsistic enterprise.","(Laughter) But it's an engaging one, and it's interesting to me for a while, but then I get bored with it, and by it, I actually mean, the piano, because it becomes, it's this familiar instrument, it's timbral range is actually pretty compressed, at least when you play on the keyboard, and if you're not doing things like listening to it after you've lit it on fire or something like that, you know.","It gets a little bit boring, and so pretty soon I go through other instruments, they become familiar, and eventually I find myself designing and constructing my own instrument, and I brought one with me today, and I thought I would play a little bit on it for you so you can hear what it sounds like.","(Music)","You gotta have doorstops, that's important.","(Laughter) I've got combs.","They're the only combs that I own.","(Music) They're all mounted on my instruments.","(Laughter)","(Music)","I can actually do all sorts of things.","I can play with a violin bow.","I don't have to use the chopsticks.","So we have this sound.","(Music) And with a bank of live electronics, I can change the sounds radically.","(Music) (Music) Like that, and like this.","(Music) And so forth.","So this gives you a little bit of an idea of the sound world of this instrument, which I think is quite interesting and it puts me in the role of the inventor, and the nice thing about \u2014 This instrument is called the Mouseketeer ... (Laughter) and the cool thing about it is I'm the world's greatest Mouseketeer player.","(Laughter) Okay?","(Applause) So in that regard, this is one of the things, this is one of the privileges of being, and here's another role, the inventor, and by the way, when I told you that I'm the world's greatest, if you're keeping score, we've had narcissism and solipsism and now a healthy dose of egocentricism.","I know some of you are just, you know, bingo!","Or, I don't know.","(Laughter)","Anyway, so this is also a really enjoyable role.","I should concede also that I'm the world's worst Mouseketeer player, and it was this distinction that I was most worried about when I was on that prior side of the tenure divide.","I'm glad I'm past that.","We're not going to go into that.","I'm crying on the inside.","There are still scars.","Anyway, but I guess my point is that all of these enterprises are engaging to me in their multiplicity, but as I've presented them to you today, they're actually solitary enterprises, and so pretty soon I want to commune with other people, and so I'm delighted that in fact I get to compose works for them.","I get to write, sometimes for soloists and I get to work with one person, sometimes full orchestras, and I work with a lot of people, and this is probably the capacity, the role creatively for which I'm probably best known professionally.","Now, some of my scores as a composer look like this, and others look like this, and some look like this, and I make all of these by hand, and it's really tedious.","It takes a long, long time to make these scores, and right now I'm working on a piece that's 180 pages in length, and it's just a big chunk of my life, and I'm just pulling out hair.","I have a lot of it, and that's a good thing I suppose.","(Laughter)","So this gets really boring and really tiresome for me, so after a while the process of notating is not only boring, but I actually want the notation to be more interesting, and so that's pushed me to do other projects like this one.","This is an excerpt from a score called \"The Metaphysics of Notation.\"","The full score is 72 feet wide.","It's a bunch of crazy pictographic notation.","Let's zoom in on one section of it right here.","You can see it's rather detailed.","I do all of this with drafting templates, with straight edges, with French curves, and by freehand, and the 72 feet was actually split into 12 six-foot-wide panels that were installed around the Cantor Arts Center Museum lobby balcony, and it appeared for one year in the museum, and during that year, it was experienced as visual art most of the week, except, as you can see in these pictures, on Fridays, from noon til one, and only during that time, various performers came and interpreted these strange and undefined pictographic glyphs.","(Laughter)","Now this was a really exciting experience for me.","It was gratifying musically, but I think the more important thing is it was exciting because I got to take on another role, especially given that it appeared in a museum, and that is as visual artist.","(Laughter) We're going to fill up the whole thing, don't worry.","(Laughter) I am multitudes.","(Laughter)","So one of the things is that, I mean, some people would say, like, \"Oh, you're being a dilettante,\" and maybe that's true.","I can understand how, I mean, because I don't have a pedigree in visual art and I don't have any training, but it's just something that I wanted to do as an extension of my composition, as an extension of a kind of creative impulse.","I can understand the question, though.","\"But is it music?\"","I mean, there's not any traditional notation.","I can also understand that sort of implicit criticism in this piece, \"S-tog,\" which I made when I was living in Copenhagen.","I took the Copenhagen subway map and I renamed all the stations to abstract musical provocations, and the players, who are synchronized with stopwatches, follow the timetables, which are listed in minutes past the hour.","So this is a case of actually adapting something, or maybe stealing something, and then turning it into a musical notation.","Another adaptation would be this piece.","I took the idea of the wristwatch, and I turned it into a musical score.","I made my own faces, and had a company fabricate them, and the players follow these scores.","They follow the second hands, and as they pass over the various symbols, the players respond musically.","Here's another example from another piece, and then its realization.","So in these two capacities, I've been scavenger, in the sense of taking, like, the subway map, right, or thief maybe, and I've also been designer, in the case of making the wristwatches.","And once again, this is, for me, interesting.","Another role that I like to take on is that of the performance artist.","Some of my pieces have these kind of weird theatric elements, and I often perform them.","I want to show you a clip from a piece called \"Echolalia.\"","This is actually being performed by Brian McWhorter, who is an extraordinary performer.","Let's watch a little bit of this, and please notice the instrumentation.","(Music)","Okay, I hear you were laughing nervously because you too could hear that the drill was a little bit sharp, the intonation was a little questionable.","(Laughter) Let's watch just another clip.","(Music)","You can see the mayhem continues, and there's, you know, there were no clarinets and trumpets and flutes and violins.","Here's a piece that has an even more unusual, more peculiar instrumentation.","This is \"Tl\u00f6n,\" for three conductors and no players.","(Laughter)","This was based on the experience of actually watching two people having a virulent argument in sign language, which produced no decibels to speak of, but affectively, psychologically, was a very loud experience.","So, yeah, I get it, with, like, the weird appliances and then the total absence of conventional instruments and this glut of conductors, people might, you know, wonder, yeah, \"Is this music?\"","But let's move on to a piece where clearly I'm behaving myself, and that is my \"Concerto for Orchestra.\"","You're going to notice a lot of conventional instruments in this clip.","(Music) (Music)","This, in fact, is not the title of this piece.","I was a bit mischievous.","In fact, to make it more interesting, I put a space right in here, and this is the actual title of the piece.","Let's continue with that same excerpt.","(Music)","It's better with a florist, right?","(Laughter) (Music) Or at least it's less boring.","Let's watch a couple more clips.","(Music)","So with all these theatric elements, this pushes me in another role, and that would be, possibly, the dramaturge.","I was playing nice.","I had to write the orchestra bits, right?","Okay?","But then there was this other stuff, right?","There was the florist, and I can understand that, once again, we're putting pressure on the ontology of music as we know it conventionally, but let's look at one last piece today I'm going to share with you.","This is going to be a piece called \"Aphasia,\" and it's for hand gestures synchronized to sound, and this invites yet another role, and final one I'll share with you, which is that of the choreographer.","And the score for the piece looks like this, and it instructs me, the performer, to make various hand gestures at very specific times synchronized with an audio tape, and that audio tape is made up exclusively of vocal samples.","I recorded an awesome singer, and I took the sound of his voice in my computer, and I warped it in countless ways to come up with the soundtrack that you're about to hear.","And I'll perform just an excerpt of \"Aphasia\" for you here.","Okay?","(Music) So that gives you a little taste of that piece.","(Applause)","Yeah, okay, that's kind of weird stuff.","Is it music?","Here's how I want to conclude.","I've decided, ultimately, that this is the wrong question, that this is not the important question.","The important question is, \"Is it interesting?\"","And I follow this question, not worrying about \"Is it music?\"","-- not worrying about the definition of the thing that I'm making.","I allow my creativity to push me in directions that are simply interesting to me, and I don't worry about the likeness of the result to some notion, some paradigm, of what music composition is supposed to be, and that has actually urged me, in a sense, to take on a whole bunch of different roles, and so what I want you to think about is, to what extent might you change the fundamental question in your discipline, and, okay, I'm going to put one extra little footnote in here, because, like, I realized I mentioned some psychological defects earlier, and we also, along the way, had a fair amount of obsessive behavior, and there was some delusional behavior and things like that, and here I think we could say that this is an argument for self-loathing and a kind of schizophrenia, at least in the popular use of the term, and I really mean dissociative identity disorder, okay.","(Laughter) Anyway, despite those perils, I would urge you to think about the possibility that you might take on roles in your own work, whether they are neighboring or far-flung from your professional definition.","And with that, I thank you very much.","(Applause)","(Applause)"],"3":["By raising your hand, how many of you know at least one person on the screen?","Wow, it's almost a full house.","It's true, they are very famous in their fields.","And do you know what all of them have in common?","They all died of pancreatic cancer.","However, although it's very, very sad this news, it's also thanks to their personal stories that we have raised awareness of how lethal this disease can be.","It's become the third cause of cancer deaths, and only eight percent of the patients will survive beyond five years.","That's a very tiny number, especially if you compare it with breast cancer, where the survival rate is almost 90 percent.","So it doesn't really come as a surprise that being diagnosed with pancreatic cancer means facing an almost certain death sentence.","What's shocking, though, is that in the last 40 years, this number hasn't changed a bit, while much more progress has been made with other types of tumors.","So how can we make pancreatic cancer treatment more effective?","As a biomedical entrepreneur, I like to work on problems that seem impossible, understanding their limitations and trying to find new, innovative solutions that can change their outcome.","The first piece of bad news with pancreatic cancer is that your pancreas is in the middle of your belly, literally.","It's depicted in orange on the screen.","But you can barely see it until I remove all the other organs in front.","It's also surrounded by many other vital organs, like the liver, the stomach, the bile duct.","And the ability of the tumor to grow into those organs is the reason why pancreatic cancer is one of the most painful tumor types.","The hard-to-reach location also prevents the doctor from surgically removing it, as is routinely done for breast cancer, for example.","So all of these reasons leave chemotherapy as the only option for the pancreatic cancer patient.","This brings us to the second piece of bad news.","Pancreatic cancer tumors have very few blood vessels.","Why should we care about the blood vessel of a tumor?","Let's think for a second how chemotherapy works.","The drug is injected in the vein and it navigates throughout the body until it reaches the tumor site.","It's like driving on a highway, trying to reach a destination.","But what if your destination doesn't have an exit on the highway?","You will never get there.","And that's exactly the same problem for chemotherapy and pancreatic cancer.","The drugs navigate throughout all of your body.","They will reach healthy organs, resulting in high toxic effect for the patients overall, but very little will go to the tumor.","Therefore, the efficacy is very limited.","To me, it seems very counterintuitive to have a whole-body treatment to target a specific organ.","However, in the last 40 years, a lot of money, research and effort have gone towards finding new, powerful drugs to treat pancreatic cancer, but nothing has been done in changing the way we deliver them to the patient.","So after two pieces of bad news, I'm going to give you good news, hopefully.","With a collaborator at MIT and the Massachusetts General Hospital in Boston, we have revolutionized the way we treat cancer by making localized drug delivery a reality.","We are basically parachuting you on top of your destination, avoiding your having to drive all around the highway.","We have embedded the drug into devices that look like this one.","They are flexible enough that they can be folded to fit into the catheter, so the doctor can implant it directly on top of the tumor with minimally invasive surgery.","But they are solid enough that once they are positioned on top of the tumor, they will act as a cage.","They will actually physically prevent the tumor from entering other organs, controlling the metastasis.","The devices are also biodegradable.","That means that once in the body, they start dissolving, delivering the drug only locally, slowly and more effectively than what is done with the current whole-body treatment.","In pre-clinical study, we have demonstrated that this localized approach is able to improve by 12 times the response to treatment.","So we took a drug that is already known and by just delivering it locally where it's needed the most, we allow a response that is 12 times more powerful, reducing the systemic toxic effect.","We are working relentlessly to bring this technology to the next level.","We are finalizing the pre-clinical testing and the animal model required prior to asking the FDA for approval for clinical trials.","Currently, the majority of patients will die from pancreatic cancer.","We are hoping that one day, we can reduce their pain, extend their life and potentially make pancreatic cancer a curable disease.","By rethinking the way we deliver the drug, we don't only make it more powerful and less toxic, we are also opening the door to finding new innovative solutions for almost all other impossible problems in pancreatic cancer patients and beyond.","Thank you very much.","(Applause)"],"4":["Good morning everybody.","I work with really amazing, little, itty-bitty creatures called cells.","And let me tell you what it's like to grow these cells in the lab.","I work in a lab where we take cells out of their native environment.","We plate them into dishes that we sometimes call petri dishes.","And we feed them -- sterilely of course -- with what we call cell culture media -- which is like their food -- and we grow them in incubators.","Why do I do this?","We observe the cells in a plate, and they're just on the surface.","But what we're really trying to do in my lab is to engineer tissues out of them.","What does that even mean?","Well it means growing an actual heart, let's say, or grow a piece of bone that can be put into the body.","Not only that, but they can also be used for disease models.","And for this purpose, traditional cell culture techniques just really aren't enough.","The cells are kind of homesick; the dish doesn't feel like their home.","And so we need to do better at copying their natural environment to get them to thrive.","We call this the biomimetic paradigm -- copying nature in the lab.","Let's take the example of the heart, the topic of a lot of my research.","What makes the heart unique?","Well, the heart beats, rhythmically, tirelessly, faithfully.","We copy this in the lab by outfitting cell culture systems with electrodes.","These electrodes act like mini pacemakers to get the cells to contract in the lab.","What else do we know about the heart?","Well, heart cells are pretty greedy.","Nature feeds the heart cells in your body with a very, very dense blood supply.","In the lab, we micro-pattern channels in the biomaterials on which we grow the cells, and this allows us to flow the cell culture media, the cells' food, through the scaffolds where we're growing the cells -- a lot like what you might expect from a capillary bed in the heart.","So this brings me to lesson number one: life can do a lot with very little.","Let's take the example of electrical stimulation.","Let's see how powerful just one of these essentials can be.","On the left, we see a tiny piece of beating heart tissue that I engineered from rat cells in the lab.","It's about the size of a mini marshmallow.","And after one week, it's beating.","You can see it in the upper left-hand corner.","But don't worry if you can't see it so well.","It's amazing that these cells beat at all.","But what's really amazing is that the cells, when we electrically stimulate them, like with a pacemaker, that they beat so much more.","But that brings me to lesson number two: cells do all the work.","In a sense, tissue engineers have a bit of an identity crisis here, because structural engineers build bridges and big things, computer engineers, computers, but what we are doing is actually building enabling technologies for the cells themselves.","What does this mean for us?","Let's do something really simple.","Let's remind ourselves that cells are not an abstract concept.","Let's remember that our cells sustain our lives in a very real way.","\"We are what we eat,\" could easily be described as, \"We are what our cells eat.\"","And in the case of the flora in our gut, these cells may not even be human.","But it's also worth noting that cells also mediate our experience of life.","Behind every sound, sight, touch, taste and smell is a corresponding set of cells that receive this information and interpret it for us.","It begs the question: shall we expand our sense of environmental stewardship to include the ecosystem of our own bodies?","I invite you to talk about this with me further, and in the meantime, I wish you luck.","May none of your non-cancer cells become endangered species.","Thank you.","(Applause)"],"5":["Chris Anderson: This is such a strange thing.","Your software, Linux, is in millions of computers, it probably powers much of the Internet.","And I think that there are, like, a billion and a half active Android devices out there.","Your software is in every single one of them.","It's kind of amazing.","You must have some amazing software headquarters driving all this.","That's what I thought -- and I was shocked when I saw a picture of it.","I mean, this is -- this is the Linux world headquarters.","(Laughter)","(Applause)","Linus Torvalds: It really doesn't look like much.","And I have to say, the most interesting part in this picture, that people mostly react to, is the walking desk.","It is the most interesting part in my office and I'm not actually using it anymore.","And I think the two things are related.","The way I work is ...","I want to not have external stimulation.","You can kind of see, on the walls are this light green.","I'm told that at mental institutions they use that on the walls.","(Laughter)","It's like a calming color, it's not something that really stimulates you.","What you can't see is the computer here, you only see the screen, but the main thing I worry about in my computer is -- it doesn't have to be big and powerful, although I like that -- it really has to be completely silent.","I know people who work for Google and they have their own small data center at home, and I don't do that.","My office is the most boring office you'll ever see.","And I sit there alone in the quiet.","If the cat comes up, it sits in my lap.","And I want to hear the cat purring, not the sound of the fans in the computer.","CA: So this is astonishing, because working this way, you're able to run this vast technology empire -- it is an empire -- so that's an amazing testament to the power of open source.","Tell us how you got to understand open source and how it lead to the development of Linux.","LT: I mean, I still work alone.","Really -- I work alone in my house, often in my bathrobe.","When a photographer shows up, I dress up, so I have clothes on.","(Laughter)","And that's how I've always worked.","I mean, this was how I started Linux, too.","I did not start Linux as a collaborative project.","I started it as one in a series of many projects I had done at the time for myself, partly because I needed the end result, but even more because I just enjoyed programming.","So it was about the end of the journey, which, 25 years later, we still have not reached.","But it was really about the fact that I was looking for a project on my own and there was no open source, really, on my radar at all.","And what happened is ... the project grows and becomes something you want to show off to people.","Really, this is more of a, \"Wow, look at what I did!\"","And trust me -- it was not that great back then.","I made it publicly available, and it wasn't even open source at that point.","At that point it was source that was open, but there was no intention behind using the kind of open-source methodology that we think of today to improve it.","It was more like, \"Look, I've been working on this for half a year, I'd love to have comments.\"","And other people approached me.","At the University of Helsinki, I had a friend who was one of the open source -- it was called mainly \"free software\" back then -- and he actually introduced me to the notion that, hey, you can use these open-source licenses that had been around.","And I thought about it for a while.","I was actually worried about the whole commercial interests coming in.","I mean, that's one of the worries I think most people who start out have, is that they worry about somebody taking advantage of their work, right?","And I decided, \"What the hell?\"","And --","CA: And then at some point, someone contributed some code that you thought, \"Wow, that really is interesting, I would not have thought of that.","This could actually improve this.\"","LT: It didn't even start by people contributing code, it was more that people started contributing ideas.","And just the fact that somebody else takes a look at your project -- and I'm sure it's true of other things, too, but it's definitely true in code -- is that somebody else takes an interest in your code, looks at it enough to actually give you feedback and give you ideas.","That was a huge thing for me.","I was 21 at the time, so I was young, but I had already programmed for half my life, basically.","And every project before that had been completely personal and it was a revelation when people just started commenting, started giving feedback on your code.","And even before they started giving code back, that was, I think, one of the big moments where I said, \"I love other people!\"","Don't get me wrong -- I'm actually not a people person.","(Laughter)","I don't really love other people --","(Laughter)","But I love computers, I love interacting with other people on email, because it kind of gives you that buffer.","But I do love other people who comment and get involved in my project.","And it made it so much more.","CA: So was there a moment when you saw what was being built and it suddenly started taking off, and you thought, \"Wait a sec, this actually could be something huge, not just a personal project that I'm getting nice feedback on, but a kind of explosive development in the whole technology world\"?","LT: Not really.","I mean, the big point for me, really, was not when it was becoming huge, it was when it was becoming little.","The big point for me was not being alone and having 10, maybe 100 people being involved -- that was a big point.","Then everything else was very gradual.","Going from 100 people to a million people is not a big deal -- to me.","Well, I mean, maybe it is if you're --","(Laughter)","If you want to sell your result then it's a huge deal -- don't get me wrong.","But if you're interested in the technology and you're interested in the project, the big part was getting the community.","Then the community grew gradually.","And there's actually not a single point where I went like, \"Wow, that just took off!\"","because it -- I mean -- it took a long time, relatively.","CA: So all the technologists that I talk to really credit you with massively changing their work.","And it's not just Linux, it's this thing called Git, which is this management system for software development.","Tell us briefly about that and your role in that.","LT: So one of the issues we had, and this took a while to start to appear, is when you ...","When you grow from having 10 people or 100 people working on a project to having 10,000 people, which -- I mean, right now we're in the situation where just on the kernel, we have 1,000 people involved in every single release and that's every two months, roughly two or three months.","Some of those people don't do a lot.","There's a lot of people who make small, small changes.","But to maintain this, the scale changes how you have to maintain it.","And we went through a lot of pain.","And there are whole projects that do only source-code maintenance.","CVS is the one that used to be the most commonly used, and I hated CVS with a passion and refused to touch it and tried something else that was radical and interesting and everybody else hated.","CA: (Laughs)","LT: And we were in this bad spot, where we had thousands of people who wanted to participate, but in many ways, I was the kind of break point, where I could not scale to the point where I could work with thousands of people.","So Git is my second big project, which was only created for me to maintain my first big project.","And this is literally how I work.","I don't code for -- well, I do code for fun -- but I want to code for something meaningful so every single project I've ever done has been something I needed and --","CA: So really, both Linux and Git kind of arose almost as an unintended consequence of your desire not to have to work with too many people.","LT: Absolutely.","Yes.","(Laughter)","CA: That's amazing.","LT: Yeah.","(Applause)","And yet, you're the man who's transformed technology not just once but twice, and we have to try and understand why it is.","You've given us some clues, but ...","Here's a picture of you as a kid, with a Rubik's Cube.","You mentioned that you've been programming since you were like 10 or 11, half your life.","Were you this sort of computer genius, you know, \u00fcbernerd, were you the star at school who could do everything?","What were you like as a kid?","LT: Yeah, I think I was the prototypical nerd.","I mean, I was ...","I was not a people person back then.","That's my younger brother.","I was clearly more interested in the Rubik's Cube than my younger brother.","(Laughter)","My younger sister, who's not in the picture, when we had family meetings -- and it's not a huge family, but I have, like, a couple of cousins -- she would prep me beforehand.","Like, before I stepped into the room she would say, \"OK. That's so-and-so ...\" Because I was not -- I was a geek.","I was into computers, I was into math, I was into physics.","I was good at that.","I don't think I was particularly exceptional.","Apparently, my sister said that my biggest exceptional quality was that I would not let go.","CA: OK, so let's go there, because that's interesting.","You would not let go.","So that's not about being a geek and being smart, that's about being ... stubborn?","LT: That's about being stubborn.","That's about, like, just starting something and not saying, \"OK, I'm done, let's do something else -- Look: shiny!\"","And I notice that in many other parts in my life, too.","I lived in Silicon Valley for seven years.","And I worked for the same company, in Silicon Valley, for the whole time.","That is unheard of.","That's not how Silicon Valley works.","The whole point of Silicon Valley is that people jump between jobs to kind of mix up the pot.","And that's not the kind of person I am.","CA: But during the actual development of Linux itself, that stubbornness sometimes brought you in conflict with other people.","Talk about that a bit.","Was that essential to sort of maintain the quality of what was being built?","How would you describe what happened?","LT: I don't know if it's essential.","Going back to the \"I'm not a people person,\" -- sometimes I'm also ... shall we say, \"myopic\" when it comes to other people's feelings, and that sometimes makes you say things that hurt other people.","And I'm not proud of that.","(Applause)","But, at the same time, it's -- I get people who tell me that I should be nice.","And then when I try to explain to them that maybe you're nice, maybe you should be more aggressive, they see that as me being not nice.","(Laughter)","What I'm trying to say is we are different.","I'm not a people person; it's not something I'm particularly proud of, but it's part of me.","And one of the things I really like about open source is it really allows different people to work together.","We don't have to like each other -- and sometimes we really don't like each other.","Really -- I mean, there are very, very heated arguments.","But you can, actually, you can find things that -- you don't even agree to disagree, it's just that you're interested in really different things.","And coming back to the point where I said earlier that I was afraid of commercial people taking advantage of your work, it turned out, and very quickly turned out, that those commercial people were lovely, lovely people.","And they did all the things that I was not at all interested in doing, and they had completely different goals.","And they used open source in ways that I just did not want to go.","But because it was open source they could do it, and it actually works really beautifully together.","And I actually think it works the same way.","You need to have the people-people, the communicators, the warm and friendly people who like --","(Laughter)","really want to hug you and get you into the community.","But that's not everybody.","And that's not me.","I care about the technology.","There are people who care about the UI.","I can't do UI to save my life.","I mean, if I was stranded on an island and the only way to get off that island was the make a pretty UI, I'd die there.","(Laughter)","So there's different kinds of people, and I'm not making excuses, I'm trying to explain.","CA: Now, when we talked last week, you talked about some other trait that you have, which I found really interesting.","It's this idea called taste.","And I've just got a couple of images here.","I think this is an example of not particularly good taste in code, and this one is better taste, which one can immediately see.","What is the difference between these two?","LT: So this is -- How many people here actually have coded?","CA: Oh my goodness.","LT: So I guarantee you, everybody who raised their hand, they have done what's called a singly-linked list.","And it's taught -- This, the first not very good taste approach, is basically how it's taught to be done when you start out coding.","And you don't have to understand the code.","The most interesting part to me is the last if statement.","Because what happens in a singly-linked list -- this is trying to remove an existing entry from a list -- and there's a difference between if it's the first entry or whether it's an entry in the middle.","Because if it's the first entry, you have to change the pointer to the first entry.","If it's in the middle, you have to change the pointer of a previous entry.","So they're two completely different cases.","CA: And that's better.","LT: And this is better.","It does not have the if statement.","And it doesn't really matter -- I don't want you understand why it doesn't have the if statement, but I want you to understand that sometimes you can see a problem in a different way and rewrite it so that a special case goes away and becomes the normal case.","And that's good code.","But this is simple code.","This is CS 101.","This is not important -- although, details are important.","To me, the sign of people I really want to work with is that they have good taste, which is how ...","I sent you this stupid example that is not relevant because it's too small.","Good taste is much bigger than this.","Good taste is about really seeing the big patterns and kind of instinctively knowing what's the right way to do things.","CA: OK, so we're putting the pieces together here now.","You have taste, in a way that's meaningful to software people.","You're --","(Laughter)","LT: I think it was meaningful to some people here.","CA: You're a very smart computer coder, and you're hellish stubborn.","But there must be something else.","I mean, you've changed the future.","You must have the ability of these grand visions of the future.","You're a visionary, right?","LT: I've actually felt slightly uncomfortable at TED for the last two days, because there's a lot of vision going on, right?","And I am not a visionary.","I do not have a five-year plan.","I'm an engineer.","And I think it's really -- I mean -- I'm perfectly happy with all the people who are walking around and just staring at the clouds and looking at the stars and saying, \"I want to go there.\"","But I'm looking at the ground, and I want to fix the pothole that's right in front of me before I fall in.","This is the kind of person I am.","(Cheers)","(Applause)","CA: So you spoke to me last week about these two guys.","Who are they and how do you relate to them?","LT: Well, so this is kind of clich\u00e9 in technology, the whole Tesla versus Edison, where Tesla is seen as the visionary scientist and crazy idea man.","And people love Tesla.","I mean, there are people who name their companies after him.","(Laughter)","The other person there is Edison, who is actually often vilified for being kind of pedestrian and is -- I mean, his most famous quote is, \"Genius is one percent inspiration and 99 percent perspiration.\"","And I'm in the Edison camp, even if people don't always like him.","Because if you actually compare the two, Tesla has kind of this mind grab these days, but who actually changed the world?","Edison may not have been a nice person, he did a lot of things -- he was maybe not so intellectual, not so visionary.","But I think I'm more of an Edison than a Tesla.","CA: So our theme at TED this week is dreams -- big, bold, audacious dreams.","You're really the antidote to that.","LT: I'm trying to dial it down a bit, yes.","CA: That's good.","(Laughter) We embrace you, we embrace you.","Companies like Google and many others have made, arguably, like, billions of dollars out of your software.","Does that piss you off?","LT: No.","No, it doesn't piss me off for several reasons.","And one of them is, I'm doing fine.","I'm really doing fine.","But the other reason is -- I mean, without doing the whole open source and really letting go thing, Linux would never have been what it is.","And it's brought experiences I don't really enjoy, public talking, but at the same time, this is an experience.","Trust me.","So there's a lot of things going on that make me a very happy man and thinking I did the right choices.","CA: Is the open source idea -- this is, I think we'll end here -- is the open source idea fully realized now in the world, or is there more that it could go, are there more things that it could do?","LT: So, I'm of two minds there.","I think one reason open source works so well in code is that at the end of the day, code tends to be somewhat black and white.","There's often a fairly good way to decide, this is done correctly and this is not done well.","Code either works or it doesn't, which means that there's less room for arguments.","And we have arguments despite this, right?","In many other areas -- I mean, people have talked about open politics and things like that -- and it's really hard sometimes to say that, yes, you can apply the same principles in some other areas just because the black and white turns into not just gray, but different colors.","So, obviously open source in science is making a comeback.","Science was there first.","But then science ended up being pretty closed, with very expensive journals and some of that going on.","And open source is making a comeback in science, with things like arXiv and open journals.","Wikipedia changed the world, too.","So there are other examples, I'm sure there are more to come.","CA: But you're not a visionary, and so it's not up to you to name them.","LT: No.","(Laughter)","It's up to you guys to make them, right?","CA: Exactly.","Linus Torvalds, thank you for Linux, thank you for the Internet, thank you for all those Android phones.","Thank you for coming here to TED and revealing so much of yourself.","LT: Thank you.","(Applause)"],"6":["(Music)","Amanda Palmer (singing): Ground Control to Major Tom,","Ground Control to Major Tom,","Take your protein pills and put your helmet on.","Al Gore: Ten, Nine, Eight, Seven, Six ...","AP: Ground Control to Major Tom,","AG: Five, Four, Three, Two, One ... AP: Commencing countdown, engines on.","Check ignition and may God's love be with you.","AG: Liftoff.","AP: This is Ground Control to Major Tom,","You've really made the grade","And the papers want to know whose shirts you wear.","Now it's time to leave the capsule if you dare.","\"This is Major Tom to Ground Control,","I'm stepping through the door","And I'm floating in a most peculiar way","And the stars look very different today.","For here am I floating round my tin can.","Far above the world,","Planet Earth is blue and there's nothing I can do.\"","(Music)","\"Though I'm past 100,000 miles,","I'm feeling very still, and I think my spaceship knows which way to go.","Tell my wife I love her very much she knows.\"","Ground Control to Major Tom, your circuit's dead, there's something wrong.","Can you hear me, Major Tom?","Can you hear me, Major Tom?","Can you hear me, Major Tom?","Can you ...","\"Here am I floating round my tin can, far above the Moon.","Planet Earth is blue and there's nothing I can do.","(Music)","[\"I'm not a prophet or a stone-age man, just a mortal with the potential of a superman ... ...","I'm living on.\"","David Bowie, 1947-2016]","(Applause)"],"7":["Namaste.","Good morning.","I'm very happy to be here in India.","And I've been thinking a lot about what I have learned over these last particularly 11 years with V-Day and \"The Vagina Monologues,\" traveling the world, essentially meeting with women and girls across the planet to stop violence against women.","What I want to talk about today is this particular cell, or grouping of cells, that is in each and every one of us.","And I want to call it the girl cell.","And it's in men as well as in women.","I want you to imagine that this particular grouping of cells is central to the evolution of our species and the continuation of the human race.","And I want you imagine that at some point in history a group of powerful people invested in owning and controlling the world understood that the suppression of this particular cell, the oppression of these cells, the reinterpretation of these cells, the undermining of these cells, getting us to believe in the weakness of these cells and the crushing, eradicating, destroying, reducing these cells, basically began the process of killing off the girl cell, which was, by the way, patriarchy.","I want you to imagine that the girl is a chip in the huge macrocosm of collective consciousness.","And it is essential to balance, to wisdom and to actually the future of all of us.","And then I want you to imagine that this girl cell is compassion, and it's empathy, and it's passion itself, and it's vulnerability, and it's openness, and it's intensity, and it's association, and it's relationship, and it is intuitive.","And then let's think how compassion informs wisdom, and that vulnerability is our greatest strength, and that emotions have inherent logic, which lead to radical, appropriate, saving action.","And then let's remember that we've been taught the exact opposite by the powers that be, that compassion clouds your thinking, that it gets in the way, that vulnerability is weakness, that emotions are not to be trusted, and you're not supposed to take things personally, which is one of my favorites.","I think the whole world has essentially been brought up not to be a girl.","How do we bring up boys?","What does it mean to be a boy?","To be a boy really means not to be a girl.","To be a man means not to be a girl.","To be a woman means not to be a girl.","To be strong means not to be a girl.","To be a leader means not to be a girl.","I actually think that being a girl is so powerful that we've had to train everyone not to be that.","(Laughter)","And I'd also like to say that the irony of course, is that denying girl, suppressing girl, suppressing emotion, refusing feeling has lead thus here.","Where we have now come to live in a world where the most extreme forms of violence, the most horrific poverty, genocide, mass rapes, the destruction of the Earth, is completely out of control.","And because we have suppressed our girl cells and suppressed our girl-ship, we do not feel what is going on.","So, we are not being charged with the adequate response to what is happening.","I want to talk a little bit about the Democratic Republic of Congo.","For me, it was the turning point of my life.","I have spent a lot of time there in the last three years.","I feel up to that point I had seen a lot in the world, a lot of violence.","I essentially lived in the rape mines of the world for the last 12 years.","But the Democratic Republic of Congo really was the turning point in my soul.","I went and I spent time in a place called Bukavu in a hospital called the Panzi Hospital, with a doctor who was as close to a saint as any person I've ever met.","His name is Dr. Denis Mukwege.","In the Congo, for those of you who don't know, there has been a war raging for the last 12 years, a war that has killed nearly six million people.","It is estimated that somewhere between 300,000 and 500,000 women have been raped there.","When I spent my first weeks at Panzi hospital I sat with women who sat and lined up every day to tell me their stories.","Their stories were so horrific, and so mind-blowing and so on the other side of human existence, that to be perfectly honest with you, I was shattered.","And I will tell you that what happened is through that shattering, listening to the stories of eight-year-old girls who had their insides eviscerated, who had guns and bayonets and things shoved inside them so they had holes, literally, inside them where their pee and poop came out of them.","Listening to the story of 80-year-old women who were tied to chains and circled, and where groups of men would come and rape them periodically, all in the name of economic exploitation to steal the minerals so the West can have it and profit from them.","My mind was so shattered.","But what happened for me is that that shattering actually emboldened me in a way I have never been emboldened.","That shattering, that opening of my girl cell, that kind of massive breakthrough of my heart allowed me to become more courageous, and braver, and actually more clever than I had been in the past in my life.","I want to say that I think the powers that be know that empire-building is actually -- that feelings get in the way of empire-building.","Feelings get in the way of the mass acquisition of the Earth, and excavating the Earth, and destroying things.","I remember, for example, when my father, who was very, very violent, used to beat me.","And he would actually say, while he was beating me, \"Don't you cry.","Don't you dare cry.\"","Because my crying somehow exposed his brutality to him.","And even in the moment he didn't want to be reminded of what he was doing.","I know that we have systematically annihilated the girl cell.","And I want to say we've annihilated it in men as well as in women.","And I think in some ways we've been much harsher to men in the annihilation of their girl cell.","(Applause) I see how boys have been brought up, and I see this across the planet: to be tough, to be hardened, to distance themselves from their tenderness, to not cry.","I actually realized once in Kosovo, when I watched a man break down, that bullets are actually hardened tears, that when we don't allow men to have their girl self and have their vulnerability, and have their compassion, and have their hearts, that they become hardened and hurtful and violent.","And I think we have taught men to be secure when they are insecure, to pretend they know things when they don't know things, or why would we be where we are?","To pretend they're not a mess when they are a mess.","And I will tell you a very funny story.","On my way here on the airplane, I was walking up and down the aisle of the plane.","And all these men, literally at least 10 men, were in their little seats watching chick flicks.","And they were all alone, and I thought, \"This is the secret life of men.\"","(Laughter)","I've traveled, as I said, to many, many countries, and I've seen, if we do what we do to the girl inside us then obviously it's horrific to think what we do to girls in the world.","And we heard from Sunitha yesterday, and Kavita about what we do to girls.","But I just want to say that I've met girls with knife wounds and cigarette burns, who are literally being treated like ashtrays.","I've seen girls be treated like garbage cans.","I've seen girls who were beaten by their mothers and brothers and fathers and uncles.","I've seen girls starving themselves to death in America in institutions to look like some idealized version of themselves.","I've seen that we cut girls and we control them and we keep them illiterate, or we make them feel bad about being too smart.","We silence them.","We make them feel guilty for being smart.","We get them to behave, to tone it down, not to be too intense.","We sell them, we kill them as embryos, we enslave them, we rape them.","We are so accustomed to robbing girls of the subject of being the subjects of their lives that we have now actually objectified them and turned them into commodities.","The selling of girls is rampant across the planet.","And in many places they are worth less than goats and cows.","But I also want to talk about the fact that if one in eight people on the planet are girls between the ages of 10 to 24, they are they key, really, in the developing world, as well as in the whole world, to the future of humanity.","And if girls are in trouble because they face systematic disadvantages that keep them where society wants them to be, including lack of access to healthcare, education, healthy foods, labor force participation.","The burden of all the household tasks usually falls on girls and younger siblings, which ensures that they will never overcome these barriers.","The state of girls, the condition of girls, will, in my belief -- and that's the girl inside us and the girl in the world -- determine whether the species survives.","And what I want to suggest is that, having talked to girls, because I just finished a new book called \"I Am an Emotional Creature: The Secret Life of Girls Around the World,\" I've been talking to girls for five years, and one of the things that I've seen is true everywhere is that the verb that's been enforced on girl is the verb \"to please.\"","Girls are trained to please.","I want to change the verb.","I want us all to change the verb.","I want the verb to be \"educate,\" or \"activate,\" or \"engage,\" or \"confront,\" or \"defy,\" or \"create.\"","If we teach girls to change the verb we will actually enforce the girl inside us and the girl inside them.","And I have to now share a few stories of girls I've seen across the planet who have engaged their girl, who have taken on their girl in spite of all the circumstances around them.","I know a 14-year-old girl in the Netherlands, for example, who is demanding that she take a boat and go around the entire world by herself.","There is a teenage girl who just recently went out and knew that she needed 56 stars tattooed on the right side of her face.","There is a girl, Julia Butterfly Hill, who lived for a year in a tree because she wanted to protect the wild oaks.","There is a girl who I met 14 years ago in Afghanistan who I have adopted as my daughter because her mother was killed.","Her mother was a revolutionary.","And this girl, when she was 17 years old, wore a burqa in Afghanistan, and went into the stadiums and documented the atrocities that were going on towards women, underneath her burqa, with a video.","And that video became the video that went out all over the world after 9\/11 to show what was going on in Afghanistan.","I want to talk about Rachel Corrie who was in her teens when she stood in front of an Israeli tank to say, \"End the occupation.\"","And she knew she risked death and she was literally gunned down and rolled over by that tank.","And I want to talk about a girl that I just met recently in Bukavu, who was impregnated by her rapist.","And she was holding her baby.","And I asked her if she loved her baby.","And she looked into her baby's eyes and she said, \"Of course I love my baby.","How could I not love my baby?","It's my baby and it's full of love.\"","The capacity for girls to overcome situations and to move on levels, to me, is mind-blowing.","There is a girl named Dorcas, and I just met her in Kenya.","Dorcas is 15 years old, and she was trained in self-defense.","A few months ago she was picked up on the street by three older men.","They kidnapped her, they put her in a car.","And through her self-defense, she grabbed their Adam's apples, she punched them in the eyes and she got herself free and out of the car.","In Kenya, in August, I went to visit one of the V-Day safe houses for girls, a house we opened seven years ago with an amazing woman named Agnes Pareyio.","Agnes was a woman who was cut when she was a little girl, she was female genitally mutilated.","And she made a decision as many women do across this planet, that what was done to her would not be enforced and done to other women and girls.","So, for years Agnes walked through the Rift valley.","She taught girls what a healthy vagina looked like, and what a mutilated vagina looked like.","And in that time she saved many girls.","And when we met her we asked her what we could do for her, and she said, \"Well, if you got me a Jeep I could get around a lot faster.\"","So, we got her a Jeep.","And then she saved 4,500 girls.","And then we asked her, \"Okay, what else do you need?\"","And she said, \"Well, now, I need a house.\"","So, seven years ago Agnes built the first V-Day safe house in Narok, Kenya, in the Masai land.","And it was a house where girls could run away, they could save their clitoris, they wouldn't be cut, they could go to school.","And in the years that Agnes has had the house, she has changed the situation there.","She has literally become deputy mayor.","She's changed the rules.","The whole community has bought in to what she's doing.","When we were there she was doing a ritual where she reconciles girls, who have run away, with their families.","And there was a young girl named Jaclyn.","Jaclyn was 14 years old and she was in her Masai family and there's a drought in Kenya.","So cows are dying, and cows are the most valued possession.","And Jaclyn overheard her father talking to an old man about how he was about to sell her for the cows.","And she knew that meant she would be cut.","She knew that meant she wouldn't go to school.","She knew that meant she wouldn't have a future.","She knew she would have to marry that old man, and she was 14.","So, one afternoon, she'd heard about the safe house, Jaclyn left her father's house and she walked for two days, two days through Masai land.","She slept with the hyenas.","She hid at night.","She imagined her father killing her on one hand, and Mama Agnes greeting her, with the hope that she would greet her when she got to the house.","And when she got to the house she was greeted.","Agnes took her in, and Agnes loved her, and Agnes supported her for the year.","She went to school and she found her voice, and she found her identity, and she found her heart.","Then, her time was ready when she had to go back to talk to her father about the reconciliation, after a year.","I had the privilege of being in the hut when she was reunited with her father and reconciled.","In that hut, we walked in, and her father and his four wives were sitting there, and her sisters who had just returned because they had all fled when she had fled, and her primary mother, who had been beaten in standing up for her with the elders.","When her father saw her and saw who she had become, in her full girl self, he threw his arms around her and broke down crying.","He said, \"You are beautiful.","You have grown into a gorgeous woman.","We will not cut you.","And I give you my word, here and now, that we will not cut your sisters either.\"","And what she said to him was, \"You were willing to sell me for four cows, and a calf and some blankets.","But I promise you, now that I will be educated I will always take care of you, and I will come back and I will build you a house.","And I will be in your corner for the rest of your life.\"","For me, that is the power of girls.","And that is the power of transformation.","I want to close today with a new piece from my book.","And I want to do it tonight for the girl in everybody here.","And I want to do it for Sunitha.","And I want to do it for the girls that Sunitha talked about yesterday, the girls who survive, the girls who can become somebody else.","But I really want to do it for each and every person here, to value the girl in us, to value the part that cries, to value the part that's emotional, to value the part that's vulnerable, to understand that's where the future lies.","This is called \"I'm An Emotional Creature.\"","And it happened because I met a girl in Watts, L.A.","I was asking girls if they like being a girl, and all the girls were like, \"No, I hate it.","I can't stand it.","It's all bad.","My brothers get everything.\"","And this girl just sat up and went, \"I love being a girl.","I'm an emotional creature!\"","(Laughter) This is for her:","I love being a girl.","I can feel what you're feeling as you're feeling inside the feeling before.","I am an emotional creature.","Things do not come to me as intellectual theories or hard-pressed ideas.","They pulse through my organs and legs and burn up my ears.","Oh, I know when your girlfriend's really pissed off, even though she appears to give you what you want.","I know when a storm is coming.","I can feel the invisible stirrings in the air.","I can tell you he won't call back.","It's a vibe I share.","I am an emotional creature.","I love that I do not take things lightly.","Everything is intense to me, the way I walk in the street, the way my momma wakes me up, the way it's unbearable when I lose, the way I hear bad news.","I am an emotional creature.","I am connected to everything and everyone.","I was born like that.","Don't you say all negative that it's only only a teenage thing, or it's only because I'm a girl.","These feelings make me better.","They make me present.","They make me ready.","They make me strong.","I am an emotional creature.","There is a particular way of knowing.","It's like the older women somehow forgot.","I rejoice that it's still in my body.","Oh, I know when the coconut's about to fall.","I know we have pushed the Earth too far.","I know my father isn't coming back, and that no one's prepared for the fire.","I know that lipstick means more than show, and boys are super insecure, and so-called terrorists are made, not born.","I know that one kiss could take away all my decision-making ability.","(Laughter) And you know what?","Sometimes it should.","This is not extreme.","It's a girl thing, what we would all be if the big door inside us flew open.","Don't tell me not to cry, to calm it down, not to be so extreme, to be reasonable.","I am an emotional creature.","It's how the earth got made, how the wind continues to pollinate.","You don't tell the Atlantic Ocean to behave.","I am an emotional creature.","Why would you want to shut me down or turn me off?","I am your remaining memory.","I can take you back.","Nothing's been diluted.","Nothing's leaked out.","I love, hear me, I love that I can feel the feelings inside you, even if they stop my life, even if they break my heart, even if they take me off track, they make me responsible.","I am an emotional, I am an emotional, incondotional, devotional creature.","And I love, hear me, I love, love, love being a girl.","Can you say it with me?","I love, I love, love, love being a girl!","Thank you very much.","(Applause)"],"8":["John Hockenberry: It's great to be here with you, Tom.","And I want to start with a question that has just been consuming me since I first became familiar with your work.","In you work there's always this kind of hybrid quality of a natural force in some sort of interplay with creative force.","Are they ever in equilibrium in the way that you see your work?","Tom Shannon: Yeah, the subject matter that I'm looking for, it's usually to solve a question.","I had the question popped into my head: What does the cone that connects the sun and the Earth look like if you could connect the two spheres?","And in proportion, what would the size of the sphere and the length, and what would the taper be to the Earth?","And so I went about and made that sculpture, turning it out of solid bronze.","And I did one that was about 35 feet long.","The sun end was about four inches in diameter, and then it tapered over about 35 feet to about a millimeter at the Earth end.","And so for me, it was really exciting just to see what it looks like if you could step outside and into a larger context, as though you were an astronaut, and see these two things as an object, because they are so intimately bound, and one is meaningless without the other.","JH: Is there a relief in playing with these forces?","And I'm wondering how much of a sense of discovery there is in playing with these forces.","TS: Well, like the magnetically levitated objects -- like that silver one there, that was the result of hundreds of experiments with magnets, trying to find a way to make something float with the least possible connection to the ground.","So I got it down to just one tether to be able to support that.","JH: Now is this electromagnetic here, or are these static?","TS: Those are permanent magnets, yeah.","JH: Because if the power went out, there would just be a big noise.","TS: Yeah.","It's really unsatisfactory having plug-in art.","JH: I agree.","TS: The magnetic works are a combination of gravity and magnetism, so it's a kind of mixture of these ambient forces that influence everything.","The sun has a tremendous field that extends way beyond the planets and the Earth's magnetic field protects us from the sun.","So there's this huge invisible shape structures that magnetism takes in the universe.","But with the pendulum, it allows me to manifest these invisible forces that are holding the magnets up.","My sculptures are normally very simplified.","I try to refine them down to very simple forms.","But the paintings become very complex, because I think the fields that are supporting them, they're billowing, and they're interpenetrating, and they're interference patterns.","JH: And they're non-deterministic.","I mean, you don't know necessarily where you're headed when you begin, even though the forces can be calculated.","So the evolution of this -- I gather this isn't your first pendulum.","TS: No.","(JH: No.)","TS: The first one I did was in the late 70's, and I just had a simple cone with a spigot at the bottom of it.","I threw it into an orbit, and it only had one color, and when it got to the center, the paint kept running out, so I had to run in there, didn't have any control over the spigot remotely.","So that told me right away: I need a remote control device.","But then I started dreaming of having six colors.","I sort of think about it as the DNA -- these colors, the red, blue, yellow, the primary colors and white and black.","And if you put them together in different combinations -- just like printing in a sense, like how a magazine color is printed -- and put them under certain forces, which is orbiting them or passing them back and forth or drawing with them, these amazing things started appearing.","JH: It looks like we're loaded for bear here.","TS: Yeah, well let's put a couple of canvases.","I'll ask a couple of my sons to set up the canvases here.","I want to just say -- so this is Jack, Nick and Louie.","JH: Thanks guys.","TS: So here are the --","JH: All right, I'll get out of the way here.","TS: I'm just going to throw this into an orbit and see if I can paint everybody's shoes in the front.","(Laughter)","JH: Whoa.","That is ... ooh, nice.","TS: So something like this.","I'm doing this as a demo, and it's more playful, but inevitably, all of this can be used.","I can redeem this painting, just continuing on, doing layers upon layers.","And I keep it around for a couple of weeks, and I'm contemplating it, and I'll do another session with it and bring it up to another level, where all of this becomes the background, the depth of it.","JH: That's fantastic.","So the valves at the bottom of those tubes there are like radio-controlled airplane valves.","TS: Yes, they're servos with cams that pinch these rubber tubes.","And they can pinch them very tight and stop it, or you can have them wide open.","And all of the colors come out one central port at the bottom.","You can always be changing colors, put aluminum paint, or I could put anything into this.","It could be tomato sauce, or anything could be dispensed -- sand, powders or anything like that.","JH: So many forces there.","You've got gravity, you've got the centrifugal force, you've got the fluid dynamics.","Each of these beautiful paintings, are they images in and of themselves, or are they records of a physical event called the pendulum approaching the canvas?","TS: Well, this painting here, I wanted to do something very simple, a simple, iconic image of two ripples interfering.","So the one on the right was done first, and then the one on the left was done over it.","And then I left gaps so you could see the one that was done before.","And then when I did the second one, it really disturbed the piece -- these big blue lines crashing through the center of it -- and so it created a kind of tension and an overlap.","There are lines in front of the one on the right, and there are lines behind the one on the left, and so it takes it into different planes.","What it's also about, just the little events, the events of the interpenetration of --","JH: Two stars, or --","TS: Two things that happened -- there's an interference pattern, and then a third thing happens.","There are shapes that come about just by the marriage of two events that are happening, and I'm very interested in that.","Like the occurrence of moire patterns.","Like this green one, this is a painting I did about 10 years ago, but it has some -- see, in the upper third -- there are these moires and interference patterns that are radio kind of imagery.","And that's something that in painting I've never seen done.","I've never seen a representation of a kind of radio interference patterns, which are so ubiquitous and such an important part of our lives.","JH: Is that a literal part of the image, or is my eye making that interference pattern -- is my eye completing that interference pattern?","TS: It is the paint actually, makes it real.","It's really manifested there.","If I throw a very concentric circle, or concentric ellipse, it just dutifully makes these evenly spaced lines, which get closer and closer together, which describes how gravity works.","There's something very appealing about the exactitude of science that I really enjoy.","And I love the shapes that I see in scientific observations and apparatus, especially astronomical forms and the idea of the vastness of it, the scale, is very interesting to me.","My focus in recent years has kind of shifted more toward biology.","Some of these paintings, when you look at them very close, odd things appear that really look like horses or birds or crocodiles, elephants.","There are lots of things that appear.","When you look into it, it's sort of like looking at cloud patterns, but sometimes they're very modeled and highly rendered.","And then there are all these forms that we don't know what they are, but they're equally well-resolved and complex.","So I think, conceivably, those could be predictive.","Because since it has the ability to make forms that look like forms that we're familiar with in biology, it's also making other forms that we're not familiar with.","And maybe it's the kind of forms we'll discover underneath the surface of Mars, where there are probably lakes with fish swimming under the surface.","JH: Oh, let's hope so.","Oh, my God, let's.","Oh, please, yes.","Oh, I'm so there.","You know, it seems at this stage in your life, you also very personally are in this state of confrontation with a sort of dissonant -- I suppose it's an electromagnetic force that somehow governs your Parkinson's and this creative force that is both the artist who is in the here and now and this sort of arc of your whole life.","Is that relevant to your work?","TS: As it turns out, this device kind of comes in handy, because I don't have to have the fine motor skills to do, that I can operate slides, which is more of a mental process.","I'm looking at it and making decisions: It needs more red, it needs more blue, it needs a different shape.","And so I make these creative decisions and can execute them in a much, much simpler way.","I mean, I've got the symptoms.","I guess Parkinson's kind of creeps up over the years, but at a certain point you start seeing the symptoms.","In my case, my left hand has a significant tremor and my left leg also.","I'm left-handed, and so I draw.","All my creations really start on small drawings, which I have thousands of, and it's my way of just thinking.","I draw with a simple pencil, and at first, the Parkinson's was really upsetting, because I couldn't get the pencil to stand still.","JH: So you're not a gatekeeper for these forces.","You don't think of yourself as the master of these forces.","You think of yourself as the servant.","TS: Nature is -- well, it's a godsend.","It just has so much in it.","And I think nature wants to express itself in the sense that we are nature, humans are of the universe.","The universe is in our mind, and our minds are in the universe.","And we are expressions of the universe, basically.","As humans, ultimately being part of the universe, we're kind of the spokespeople or the observer part of the constituency of the universe.","And to interface with it, with a device that lets these forces that are everywhere act and show what they can do, giving them pigment and paint just like an artist, it's a good ally.","It's a terrific studio assistant.","JH: Well, I love the idea that somewhere within this idea of fine motion and control with the traditional skills that you have with your hand, some sort of more elemental force gets revealed, and that's the beauty here.","Tom, thank you so much.","It's been really, really great.","TS: Thank you, John.","(Applause)"],"9":["We humans have always been very concerned about the health of our bodies, but we haven't always been that good at figuring out what's important.","Take the ancient Egyptians, for example: very concerned about the body parts they thought they'd need in the afterlife, but they left some parts out.","This part, for example.","Although they very carefully preserved the stomach, the lungs, the liver, and so forth, they just mushed up the brain, drained it out through the nose, and threw it away, which makes sense, really, because what does a brain do for us anyway?","But imagine if there were a kind of neglected organ in our bodies that weighed just as much as the brain and in some ways was just as important to who we are, but we knew so little about and treated with such disregard.","And imagine if, through new scientific advances, we were just beginning to understand its importance to how we think of ourselves.","Wouldn't you want to know more about it?","Well, it turns out that we do have something just like that: our gut, or rather, its microbes.","But it's not just the microbes in our gut that are important.","Microbes all over our body turn out to be really critical to a whole range of differences that make different people who we are.","So for example, have you ever noticed how some people get bitten by mosquitos way more often than others?","It turns out that everyone's anecdotal experience out camping is actually true.","For example, I seldom get bitten by mosquitos, but my partner Amanda attracts them in droves, and the reason why is that we have different microbes on our skin that produce different chemicals that the mosquitos detect.","Now, microbes are also really important in the field of medicine.","So, for example, what microbes you have in your gut determine whether particular painkillers are toxic to your liver.","They also determine whether or not other drugs will work for your heart condition.","And, if you're a fruit fly, at least, your microbes determine who you want to have sex with.","We haven't demonstrated this in humans yet but maybe it's just a matter of time before we find out.","(Laughter)","So microbes are performing a huge range of functions.","They help us digest our food.","They help educate our immune system.","They help us resist disease, and they may even be affecting our behavior.","So what would a map of all these microbial communities look like?","Well, it wouldn't look exactly like this, but it's a helpful guide for understanding biodiversity.","Different parts of the world have different landscapes of organisms that are immediately characteristic of one place or another or another.","With microbiology, it's kind of the same, although I've got to be honest with you: All the microbes essentially look the same under a microscope.","So instead of trying to identify them visually, what we do is we look at their DNA sequences, and in a project called the Human Microbiome Project, NIH funded this $173 million project where hundreds of researchers came together to map out all the A's, T's, G's, and C's, and all of these microbes in the human body.","So when we take them together, they look like this.","It's a bit more difficult to tell who lives where now, isn't it?","What my lab does is develop computational techniques that allow us to take all these terabytes of sequence data and turn them into something that's a bit more useful as a map, and so when we do that with the human microbiome data from 250 healthy volunteers, it looks like this.","Each point here represents all the complex microbes in an entire microbial community.","See, I told you they basically all look the same.","So what we're looking at is each point represents one microbial community from one body site of one healthy volunteer.","And so you can see that there's different parts of the map in different colors, almost like separate continents.","And what it turns out to be is that those, as the different regions of the body, have very different microbes in them.","So what we have is we have the oral community up there in green.","Over on the other side, we have the skin community in blue, the vaginal community in purple, and then right down at the bottom, we have the fecal community in brown.","And we've just over the last few years found out that the microbes in different parts of the body are amazingly different from one another.","So if I look at just one person's microbes in the mouth and in the gut, it turns out that the difference between those two microbial communities is enormous.","It's bigger than the difference between the microbes in this reef and the microbes in this prairie.","So this is incredible when you think about it.","What it means is that a few feet of difference in the human body makes more of a difference to your microbial ecology than hundreds of miles on Earth.","And this is not to say that two people look basically the same in the same body habitat, either.","So you probably heard that we're pretty much all the same in terms of our human DNA.","You're 99.99 percent identical in terms of your human DNA to the person sitting next to you.","But that's not true of your gut microbes: you might only share 10 percent similarity with the person sitting next to you in terms of your gut microbes.","So that's as different as the bacteria on this prairie and the bacteria in this forest.","So these different microbes have all these different kinds of functions that I told you about, everything from digesting food to involvement in different kinds of diseases, metabolizing drugs, and so forth.","So how do they do all this stuff?","Well, in part it's because although there's just three pounds of those microbes in our gut, they really outnumber us.","And so how much do they outnumber us?","Well, it depends on what you think of as our bodies.","Is it our cells?","Well, each of us consists of about 10 trillion human cells, but we harbor as many as 100 trillion microbial cells.","So they outnumber us 10 to one.","Now, you might think, well, we're human because of our DNA, but it turns out that each of us has about 20,000 human genes, depending on what you count exactly, but as many as two million to 20 million microbial genes.","So whichever way we look at it, we're vastly outnumbered by our microbial symbionts.","And it turns out that in addition to traces of our human DNA, we also leave traces of our microbial DNA on everything we touch.","We showed in a study a few years ago that you can actually match the palm of someone's hand up to the computer mouse that they use routinely with up to 95 percent accuracy.","So this came out in a scientific journal a few years ago, but more importantly, it was featured on \"CSI: Miami,\" so you really know it's true.","(Laughter)","So where do our microbes come from in the first place?","Well if, as I do, you have dogs or kids, you probably have some dark suspicions about that, all of which are true, by the way.","So just like we can match you to your computer equipment by the microbes you share, we can also match you up to your dog.","But it turns out that in adults, microbial communities are relatively stable, so even if you live together with someone, you'll maintain your separate microbial identity over a period of weeks, months, even years.","It turns out that our first microbial communities depend a lot on how we're born.","So babies that come out the regular way, all of their microbes are basically like the vaginal community, whereas babies that are delivered by C-section, all of their microbes instead look like skin.","And this might be associated with some of the differences in health associated with Cesarean birth, such as more asthma, more allergies, even more obesity, all of which have been linked to microbes now, and when you think about it, until recently, every surviving mammal had been delivered by the birth canal, and so the lack of those protective microbes that we've co-evolved with might be really important for a lot of these different conditions that we now know involve the microbiome.","When my own daughter was born a couple of years ago by emergency C-section, we took matters into our own hands and made sure she was coated with those vaginal microbes that she would have gotten naturally.","Now, it's really difficult to tell whether this has had an effect on her health specifically, right?","With a sample size of just one child, no matter how much we love her, you don't really have enough of a sample size to figure out what happens on average, but at two years old, she hasn't had an ear infection yet, so we're keeping our fingers crossed on that one.","And what's more, we're starting to do clinical trials with more children to figure out whether this has a protective effect generally.","So how we're born has a tremendous effect on what microbes we have initially, but where do we go after that?","What I'm showing you again here is this map of the Human Microbiome Project Data, so each point represents a sample from one body site from one of 250 healthy adults.","And you've seen children develop physically.","You've seen them develop mentally.","Now, for the first time, you're going to see one of my colleague's children develop microbially.","So what we are going to look at is we're going to look at this one baby's stool, the fecal community, which represents the gut, sampled every week for almost two and a half years.","And so we're starting on day one.","What's going to happen is that the infant is going to start off as this yellow dot, and you can see that he's starting off basically in the vaginal community, as we would expect from his delivery mode.","And what's going to happen over these two and a half years is that he's going to travel all the way down to resemble the adult fecal community from healthy volunteers down at the bottom.","So I'm just going to start this going and we'll see how that happens.","What you can see, and remember each step in this is just one week, what you can see is that week to week, the change in the microbial community of the feces of this one child, the differences week to week are much greater than the differences between individual healthy adults in the Human Microbiome Project cohort, which are those brown dots down at the bottom.","And you can see he's starting to approach the adult fecal community.","This is up to about two years.","But something amazing is about to happen here.","So he's getting antibiotics for an ear infection.","What you can see is this huge change in the community, followed by a relatively rapid recovery.","I'll just rewind that for you.","And what we can see is that just over these few weeks, we have a much more radical change, a setback of many months of normal development, followed by a relatively rapid recovery, and by the time he reaches day 838, which is the end of this video, you can see that he has essentially reached the healthy adult stool community, despite that antibiotic intervention.","So this is really interesting because it raises fundamental questions about what happens when we intervene at different ages in a child's life.","So does what we do early on, where the microbiome is changing so rapidly, actually matter, or is it like throwing a stone into a stormy sea, where the ripples will just be lost?","Well, fascinatingly, it turns out that if you give children antibiotics in the first six months of life, they're more likely to become obese later on than if they don't get antibiotics then or only get them later, and so what we do early on may have profound impacts on the gut microbial community and on later health that we're only beginning to understand.","So this is fascinating, because one day, in addition to the effects that antibiotics have on antibiotic-resistant bacteria, which are very important, they may also be degrading our gut microbial ecosystems, and so one day we may come to regard antibiotics with the same horror that we currently reserve for those metal tools that the Egyptians used to use to mush up the brains before they drained them out for embalming.","So I mentioned that microbes have all these important functions, and they've also now, just over the past few years, been connected to a whole range of different diseases, including inflammatory bowel disease, heart disease, colon cancer, and even obesity.","Obesity has a really large effect, as it turns out, and today, we can tell whether you're lean or obese with 90 percent accuracy by looking at the microbes in your gut.","Now, although that might sound impressive, in some ways it's a little bit problematic as a medical test, because you can probably tell which of these people is obese without knowing anything about their gut microbes, but it turns out that even if we sequence their complete genomes and had all their human DNA, we could only predict which one was obese with about 60 percent accuracy.","So that's amazing, right?","What it means that the three pounds of microbes that you carry around with you may be more important for some health conditions than every single gene in your genome.","And then in mice, we can do a lot more.","So in mice, microbes have been linked to all kinds of additional conditions, including things like multiple sclerosis, depression, autism, and again, obesity.","But how can we tell whether these microbial differences that correlate with disease are cause or effect?","Well, one thing we can do is we can raise some mice without any microbes of their own in a germ-free bubble.","Then we can add in some microbes that we think are important, and see what happens.","When we take the microbes from an obese mouse and transplant them into a genetically normal mouse that's been raised in a bubble with no microbes of its own, it becomes fatter than if it got them from a regular mouse.","Why this happens is absolutely amazing, though.","Sometimes what's going on is that the microbes are helping them digest food more efficiently from the same diet, so they're taking more energy from their food, but other times, the microbes are actually affecting their behavior.","What they're doing is they're eating more than the normal mouse, so they only get fat if we let them eat as much as they want.","So this is really remarkable, right?","The implication is that microbes can affect mammalian behavior.","So you might be wondering whether we can also do this sort of thing across species, and it turns out that if you take microbes from an obese person and transplant them into mice you've raised germ-free, those mice will also become fatter than if they received the microbes from a lean person, but we can design a microbial community that we inoculate them with that prevents them from gaining this weight.","We can also do this for malnutrition.","So in a project funded by the Gates Foundation, what we're looking at is children in Malawi who have kwashiorkor, a profound form of malnutrition, and mice that get the kwashiorkor community transplanted into them lose 30 percent of their body mass in just three weeks, but we can restore their health by using the same peanut butter-based supplement that is used for the children in the clinic, and the mice that receive the community from the healthy identical twins of the kwashiorkor children do fine.","This is truly amazing because it suggests that we can pilot therapies by trying them out in a whole bunch of different mice with individual people's gut communities and perhaps tailor those therapies all the way down to the individual level.","So I think it's really important that everyone has a chance to participate in this discovery.","So, a couple of years ago, we started this project called American Gut, which allows you to claim a place for yourself on this microbial map.","This is now the largest crowd-funded science project that we know of -- over 8,000 people have signed up at this point.","What happens is, they send in their samples, we sequence the DNA of their microbes and then release the results back to them.","We also release them, de-identified, to scientists, to educators, to interested members of the general public, and so forth, so anyone can have access to the data.","On the other hand, when we do tours of our lab at the BioFrontiers Institute, and we explain that we use robots and lasers to look at poop, it turns out that not everyone wants to know.","(Laughter) But I'm guessing that many of you do, and so I brought some kits here if you're interested in trying this out for yourself.","So why might we want to do this?","Well, it turns out that microbes are not just important for finding out where we are in terms of our health, but they can actually cure disease.","This is one of the newest things we've been able to visualize with colleagues at the University of Minnesota.","So here's that map of the human microbiome again.","What we're looking at now -- I'm going to add in the community of some people with C. diff.","So, this is a terrible form of diarrhea where you have to go up to 20 times a day, and these people have failed antibiotic therapy for two years before they're eligible for this trial.","So what would happen if we transplanted some of the stool from a healthy donor, that star down at the bottom, into these patients.","Would the good microbes do battle with the bad microbes and help to restore their health?","So let's watch exactly what happens there.","Four of those patients are about to get a transplant from that healthy donor at the bottom, and what you can see is that immediately, you have this radical change in the gut community.","So one day after you do that transplant, all those symptoms clear up, the diarrhea vanishes, and they're essentially healthy again, coming to resemble the donor's community, and they stay there.","(Applause)","So we're just at the beginning of this discovery.","We're just finding out that microbes have implications for all these different kinds of diseases, ranging from inflammatory bowel disease to obesity, and perhaps even autism and depression.","What we need to do, though, is we need to develop a kind of microbial GPS, where we don't just know where we are currently but also where we want to go and what we need to do in order to get there, and we need to be able to make this simple enough that even a child can use it.","(Laughter)","Thank you.","(Applause)"],"10":["(Applause)","(Music)","(Applause)"],"11":["I have the feeling that we can all agree that we're moving towards a new model of the state and society.","But, we're absolutely clueless as to what this is or what it should be.","It seems like we need to have a conversation about democracy","in our day and age.","Let's think about it this way: We are 21st-century citizens, doing our very, very best to interact with 19th century-designed institutions that are based on an information technology of the 15th century.","Let's have a look at some of the characteristics of this system.","First of all, it's designed for an information technology that's over 500 years old.","And the best possible system that could be designed for it is one where the few make daily decisions in the name of the many.","And the many get to vote once every couple of years.","In the second place, the costs of participating in this system are incredibly high.","You either have to have a fair bit of money and influence, or you have to devote your entire life to politics.","You have to become a party member and slowly start working up the ranks until maybe, one day, you'll get to sit at a table where a decision is being made.","And last but not least, the language of the system \u2014 it's incredibly cryptic.","It's done for lawyers, by lawyers,","and no one else can understand.","So, it's a system where we can choose our authorities, but we are completely left out on how those authorities reach their decisions.","So, in a day where a new information technology allows us to participate globally in any conversation, our barriers of information are completely lowered and we can, more than ever before, express our desires and our concerns.","Our political system remains the same for the past 200 years and expects us to be contented with being simply passive recipients","of a monologue.","So, it's really not surprising that this kind of system is only able to produce two kinds of results: silence or noise.","Silence, in terms of citizens not engaging, simply not wanting to participate.","There's this commonplace [idea] that I truly, truly dislike, and it's this idea that we citizens are naturally apathetic.","That we shun commitment.","But, can you really blame us for not jumping at the opportunity of going to the middle of the city in the middle of a working day to attend, physically, a public hearing that has no impact whatsoever?","Conflict is bound to happen between a system that no longer represents, nor has any dialogue capacity, and citizens that are increasingly used to representing themselves.","And, then we find noise: Chile, Argentina, Brazil, Mexico Italy, France, Spain, the United States, they're all democracies.","Their citizens have access to the ballot boxes.","But they still feel the need,","they need to take to the streets in order to be heard.","To me, it seems like the 18th-century slogan that was the basis for the formation of our modern democracies, \"No taxation without representation,\" can now be updated to \"No representation without a conversation.\"","We want our seat at the table.","And rightly so.","But in order to be part of this conversation, we need to know what we want to do next, because political action is being able to move from agitation to construction.","My generation has been incredibly good at using new networks and technologies to organize protests, protests that were able to successfully impose agendas, roll back extremely pernicious legislation, and even overthrow authoritarian governments.","And we should be immensely proud of this.","But, we also must admit that we haven't been good at using those same networks and technologies to successfully articulate an alternative to what we're seeing and find the consensus and build the alliances that are needed","to make it happen.","And so the risk that we face is that we can create these huge power vacuums that will very quickly get filled up by de facto powers, like the military or highly motivated and already organized groups","that generally lie on the extremes.","But our democracy is neither just a matter of voting once every couple of years.","But it's not either the ability to bring millions onto the streets.","So the question I'd like to raise here, and I do believe it's the most important question we need to answer, is this one: If Internet is the new printing press, then what is democracy for the Internet era?","What institutions do we want to build","for the 21st-century society?","I don't have the answer, just in case.","I don't think anyone does.","But I truly believe we can't afford to ignore this question anymore.","So, I'd like to share our experience and what we've learned so far and hopefully contribute two cents","to this conversation.","Two years ago, with a group of friends from Argentina, we started thinking, \"how can we get our representatives, our elected representatives, to represent us?\"","Marshall McLuhan once said that politics is solving today's problems with yesterday's tools.","So the question that motivated us was, can we try and solve some of today's problems with the tools that we use every single day of our lives?","Our first approach was to design and develop a piece of software called DemocracyOS.","DemocracyOS is an open-source web application that is designed to become a bridge between citizens and their elected representatives","to make it easier for us to participate from our everyday lives.","So first of all, you can get informed so every new project that gets introduced in Congress gets immediately translated and explained in plain language on this platform.","But we all know that social change is not going to come from just knowing more information, but from doing something with it.","So better access to information should lead to a conversation about what we're going to do next, and DemocracyOS allows for that.","Because we believe that democracy is not just a matter of stacking up preferences, one on top of each other, but that our healthy and robust public debate","should be, once again, one of its fundamental values.","So DemocracyOS is about persuading and being persuaded.","It's about reaching a consensus as much as finding a proper way of channeling our disagreement.","And finally, you can vote how you would like your elected representative to vote.","And if you do not feel comfortable voting on a certain issue, you can always delegate your vote to someone else, allowing","for a dynamic and emerging social leadership.","It suddenly became very easy for us to simply compare these results with how our representatives were voting in Congress.","But, it also became very evident that technology was not going to do the trick.","What we needed to do to was to find actors that were able to grab this distributed knowledge in society and use it to make better and more fair decisions.","So we reached out to traditional political parties and we offered them DemocracyOS.","We said, \"Look, here you have a platform that you can use to build a two-way conversation with your constituencies.\"","And yes, we failed.","We failed big time.","We were sent to play outside like little kids.","Amongst other things, we were called naive.","And I must be honest: I think, in hindsight, we were.","Because the challenges that we face, they're not technological, they're cultural.","Political parties were never willing to change the way they make their decisions.","So it suddenly became a bit obvious that if we wanted to move forward with this idea,","we needed to do it ourselves.","And so we took quite a leap of faith, and in August last year, we founded our own political party, El Partido de la Red, or the Net Party, in the city of Buenos Aires.","And taking an even bigger leap of faith, we ran for elections in October last year with this idea: if we want a seat in Congress, our candidate, our representatives were always going to vote according to what citizens decided on DemocracyOS.","Every single project that got introduced in Congress, we were going vote according to what citizens decided on an online platform.","It was our way of hacking the political system.","We understood that if we wanted to become part of the conversation, to have a seat at the table, we needed to become valid stakeholders,","and the only way of doing it is to play by the system rules.","But we were hacking it in the sense that we were radically changing the way a political party makes its decisions.","For the first time, we were making our decisions together with those who we were","affecting directly by those decisions.","It was a very, very bold move for a two-month-old party in the city of Buenos Aires.","But it got attention.","We got 22,000 votes, that's 1.2 percent of the votes, and we came in second for the local options.","So, even if that wasn't enough to win a seat in Congress, it was enough for us to become part of the conversation, to the extent that next month, Congress, as an institution, is launching for the first time in Argentina's history, a DemocracyOS to discuss, with the citizens, three pieces of legislation: two on urban transportation and","one on the use of public space.","Of course, our elected representatives are not saying, \"Yes, we're going to vote according to what citizens decide,\" but they're willing to try.","They're willing to open up a new space for citizen engagement and hopefully","they'll be willing to listen as well.","Our political system can be transformed, and not by subverting it, by destroying it, but by rewiring it with the tools that","Internet affords us now.","But a real challenge is to find, to design to create, to empower those connectors that are able to innovate, to transform noise and silence into signal and finally bring our democracies","to the 21st century.","I'm not saying it's easy.","But in our experience, we actually stand a chance of making it work.","And in my heart, it's most definitely worth trying.","Thank you.","(Applause)"],"12":["I\u2019d like to dedicate this next song to Carmelo, who was put to sleep a couple of days ago, because he got too old.","But apparently he was a very nice dog and he always let the cat sleep in the dog bed.","\u266b (Dog panting noise) Heh, heh, heh, heh, heh, heh, heh, heh, heh, heh.","\u266b","\u266b I'm just a'walking my dog, singing my song, strolling along.","\u266b","\u266b Yeah, it's just me and my dog, catching some sun.","We can't go wrong.","\u266b","\u266b My life was lonely and blue.","\u266b","\u266b Yeah, I was sad as a sailor, \u266b","\u266b I was an angry 'un too.","\u266b","\u266b Then there was you -- appeared when I was entangled with youth and fear, \u266b","\u266b and nerves jingle jangled, vermouth and beer were getting me mangled up.","\u266b","\u266b But then I looked in your eyes \u266b","\u266b and I was no more a failure.","\u266b","\u266b You looked so wacky and wise.","\u266b","\u266b And I said, \"Lord, I'm happy, 'cause I'm just a'walking my dog, \u266b","\u266b catching some sun.","We can't go wrong.\"","\u266b","\u266b Yeah, it's just me and my dog, singing our song, strolling along.","\u266b","\u266b 'Cause I don't care about your hating and your doubt, \u266b","\u266b and I don't care what the politicians spout.","\u266b","\u266b If you need a companion, why, just go out to the pound, \u266b","\u266b and find yourself a hound, and make that doggie proud, \u266b","\u266b 'cause that's what it's all about.","\u266b","\u266b (Dog panting noise) Heh, heh, heh, heh, heh, heh, heh, heh, heh, heh.","\u266b","\u266b My life was tragic and sad.","\u266b","\u266b I was the archetypal loser.","\u266b","\u266b I was a pageant gone bad.","\u266b","\u266b And then there was you -- on time, and wagging your tail \u266b","\u266b in the cutest mime that you was in jail.","\u266b","\u266b I said, \"Woof, be mine!\"","and you gave a wail and then \u266b","\u266b I was no longer alone.","\u266b","\u266b And I was no more a boozer.","\u266b","\u266b We'll make the happiest home.","\u266b","\u266b And I said, \"Lord, I'm happy, 'cause I\u2019m just a'walking my dog, \u266b","\u266b singing my song, strolling along.\"","\u266b","\u266b Yeah, it's just me and my dog, catching some sun.","We can't go wrong, \u266b","\u266b 'cause I don't care about your hating and your doubt, \u266b","\u266b and I don\u2019t care what the politicians spout.","\u266b","\u266b If you need a companion, why, just go out to the pound, \u266b","\u266b and find yourself a hound, and make that doggie proud, \u266b","\u266b 'cause that's what it's all about, \u266b","\u266b that's what it's all about, \u266b","\u266b that's what it's all abou-BOW-WOW-WOW-WOW \u266b","\u266b that's what it's all about.","\u266b","\u266b (Dog panting noise) Heh, heh, heh, heh, heh.","\u266b","Good dog!","Thank you."],"13":["I'm here to talk to you about how globalized we are, how globalized we aren't, and why it's important to actually be accurate in making those kinds of assessments.","And the leading point of view on this, whether measured by number of books sold, mentions in media, or surveys that I've run with groups ranging from my students to delegates to the World Trade Organization, is this view that national borders really don't matter very much anymore, cross-border integration is close to complete, and we live in one world.","And what's interesting about this view is, again, it's a view that's held by pro-globalizers like Tom Friedman, from whose book this quote is obviously excerpted, but it's also held by anti-globalizers, who see this giant globalization tsunami that's about to wreck all our lives if it hasn't already done so.","The other thing I would add is that this is not a new view.","I'm a little bit of an amateur historian, so I've spent some time going back, trying to see the first mention of this kind of thing.","And the best, earliest quote that I could find was one from David Livingstone, writing in the 1850s about how the railroad, the steam ship, and the telegraph were integrating East Africa perfectly with the rest of the world.","Now clearly, David Livingstone was a little bit ahead of his time, but it does seem useful to ask ourselves, \"Just how global are we?\"","before we think about where we go from here.","So the best way I've found of trying to get people to take seriously the idea that the world may not be flat, may not even be close to flat, is with some data.","So one of the things I've been doing over the last few years is really compiling data on things that could either happen within national borders or across national borders, and I've looked at the cross-border component as a percentage of the total.","I'm not going to present all the data that I have here today, but let me just give you a few data points.","I'm going to talk a little bit about one kind of information flow, one kind of flow of people, one kind of flow of capital, and, of course, trade in products and services.","So let's start off with plain old telephone service.","Of all the voice-calling minutes in the world last year, what percentage do you think were accounted for by cross-border phone calls?","Pick a percentage in your own mind.","The answer turns out to be two percent.","If you include Internet telephony, you might be able to push this number up to six or seven percent, but it's nowhere near what people tend to estimate.","Or let's turn to people moving across borders.","One particular thing we might look at, in terms of long-term flows of people, is what percentage of the world's population is accounted for by first-generation immigrants?","Again, please pick a percentage.","Turns out to be a little bit higher.","It's actually about three percent.","Or think of investment.","Take all the real investment that went on in the world in 2010.","What percentage of that was accounted for by foreign direct investment?","Not quite ten percent.","And then finally, the one statistic that I suspect many of the people in this room have seen: the export-to-GDP ratio.","If you look at the official statistics, they typically indicate a little bit above 30 percent.","However, there's a big problem with the official statistics, in that if, for instance, a Japanese component supplier ships something to China to be put into an iPod, and then the iPod gets shipped to the U.S., that component ends up getting counted multiple times.","So nobody knows how bad this bias with the official statistics actually is, so I thought I would ask the person who's spearheading the effort to generate data on this, Pascal Lamy, the Director of the World Trade Organization, what his best guess would be of exports as a percentage of GDP, without the double- and triple-counting, and it's actually probably a bit under 20 percent, rather than the 30 percent-plus numbers that we're talking about.","So it's very clear that if you look at these numbers or all the other numbers that I talk about in my book, \"World 3.0,\" that we're very, very far from the no-border effect benchmark, which would imply internationalization levels of the order of 85, 90, 95 percent.","So clearly, apocalyptically-minded authors have overstated the case.","But it's not just the apocalyptics, as I think of them, who are prone to this kind of overstatement.","I've also spent some time surveying audiences in different parts of the world on what they actually guess these numbers to be.","Let me share with you the results of a survey that Harvard Business Review was kind enough to run of its readership as to what people's guesses along these dimensions actually were.","So a couple of observations stand out for me from this slide.","First of all, there is a suggestion of some error.","Okay.","(Laughter) Second, these are pretty large errors.","For four quantities whose average value is less than 10 percent, you have people guessing three, four times that level.","Even though I'm an economist, I find that a pretty large error.","And third, this is not just confined to the readers of the Harvard Business Review.","I've run several dozen such surveys in different parts of the world, and in all cases except one, where a group actually underestimated the trade-to-GDP ratio, people have this tendency towards overestimation, and so I thought it important to give a name to this, and that's what I refer to as globaloney, the difference between the dark blue bars and the light gray bars.","Especially because, I suspect, some of you may still be a little bit skeptical of the claims, I think it's important to just spend a little bit of time thinking about why we might be prone to globaloney.","A couple of different reasons come to mind.","First of all, there's a real dearth of data in the debate.","Let me give you an example.","When I first published some of these data a few years ago in a magazine called Foreign Policy, one of the people who wrote in, not entirely in agreement, was Tom Friedman.","And since my article was titled \"Why the World Isn't Flat,\" that wasn't too surprising.","(Laughter) What was very surprising to me was Tom's critique, which was, \"Ghemawat's data are narrow.\"","And this caused me to scratch my head, because as I went back through his several-hundred-page book, I couldn't find a single figure, chart, table, reference or footnote.","So my point is, I haven't presented a lot of data here to convince you that I'm right, but I would urge you to go away and look for your own data to try and actually assess whether some of these hand-me-down insights that we've been bombarded with actually are correct.","So dearth of data in the debate is one reason.","A second reason has to do with peer pressure.","I remember, I decided to write my \"Why the World Isn't Flat\" article, because I was being interviewed on TV in Mumbai, and the interviewer's first question to me was, \"Professor Ghemawat, why do you still believe that the world is round?\"","And I started laughing, because I hadn't come across that formulation before.","(Laughter) And as I was laughing, I was thinking, I really need a more coherent response, especially on national TV.","I'd better write something about this.","(Laughter) But what I can't quite capture for you was the pity and disbelief with which the interviewer asked her question.","The perspective was, here is this poor professor.","He's clearly been in a cave for the last 20,000 years.","He really has no idea as to what's actually going on in the world.","So try this out with your friends and acquaintances, if you like.","You'll find that it's very cool to talk about the world being one, etc.","If you raise questions about that formulation, you really are considered a bit of an antique.","And then the final reason, which I mention, especially to a TED audience, with some trepidation, has to do with what I call \"techno-trances.\"","If you listen to techno music for long periods of time, it does things to your brainwave activity.","(Laughter) Something similar seems to happen with exaggerated conceptions of how technology is going to overpower in the very immediate run all cultural barriers, all political barriers, all geographic barriers, because at this point I know you aren't allowed to ask me questions, but when I get to this point in my lecture with my students, hands go up, and people ask me, \"Yeah, but what about Facebook?\"","And I got this question often enough that I thought I'd better do some research on Facebook.","Because, in some sense, it's the ideal kind of technology to think about.","Theoretically, it makes it as easy to form friendships halfway around the world as opposed to right next door.","What percentage of people's friends on Facebook are actually located in countries other than where people we're analyzing are based?","The answer is probably somewhere between 10 to 15 percent.","Non-negligible, so we don't live in an entirely local or national world, but very, very far from the 95 percent level that you would expect, and the reason's very simple.","We don't, or I hope we don't, form friendships at random on Facebook.","The technology is overlaid on a pre-existing matrix of relationships that we have, and those relationships are what the technology doesn't quite displace.","Those relationships are why we get far fewer than 95 percent of our friends being located in countries other than where we are.","So does all this matter?","Or is globaloney just a harmless way of getting people to pay more attention to globalization-related issues?","I want to suggest that actually, globaloney can be very harmful to your health.","First of all, recognizing that the glass is only 10 to 20 percent full is critical to seeing that there might be potential for additional gains from additional integration, whereas if we thought we were already there, there would be no particular point to pushing harder.","It's a little bit like, we wouldn't be having a conference on radical openness if we already thought we were totally open to all the kinds of influences that are being talked about at this conference.","So being accurate about how limited globalization levels are is critical to even being able to notice that there might be room for something more, something that would contribute further to global welfare.","Which brings me to my second point.","Avoiding overstatement is also very helpful because it reduces and in some cases even reverses some of the fears that people have about globalization.","So I actually spend most of my \"World 3.0\" book working through a litany of market failures and fears that people have that they worry globalization is going to exacerbate.","I'm obviously not going to be able to do that for you today, so let me just present to you two headlines as an illustration of what I have in mind.","Think of France and the current debate about immigration.","When you ask people in France what percentage of the French population is immigrants, the answer is about 24 percent.","That's their guess.","Maybe realizing that the number is just eight percent might help cool some of the superheated rhetoric that we see around the immigration issue.","Or to take an even more striking example, when the Chicago Council on Foreign Relations did a survey of Americans, asking them to guess what percentage of the federal budget went to foreign aid, the guess was 30 percent, which is slightly in excess of the actual level \u2014 (\"actually about ... 1%\") (Laughter) \u2014 of U.S. governmental commitments to federal aid.","The reassuring thing about this particular survey was, when it was pointed out to people how far their estimates were from the actual data, some of them \u2014 not all of them \u2014 seemed to become more willing to consider increases in foreign aid.","So foreign aid is actually a great way of sort of wrapping up here, because if you think about it, what I've been talking about today is this notion -- very uncontroversial amongst economists -- that most things are very home-biased.","\"Foreign aid is the most aid to poor people,\" is about the most home-biased thing you can find.","If you look at the OECD countries and how much they spend per domestic poor person, and compare it with how much they spend per poor person in poor countries, the ratio \u2014 Branko Milanovic at the World Bank did the calculations \u2014 turns out to be about 30,000 to one.","Now of course, some of us, if we truly are cosmopolitan, would like to see that ratio being brought down to one-is-to-one.","I'd like to make the suggestion that we don't need to aim for that to make substantial progress from where we are.","If we simply brought that ratio down to 15,000 to one, we would be meeting those aid targets that were agreed at the Rio Summit 20 years ago that the summit that ended last week made no further progress on.","So in summary, while radical openness is great, given how closed we are, even incremental openness could make things dramatically better.","Thank you very much.","(Applause) (Applause)"],"14":["I grew up watching Star Trek.","I love Star Trek.","Star Trek made me want to see alien creatures, creatures from a far-distant world.","But basically, I figured out that I could find those alien creatures right on Earth.","And what I do is I study insects.","I'm obsessed with insects, particularly insect flight.","I think the evolution of insect flight is perhaps one of the most important events in the history of life.","Without insects, there'd be no flowering plants.","Without flowering plants, there would be no clever, fruit-eating primates giving TED Talks.","(Laughter)","Now, David and Hidehiko and Ketaki gave a very compelling story about the similarities between fruit flies and humans, and there are many similarities, and so you might think that if humans are similar to fruit flies, the favorite behavior of a fruit fly might be this, for example -- (Laughter) but in my talk, I don't want to emphasize on the similarities between humans and fruit flies, but rather the differences, and focus on the behaviors that I think fruit flies excel at doing.","And so I want to show you a high-speed video sequence of a fly shot at 7,000 frames per second in infrared lighting, and to the right, off-screen, is an electronic looming predator that is going to go at the fly.","The fly is going to sense this predator.","It is going to extend its legs out.","It's going to sashay away to live to fly another day.","Now I have carefully cropped this sequence to be exactly the duration of a human eye blink, so in the time that it would take you to blink your eye, the fly has seen this looming predator, estimated its position, initiated a motor pattern to fly it away, beating its wings at 220 times a second as it does so.","I think this is a fascinating behavior that shows how fast the fly's brain can process information.","Now, flight -- what does it take to fly?","Well, in order to fly, just as in a human aircraft, you need wings that can generate sufficient aerodynamic forces, you need an engine sufficient to generate the power required for flight, and you need a controller, and in the first human aircraft, the controller was basically the brain of Orville and Wilbur sitting in the cockpit.","Now, how does this compare to a fly?","Well, I spent a lot of my early career trying to figure out how insect wings generate enough force to keep the flies in the air.","And you might have heard how engineers proved that bumblebees couldn't fly.","Well, the problem was in thinking that the insect wings function in the way that aircraft wings work.","But they don't.","And we tackle this problem by building giant, dynamically scaled model robot insects that would flap in giant pools of mineral oil where we could study the aerodynamic forces.","And it turns out that the insects flap their wings in a very clever way, at a very high angle of attack that creates a structure at the leading edge of the wing, a little tornado-like structure called a leading edge vortex, and it's that vortex that actually enables the wings to make enough force for the animal to stay in the air.","But the thing that's actually most -- so, what's fascinating is not so much that the wing has some interesting morphology.","What's clever is the way the fly flaps it, which of course ultimately is controlled by the nervous system, and this is what enables flies to perform these remarkable aerial maneuvers.","Now, what about the engine?","The engine of the fly is absolutely fascinating.","They have two types of flight muscle: so-called power muscle, which is stretch-activated, which means that it activates itself and does not need to be controlled on a contraction-by-contraction basis by the nervous system.","It's specialized to generate the enormous power required for flight, and it fills the middle portion of the fly, so when a fly hits your windshield, it's basically the power muscle that you're looking at.","But attached to the base of the wing is a set of little, tiny control muscles that are not very powerful at all, but they're very fast, and they're able to reconfigure the hinge of the wing on a stroke-by-stroke basis, and this is what enables the fly to change its wing and generate the changes in aerodynamic forces which change its flight trajectory.","And of course, the role of the nervous system is to control all this.","So let's look at the controller.","Now flies excel in the sorts of sensors that they carry to this problem.","They have antennae that sense odors and detect wind detection.","They have a sophisticated eye which is the fastest visual system on the planet.","They have another set of eyes on the top of their head.","We have no idea what they do.","They have sensors on their wing.","Their wing is covered with sensors, including sensors that sense deformation of the wing.","They can even taste with their wings.","One of the most sophisticated sensors a fly has is a structure called the halteres.","The halteres are actually gyroscopes.","These devices beat back and forth about 200 hertz during flight, and the animal can use them to sense its body rotation and initiate very, very fast corrective maneuvers.","But all of this sensory information has to be processed by a brain, and yes, indeed, flies have a brain, a brain of about 100,000 neurons.","Now several people at this conference have already suggested that fruit flies could serve neuroscience because they're a simple model of brain function.","And the basic punchline of my talk is, I'd like to turn that over on its head.","I don't think they're a simple model of anything.","And I think that flies are a great model.","They're a great model for flies.","(Laughter)","And let's explore this notion of simplicity.","So I think, unfortunately, a lot of neuroscientists, we're all somewhat narcissistic.","When we think of brain, we of course imagine our own brain.","But remember that this kind of brain, which is much, much smaller \u2014 instead of 100 billion neurons, it has 100,000 neurons \u2014 but this is the most common form of brain on the planet and has been for 400 million years.","And is it fair to say that it's simple?","Well, it's simple in the sense that it has fewer neurons, but is that a fair metric?","And I would propose it's not a fair metric.","So let's sort of think about this.","I think we have to compare -- (Laughter) \u2014 we have to compare the size of the brain with what the brain can do.","So I propose we have a Trump number, and the Trump number is the ratio of this man's behavioral repertoire to the number of neurons in his brain.","We'll calculate the Trump number for the fruit fly.","Now, how many people here think the Trump number is higher for the fruit fly?","(Applause)","It's a very smart, smart audience.","Yes, the inequality goes in this direction, or I would posit it.","Now I realize that it is a little bit absurd to compare the behavioral repertoire of a human to a fly.","But let's take another animal just as an example.","Here's a mouse.","A mouse has about 1,000 times as many neurons as a fly.","I used to study mice.","When I studied mice, I used to talk really slowly.","And then something happened when I started to work on flies.","(Laughter) And I think if you compare the natural history of flies and mice, it's really comparable.","They have to forage for food.","They have to engage in courtship.","They have sex.","They hide from predators.","They do a lot of the similar things.","But I would argue that flies do more.","So for example, I'm going to show you a sequence, and I have to say, some of my funding comes from the military, so I'm showing this classified sequence and you cannot discuss it outside of this room.","Okay?","So I want you to look at the payload at the tail of the fruit fly.","Watch it very closely, and you'll see why my six-year-old son now wants to be a neuroscientist.","Wait for it.","Pshhew.","So at least you'll admit that if fruit flies are not as clever as mice, they're at least as clever as pigeons.","(Laughter)","Now, I want to get across that it's not just a matter of numbers but also the challenge for a fly to compute everything its brain has to compute with such tiny neurons.","So this is a beautiful image of a visual interneuron from a mouse that came from Jeff Lichtman's lab, and you can see the wonderful images of brains that he showed in his talk.","But up in the corner, in the right corner, you'll see, at the same scale, a visual interneuron from a fly.","And I'll expand this up.","And it's a beautifully complex neuron.","It's just very, very tiny, and there's lots of biophysical challenges with trying to compute information with tiny, tiny neurons.","How small can neurons get?","Well, look at this interesting insect.","It looks sort of like a fly.","It has wings, it has eyes, it has antennae, its legs, complicated life history, it's a parasite, it has to fly around and find caterpillars to parasatize, but not only is its brain the size of a salt grain, which is comparable for a fruit fly, it is the size of a salt grain.","So here's some other organisms at the similar scale.","This animal is the size of a paramecium and an amoeba, and it has a brain of 7,000 neurons that's so small -- you know these things called cell bodies you've been hearing about, where the nucleus of the neuron is?","This animal gets rid of them because they take up too much space.","So this is a session on frontiers in neuroscience.","I would posit that one frontier in neuroscience is to figure out how the brain of that thing works.","But let's think about this.","How can you make a small number of neurons do a lot?","And I think, from an engineering perspective, you think of multiplexing.","You can take a hardware and have that hardware do different things at different times, or have different parts of the hardware doing different things.","And these are the two concepts I'd like to explore.","And they're not concepts that I've come up with, but concepts that have been proposed by others in the past.","And one idea comes from lessons from chewing crabs.","And I don't mean chewing the crabs.","I grew up in Baltimore, and I chew crabs very, very well.","But I'm talking about the crabs actually doing the chewing.","Crab chewing is actually really fascinating.","Crabs have this complicated structure under their carapace called the gastric mill that grinds their food in a variety of different ways.","And here's an endoscopic movie of this structure.","The amazing thing about this is that it's controlled by a really tiny set of neurons, about two dozen neurons that can produce a vast variety of different motor patterns, and the reason it can do this is that this little tiny ganglion in the crab is actually inundated by many, many neuromodulators.","You heard about neuromodulators earlier.","There are more neuromodulators that alter, that innervate this structure than actually neurons in the structure, and they're able to generate a complicated set of patterns.","And this is the work by Eve Marder and her many colleagues who've been studying this fascinating system that show how a smaller cluster of neurons can do many, many, many things because of neuromodulation that can take place on a moment-by-moment basis.","So this is basically multiplexing in time.","Imagine a network of neurons with one neuromodulator.","You select one set of cells to perform one sort of behavior, another neuromodulator, another set of cells, a different pattern, and you can imagine you could extrapolate to a very, very complicated system.","Is there any evidence that flies do this?","Well, for many years in my laboratory and other laboratories around the world, we've been studying fly behaviors in little flight simulators.","You can tether a fly to a little stick.","You can measure the aerodynamic forces it's creating.","You can let the fly play a little video game by letting it fly around in a visual display.","So let me show you a little tiny sequence of this.","Here's a fly and a large infrared view of the fly in the flight simulator, and this is a game the flies love to play.","You allow them to steer towards the little stripe, and they'll just steer towards that stripe forever.","It's part of their visual guidance system.","But very, very recently, it's been possible to modify these sorts of behavioral arenas for physiologies.","So this is the preparation that one of my former post-docs, Gaby Maimon, who's now at Rockefeller, developed, and it's basically a flight simulator but under conditions where you actually can stick an electrode in the brain of the fly and record from a genetically identified neuron in the fly's brain.","And this is what one of these experiments looks like.","It was a sequence taken from another post-doc in the lab, Bettina Schnell.","The green trace at the bottom is the membrane potential of a neuron in the fly's brain, and you'll see the fly start to fly, and the fly is actually controlling the rotation of that visual pattern itself by its own wing motion, and you can see this visual interneuron respond to the pattern of wing motion as the fly flies.","So for the first time we've actually been able to record from neurons in the fly's brain while the fly is performing sophisticated behaviors such as flight.","And one of the lessons we've been learning is that the physiology of cells that we've been studying for many years in quiescent flies is not the same as the physiology of those cells when the flies actually engage in active behaviors like flying and walking and so forth.","And why is the physiology different?","Well it turns out it's these neuromodulators, just like the neuromodulators in that little tiny ganglion in the crabs.","So here's a picture of the octopamine system.","Octopamine is a neuromodulator that seems to play an important role in flight and other behaviors.","But this is just one of many neuromodulators that's in the fly's brain.","So I really think that, as we learn more, it's going to turn out that the whole fly brain is just like a large version of this stomatogastric ganglion, and that's one of the reasons why it can do so much with so few neurons.","Now, another idea, another way of multiplexing is multiplexing in space, having different parts of a neuron do different things at the same time.","So here's two sort of canonical neurons from a vertebrate and an invertebrate, a human pyramidal neuron from Ramon y Cajal, and another cell to the right, a non-spiking interneuron, and this is the work of Alan Watson and Malcolm Burrows many years ago, and Malcolm Burrows came up with a pretty interesting idea based on the fact that this neuron from a locust does not fire action potentials.","It's a non-spiking cell.","So a typical cell, like the neurons in our brain, has a region called the dendrites that receives input, and that input sums together and will produce action potentials that run down the axon and then activate all the output regions of the neuron.","But non-spiking neurons are actually quite complicated because they can have input synapses and output synapses all interdigitated, and there's no single action potential that drives all the outputs at the same time.","So there's a possibility that you have computational compartments that allow the different parts of the neuron to do different things at the same time.","So these basic concepts of multitasking in time and multitasking in space, I think these are things that are true in our brains as well, but I think the insects are the true masters of this.","So I hope you think of insects a little bit differently next time, and as I say up here, please think before you swat.","(Applause)"]},"talk_id":{"0":"halla_tomasdottir_it_s_time_for_women_to_run_for_office","1":"dean_kamen_the_emotion_behind_invention","2":"mark_applebaum_the_mad_scientist_of_music","3":"laura_indolfi_good_news_in_the_fight_against_pancreatic_cancer","4":"nina_tandon_caring_for_cells","5":"linus_torvalds_the_mind_behind_linux","6":"amanda_palmer_jherek_bischoff_usman_riaz_space_oddity","7":"eve_ensler_embrace_your_inner_girl","8":"tom_shannon_the_painter_and_the_pendulum","9":"rob_knight_how_our_microbes_make_us_who_we_are","10":"kenichi_ebina_s_magic_moves","11":"pia_mancini_how_to_upgrade_democracy_for_the_internet_era","12":"nellie_mckay_sings_the_dog_song","13":"pankaj_ghemawat_actually_the_world_isn_t_flat","14":"michael_dickinson_how_a_fly_flies"}} \ No newline at end of file diff --git a/generate_text_api.py b/generate_text_api.py new file mode 100644 index 0000000000000000000000000000000000000000..8c8de826382699ef2849ce867a746b8548f28f63 --- /dev/null +++ b/generate_text_api.py @@ -0,0 +1,114 @@ +import json + +import aiohttp + + +class TextGenerator: + def __init__(self, host_url): + self.host_url = host_url.rstrip("/") + "/generate" + self.host_url_stream = host_url.rstrip("/") + "/generate_stream" + + async def generate_text_async(self, prompt, max_new_tokens=100, do_sample=True, temperature=0.8): + payload = { + 'inputs': prompt, + 'parameters': { + 'max_new_tokens': max_new_tokens, + 'do_sample': do_sample, + 'temperature': temperature, + } + } + + headers = { + 'Content-Type': 'application/json' + } + + async with aiohttp.ClientSession() as session: + async with session.post(self.host_url, data=json.dumps(payload), headers=headers) as response: + if response.status == 200: + data = await response.json() + text = data["generated_text"] + return text + else: + # Handle error responses here + return None + + def generate_text(self, prompt, max_new_tokens=100, do_sample=True, temperature=0.8): + import requests + + payload = { + 'inputs': prompt, + 'parameters': { + 'max_new_tokens': max_new_tokens, + 'do_sample': do_sample, + 'temperature': temperature, + } + } + + headers = { + 'Content-Type': 'application/json' + } + + response = requests.post(self.host_url, data=json.dumps(payload), headers=headers).json() + text = response["generated_text"] + return text + + def generate_text_stream(self, prompt, max_new_tokens=100, do_sample=True, temperature=0.8, stop=[], best_of=1): + import requests + + payload = { + 'inputs': prompt, + 'parameters': { + 'max_new_tokens': max_new_tokens, + 'do_sample': do_sample, + 'temperature': temperature, + 'stop': stop, + 'best_of': best_of, + } + } + + headers = { + 'Content-Type': 'application/json', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive' + } + + response = requests.post(self.host_url_stream, data=json.dumps(payload), headers=headers, stream=True) + + for line in response.iter_lines(): + if line: + print(line) + json_data = line.decode('utf-8') + if json_data.startswith('data:'): + print(json_data) + json_data = json_data[5:] + token_data = json.loads(json_data) + token = token_data['token']['text'] + if not token_data['token']['special']: + yield token + +class SummarizerGenerator: + def __init__(self, api): + self.api = api + + def generate_summary_stream(self, text): + import requests + payload = {"text": text} + + headers = { + 'Content-Type': 'application/json', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive' + } + + response = requests.post(self.api, data=json.dumps(payload), headers=headers, stream=True) + + i = 1 + for line in response.iter_lines(): + if line: + print(line) + data = line.decode('utf-8').removesuffix('<|eot_id|>') + if data.startswith("•"): + data = data.replace("•", "-") + data += "\n\n" if i < 3 else "" + yield data + i += 1 \ No newline at end of file diff --git a/model_inferences/utils/chunking.py b/model_inferences/utils/chunking.py new file mode 100644 index 0000000000000000000000000000000000000000..7fd66956199992dbfe7268c9455f1954dfb2e9c2 --- /dev/null +++ b/model_inferences/utils/chunking.py @@ -0,0 +1,115 @@ +import re +from functools import partial + +import nltk + + +def get_len(tokenizer, text): + return len(tokenizer.encode(text, add_special_tokens=False)) + +class Truncater: + def __init__(self, tokenizer, *, max_length): + self.max_length = max_length + self.tokenizer = tokenizer + + def __call__(self, text): + return self.truncate(text) + + def truncate(self, text): + input_ids = self.tokenizer.encode(text, add_special_tokens=False, truncation=True, max_length=self.max_length) + return self.tokenizer.decode(input_ids) + +class Refiner: + def __init__(self, tokenizer, *, chunk_size, max_chunk_size): + assert chunk_size <= max_chunk_size + + self.chunk_size = chunk_size + self.max_chunk_size = max_chunk_size + + self.tokenizer = tokenizer + self.get_len = partial(get_len, tokenizer) + + self.current_summary = None + self.chunks = [] + + self.initial_prompt = "" + self.chunk_prefix = "" + self.summary_prefix = "" + self.refinement_prompt = "" + + def set_prompts(self, *, initial_prompt="", chunk_prefix="", summary_prefix="", refinement_prompt=""): + self.initial_prompt = initial_prompt + self.chunk_prefix = chunk_prefix + self.summary_prefix = summary_prefix + self.refinement_prompt = refinement_prompt + + @property + def current_prompt(self): + if self.current_summary is None: + return self.initial_prompt + else: + return self.refinement_prompt + + def __call__(self, text): + self.chunks = Chunker.chunk_text(text, self.chunk_size, self.max_chunk_size, self.get_len) + return self.refine(text) + + def __len__(self): + return len(self.chunks) + + def refine(self, text): + for chunk in self.chunks: + if self.current_summary is None: + yield chunk + else: + summary = self.summary_prefix + self.current_summary + chunk = self.chunk_prefix + chunk + yield summary + "\n\n" + chunk + + def set_current_summary(self, summary): + self.current_summary = summary + +class Chunker: + def __init__(self, tokenizer, *, chunk_size, max_chunk_size): + assert chunk_size <= max_chunk_size + + self.chunk_size = chunk_size # target chunk size + self.max_chunk_size = max_chunk_size # hard limit + self.tokenizer = tokenizer + self.get_len = partial(get_len, tokenizer) + + def __call__(self, text): + return Chunker.chunk_text(text, self.chunk_size, self.max_chunk_size, self.get_len) + + @staticmethod + def chunk_text(text, chunk_size, max_chunk_size, len_fn): + paragraphs = re.split("\n\n|\n(?=[^\n])", text) + text = " ".join(paragraphs) + sentences = nltk.sent_tokenize(text) + sentences = [s.strip() for s in sentences] + chunks = [] + Chunker._chunk_text(sentences, chunks, chunk_size, max_chunk_size, len_fn) + return chunks + + @staticmethod + def _chunk_text(sentences, chunks, chunk_size, max_chunk_size, len_fn): + if not sentences: + return + + remaining_text = " ".join(sentences) + if len_fn(remaining_text) <= max_chunk_size: + chunks.append(remaining_text) + return + + index = 0 + length_so_far = 0 + while index < len(sentences) and length_so_far + len_fn(sentences[index]) <= chunk_size: + length_so_far += len_fn(sentences[index]) + index += 1 + + if index == 0: + raise ValueError("No chunking possible") + else: + chunk = " ".join(sentences[:index]) + chunks.append(chunk) + Chunker._chunk_text(sentences[index:], chunks, chunk_size, max_chunk_size, len_fn) diff --git a/model_inferences/utils/files.py b/model_inferences/utils/files.py new file mode 100644 index 0000000000000000000000000000000000000000..d295f2955a9106ed502ac38bde71aa21dcfc8edb --- /dev/null +++ b/model_inferences/utils/files.py @@ -0,0 +1,29 @@ +import json +from datetime import date + +import webvtt + + +def get_seconds(time_str): + h, m, s_ms = time_str.split(':') + s, ms = s_ms.split('.') + return int(h) * 3600 + int(m) * 60 + int(s) + float('0.' + ms) + +def get_transcript(path): + with open(path) as f: + transcript = f.read() + transcript = transcript.replace('\n', ' ') + return transcript + +def get_captions_from_vtt(path): + vtt = webvtt.read(path) + return [{'start': get_seconds(caption.start), 'end': get_seconds(caption.end), 'text': caption.text} for caption in vtt] + +def save_summary(summary, path, filename, config): + path.mkdir(parents=True, exist_ok=True) + with open(path / filename, "w+") as f: + f.write(summary) + with open(path / "config.txt", "w+") as f: + config["date"] = str(date.today()) + json.dump(config, f, indent=4) + print("Saved", path / filename) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95fc340f1fd35e53d25136c0f65c08bed6d87c8b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests +pandas +nltk \ No newline at end of file