ScientiaEtVeritas commited on
Commit
c57bf8a
1 Parent(s): 2412f21

initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. app.py +322 -0
  3. demo_data/lectures/Lecture-01-18.04.2023/English.vtt +2582 -0
  4. demo_data/lectures/Lecture-01-18.04.2023/video.mp4 +3 -0
  5. demo_data/lectures/Lecture-02-20.04.2023/English.vtt +2984 -0
  6. demo_data/lectures/Lecture-02-20.04.2023/video.mp4 +3 -0
  7. demo_data/lectures/Lecture-03-25.04.2023/English.vtt +3102 -0
  8. demo_data/lectures/Lecture-03-25.04.2023/video.mp4 +3 -0
  9. demo_data/lectures/Lecture-04-27.04.2023/English.vtt +2919 -0
  10. demo_data/lectures/Lecture-04-27.04.2023/video.mp4 +3 -0
  11. demo_data/lectures/Lecture-05-02.05.2023/English.vtt +1124 -0
  12. demo_data/lectures/Lecture-05-02.05.2023/video.mp4 +3 -0
  13. demo_data/lectures/Lecture-06-09.05.2023/English.vtt +2970 -0
  14. demo_data/lectures/Lecture-06-09.05.2023/video.mp4 +3 -0
  15. demo_data/lectures/Lecture-07-11.05.2023/English.vtt +2596 -0
  16. demo_data/lectures/Lecture-07-11.05.2023/video.mp4 +3 -0
  17. demo_data/lectures/Lecture-07-16.05.2023/English.vtt +2523 -0
  18. demo_data/lectures/Lecture-07-16.05.2023/video.mp4 +3 -0
  19. demo_data/lectures/Lecture-09-25.05.2023/English.vtt +3039 -0
  20. demo_data/lectures/Lecture-09-25.05.2023/video.mp4 +3 -0
  21. demo_data/lectures/Lecture-10-13.06.2023/English.vtt +2458 -0
  22. demo_data/lectures/Lecture-10-13.06.2023/video.mp4 +3 -0
  23. demo_data/lectures/Lecture-11-15.06.2023/English.vtt +0 -0
  24. demo_data/lectures/Lecture-11-15.06.2023/video.mp4 +3 -0
  25. demo_data/lectures/Lecture-12-20.06.2023/English.vtt +0 -0
  26. demo_data/lectures/Lecture-12-20.06.2023/video.mp4 +3 -0
  27. demo_data/lectures/Lecture-13-04.07.2023/English.vtt +2699 -0
  28. demo_data/lectures/Lecture-13-04.07.2023/video.mp4 +3 -0
  29. demo_data/lectures/Lecture-14-27.06.2023/English.vtt +2753 -0
  30. demo_data/lectures/Lecture-14-27.06.2023/video.mp4 +3 -0
  31. demo_data/lectures/Lecture-15-11.07.2023/English.vtt +2295 -0
  32. demo_data/lectures/Lecture-15-11.07.2023/video.mp4 +3 -0
  33. demo_data/lectures/Lecture-18-18.07.2023/English.vtt +2738 -0
  34. demo_data/lectures/Lecture-18-18.07.2023/video.mp4 +3 -0
  35. demo_data/lectures/Lecture-19-21.07.2023/English.vtt +2860 -0
  36. demo_data/lectures/Lecture-19-21.07.2023/video.mp4 +3 -0
  37. demo_data/nips-2021/25953/metadata.json +3 -0
  38. demo_data/nips-2021/25953/transcript_whisper_large-v2.vtt +581 -0
  39. demo_data/nips-2021/25953/video.mp4 +3 -0
  40. demo_data/nips-2021/25957/metadata.json +3 -0
  41. demo_data/nips-2021/25957/transcript_whisper_large-v2.vtt +539 -0
  42. demo_data/nips-2021/25957/video.mp4 +3 -0
  43. demo_data/nips-2021/25958/metadata.json +3 -0
  44. demo_data/nips-2021/25958/transcript_whisper_large-v2.vtt +374 -0
  45. demo_data/nips-2021/25958/video.mp4 +3 -0
  46. demo_data/nips-2021/25959/metadata.json +3 -0
  47. demo_data/nips-2021/25959/transcript_whisper_large-v2.vtt +353 -0
  48. demo_data/nips-2021/25959/video.mp4 +3 -0
  49. demo_data/nips-2021/25962/metadata.json +3 -0
  50. demo_data/nips-2021/25962/transcript_whisper_large-v2.vtt +155 -0
.gitattributes CHANGED
@@ -34,3 +34,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  video.mp4 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  video.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ *.psd filter=lfs diff=lfs merge=lfs -text
38
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
39
+ demo_data/lectures/*/*.mp4 filter=lfs diff=lfs merge=lfs -text
40
+ demo_data/*/.mp4 filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+ import json
3
+ import re
4
+ from functools import partial
5
+ from pathlib import Path
6
+
7
+ import requests
8
+ import streamlit as st
9
+ import webvtt
10
+ from load_data import get_partition
11
+ from transformers import AutoTokenizer
12
+
13
+ from generate_text_api import TextGenerator
14
+ from model_inferences.utils.chunking import Truncater
15
+ from model_inferences.utils.files import get_captions_from_vtt, get_transcript
16
+
17
+ USE_PARAGRAPHING_MODEL = True
18
+
19
+ def get_sublist_by_flattened_index(A, i):
20
+ current_index = 0
21
+ for sublist in A:
22
+ sublist_length = len(sublist)
23
+ if current_index <= i < current_index + sublist_length:
24
+ return sublist, A.index(sublist)
25
+ current_index += sublist_length
26
+ return None, None
27
+
28
+ import requests
29
+
30
+
31
+ def get_talk_metadata(video_id):
32
+ url = "https://www.ted.com/graphql"
33
+
34
+ headers = {
35
+ "Content-Type": "application/json",
36
+ "Accept": "application/json",
37
+ "x-operation-name": "Transcript", # Replace with the actual operation name
38
+ }
39
+
40
+ data = {
41
+ "query": """
42
+ query GetTalk($videoId: ID!) {
43
+ video(id: $videoId) {
44
+ title,
45
+ presenterDisplayName,
46
+ nativeDownloads {medium}
47
+ }
48
+ }
49
+ """,
50
+ "variables": {
51
+ "videoId": video_id, # Corrected key to "videoId"
52
+ },
53
+ }
54
+
55
+ response = requests.post(url, json=data, headers=headers)
56
+
57
+ if response.status_code == 200:
58
+ result = response.json()
59
+ return result
60
+ else:
61
+ print(f"Error: {response.status_code}, {response.text}")
62
+
63
+ class OfflineTextSegmenterClient:
64
+ def __init__(self, host_url):
65
+ self.host_url = host_url.rstrip("/") + "/segment"
66
+
67
+ def segment(self, text, captions=None, generate_titles=False, threshold=0.4):
68
+ payload = {
69
+ 'text': text,
70
+ 'captions': captions,
71
+ 'generate_titles': generate_titles,
72
+ "prefix_titles": True,
73
+ "threshold": threshold,
74
+ }
75
+
76
+ headers = {
77
+ 'Content-Type': 'application/json'
78
+ }
79
+
80
+ response = requests.post(self.host_url, data=json.dumps(payload), headers=headers).json()
81
+ #segments = response["annotated_segments"] if "annotated_segments" in response else response["segments"]
82
+ return {'segments':response["segments"], 'titles': response["titles"], 'sentences': response["sentences"]}
83
+
84
+ class Toc:
85
+
86
+ def __init__(self):
87
+ self._items = []
88
+ self._placeholder = None
89
+
90
+ def title(self, text):
91
+ self._markdown(text, "h1")
92
+
93
+ def header(self, text):
94
+ self._markdown(text, "h2", " " * 2)
95
+
96
+ def subheader(self, text):
97
+ self._markdown(text, "h3", " " * 4)
98
+
99
+ def placeholder(self, sidebar=False):
100
+ self._placeholder = st.sidebar.empty() if sidebar else st.empty()
101
+
102
+ def generate(self):
103
+ if self._placeholder:
104
+ self._placeholder.markdown("\n".join(self._items), unsafe_allow_html=True)
105
+
106
+ def _markdown(self, text, level, space=""):
107
+ key = re.sub(r'[^\w-]', '', text.replace(" ", "-").replace("'", "-").lower())
108
+ st.markdown(f"<{level} id='{key}'>{text}</{level}>", unsafe_allow_html=True)
109
+ self._items.append(f"{space}* <a href='#{key}'>{text}</a>")
110
+
111
+ custom_css = "<style type='text/css'>" + Path('style.css').read_text() + "</style>"
112
+ st.write(custom_css, unsafe_allow_html=True)
113
+
114
+ def concat_prompt(prompt_text, text, model_name):
115
+ if 'flan' in model_name:
116
+ input_ = prompt_text + "\n\n" + text
117
+ elif 'galactica' in model_name:
118
+ input_ = text + "\n\n" + prompt_text
119
+ return input_
120
+
121
+ endpoint = "http://hiaisc.isl.iar.kit.edu/summarize"
122
+ ENDPOINTS = {"http://hiaisc.isl.iar.kit.edu/summarize": "meta-llama/Llama-2-13b-chat-hf",}
123
+
124
+ client = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/chapter")
125
+ if USE_PARAGRAPHING_MODEL:
126
+ paragrapher = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/paragraph")
127
+ summarizer = TextGenerator(endpoint)
128
+
129
+ tokenizer = AutoTokenizer.from_pretrained(ENDPOINTS[endpoint], use_fast=False)
130
+
131
+ # TLDR PROMPT
132
+
133
+ SYSTEM_PROMPT = "You are an assistant who replies with a summary to every message."
134
+
135
+ TLDR_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
136
+ {system_prompt}
137
+ <</SYS>>
138
+
139
+ {user_message} [/INST] Sure! Here is a summary of the research presentation in a single, short sentence:"""
140
+
141
+ TLDR_USER_PROMPT = "Summarize the following research presentation in a single, short sentence:\n\n{input}"
142
+
143
+ TLDR_PROMPT = TLDR_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
144
+ TLDR_PROMPT_LENGTH = tokenizer(TLDR_PROMPT, return_tensors="pt")["input_ids"].size(1)
145
+
146
+ BP_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
147
+ {system_prompt}
148
+ <</SYS>>
149
+
150
+ {user_message} [/INST] Sure! Here is a summary of the research presentation using three bullet points:\n\n\u2022"""
151
+
152
+ BP_USER_PROMPT = "Summarize the following research presentation using three bullet points:\n\n{input}"
153
+
154
+ BP_PROMPT = BP_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
155
+ BP_PROMPT_LENGTH = tokenizer(BP_PROMPT, return_tensors="pt")["input_ids"].size(1)
156
+
157
+ CONTEXT_LENGTH = 3072
158
+ MAX_SUMMARY_LENGTH = 1024
159
+ TLDR_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - TLDR_PROMPT_LENGTH - 1
160
+ BP_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - BP_PROMPT_LENGTH - 1
161
+
162
+
163
+ text_generator = TextGenerator(endpoint)
164
+ temperature = 0.7
165
+
166
+ import re
167
+
168
+
169
+ def replace_newlines(text):
170
+ updated_text = re.sub(r'\n+', r'\n\n', text)
171
+ return updated_text
172
+
173
+ def generate_summary(summarizer, generated_text_box, input_, prompt, max_input_length, prefix=""):
174
+ all_generated_text = prefix
175
+ truncater = Truncater(tokenizer, max_length=max_input_length)
176
+ input_ = truncater(input_)
177
+ input_ = prompt.format(input=input_)
178
+ for generated_text in summarizer.generate_text_stream(input_, max_new_tokens=MAX_SUMMARY_LENGTH, do_sample=True, temperature=temperature):
179
+ all_generated_text += replace_newlines(generated_text)
180
+ generated_text_box.info(all_generated_text)
181
+ print(all_generated_text)
182
+ return all_generated_text.strip()
183
+
184
+ st.header("Demo: Intelligent Recap")
185
+
186
+ if not hasattr(st, 'global_state'):
187
+ st.global_state = {'NIPS 2021 Talks': None, 'TED Talks': None}
188
+ # NIPS 2021 Talks
189
+ transcript_files = itertools.islice(Path("demo_data/nips-2021/").rglob("transcript_whisper_large-v2.vtt"), 15)
190
+ # get titles from metadata.json
191
+ transcripts_map = {}
192
+ for transcript_file in transcript_files:
193
+ base_path = transcript_file.parent
194
+ metadata = base_path / "metadata.json"
195
+ txt_file = base_path / "transcript_whisper_large-v2.txt"
196
+ with open(metadata) as f:
197
+ metadata = json.load(f)
198
+ title = metadata["title"]
199
+ transcript = get_transcript(txt_file)
200
+ captions = get_captions_from_vtt(transcript_file)
201
+ transcripts_map[title] = {"transcript": transcript, "captions": captions, "video": base_path / "video.mp4"}
202
+ st.global_state['NIPS 2021 Talks'] = transcripts_map
203
+
204
+ data = get_partition("train").sample(15, random_state=41)
205
+ video_ids = data.talk_id.tolist()
206
+ transcripts = data.text.apply(lambda x: " ".join(x)).tolist()
207
+ transcripts_map = {}
208
+ for video_id, transcript in zip(video_ids, transcripts):
209
+ metadata = get_talk_metadata(video_id)
210
+ title = metadata["data"]["video"]["title"]
211
+ presenter = metadata["data"]["video"]["presenterDisplayName"]
212
+ print(metadata["data"])
213
+ if metadata["data"]["video"]["nativeDownloads"] is None:
214
+ continue
215
+ video_url = metadata["data"]["video"]["nativeDownloads"]["medium"]
216
+ transcripts_map[title] = {"transcript": transcript, "video": video_url, "presenter": presenter}
217
+ st.global_state['TED Talks'] = transcripts_map
218
+
219
+ def get_lecture_id(path):
220
+ return int(path.parts[-2].split('-')[1])
221
+
222
+ transcript_files = Path("demo_data/lectures/").rglob("English.vtt")
223
+ sorted_path_list = sorted(transcript_files, key=get_lecture_id)
224
+
225
+ transcripts_map = {}
226
+ for transcript_file in sorted_path_list:
227
+ base_path = transcript_file.parent
228
+ lecture_id = base_path.parts[-1]
229
+ transcript = " ".join([c["text"].strip() for c in get_captions_from_vtt(transcript_file)]).replace("\n", " ")
230
+ video_path = Path(base_path, "video.mp4")
231
+ transcripts_map["Machine Translation: " + lecture_id] = {"transcript": transcript, "video": video_path}
232
+ st.global_state['KIT Lectures'] = transcripts_map
233
+
234
+ type_of_document = st.selectbox('What kind of document do you want to test it on?', list(st.global_state.keys()))
235
+
236
+ transcripts_map = st.global_state[type_of_document]
237
+
238
+ selected_talk = st.selectbox("Choose a document...", list(transcripts_map.keys()))
239
+
240
+ st.video(str(transcripts_map[selected_talk]['video']), format="video/mp4", start_time=0)
241
+
242
+ input_text = st.text_area("Transcript", value=transcripts_map[selected_talk]['transcript'], height=300)
243
+
244
+ toc = Toc()
245
+
246
+ summarization_todos = []
247
+
248
+ with st.expander("Adjust Thresholds"):
249
+ threshold = st.slider('Chapter Segmentation Threshold', 0.00, 1.00, value=0.4, step=0.05)
250
+ paragraphing_threshold = st.slider('Paragraphing Threshold', 0.00, 1.00, value=0.5, step=0.05)
251
+
252
+ if st.button("Process Transcript"):
253
+ with st.sidebar:
254
+ st.header("Table of Contents")
255
+ toc.placeholder()
256
+
257
+ st.header(selected_talk, divider='rainbow')
258
+ # if 'presenter' in transcripts_map[selected_talk]:
259
+ # st.markdown(f"### *by **{transcripts_map[selected_talk]['presenter']}***")
260
+
261
+ captions = transcripts_map[selected_talk]['captions'] if 'captions' in transcripts_map[selected_talk] else None
262
+ result = client.segment(input_text, captions, generate_titles=True, threshold=threshold)
263
+ if USE_PARAGRAPHING_MODEL:
264
+ presult = paragrapher.segment(input_text, captions, generate_titles=False, threshold=paragraphing_threshold)
265
+ paragraphs = presult['segments']
266
+ segments, titles, sentences = result['segments'], result['titles'], result['sentences']
267
+
268
+ if USE_PARAGRAPHING_MODEL:
269
+ prev_chapter_idx = 0
270
+ prev_paragraph_idx = 0
271
+ segment = []
272
+ for i, sentence in enumerate(sentences):
273
+ chapter, chapter_idx = get_sublist_by_flattened_index(segments, i)
274
+ paragraph, paragraph_idx = get_sublist_by_flattened_index(paragraphs, i)
275
+
276
+ if (chapter_idx != prev_chapter_idx and paragraph_idx == prev_paragraph_idx) or (paragraph_idx != prev_paragraph_idx and chapter_idx != prev_chapter_idx):
277
+ print("Chapter / Chapter & Paragraph")
278
+ segment_text = " ".join(segment)
279
+ toc.subheader(titles[prev_chapter_idx])
280
+ if len(segment_text) > 1200:
281
+ generated_text_box = st.info("")
282
+ summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
283
+ elif len(segment_text) > 450:
284
+ generated_text_box = st.info("")
285
+ summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
286
+ st.write(segment_text)
287
+ segment = []
288
+ elif paragraph_idx != prev_paragraph_idx and chapter_idx == prev_chapter_idx:
289
+ print("Paragraph")
290
+ segment.append("\n\n")
291
+
292
+ segment.append(sentence)
293
+
294
+ prev_chapter_idx = chapter_idx
295
+ prev_paragraph_idx = paragraph_idx
296
+
297
+ segment_text = " ".join(segment)
298
+ toc.subheader(titles[prev_chapter_idx])
299
+ if len(segment_text) > 1200:
300
+ generated_text_box = st.info("")
301
+ summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
302
+ elif len(segment_text) > 450:
303
+ generated_text_box = st.info("")
304
+ summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
305
+ st.write(segment_text)
306
+
307
+
308
+ else:
309
+ segments = [" ".join([sentence for sentence in segment]) for segment in segments]
310
+ for title, segment in zip(titles, segments):
311
+ toc.subheader(title)
312
+ if len(segment) > 1200:
313
+ generated_text_box = st.info("")
314
+ summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
315
+ elif len(segment) > 450:
316
+ generated_text_box = st.info("")
317
+ summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
318
+ st.write(segment)
319
+ toc.generate()
320
+
321
+ for summarization_todo in summarization_todos:
322
+ summarization_todo()
demo_data/lectures/Lecture-01-18.04.2023/English.vtt ADDED
@@ -0,0 +1,2582 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 0:00:00.000 --> 0:00:10.115
4
+ That easy to say this is a good translation
5
+ and this is a bad translation.
6
+
7
+ 0:00:10.115 --> 0:00:12.947
8
+ How can we evaluate?
9
+
10
+ 0:00:13.413 --> 0:00:26.083
11
+ We will put an emphasis on machine translation
12
+ because that is currently the state of the
13
+
14
+ 0:00:26.083 --> 0:00:26.787
15
+ art.
16
+
17
+ 0:00:28.028 --> 0:00:35.120
18
+ But we are now focused on the details of neural
19
+ networks where we are describing the basic
20
+
21
+ 0:00:35.120 --> 0:00:39.095
22
+ ideas and how to use the info machine translation.
23
+
24
+ 0:00:39.095 --> 0:00:41.979
25
+ This is not a neural network course.
26
+
27
+ 0:00:42.242 --> 0:00:49.574
28
+ If you have some background in Neo Networks,
29
+ that is of course of an advantage, but it should
30
+
31
+ 0:00:49.574 --> 0:00:51.134
32
+ not be a challenge.
33
+
34
+ 0:00:51.134 --> 0:00:58.076
35
+ If you have not done the details, we'll shortly
36
+ cover the background and the main ideas.
37
+
38
+ 0:00:58.076 --> 0:01:00.338
39
+ How can we use them for for?
40
+
41
+ 0:01:00.280 --> 0:01:06.880
42
+ Machine translation: We will starve the first
43
+ two, three lectures with some like more traditional
44
+
45
+ 0:01:06.880 --> 0:01:12.740
46
+ approaches how they work because they still
47
+ give some good intuition, some good ideas.
48
+
49
+ 0:01:12.872 --> 0:01:17.141
50
+ And they help us to understand where our systems
51
+ might be better.
52
+
53
+ 0:01:17.657 --> 0:01:22.942
54
+ And yeah, we have an innocence on really what
55
+ do we need to do to build a strong system.
56
+
57
+ 0:01:23.343 --> 0:01:35.534
58
+ And then we have a part on experience where
59
+ it's about how to build the systems and how
60
+
61
+ 0:01:35.534 --> 0:01:37.335
62
+ to apply it.
63
+
64
+ 0:01:39.799 --> 0:01:47.774
65
+ For additional reading materials, so we have
66
+ the slides on the website.
67
+
68
+ 0:01:47.774 --> 0:01:55.305
69
+ There is also links to papers which cover
70
+ the topic of the lecture.
71
+
72
+ 0:01:55.235 --> 0:01:58.436
73
+ If You'd Like to Study Additional Books.
74
+
75
+ 0:01:59.559 --> 0:02:07.158
76
+ Think the most relevant is this machine translation
77
+ from Philip Kurnan, which gives an introduction
78
+
79
+ 0:02:07.158 --> 0:02:09.210
80
+ about machine translation.
81
+
82
+ 0:02:09.210 --> 0:02:15.897
83
+ But this lecture is, of course, not a one
84
+ to one like we don't go through the book, but
85
+
86
+ 0:02:15.897 --> 0:02:17.873
87
+ it covers related topics.
88
+
89
+ 0:02:18.678 --> 0:02:25.094
90
+ Is a previous version of that statistical
91
+ machine translation focusing on that part,
92
+
93
+ 0:02:25.094 --> 0:02:28.717
94
+ and we cover some of that part rather than
95
+ all.
96
+
97
+ 0:02:28.717 --> 0:02:35.510
98
+ If you want to have more basics about natural
99
+ language processing, this might be helpful.
100
+
101
+ 0:02:39.099 --> 0:02:53.738
102
+ In addition, there is an online course on
103
+ machine translation which we also develop here
104
+
105
+ 0:02:53.738 --> 0:02:57.521
106
+ at which is available.
107
+
108
+ 0:02:57.377 --> 0:03:04.894
109
+ Input where you're, of course, free to use
110
+ that I might give you some other type of presentation
111
+
112
+ 0:03:04.894 --> 0:03:07.141
113
+ of the lecture important is.
114
+
115
+ 0:03:07.141 --> 0:03:14.193
116
+ It's, of course, a lot shorter and book doesn't
117
+ cover all the topics which you're covering
118
+
119
+ 0:03:14.193 --> 0:03:15.432
120
+ in the lecture.
121
+
122
+ 0:03:15.655 --> 0:03:19.407
123
+ So, of course, for the exam everything which
124
+ was in the lecture is important.
125
+
126
+ 0:03:19.679 --> 0:03:25.012
127
+ This covers like the first half where don't
128
+ know exactly the first X lectures.
129
+
130
+ 0:03:26.026 --> 0:03:28.554
131
+ Feel free to have a look at that.
132
+
133
+ 0:03:28.554 --> 0:03:29.596
134
+ It's shorter.
135
+
136
+ 0:03:29.596 --> 0:03:36.438
137
+ Maybe there's some of you interesting to have
138
+ very short videos or after the lecture single
139
+
140
+ 0:03:36.438 --> 0:03:39.934
141
+ this topic I didn't understand want to repeat.
142
+
143
+ 0:03:40.260 --> 0:03:50.504
144
+ Then this might be helpful, but it's important
145
+ that there is more content in the lecture.
146
+
147
+ 0:03:53.753 --> 0:04:02.859
148
+ The exam will be minutes and oral exam and
149
+ just make an appointment and then.
150
+
151
+ 0:04:05.305 --> 0:04:09.735
152
+ If you think this is a really cool topic,
153
+ want to hear more.
154
+
155
+ 0:04:09.735 --> 0:04:14.747
156
+ There's two similars, one on advanced topics
157
+ in machine translation.
158
+
159
+ 0:04:15.855 --> 0:04:24.347
160
+ Which is every Thursday and there is one which
161
+ was already on Monday.
162
+
163
+ 0:04:24.347 --> 0:04:34.295
164
+ But if you're interested in speech translation
165
+ to contact us and there, I think,.
166
+
167
+ 0:04:34.734 --> 0:04:47.066
168
+ Then there are other lectures, one more learning
169
+ by Professor Vival, and for us some of you
170
+
171
+ 0:04:47.066 --> 0:04:48.942
172
+ have already.
173
+
174
+ 0:04:48.888 --> 0:04:55.496
175
+ Lecture, which is related but of discovering
176
+ more general natural language processing than
177
+
178
+ 0:04:55.496 --> 0:04:57.530
179
+ will be again available in.
180
+
181
+ 0:04:57.597 --> 0:05:07.108
182
+ Winter semester, and then we are concentrating
183
+ on the task of machine translation and mighty.
184
+
185
+ 0:05:11.191 --> 0:05:14.630
186
+ Yeah, and also there's an automatic speech
187
+ emission problem.
188
+
189
+ 0:05:16.616 --> 0:05:27.150
190
+ And this is a bit what we are planning to
191
+ talk about in this semester.
192
+
193
+ 0:05:27.150 --> 0:05:30.859
194
+ Today we have a general.
195
+
196
+ 0:05:31.371 --> 0:05:37.362
197
+ Then on Thursday we are doing a bit of a different
198
+ lecture and that's about the linguistic.
199
+
200
+ 0:05:37.717 --> 0:05:42.475
201
+ It may be quite different from what you're
202
+ more computer scientist, what you've done there,
203
+
204
+ 0:05:42.475 --> 0:05:43.354
205
+ but don't worry.
206
+
207
+ 0:05:43.763 --> 0:05:49.051
208
+ We're coming in a very basic thing that I
209
+ think it's important if you're dealing with
210
+
211
+ 0:05:49.051 --> 0:05:53.663
212
+ natural language to have a bit of an understanding
213
+ of what language isn't.
214
+
215
+ 0:05:53.663 --> 0:05:59.320
216
+ Maybe I've learned about that in high school,
217
+ but also for you this I guess some years ago.
218
+
219
+ 0:05:59.619 --> 0:06:07.381
220
+ And so it's a bit of yeah, it better understand
221
+ also what other challenges there.
222
+
223
+ 0:06:07.307 --> 0:06:16.866
224
+ And especially since we are all dealing with
225
+ our mother time, it may be English, but there
226
+
227
+ 0:06:16.866 --> 0:06:25.270
228
+ is a lot of interesting phenomena which would
229
+ not occur in these two languages.
230
+
231
+ 0:06:25.625 --> 0:06:30.663
232
+ And therefore we'll also look a bit into what
233
+ are things which might happen in other languages.
234
+
235
+ 0:06:30.930 --> 0:06:35.907
236
+ If we want to build machine translation, of
237
+ course we want to build machine Translation
238
+
239
+ 0:06:35.907 --> 0:06:36.472
240
+ for many.
241
+
242
+ 0:06:38.178 --> 0:06:46.989
243
+ Then we will see a lot of these machine learning
244
+ based how to get the data and process the data
245
+
246
+ 0:06:46.989 --> 0:06:47.999
247
+ next week.
248
+
249
+ 0:06:48.208 --> 0:07:03.500
250
+ And then we'll have one lecture about statistical
251
+ machine translation, which was the approach
252
+
253
+ 0:07:03.500 --> 0:07:06.428
254
+ for twenty years.
255
+
256
+ 0:07:07.487 --> 0:07:17.308
257
+ And then maybe surprisingly very early we'll
258
+ talk about evaluation and this is because evaluation
259
+
260
+ 0:07:17.308 --> 0:07:24.424
261
+ is really essential for machine translation
262
+ and it's very challenging.
263
+
264
+ 0:07:24.804 --> 0:07:28.840
265
+ To decide if machine translation output is
266
+ good or bad is really challenging.
267
+
268
+ 0:07:29.349 --> 0:07:38.563
269
+ If you see another translation for a machine
270
+ to decide is not as difficult and even for
271
+
272
+ 0:07:38.563 --> 0:07:48.387
273
+ a machine translation output and ask them to
274
+ rate, you'll get three different answers: And
275
+
276
+ 0:07:48.387 --> 0:07:55.158
277
+ so it's worse to investigate it, and of course
278
+ it's also important to have that at the beginning
279
+
280
+ 0:07:55.158 --> 0:08:01.928
281
+ because if we're later talking about some techniques,
282
+ it will be always saying this technique is
283
+
284
+ 0:08:01.928 --> 0:08:03.813
285
+ better by x percent or so.
286
+
287
+ 0:08:04.284 --> 0:08:06.283
288
+ And we'll also have a practical good course
289
+ of this.
290
+
291
+ 0:08:06.746 --> 0:08:16.553
292
+ Then we're going to build language models
293
+ which are in point to translation models.
294
+
295
+ 0:08:16.736 --> 0:08:28.729
296
+ After the half you have a basic understanding
297
+ of what and basic machine translation.
298
+
299
+ 0:08:29.029 --> 0:08:39.065
300
+ And then on the second part of the lecture
301
+ we will cover more advanced topics.
302
+
303
+ 0:08:39.065 --> 0:08:42.369
304
+ What are the challenging?
305
+
306
+ 0:08:43.463 --> 0:08:48.035
307
+ One challenge is, of course, about additional
308
+ resources about data.
309
+
310
+ 0:08:48.208 --> 0:08:53.807
311
+ So the question is how can we get more data
312
+ or better data and their different ways of
313
+
314
+ 0:08:53.807 --> 0:08:54.258
315
+ doing?
316
+
317
+ 0:08:54.214 --> 0:09:00.230
318
+ Our thralling data will look into our building
319
+ systems which not translate between one language
320
+
321
+ 0:09:00.230 --> 0:09:06.122
322
+ but which translate between fifteen languages
323
+ and youth knowledge and share knowledge between
324
+
325
+ 0:09:06.122 --> 0:09:09.632
326
+ the language so that for each pair they need
327
+ less data.
328
+
329
+ 0:09:11.751 --> 0:09:19.194
330
+ And then we'll have something about efficiency.
331
+
332
+ 0:09:19.194 --> 0:09:27.722
333
+ That is, of course, with more and more complex
334
+ models.
335
+
336
+ 0:09:27.647 --> 0:09:33.053
337
+ Because then nobody can afford to do that,
338
+ so how can you build really efficient things?
339
+
340
+ 0:09:33.393 --> 0:09:38.513
341
+ Who also like energy is getting more expensive
342
+ so it's even more important to build systems.
343
+
344
+ 0:09:39.419 --> 0:09:43.447
345
+ We're Looking to Biases So.
346
+
347
+ 0:09:43.423 --> 0:09:50.364
348
+ That is a machine translation quite interesting
349
+ because some information are represented different
350
+
351
+ 0:09:50.364 --> 0:09:51.345
352
+ in languages.
353
+
354
+ 0:09:51.345 --> 0:09:55.552
355
+ So if you think about German, there is always
356
+ clear or not.
357
+
358
+ 0:09:55.552 --> 0:10:00.950
359
+ But in a lot of situations, it's clear if
360
+ you talk about to teach her about.
361
+
362
+ 0:10:01.321 --> 0:10:03.807
363
+ Another Person If It's Male or Female.
364
+
365
+ 0:10:04.204 --> 0:10:13.832
366
+ From English to German you don't have this
367
+ information, so how do you generate that and
368
+
369
+ 0:10:13.832 --> 0:10:15.364
370
+ what systems?
371
+
372
+ 0:10:15.515 --> 0:10:24.126
373
+ Will just assume things and we'll see that
374
+ exactly this is happening, so in order to address
375
+
376
+ 0:10:24.126 --> 0:10:27.459
377
+ these challenges and try to reduce.
378
+
379
+ 0:10:28.368 --> 0:10:35.186
380
+ The main adaptation is what I said that beginning
381
+ systems are good at the task they are trained.
382
+
383
+ 0:10:35.186 --> 0:10:37.928
384
+ But how can we adapt them to new task?
385
+
386
+ 0:10:38.959 --> 0:10:51.561
387
+ Document level is doing more context and we
388
+ have two lectures about speech translation,
389
+
390
+ 0:10:51.561 --> 0:10:56.859
391
+ so mostly before we are translating.
392
+
393
+ 0:10:57.117 --> 0:11:00.040
394
+ Are now translating audio things.
395
+
396
+ 0:11:00.040 --> 0:11:05.371
397
+ We have just additional challenges and these
398
+ we will address.
399
+
400
+ 0:11:10.450 --> 0:11:22.165
401
+ So to the motivation, why should you work
402
+ on the theme translation and why should you
403
+
404
+ 0:11:22.165 --> 0:11:23.799
405
+ put effort?
406
+
407
+ 0:11:24.224 --> 0:11:30.998
408
+ So we want or we are living in a more global
409
+ society.
410
+
411
+ 0:11:30.998 --> 0:11:37.522
412
+ You have now the chance to communicate with
413
+ people.
414
+
415
+ 0:11:37.897 --> 0:11:44.997
416
+ And the danger of course is that languages
417
+ are dying, and more and more languages are
418
+
419
+ 0:11:44.997 --> 0:11:45.988
420
+ going away.
421
+
422
+ 0:11:46.006 --> 0:11:53.669
423
+ I think at least that some opportunity in
424
+ order to keep more languages is that we have
425
+
426
+ 0:11:53.669 --> 0:12:01.509
427
+ technology solutions which help you to speak
428
+ in your language and still communicate with
429
+
430
+ 0:12:01.509 --> 0:12:04.592
431
+ people who speak another language.
432
+
433
+ 0:12:04.864 --> 0:12:16.776
434
+ And on the one hand there is the need and
435
+ more and more people want to speak in some
436
+
437
+ 0:12:16.776 --> 0:12:19.159
438
+ other languages.
439
+
440
+ 0:12:19.759 --> 0:12:27.980
441
+ For example, Iceland was really keen on getting
442
+ Icelandic into commercial systems and they
443
+
444
+ 0:12:27.980 --> 0:12:36.471
445
+ even provided data and so on because they wanted
446
+ that their language is spoken longer and not
447
+
448
+ 0:12:36.471 --> 0:12:38.548
449
+ just people switching.
450
+
451
+ 0:12:38.959 --> 0:12:47.177
452
+ So there's even like yeah, they were spending
453
+ for promoting this language in order to have
454
+
455
+ 0:12:47.177 --> 0:12:55.125
456
+ all these digital tools available for languages
457
+ which are not spoken by so many people.
458
+
459
+ 0:12:56.156 --> 0:13:07.409
460
+ So it's questionable and it's not completely
461
+ clear technology always provides.
462
+
463
+ 0:13:10.430 --> 0:13:25.622
464
+ If we think about machine translation, there
465
+ are different use cases in which you can use
466
+
467
+ 0:13:25.622 --> 0:13:26.635
468
+ that.
469
+
470
+ 0:13:27.207 --> 0:13:36.978
471
+ And this has some characteristics: So typically
472
+ in this case it is where machine translation
473
+
474
+ 0:13:36.978 --> 0:13:40.068
475
+ was used first anybody.
476
+
477
+ 0:13:40.780 --> 0:13:50.780
478
+ Because most youth outlets around the world
479
+ report at least some of the same events, like
480
+
481
+ 0:13:50.780 --> 0:13:58.669
482
+ was probably covered around the world in a
483
+ lot of different languages.
484
+
485
+ 0:13:59.279 --> 0:14:08.539
486
+ That is one point yes, so the training gator
487
+ is there.
488
+
489
+ 0:14:08.539 --> 0:14:16.284
490
+ That's definitely a good point here and then.
491
+
492
+ 0:14:17.717 --> 0:14:19.425
493
+ Yes, there was my regional idea.
494
+
495
+ 0:14:19.425 --> 0:14:23.256
496
+ The motivation program was a bit different
497
+ by you, but it's a good point.
498
+
499
+ 0:14:23.256 --> 0:14:26.517
500
+ So on the one end you'll understand maybe
501
+ not perfect English.
502
+
503
+ 0:14:26.517 --> 0:14:30.762
504
+ Also, it's for his personal use, so you're
505
+ using machine translation for you use.
506
+
507
+ 0:14:31.311 --> 0:14:37.367
508
+ It's not as important that this is really
509
+ perfect written text, but you're more interested
510
+
511
+ 0:14:37.367 --> 0:14:38.564
512
+ in understanding.
513
+
514
+ 0:14:38.858 --> 0:14:45.570
515
+ Maybe it's more clearer if you think about
516
+ the other situation where it's about dissimination
517
+
518
+ 0:14:45.570 --> 0:14:48.926
519
+ that means producing text in another language.
520
+
521
+ 0:14:48.926 --> 0:14:55.138
522
+ So just imagine you have a website or you
523
+ have a restaurant and you want to offer your
524
+
525
+ 0:14:55.138 --> 0:14:55.566
526
+ menu.
527
+
528
+ 0:14:56.476 --> 0:15:01.948
529
+ And in this case maybe you want to have a
530
+ higher quality because in some of your.
531
+
532
+ 0:15:01.901 --> 0:15:06.396
533
+ You're presenting something of yourself and
534
+ you want to have good quality.
535
+
536
+ 0:15:06.396 --> 0:15:11.490
537
+ Just remember you're writing a letter and
538
+ if you're translating your letter then you
539
+
540
+ 0:15:11.490 --> 0:15:17.123
541
+ don't want to have it full of mistakes because
542
+ it's somehow a bad, bad oppression but if it's
543
+
544
+ 0:15:17.123 --> 0:15:20.300
545
+ assimilation it's about you getting the information.
546
+
547
+ 0:15:20.660 --> 0:15:25.564
548
+ So here you want your disciplination, you're
549
+ producing texts for another language.
550
+
551
+ 0:15:26.006 --> 0:15:31.560
552
+ And then you have the disadvantage that you
553
+ maybe want to have a higher quality.
554
+
555
+ 0:15:31.831 --> 0:15:43.432
556
+ Therefore, typically there is less amount,
557
+ so normally you're getting more information
558
+
559
+ 0:15:43.432 --> 0:15:46.499
560
+ than you're producing.
561
+
562
+ 0:15:49.109 --> 0:15:57.817
563
+ Then of course there is a dynamic scenario
564
+ where there is some type of interaction and
565
+
566
+ 0:15:57.817 --> 0:16:07.099
567
+ the one thing which is interesting about the
568
+ dialogue scenario is there is: So if you're
569
+
570
+ 0:16:07.099 --> 0:16:18.045
571
+ translating a website you have all the data
572
+ available but in a dialogue scenario you.
573
+
574
+ 0:16:18.378 --> 0:16:23.655
575
+ And we'll see that in speech recognition this
576
+ is a big challenge.
577
+
578
+ 0:16:23.655 --> 0:16:30.930
579
+ Just to mention German where in German the
580
+ work is often more at the end, so each harmony.
581
+
582
+ 0:16:32.052 --> 0:16:36.343
583
+ Know that you want to generate the English
584
+ sentence.
585
+
586
+ 0:16:36.343 --> 0:16:42.740
587
+ Now you need to know if you cancel this registration
588
+ to produce a second word.
589
+
590
+ 0:16:42.740 --> 0:16:49.785
591
+ So you have to either guess or do something
592
+ in order to provide the translation before
593
+
594
+ 0:16:49.785 --> 0:16:52.052
595
+ the translation is already.
596
+
597
+ 0:16:57.817 --> 0:17:00.530
598
+ The question, of course, is in the new world.
599
+
600
+ 0:17:00.530 --> 0:17:05.659
601
+ I mean, of course, we can, on the one hand,
602
+ say we don't want to have English, but the
603
+
604
+ 0:17:05.659 --> 0:17:10.789
605
+ question is do we really need that many languages
606
+ and how many are here at the moment?
607
+
608
+ 0:17:11.291 --> 0:17:20.248
609
+ Does anybody have an idea how many languages
610
+ are spoken in the world?
611
+
612
+ 0:17:23.043 --> 0:17:26.510
613
+ This is already the first big challenge.
614
+
615
+ 0:17:26.510 --> 0:17:34.120
616
+ What a language is and what no language is
617
+ is already difficult, and then maybe one point
618
+
619
+ 0:17:34.120 --> 0:17:40.124
620
+ people have to argue first about written language
621
+ or spoken languages.
622
+
623
+ 0:17:40.400 --> 0:17:47.765
624
+ For written languages I think that number
625
+ is still too low, but for a spoken language
626
+
627
+ 0:17:47.765 --> 0:17:53.879
628
+ people normally think: So you see that it's
629
+ really a lot of languages which will be difficult
630
+
631
+ 0:17:53.879 --> 0:17:54.688
632
+ to all happen.
633
+
634
+ 0:17:55.035 --> 0:18:00.662
635
+ And these are just like you see Europe where
636
+ there's relatively few languages.
637
+
638
+ 0:18:00.662 --> 0:18:05.576
639
+ You already have quite a lot of languages,
640
+ even walls and countries.
641
+
642
+ 0:18:06.126 --> 0:18:13.706
643
+ Of course sometimes you share the language,
644
+ but then you have Briton or Gillesian vest
645
+
646
+ 0:18:13.706 --> 0:18:17.104
647
+ where you have languages in a country.
648
+
649
+ 0:18:18.478 --> 0:18:24.902
650
+ And yeah, of course, there's the question:
651
+ When does it start to be a language?
652
+
653
+ 0:18:24.902 --> 0:18:27.793
654
+ And when is it more like a dialect?
655
+
656
+ 0:18:27.793 --> 0:18:28.997
657
+ So is Catalan?
658
+
659
+ 0:18:28.997 --> 0:18:31.727
660
+ Is Swiss German a known language?
661
+
662
+ 0:18:31.727 --> 0:18:33.253
663
+ Or is it the same?
664
+
665
+ 0:18:33.293 --> 0:18:36.887
666
+ So then, of course, it's are like Czech and
667
+ Slovakian.
668
+
669
+ 0:18:36.887 --> 0:18:42.704
670
+ I know heard that people can understand each
671
+ other so they can just continue talking and
672
+
673
+ 0:18:42.704 --> 0:18:45.711
674
+ understand by some of their own language and.
675
+
676
+ 0:18:46.026 --> 0:18:56.498
677
+ Of course, it's partly also like about your
678
+ own nationality, so I think some people said
679
+
680
+ 0:18:56.498 --> 0:18:57.675
681
+ creation.
682
+
683
+ 0:18:58.018 --> 0:19:04.957
684
+ But think for a lot of people you shouldn't
685
+ say that they are part of being creation language.
686
+
687
+ 0:19:05.165 --> 0:19:10.876
688
+ But you see therefore that it is not completely
689
+ clear that there is no hardwater between this
690
+
691
+ 0:19:10.876 --> 0:19:13.974
692
+ and the new language, and this is a different
693
+ one.
694
+
695
+ 0:19:14.094 --> 0:19:19.403
696
+ And of course it's getting more fluent when
697
+ you talk about scientific things.
698
+
699
+ 0:19:19.403 --> 0:19:25.189
700
+ I guess sometimes it's no longer clear if
701
+ it's German or English because we start to
702
+
703
+ 0:19:25.189 --> 0:19:27.707
704
+ use a lot of English terms in there.
705
+
706
+ 0:19:27.707 --> 0:19:31.519
707
+ So of course there's interesting mixes which
708
+ will talk.
709
+
710
+ 0:19:33.193 --> 0:19:38.537
711
+ So should everybody just speak English, and
712
+ these numbers are a bit older, have to admit:
713
+
714
+ 0:19:38.938 --> 0:19:47.124
715
+ However, I don't think they're completely different
716
+ now and it says like how many people know in
717
+
718
+ 0:19:47.124 --> 0:19:54.718
719
+ Europe can speak English for countries where
720
+ English is not the mothertown or for people.
721
+
722
+ 0:19:54.995 --> 0:20:06.740
723
+ In some countries like smaller ones, for smaller
724
+ countries you have quite high numbers.
725
+
726
+ 0:20:07.087 --> 0:20:13.979
727
+ However, there are many countries where you
728
+ have like twenty to thirty percent of the population,
729
+
730
+ 0:20:13.979 --> 0:20:16.370
731
+ only being able to speak English.
732
+
733
+ 0:20:16.370 --> 0:20:22.559
734
+ So if we would only do everything only in
735
+ English, we would exclude half the population
736
+
737
+ 0:20:22.559 --> 0:20:23.333
738
+ of Europe.
739
+
740
+ 0:20:23.563 --> 0:20:30.475
741
+ And therefore providing translations is very
742
+ important and therefore, for example, the European
743
+
744
+ 0:20:30.475 --> 0:20:35.587
745
+ Parliament puts a really large amount of money
746
+ into doing translation.
747
+
748
+ 0:20:35.695 --> 0:20:40.621
749
+ So that's why you can speak in your mother
750
+ too in the European Parliament.
751
+
752
+ 0:20:40.621 --> 0:20:46.204
753
+ Everybody like everyone elected there can
754
+ speak in there and they were translated to
755
+
756
+ 0:20:46.204 --> 0:20:52.247
757
+ all the other languages and it's a huge effort
758
+ and so the question is can we do better with
759
+
760
+ 0:20:52.247 --> 0:20:52.838
761
+ machine.
762
+
763
+ 0:20:53.493 --> 0:20:58.362
764
+ And for other countries things are even more.
765
+
766
+ 0:20:58.362 --> 0:21:05.771
767
+ They may be not worse, difficult, but they
768
+ are even more challenging.
769
+
770
+ 0:21:06.946 --> 0:21:13.764
771
+ So there's even more diversity of languages
772
+ and it might be even more important to do machines.
773
+
774
+ 0:21:16.576 --> 0:21:31.034
775
+ If you see how many people speak French, Portuguese
776
+ or English, it's relatively few compared to
777
+
778
+ 0:21:31.034 --> 0:21:33.443
779
+ the population.
780
+
781
+ 0:21:33.813 --> 0:21:46.882
782
+ So think that this should be around millions
783
+ would understand you, but all the others wouldn't.
784
+
785
+ 0:21:49.289 --> 0:21:54.877
786
+ So it seems to be very important to provide
787
+ some taebo translation.
788
+
789
+ 0:21:54.877 --> 0:21:58.740
790
+ It's a quite big industry as a European Union.
791
+
792
+ 0:21:58.740 --> 0:22:05.643
793
+ This is already also quite long ago, but it
794
+ won't get less spent like in that year.
795
+
796
+ 0:22:05.643 --> 0:22:08.931
797
+ One point three billion on translation.
798
+
799
+ 0:22:09.289 --> 0:22:21.315
800
+ So it might be very helpful to have tools
801
+ in order to provide them, and as said, not
802
+
803
+ 0:22:21.315 --> 0:22:26.267
804
+ all directions might be important.
805
+
806
+ 0:22:26.426 --> 0:22:35.059
807
+ Is even not possible for students, so in the
808
+ European Parliament they don't have all combinations
809
+
810
+ 0:22:35.059 --> 0:22:36.644
811
+ of the different.
812
+
813
+ 0:22:36.977 --> 0:22:42.210
814
+ And language is so if they want to translate
815
+ from Maltese to Estonian or so.
816
+
817
+ 0:22:42.402 --> 0:22:47.361
818
+ And maybe they have a translator for that,
819
+ but there are some directions which don't have
820
+
821
+ 0:22:47.361 --> 0:22:47.692
822
+ that.
823
+
824
+ 0:22:47.692 --> 0:22:52.706
825
+ Then they handle directly, but they would
826
+ translate first to French, German or or English,
827
+
828
+ 0:22:52.706 --> 0:22:57.721
829
+ and then there would be a second translator
830
+ getting the translation and really translating
831
+
832
+ 0:22:57.721 --> 0:22:59.154
833
+ to your Italian language.
834
+
835
+ 0:22:59.299 --> 0:23:06.351
836
+ And it's not always English, so they are really
837
+ selecting what is most helpful.
838
+
839
+ 0:23:06.351 --> 0:23:13.931
840
+ But you see that even in this small setup,
841
+ with this large amount of effort in there,
842
+
843
+ 0:23:13.931 --> 0:23:17.545
844
+ there's not enough ability to translate.
845
+
846
+ 0:23:19.819 --> 0:23:21.443
847
+ And of course this was text.
848
+
849
+ 0:23:21.443 --> 0:23:26.538
850
+ Then you have a lot of other things where
851
+ you want to, for example, do speech translation.
852
+
853
+ 0:23:26.538 --> 0:23:31.744
854
+ There is a lot of conferences which currently
855
+ are all held in English, which of course might
856
+
857
+ 0:23:31.744 --> 0:23:35.831
858
+ also not be the best solution if you've gone
859
+ to some of the conferences.
860
+
861
+ 0:23:36.176 --> 0:23:45.964
862
+ You might have heard some accented speech
863
+ where people speak a language that is very
864
+
865
+ 0:23:45.964 --> 0:23:49.304
866
+ different from their mother.
867
+
868
+ 0:23:49.749 --> 0:23:52.059
869
+ Might be difficult to understand.
870
+
871
+ 0:23:52.212 --> 0:23:59.123
872
+ We're currently having an effort for example
873
+ by ACL, which is the conference organized in
874
+
875
+ 0:23:59.123 --> 0:24:06.112
876
+ this field to provide these translations into
877
+ ten hour languages so that also students who
878
+
879
+ 0:24:06.112 --> 0:24:06.803
880
+ are not.
881
+
882
+ 0:24:06.746 --> 0:24:12.446
883
+ That familiar English is able to read the
884
+ papers and watch the present case.
885
+
886
+ 0:24:16.416 --> 0:24:25.243
887
+ So the question is what can you do here and
888
+ one interesting solution which we'll cover
889
+
890
+ 0:24:25.243 --> 0:24:26.968
891
+ in this lecture?
892
+
893
+ 0:24:27.087 --> 0:24:38.112
894
+ This always comes with a question: is it will
895
+ it replace the human?
896
+
897
+ 0:24:38.112 --> 0:24:40.382
898
+ And yes, the.
899
+
900
+ 0:24:40.300 --> 0:24:49.300
901
+ Idea, but the question doesn't really happen
902
+ and I'm any skeptical about that.
903
+
904
+ 0:24:49.300 --> 0:24:52.946
905
+ So currently we are not seeing.
906
+
907
+ 0:24:53.713 --> 0:24:55.807
908
+ So much more effort needed.
909
+
910
+ 0:24:55.807 --> 0:25:00.294
911
+ Of course, machine translation is now used
912
+ as some type of.
913
+
914
+ 0:25:01.901 --> 0:25:11.785
915
+ If you think about in the European Parliament,
916
+ they will have some humans doing their translation
917
+
918
+ 0:25:11.785 --> 0:25:18.060
919
+ because: If you think about the chancel of
920
+ Germany trembling somewhere and quite sure
921
+
922
+ 0:25:18.060 --> 0:25:18.784
923
+ you want,.
924
+
925
+ 0:25:19.179 --> 0:25:31.805
926
+ And so it's more like we are augmenting the
927
+ possibilities to have more possibilities to
928
+
929
+ 0:25:31.805 --> 0:25:37.400
930
+ provide translation and travel around.
931
+
932
+ 0:25:39.499 --> 0:25:53.650
933
+ How can this technology help so machine translation
934
+ is one way of dealing with?
935
+
936
+ 0:25:54.474 --> 0:26:01.144
937
+ Of course, there is other tasks which do even
938
+ without machine translation.
939
+
940
+ 0:26:01.144 --> 0:26:04.613
941
+ Just think about summarize my lecture.
942
+
943
+ 0:26:04.965 --> 0:26:08.019
944
+ Approaches doing that what they call end to
945
+ end.
946
+
947
+ 0:26:08.019 --> 0:26:11.635
948
+ So you just put an English text and get a
949
+ German summary.
950
+
951
+ 0:26:11.635 --> 0:26:17.058
952
+ However, a good baseline and an important
953
+ thing is to either first lecture into German
954
+
955
+ 0:26:17.058 --> 0:26:22.544
956
+ and then do a summary art, first do a summary
957
+ in English and then translation language.
958
+
959
+ 0:26:23.223 --> 0:26:28.764
960
+ Translation is very important in order to
961
+ different application scenarios.
962
+
963
+ 0:26:28.764 --> 0:26:33.861
964
+ We have that dissemination dialogue but also
965
+ information extraction.
966
+
967
+ 0:26:33.861 --> 0:26:39.993
968
+ So if you want to do like get information
969
+ not only from English websites but from.
970
+
971
+ 0:26:40.300 --> 0:26:42.427
972
+ Very different websites.
973
+
974
+ 0:26:42.427 --> 0:26:46.171
975
+ It's helpful to have this type of solution.
976
+
977
+ 0:26:50.550 --> 0:26:52.772
978
+ Yeah, what can you translate?
979
+
980
+ 0:26:52.772 --> 0:26:59.660
981
+ Of course, we will focus on text, as I said
982
+ for most of them, because it's about translation
983
+
984
+ 0:26:59.660 --> 0:27:06.178
985
+ and anything first translates to text, and
986
+ then change to text, and then we can do text
987
+
988
+ 0:27:06.178 --> 0:27:07.141
989
+ translation.
990
+
991
+ 0:27:09.189 --> 0:27:19.599
992
+ And text is not equals text, so we can do
993
+ translation that is some of the most common.
994
+
995
+ 0:27:19.499 --> 0:27:27.559
996
+ Is working on translation, so just imagine
997
+ you are developing your new.
998
+
999
+ 0:27:27.947 --> 0:27:34.628
1000
+ Nowadays you don't want to have to only be
1001
+ available in English or German books in as
1002
+
1003
+ 0:27:34.628 --> 0:27:40.998
1004
+ many languages as possible, and if you use
1005
+ the standard tools it's not that easy.
1006
+
1007
+ 0:27:41.141 --> 0:27:50.666
1008
+ We have a different type of domain and there
1009
+ again we have very few contexts.
1010
+
1011
+ 0:27:50.666 --> 0:27:56.823
1012
+ Normally we translate: To pick up an app you
1013
+ have the menu and there's like safe.
1014
+
1015
+ 0:27:57.577 --> 0:28:02.535
1016
+ And then you only have safe.
1017
+
1018
+ 0:28:02.535 --> 0:28:14.845
1019
+ How should translate safe should it be written
1020
+ or should it be spicing?
1021
+
1022
+ 0:28:16.856 --> 0:28:24.407
1023
+ Then, of course, if you have like files, it
1024
+ might be that you have meta data to transport.
1025
+
1026
+ 0:28:26.466 --> 0:28:27.137
1027
+ Novels.
1028
+
1029
+ 0:28:27.137 --> 0:28:32.501
1030
+ Some work on that, but yeah, that's always
1031
+ a typical criticism.
1032
+
1033
+ 0:28:32.501 --> 0:28:36.440
1034
+ You'll never be able to translate Shakespeare.
1035
+
1036
+ 0:28:36.656 --> 0:28:43.684
1037
+ Think this is somehow the last use case of
1038
+ machine translation.
1039
+
1040
+ 0:28:43.684 --> 0:28:47.637
1041
+ For a translation of books there's.
1042
+
1043
+ 0:28:47.847 --> 0:28:57.047
1044
+ But the nice thing about machine translation
1045
+ is that it can translate to things which are
1046
+
1047
+ 0:28:57.047 --> 0:29:05.327
1048
+ boring, so think about translating some bureaucrative
1049
+ forms or some regulations.
1050
+
1051
+ 0:29:05.565 --> 0:29:11.302
1052
+ This is normally not very interesting, it's
1053
+ very repetitive, so their automation works
1054
+
1055
+ 0:29:11.302 --> 0:29:11.697
1056
+ well.
1057
+
1058
+ 0:29:11.931 --> 0:29:17.519
1059
+ Of course, there is also translations on Paibos
1060
+ images.
1061
+
1062
+ 0:29:17.519 --> 0:29:24.604
1063
+ I guess you point your camera to an object
1064
+ where it translates things.
1065
+
1066
+ 0:29:25.005 --> 0:29:43.178
1067
+ And we'll cover that at the end, as said,
1068
+ the speech translation.
1069
+
1070
+ 0:29:43.663 --> 0:29:46.795
1071
+ So you can't provide the translation of the
1072
+ lecture.
1073
+
1074
+ 0:29:46.795 --> 0:29:50.518
1075
+ If I'm five slides further then you would
1076
+ see the translation.
1077
+
1078
+ 0:29:50.518 --> 0:29:52.291
1079
+ It might not be very helpful.
1080
+
1081
+ 0:29:54.794 --> 0:29:57.062
1082
+ We are not speaking as we are written.
1083
+
1084
+ 0:29:57.062 --> 0:29:59.097
1085
+ It's again like a domain mismatch.
1086
+
1087
+ 0:29:59.359 --> 0:30:10.161
1088
+ So typically the sentences are not full sentences
1089
+ and I'm saying this is not the right way to
1090
+
1091
+ 0:30:10.161 --> 0:30:19.354
1092
+ praise it and if you just read what was written
1093
+ it might be hard to understand.
1094
+
1095
+ 0:30:23.803 --> 0:30:36.590
1096
+ We are focusing on the first application scenario
1097
+ that is fully out of management.
1098
+
1099
+ 0:30:37.177 --> 0:30:46.373
1100
+ Of course, there are quite interesting application
1101
+ scenarios for other things where it should
1102
+
1103
+ 0:30:46.373 --> 0:30:47.645
1104
+ be referred.
1105
+
1106
+ 0:30:47.867 --> 0:30:49.695
1107
+ Where it's no longer going to be.
1108
+
1109
+ 0:30:49.695 --> 0:30:52.436
1110
+ We have this tool and it works, but it's a
1111
+ market.
1112
+
1113
+ 0:30:52.436 --> 0:30:57.381
1114
+ We have the machine translation system and
1115
+ the human translator, and they somehow cooperate
1116
+
1117
+ 0:30:57.381 --> 0:30:59.853
1118
+ and try to be as fast as possible in doing
1119
+ a.
1120
+
1121
+ 0:31:00.380 --> 0:31:12.844
1122
+ The easiest idea there would be the first
1123
+ point you take the machine translation.
1124
+
1125
+ 0:31:13.553 --> 0:31:17.297
1126
+ That sometimes farther might not be the best
1127
+ way of suing it.
1128
+
1129
+ 0:31:17.357 --> 0:31:25.308
1130
+ Any ideas or what else you could do, then
1131
+ maybe the machine could aid the human and say
1132
+
1133
+ 0:31:25.308 --> 0:31:27.838
1134
+ I'm sure about this author.
1135
+
1136
+ 0:31:28.368 --> 0:31:32.319
1137
+ Yeah, very interesting, very good.
1138
+
1139
+ 0:31:32.319 --> 0:31:42.252
1140
+ Of course, the dangerous thing there is you
1141
+ asking something from a machine translation
1142
+
1143
+ 0:31:42.252 --> 0:31:45.638
1144
+ system where it's really bad.
1145
+
1146
+ 0:31:45.845 --> 0:31:50.947
1147
+ There is quality estimation that maybe it
1148
+ will couple that in evaluation so in evaluation
1149
+
1150
+ 0:31:50.947 --> 0:31:55.992
1151
+ you know what is correct translation and you
1152
+ have another output and you try to estimate
1153
+
1154
+ 0:31:55.992 --> 0:31:57.409
1155
+ how good is the quality.
1156
+
1157
+ 0:31:57.409 --> 0:32:02.511
1158
+ In quality estimation you don't have you only
1159
+ have a source and time and good question is
1160
+
1161
+ 0:32:02.511 --> 0:32:03.531
1162
+ exactly this one.
1163
+
1164
+ 0:32:03.531 --> 0:32:05.401
1165
+ Is it a good translation or not?
1166
+
1167
+ 0:32:05.665 --> 0:32:12.806
1168
+ This might be easier because the system might
1169
+ not know what translation is.
1170
+
1171
+ 0:32:13.053 --> 0:32:23.445
1172
+ Human is very good at that for machines that
1173
+ are difficult, but of course that's an interesting
1174
+
1175
+ 0:32:23.445 --> 0:32:24.853
1176
+ application.
1177
+
1178
+ 0:32:25.065 --> 0:32:32.483
1179
+ Be more interactive so that you may be translating
1180
+ if the human changes the fifth word.
1181
+
1182
+ 0:32:32.483 --> 0:32:36.361
1183
+ What does it mean for the remaining sentence?
1184
+
1185
+ 0:32:36.361 --> 0:32:38.131
1186
+ Do I need to change?
1187
+
1188
+ 0:32:38.131 --> 0:32:43.948
1189
+ There are also things like you don't have
1190
+ to repeat the same errors.
1191
+
1192
+ 0:32:47.767 --> 0:32:57.651
1193
+ Hell our automated basemen, you only want
1194
+ to correct at once and not at all positions.
1195
+
1196
+ 0:33:00.000 --> 0:33:21.784
1197
+ And then they ask, for example, so before
1198
+ the translation is done they ask: I'm not directly
1199
+
1200
+ 0:33:21.784 --> 0:33:23.324
1201
+ aware of that.
1202
+
1203
+ 0:33:23.324 --> 0:33:33.280
1204
+ I think it's a good way of ending and I think
1205
+ it's where, especially with more advanced dialogue
1206
+
1207
+ 0:33:33.280 --> 0:33:34.717
1208
+ strategy and.
1209
+
1210
+ 0:33:35.275 --> 0:33:38.831
1211
+ Currently think of most of the focus is like
1212
+ at least determining.
1213
+
1214
+ 0:33:39.299 --> 0:33:45.646
1215
+ Don't have this information that is already
1216
+ challenging, so there is quite some work on
1217
+
1218
+ 0:33:45.646 --> 0:33:49.541
1219
+ quality estimation that I'm missing your information.
1220
+
1221
+ 0:33:49.789 --> 0:33:53.126
1222
+ But is there something missing?
1223
+
1224
+ 0:33:53.126 --> 0:33:59.904
1225
+ It's really quite challenging and think that
1226
+ is where currently.
1227
+
1228
+ 0:34:00.260 --> 0:34:05.790
1229
+ What is there is there is opportunities to
1230
+ provide or there is models to directly provide
1231
+
1232
+ 0:34:05.790 --> 0:34:06.527
1233
+ additional?
1234
+
1235
+ 0:34:06.786 --> 0:34:13.701
1236
+ You can give them anything you have and provide
1237
+ them.
1238
+
1239
+ 0:34:13.701 --> 0:34:21.129
1240
+ It's a similar situation if you're translating
1241
+ to German.
1242
+
1243
+ 0:34:21.641 --> 0:34:31.401
1244
+ And it would just guess normally or do some
1245
+ random guessing always means it's using some
1246
+
1247
+ 0:34:31.401 --> 0:34:36.445
1248
+ information which should not be really there.
1249
+
1250
+ 0:34:36.776 --> 0:34:46.449
1251
+ So then you can provide it with an additional
1252
+ input or you should use formula or non formula.
1253
+
1254
+ 0:34:47.747 --> 0:35:04.687
1255
+ To know that this information is missing.
1256
+
1257
+ 0:35:04.544 --> 0:35:19.504
1258
+ Since you're not specifically modeling this,
1259
+ it's likely that there is a gender difference
1260
+
1261
+ 0:35:19.504 --> 0:35:21.805
1262
+ in languages.
1263
+
1264
+ 0:35:26.046 --> 0:35:39.966
1265
+ One are we doing good search on machine translation,
1266
+ so it's a very important part to ask in natural
1267
+
1268
+ 0:35:39.966 --> 0:35:42.860
1269
+ language processing.
1270
+
1271
+ 0:35:43.283 --> 0:35:49.234
1272
+ So of course you have a lot of computer science
1273
+ thing in there and that's the backbone of.
1274
+
1275
+ 0:35:49.569 --> 0:36:01.848
1276
+ However, task and understanding you can also
1277
+ get from information like computational linguistics,
1278
+
1279
+ 0:36:01.848 --> 0:36:08.613
1280
+ which tell you about what language it's good
1281
+ to know.
1282
+
1283
+ 0:36:08.989 --> 0:36:15.425
1284
+ Doesn't mean that in a computer we have to
1285
+ bottle it exactly the same, but for example
1286
+
1287
+ 0:36:15.425 --> 0:36:22.453
1288
+ to know that there is something like morphology,
1289
+ which means how words are built, and that for
1290
+
1291
+ 0:36:22.453 --> 0:36:24.746
1292
+ some languages it's very easy.
1293
+
1294
+ 0:36:24.746 --> 0:36:28.001
1295
+ In English there is nearly no worth coming.
1296
+
1297
+ 0:36:28.688 --> 0:36:35.557
1298
+ Well in Germany you already start for soon
1299
+ you have like different forms and so on.
1300
+
1301
+ 0:36:36.316 --> 0:36:41.991
1302
+ And for other languages, for finish, it's
1303
+ even more complicated with Basque.
1304
+
1305
+ 0:36:41.991 --> 0:36:44.498
1306
+ I think for some words more than.
1307
+
1308
+ 0:36:45.045 --> 0:36:52.098
1309
+ So knowing this, of course, gives you some
1310
+ advice.
1311
+
1312
+ 0:36:52.098 --> 0:37:04.682
1313
+ How do I look at that now because we'll see
1314
+ in the basic treat each word as an individual?
1315
+
1316
+ 0:37:06.106 --> 0:37:09.259
1317
+ Of course there is a lot of interest also
1318
+ prone from industry.
1319
+
1320
+ 0:37:09.259 --> 0:37:10.860
1321
+ There is a lot of applications.
1322
+
1323
+ 0:37:11.191 --> 0:37:17.068
1324
+ There's research groups at Google, Facebook,
1325
+ and Amazon.
1326
+
1327
+ 0:37:17.068 --> 0:37:26.349
1328
+ So there's quite a lot of interest in providing
1329
+ that for German and English it is solved.
1330
+
1331
+ 0:37:26.546 --> 0:37:27.569
1332
+ Annoucing it's hard.
1333
+
1334
+ 0:37:27.569 --> 0:37:31.660
1335
+ We're saying that not hard, but of course
1336
+ we haven't acquired high quality in them.
1337
+
1338
+ 0:37:32.212 --> 0:37:39.296
1339
+ But there's currently really a large trend
1340
+ in building other systems for low research
1341
+
1342
+ 0:37:39.296 --> 0:37:40.202
1343
+ languages.
1344
+
1345
+ 0:37:40.480 --> 0:37:53.302
1346
+ So there are tasks on last year's task on
1347
+ translating from Native American languages:
1348
+
1349
+ 0:37:53.193 --> 0:37:58.503
1350
+ Don't know yet but but five other languages,
1351
+ so how can you translate from them?
1352
+
1353
+ 0:37:58.538 --> 0:38:05.074
1354
+ Then you don't have like millions of sentences,
1355
+ but you might have only the Bible or some more
1356
+
1357
+ 0:38:05.074 --> 0:38:05.486
1358
+ data.
1359
+
1360
+ 0:38:05.486 --> 0:38:08.169
1361
+ Then the question is, what can you do?
1362
+
1363
+ 0:38:08.169 --> 0:38:09.958
1364
+ And how good can you get?
1365
+
1366
+ 0:38:14.794 --> 0:38:17.296
1367
+ One thing is very important.
1368
+
1369
+ 0:38:17.296 --> 0:38:25.751
1370
+ Of course, in a lot of A I is to measure the
1371
+ quality and what you can measure is quite important.
1372
+
1373
+ 0:38:25.986 --> 0:38:37.213
1374
+ So that's why for many years of regular there
1375
+ is different evaluation campaigns where people
1376
+
1377
+ 0:38:37.213 --> 0:38:38.178
1378
+ submit.
1379
+
1380
+ 0:38:39.419 --> 0:38:45.426
1381
+ We're often part of the statistical machine
1382
+ translation original, yet now I think it's
1383
+
1384
+ 0:38:45.426 --> 0:38:51.019
1385
+ a machine translation where it's mostly about
1386
+ European languages and used texts.
1387
+
1388
+ 0:38:51.051 --> 0:38:57.910
1389
+ The International Workshop of Spoken Language
1390
+ Translation, which is translation about lectures
1391
+
1392
+ 0:38:57.910 --> 0:39:04.263
1393
+ which we are co organizing, and there is a
1394
+ bovia as I said building strong systems this
1395
+
1396
+ 0:39:04.263 --> 0:39:04.696
1397
+ time.
1398
+
1399
+ 0:39:04.664 --> 0:39:11.295
1400
+ This has established translating conference
1401
+ presentations from English into ten different
1402
+
1403
+ 0:39:11.295 --> 0:39:17.080
1404
+ languages: And then, of course, you have to
1405
+ deal with things like special vocabulary.
1406
+
1407
+ 0:39:17.037 --> 0:39:23.984
1408
+ You think about recurrent real networks are
1409
+ terms like co-recurrent networks, convolutional
1410
+
1411
+ 0:39:23.984 --> 0:39:24.740
1412
+ networks.
1413
+
1414
+ 0:39:25.545 --> 0:39:29.917
1415
+ That might be more difficult to translate
1416
+ and you also have to decide who I need to translate
1417
+
1418
+ 0:39:29.917 --> 0:39:33.359
1419
+ or should I keep it in English, and that's
1420
+ not the same in each language.
1421
+
1422
+ 0:39:33.873 --> 0:39:37.045
1423
+ In German maybe mostly you keep it.
1424
+
1425
+ 0:39:37.045 --> 0:39:44.622
1426
+ I think in French people are typically like
1427
+ wanting to translate as much as possible.
1428
+
1429
+ 0:39:44.622 --> 0:39:52.200
1430
+ These are then challenges and then, of course,
1431
+ in Poland where it's also challenging.
1432
+
1433
+ 0:39:53.153 --> 0:39:59.369
1434
+ I think all of the speakers in the test that
1435
+ are not native in your speakers, so you need
1436
+
1437
+ 0:39:59.369 --> 0:40:05.655
1438
+ to translate people with a German accent or
1439
+ with a French accent or with a Japanese accent
1440
+
1441
+ 0:40:05.655 --> 0:40:09.178
1442
+ or an English accent, which poison has additional.
1443
+
1444
+ 0:40:12.272 --> 0:40:21.279
1445
+ Yes, so there is criticism always with new
1446
+ technologies because people say will never
1447
+
1448
+ 0:40:21.279 --> 0:40:23.688
1449
+ translate Shakespeare.
1450
+
1451
+ 0:40:24.204 --> 0:40:26.845
1452
+ Partly agree with the second.
1453
+
1454
+ 0:40:26.845 --> 0:40:34.682
1455
+ Maybe it's not good at translating Shakespeare,
1456
+ but there's many people working on that.
1457
+
1458
+ 0:40:35.255 --> 0:40:38.039
1459
+ Of course, the poison cookie is a challenge.
1460
+
1461
+ 0:40:38.858 --> 0:40:44.946
1462
+ The thing is here that the cookie chart that
1463
+ you can't never be sure if the machine translation
1464
+
1465
+ 0:40:44.946 --> 0:40:47.546
1466
+ system doesn't really mistake somewhere.
1467
+
1468
+ 0:40:47.546 --> 0:40:53.316
1469
+ So if you can't be sure that there's no error
1470
+ in there, how can you trust the translation?
1471
+
1472
+ 0:40:55.275 --> 0:41:01.892
1473
+ That is partly true, on the other hand, otherwise
1474
+ you have to translate to a human translator
1475
+
1476
+ 0:41:01.892 --> 0:41:06.116
1477
+ and men who are sometimes overestimating human
1478
+ performance.
1479
+
1480
+ 0:41:06.746 --> 0:41:15.111
1481
+ They are very good translators but under a
1482
+ lot of pressure and not human translations.
1483
+
1484
+ 0:41:15.715 --> 0:41:22.855
1485
+ The question is: When can you trust it enough
1486
+ anyway?
1487
+
1488
+ 0:41:22.855 --> 0:41:28.540
1489
+ You should be careful about trusting them.
1490
+
1491
+ 0:41:31.011 --> 0:41:38.023
1492
+ And I think some of them are too old now because
1493
+ it has been shown that it is helpful to have
1494
+
1495
+ 0:41:38.023 --> 0:41:41.082
1496
+ some type of machine translation system.
1497
+
1498
+ 0:41:41.082 --> 0:41:47.722
1499
+ Of course, it is not buying the car, so typically
1500
+ still a system is not working forever.
1501
+
1502
+ 0:41:48.048 --> 0:41:56.147
1503
+ If you want your dedicated system, which is
1504
+ good for the task you are, they are typically
1505
+
1506
+ 0:41:56.147 --> 0:41:57.947
1507
+ not as generalized.
1508
+
1509
+ 0:41:58.278 --> 0:42:07.414
1510
+ That can translate news and chats, and I don't
1511
+ know what.
1512
+
1513
+ 0:42:07.414 --> 0:42:12.770
1514
+ So typically if you want to show.
1515
+
1516
+ 0:42:12.772 --> 0:42:18.796
1517
+ It's not made for, it has not seen very well
1518
+ and then you see a bad quality.
1519
+
1520
+ 0:42:19.179 --> 0:42:27.139
1521
+ But that's also like yeah, therefore you don't
1522
+ build it.
1523
+
1524
+ 0:42:27.139 --> 0:42:42.187
1525
+ If you have a sports car and you are driving
1526
+ off road you should: Yeah, you can also say
1527
+
1528
+ 0:42:42.187 --> 0:42:49.180
1529
+ the other way around trans machine translation
1530
+ is already solved, and especially with more
1531
+
1532
+ 0:42:49.180 --> 0:42:50.487
1533
+ people think so.
1534
+
1535
+ 0:42:50.750 --> 0:43:04.275
1536
+ However, there is an impressive performance
1537
+ of machine translation, but it's not stated
1538
+
1539
+ 0:43:04.275 --> 0:43:06.119
1540
+ of the art.
1541
+
1542
+ 0:43:06.586 --> 0:43:11.811
1543
+ And yeah, they're good for some domains and
1544
+ some languages that are even like already.
1545
+
1546
+ 0:43:12.572 --> 0:43:27.359
1547
+ Have Microsoft has a very super human performance
1548
+ claiming that their machine translated system.
1549
+
1550
+ 0:43:27.467 --> 0:43:38.319
1551
+ However, there was one domain use and some
1552
+ language in Spanish where there is a huge amount
1553
+
1554
+ 0:43:38.319 --> 0:43:45.042
1555
+ of training data and you can build a very strong
1556
+ system.
1557
+
1558
+ 0:43:45.505 --> 0:43:48.605
1559
+ And you even don't have to go to these extreme
1560
+ cases.
1561
+
1562
+ 0:43:48.688 --> 0:43:54.328
1563
+ We have worked on Canada, which is a language
1564
+ in India spoken.
1565
+
1566
+ 0:43:54.328 --> 0:44:01.669
1567
+ I think by also around eighty million people
1568
+ so similar to to German that it has.
1569
+
1570
+ 0:44:01.669 --> 0:44:07.757
1571
+ The quality is significantly worse, it has
1572
+ significantly less data.
1573
+
1574
+ 0:44:08.108 --> 0:44:15.132
1575
+ There are still quite a lot of languages where
1576
+ the quality is not, where you want to have.
1577
+
1578
+ 0:44:15.295 --> 0:44:17.971
1579
+ Scaling this is not as easy at this thing.
1580
+
1581
+ 0:44:17.971 --> 0:44:23.759
1582
+ That's why we're also interested in multilingual
1583
+ systems with the hope that we don't have to
1584
+
1585
+ 0:44:23.759 --> 0:44:29.548
1586
+ build a system for each possible combination,
1587
+ but we can build a system which can cover many
1588
+
1589
+ 0:44:29.548 --> 0:44:33.655
1590
+ tags, many languages and then also need less
1591
+ data for each other.
1592
+
1593
+ 0:44:39.639 --> 0:44:51.067
1594
+ With invasion maybe some presentation of everything
1595
+ is a bit cat that can say the most important.
1596
+
1597
+ 0:44:51.331 --> 0:45:09.053
1598
+ So machine translation started coming from
1599
+ information theory in there was this: It's
1600
+
1601
+ 0:45:09.053 --> 0:45:13.286
1602
+ treating machine translation as encryption
1603
+ or decryption.
1604
+
1605
+ 0:45:13.533 --> 0:45:21.088
1606
+ Don't understand it, want to have it in English,
1607
+ treat it as if it's like encrypted English,
1608
+
1609
+ 0:45:21.088 --> 0:45:28.724
1610
+ and then apply my decryption algorithm, which
1611
+ they were working a lot during the Second World
1612
+
1613
+ 0:45:28.724 --> 0:45:29.130
1614
+ War.
1615
+
1616
+ 0:45:29.209 --> 0:45:34.194
1617
+ And so if I cannot do this detruction then
1618
+ this sings a song.
1619
+
1620
+ 0:45:34.934 --> 0:45:42.430
1621
+ And they based on that they had rules and
1622
+ so on.
1623
+
1624
+ 0:45:42.430 --> 0:45:50.843
1625
+ So they had the judge Georgetown experiments
1626
+ in where.
1627
+
1628
+ 0:45:51.691 --> 0:45:57.419
1629
+ From English and then they were like wow.
1630
+
1631
+ 0:45:57.419 --> 0:46:01.511
1632
+ This is solved in some years.
1633
+
1634
+ 0:46:01.511 --> 0:46:04.921
1635
+ Now we can do sentences.
1636
+
1637
+ 0:46:06.546 --> 0:46:18.657
1638
+ As you can imagine this didn't really work
1639
+ out that way, so it's not really happening.
1640
+
1641
+ 0:46:18.657 --> 0:46:24.503
1642
+ The spirit is willing, but flesh is weak.
1643
+
1644
+ 0:46:24.444 --> 0:46:30.779
1645
+ Translated it to Russian and then to Germany
1646
+ and then vodka is good but the meat is rotten.
1647
+
1648
+ 0:46:31.271 --> 0:46:39.694
1649
+ Think it never really happened this way, but
1650
+ you can see you can imagine that something
1651
+
1652
+ 0:46:39.694 --> 0:46:49.533
1653
+ like that could happen, and then in in the
1654
+ there was this report saying: It's more challenging
1655
+
1656
+ 0:46:49.533 --> 0:46:56.877
1657
+ than expected and the problem is that we have
1658
+ to invest more.
1659
+
1660
+ 0:46:56.877 --> 0:47:02.801
1661
+ There's no benefit for doing machine translation.
1662
+
1663
+ 0:47:04.044 --> 0:47:09.255
1664
+ At least in some other countries there was
1665
+ a bit, but then for some time there wasn't
1666
+
1667
+ 0:47:09.255 --> 0:47:10.831
1668
+ that big out of progress.
1669
+
1670
+ 0:47:12.152 --> 0:47:26.554
1671
+ We have then in the' 70s there were some rule
1672
+ based systems that would cover out some linguistic
1673
+
1674
+ 0:47:26.554 --> 0:47:28.336
1675
+ background.
1676
+
1677
+ 0:47:28.728 --> 0:47:34.013
1678
+ They are now doing very good machine translation,
1679
+ but they had a really huge rule base.
1680
+
1681
+ 0:47:34.314 --> 0:47:43.538
1682
+ So they really have like handwritten roots
1683
+ how to parse sentences, how to translate parse
1684
+
1685
+ 0:47:43.538 --> 0:47:45.587
1686
+ sentences to parse.
1687
+
1688
+ 0:47:46.306 --> 0:47:55.868
1689
+ When which word should be translated, these
1690
+ rule based systems were quite strong for a
1691
+
1692
+ 0:47:55.868 --> 0:47:57.627
1693
+ very long time.
1694
+
1695
+ 0:47:57.917 --> 0:48:03.947
1696
+ So even in or so for some language fares and
1697
+ some remains, it was better than a machine
1698
+
1699
+ 0:48:03.947 --> 0:48:04.633
1700
+ learning.
1701
+
1702
+ 0:48:05.505 --> 0:48:09.576
1703
+ Well, of course, there was a lot of effort
1704
+ in and a lot of experts were building this.
1705
+
1706
+ 0:48:11.791 --> 0:48:13.170
1707
+ And then.
1708
+
1709
+ 0:48:13.053 --> 0:48:18.782
1710
+ The first statistical machine translations
1711
+ were coming in the early nineties.
1712
+
1713
+ 0:48:18.782 --> 0:48:25.761
1714
+ There's the system by IBM will refer to them
1715
+ as a T by the IBM models, which are quite famous,
1716
+
1717
+ 0:48:25.761 --> 0:48:32.886
1718
+ and they were used to film your machine translations
1719
+ from the nineties nineties to two thousand.
1720
+
1721
+ 0:48:32.912 --> 0:48:35.891
1722
+ Fifteen or so people were working on the IBM
1723
+ models.
1724
+
1725
+ 0:48:36.496 --> 0:48:44.608
1726
+ And that was the first way of doing a machine
1727
+ translation with statisticals or machine learning.
1728
+
1729
+ 0:48:44.924 --> 0:48:52.143
1730
+ And it was possible through the French English
1731
+ under a corpusol from the Canadian Parliament
1732
+
1733
+ 0:48:52.143 --> 0:48:59.516
1734
+ they also had proceedings in French and English
1735
+ and people tried to use that to translate and.
1736
+
1737
+ 0:49:01.681 --> 0:49:06.919
1738
+ And yes, so that was than the start of statistical
1739
+ machine translation.
1740
+
1741
+ 0:49:07.227 --> 0:49:17.797
1742
+ Is called a phrase page machine translation
1743
+ was introduced where you could add more information
1744
+
1745
+ 0:49:17.797 --> 0:49:26.055
1746
+ in use longer chunks to translate and phrase
1747
+ page translation was somehow.
1748
+
1749
+ 0:49:26.326 --> 0:49:27.603
1750
+ She'll Start Fourteen.
1751
+
1752
+ 0:49:27.767 --> 0:49:37.721
1753
+ With this straight space machine sensation
1754
+ we saw the first commercial systems.
1755
+
1756
+ 0:49:38.178 --> 0:49:45.301
1757
+ And yeah, that was the first big advantage
1758
+ where really you can see the machine translation.
1759
+
1760
+ 0:49:47.287 --> 0:49:55.511
1761
+ And neural machine translation was mainly
1762
+ introduced.
1763
+
1764
+ 0:49:55.511 --> 0:50:07.239
1765
+ That means there was a shift from traditional
1766
+ statistical modeling to using.
1767
+
1768
+ 0:50:07.507 --> 0:50:09.496
1769
+ And that was quite impressive.
1770
+
1771
+ 0:50:09.496 --> 0:50:11.999
1772
+ It was really within one or two years.
1773
+
1774
+ 0:50:11.999 --> 0:50:17.453
1775
+ The whole research community shifted from
1776
+ what they had been working on since twenty
1777
+
1778
+ 0:50:17.453 --> 0:50:17.902
1779
+ years.
1780
+
1781
+ 0:50:17.902 --> 0:50:23.485
1782
+ And everybody was using this pattern, you
1783
+ know networks, because just the performances
1784
+
1785
+ 0:50:23.485 --> 0:50:25.089
1786
+ were really really much.
1787
+
1788
+ 0:50:25.425 --> 0:50:35.048
1789
+ Especially they are what we also see now with
1790
+ chat boards like the impressive thing.
1791
+
1792
+ 0:50:35.135 --> 0:50:45.261
1793
+ That was very, very challenging if you see
1794
+ machine translation before that, especially
1795
+
1796
+ 0:50:45.261 --> 0:50:47.123
1797
+ if the English.
1798
+
1799
+ 0:50:47.547 --> 0:50:53.352
1800
+ But if you were transmitting to German you
1801
+ would see that the agreement so that it's there
1802
+
1803
+ 0:50:53.352 --> 0:50:58.966
1804
+ shown abound and dishewn and boima and this
1805
+ didn't always really work perfect maybe for
1806
+
1807
+ 0:50:58.966 --> 0:51:04.835
1808
+ the short range of work but then it has to
1809
+ be accusative and it's like far away then things
1810
+
1811
+ 0:51:04.835 --> 0:51:06.430
1812
+ didn't really work well.
1813
+
1814
+ 0:51:06.866 --> 0:51:13.323
1815
+ Now with new machine translation we have a
1816
+ bit of a different problem: So the sentences
1817
+
1818
+ 0:51:13.323 --> 0:51:16.901
1819
+ are typically really nice.
1820
+
1821
+ 0:51:16.901 --> 0:51:24.056
1822
+ They are perfectly written not always but
1823
+ very often.
1824
+
1825
+ 0:51:24.224 --> 0:51:36.587
1826
+ So that adequacy and their conveillance should
1827
+ have the same meaning is typically the bigger.
1828
+
1829
+ 0:51:42.002 --> 0:51:46.039
1830
+ So how can we do so last?
1831
+
1832
+ 0:51:46.039 --> 0:51:54.889
1833
+ What are the things and how can we do machine
1834
+ rendering?
1835
+
1836
+ 0:51:55.235 --> 0:52:01.297
1837
+ So we had first blue based systems, and as
1838
+ a side systems we did that we manually created
1839
+
1840
+ 0:52:01.297 --> 0:52:01.769
1841
+ rules.
1842
+
1843
+ 0:52:01.861 --> 0:52:07.421
1844
+ And there were rules how to dissemvy real
1845
+ ambiguities.
1846
+
1847
+ 0:52:07.421 --> 0:52:16.417
1848
+ For example, we had the word banks look at
1849
+ the context and do rules like to decide when.
1850
+
1851
+ 0:52:17.197 --> 0:52:28.418
1852
+ How to translate the structure, but you know
1853
+ how to transfer the structure that you work
1854
+
1855
+ 0:52:28.418 --> 0:52:33.839
1856
+ has to split it in German and move to the.
1857
+
1858
+ 0:52:35.295 --> 0:52:36.675
1859
+ Here's a difficult thing.
1860
+
1861
+ 0:52:36.675 --> 0:52:39.118
1862
+ My thing is you don't need any training data.
1863
+
1864
+ 0:52:39.118 --> 0:52:41.295
1865
+ It's not like now with machine learning.
1866
+
1867
+ 0:52:41.295 --> 0:52:46.073
1868
+ If you build a machine translation system,
1869
+ the first question you should ask is do I have
1870
+
1871
+ 0:52:46.073 --> 0:52:46.976
1872
+ data to do that?
1873
+
1874
+ 0:52:46.976 --> 0:52:48.781
1875
+ Do I have parallel data to train?
1876
+
1877
+ 0:52:49.169 --> 0:52:50.885
1878
+ Here there's no data.
1879
+
1880
+ 0:52:50.885 --> 0:52:57.829
1881
+ It's like all trades, pencils and roads, but
1882
+ the problem is people trading the roads and
1883
+
1884
+ 0:52:57.829 --> 0:52:59.857
1885
+ this needs to be experts.
1886
+
1887
+ 0:52:59.799 --> 0:53:06.614
1888
+ Understand at least the grammar in one language,
1889
+ basically the grammar in both languages.
1890
+
1891
+ 0:53:06.614 --> 0:53:09.264
1892
+ It needs to be a real language to.
1893
+
1894
+ 0:53:10.090 --> 0:53:17.308
1895
+ Then we have the two corpus based machine
1896
+ translation approaches, and then we use machine
1897
+
1898
+ 0:53:17.308 --> 0:53:22.682
1899
+ learning to learn how to translate from one
1900
+ language to the other.
1901
+
1902
+ 0:53:22.882 --> 0:53:29.205
1903
+ We should find out ourselves what is the meaning
1904
+ of individual words, which words translate
1905
+
1906
+ 0:53:29.205 --> 0:53:30.236
1907
+ to each other.
1908
+
1909
+ 0:53:30.236 --> 0:53:36.215
1910
+ The only information we give is the German
1911
+ sentence, the English sentence, and then we
1912
+
1913
+ 0:53:36.215 --> 0:53:37.245
1914
+ look for many.
1915
+
1916
+ 0:53:37.697 --> 0:53:42.373
1917
+ So maybe you think there's a Bible for each
1918
+ language.
1919
+
1920
+ 0:53:42.373 --> 0:53:44.971
1921
+ There shouldn't be a problem.
1922
+
1923
+ 0:53:45.605 --> 0:53:52.752
1924
+ But this is not the scale when we're talking
1925
+ about.
1926
+
1927
+ 0:53:52.752 --> 0:54:05.122
1928
+ Small systems have maybe one hundred thousand
1929
+ sentences when we're building large models.
1930
+
1931
+ 0:54:05.745 --> 0:54:19.909
1932
+ The statistical models do statistics about
1933
+ how the word screw occur and how often the
1934
+
1935
+ 0:54:19.909 --> 0:54:21.886
1936
+ word screw.
1937
+
1938
+ 0:54:22.382 --> 0:54:29.523
1939
+ While we were focused on it was currently
1940
+ most of the cases referred to as neural communication.
1941
+
1942
+ 0:54:30.050 --> 0:54:44.792
1943
+ So in this case the idea is that you have
1944
+ a neural model which is a big neural network.
1945
+
1946
+ 0:54:45.345 --> 0:54:55.964
1947
+ And for these machine drums there quite challenging
1948
+ tasks.
1949
+
1950
+ 0:54:55.964 --> 0:55:03.883
1951
+ For example, this transformal architecture.
1952
+
1953
+ 0:55:03.903 --> 0:55:07.399
1954
+ Cast by Google in two thousand eight.
1955
+
1956
+ 0:55:08.028 --> 0:55:19.287
1957
+ Here want to ask the screw-based machine translation
1958
+ of that part.
1959
+
1960
+ 0:55:22.862 --> 0:55:33.201
1961
+ Would say it's mainly rule based systems because
1962
+ purely rule based systems maybe exist with
1963
+
1964
+ 0:55:33.201 --> 0:55:36.348
1965
+ some very exotic languages.
1966
+
1967
+ 0:55:36.776 --> 0:55:43.947
1968
+ Of course, the idea of investigating if we
1969
+ have this type of rulers that might be still
1970
+
1971
+ 0:55:43.947 --> 0:55:45.006
1972
+ interesting.
1973
+
1974
+ 0:55:45.105 --> 0:55:52.090
1975
+ Maybe you can try to let someone force the
1976
+ rules in there.
1977
+
1978
+ 0:55:52.090 --> 0:55:57.655
1979
+ You might use rules to create artificial data.
1980
+
1981
+ 0:55:57.557 --> 0:56:03.577
1982
+ That it might be helpful to have some concepts
1983
+ which develop by bilinguistic researches to
1984
+
1985
+ 0:56:03.577 --> 0:56:09.464
1986
+ somehow interview that that's still an open
1987
+ question is sometimes helpful, and of course
1988
+
1989
+ 0:56:09.464 --> 0:56:13.235
1990
+ is also interesting from more the analyzed
1991
+ perspectives.
1992
+
1993
+ 0:56:13.235 --> 0:56:13.499
1994
+ So.
1995
+
1996
+ 0:56:13.793 --> 0:56:20.755
1997
+ Do the new networks have these types of concepts
1998
+ of gender or anything?
1999
+
2000
+ 0:56:20.755 --> 0:56:23.560
2001
+ And can we test that though?
2002
+
2003
+ 0:56:30.330 --> 0:56:34.255
2004
+ Yes, and then the other way of describing
2005
+ how this can be done.
2006
+
2007
+ 0:56:34.574 --> 0:56:52.021
2008
+ And then originally mainly for a rule based
2009
+ system that can be used for a lot of scenarios.
2010
+
2011
+ 0:56:52.352 --> 0:57:04.135
2012
+ In real ways, the first world has really direct
2013
+ translation systems that work for related languages.
2014
+
2015
+ 0:57:04.135 --> 0:57:11.367
2016
+ You mainly look at each word and replace the
2017
+ word by the one.
2018
+
2019
+ 0:57:11.631 --> 0:57:22.642
2020
+ Another idea is that you first do some type
2021
+ of animus on the source side, so for example
2022
+
2023
+ 0:57:22.642 --> 0:57:28.952
2024
+ you can create what is referred to as a path
2025
+ tree.
2026
+
2027
+ 0:57:30.150 --> 0:57:36.290
2028
+ Or you can instead, and that is what is called
2029
+ the lingua face approach.
2030
+
2031
+ 0:57:36.290 --> 0:57:44.027
2032
+ You take the short sentence and parse it into
2033
+ a semantic representation, which is hopefully
2034
+
2035
+ 0:57:44.027 --> 0:57:44.448
2036
+ the.
2037
+
2038
+ 0:57:44.384 --> 0:57:50.100
2039
+ Only of the meaning of what is said and then
2040
+ you can generate it to any other language because
2041
+
2042
+ 0:57:50.100 --> 0:57:55.335
2043
+ it has a meaning and then you can need a part
2044
+ generation which can generate all other.
2045
+
2046
+ 0:57:57.077 --> 0:58:09.248
2047
+ The idea is somewhat nice to have this type
2048
+ of interlingua, general representation of all
2049
+
2050
+ 0:58:09.248 --> 0:58:17.092
2051
+ meanings, and they always translate into the
2052
+ interlingua.
2053
+
2054
+ 0:58:17.177 --> 0:58:19.189
2055
+ A Little World and It's Been Somewhere.
2056
+
2057
+ 0:58:20.580 --> 0:58:26.684
2058
+ It shouldn't be a natural language because
2059
+ it shouldn't have ambiguities so that's a big
2060
+
2061
+ 0:58:26.684 --> 0:58:32.995
2062
+ difference so the story and the tiger language
2063
+ have ambiguities so the idea is they do some
2064
+
2065
+ 0:58:32.995 --> 0:58:39.648
2066
+ semantic representation or what does it mean
2067
+ and so on and therefore it's very easy to generate.
2068
+
2069
+ 0:58:41.962 --> 0:58:45.176
2070
+ However, that is a challenge that this really
2071
+ exists.
2072
+
2073
+ 0:58:45.176 --> 0:58:48.628
2074
+ You cannot define the language for anything
2075
+ in the world.
2076
+
2077
+ 0:58:49.249 --> 0:58:56.867
2078
+ And that's why the Lingo-based approach typically
2079
+ worked for small domains to do hotel reservation,
2080
+
2081
+ 0:58:56.867 --> 0:59:00.676
2082
+ but if you want to define the Lingo for anything.
2083
+
2084
+ 0:59:01.061 --> 0:59:07.961
2085
+ There have been approaches and semantics,
2086
+ but it's yeah, it's not really possible CR.
2087
+
2088
+ 0:59:07.961 --> 0:59:15.905
2089
+ So approaches to this because I mean a seasonal
2090
+ vector's face and bitch eyes and slaves everything
2091
+
2092
+ 0:59:15.905 --> 0:59:20.961
2093
+ that I mitonized that they all could end up
2094
+ in the same space.
2095
+
2096
+ 0:59:21.821 --> 0:59:24.936
2097
+ That is not the question.
2098
+
2099
+ 0:59:24.936 --> 0:59:35.957
2100
+ If you talk about neural networks, it's direct
2101
+ translation on the one you're putting in the
2102
+
2103
+ 0:59:35.957 --> 0:59:36.796
2104
+ input.
2105
+
2106
+ 0:59:36.957 --> 0:59:44.061
2107
+ And you can argue for both that we have been
2108
+ making this representation language agnostic
2109
+
2110
+ 0:59:44.061 --> 0:59:45.324
2111
+ or independent.
2112
+
2113
+ 0:59:47.227 --> 0:59:52.912
2114
+ Until now we were able to make it less language
2115
+ dependent but it's very hard to make it completely
2116
+
2117
+ 0:59:52.912 --> 0:59:54.175
2118
+ language independent.
2119
+
2120
+ 0:59:54.175 --> 0:59:59.286
2121
+ Maybe it's also not necessary and of course
2122
+ if there's again the problem there's not all
2123
+
2124
+ 0:59:59.286 --> 1:00:04.798
2125
+ information and the source and the target there
2126
+ is different types of information if you remove
2127
+
2128
+ 1:00:04.798 --> 1:00:05.602
2129
+ all language.
2130
+
2131
+ 1:00:05.585 --> 1:00:09.408
2132
+ Information might be that you have removed
2133
+ too many information.
2134
+
2135
+ 1:00:10.290 --> 1:00:15.280
2136
+ Talk about this and there's a very interesting
2137
+ research direction in which we are working
2138
+
2139
+ 1:00:15.280 --> 1:00:20.325
2140
+ on on the multilingual part because there is
2141
+ especially the case if we have several source
2142
+
2143
+ 1:00:20.325 --> 1:00:25.205
2144
+ languages, several type of languages who try
2145
+ to generate a representation in the middle
2146
+
2147
+ 1:00:25.205 --> 1:00:27.422
2148
+ which have the few language dependence.
2149
+
2150
+ 1:00:32.752 --> 1:00:46.173
2151
+ Yes, so for a direct base approach, so as
2152
+ said the first one is dictionary based approach.
2153
+
2154
+ 1:00:46.806 --> 1:00:48.805
2155
+ Replace some words with other words.
2156
+
2157
+ 1:00:48.805 --> 1:00:51.345
2158
+ Then you have exactly the same same structure.
2159
+
2160
+ 1:00:51.771 --> 1:00:55.334
2161
+ Other problems are one to one correspondence.
2162
+
2163
+ 1:00:55.334 --> 1:01:01.686
2164
+ Some phrases are expressed with several words
2165
+ in English, but one word in German.
2166
+
2167
+ 1:01:01.686 --> 1:01:03.777
2168
+ That's extremely the case.
2169
+
2170
+ 1:01:03.777 --> 1:01:07.805
2171
+ Just think about all our composites like the
2172
+ Donau.
2173
+
2174
+ 1:01:08.608 --> 1:01:18.787
2175
+ Which is used very often as been referred
2176
+ to as translation memory.
2177
+
2178
+ 1:01:18.787 --> 1:01:25.074
2179
+ It might seem very simple, but it's like.
2180
+
2181
+ 1:01:26.406 --> 1:01:33.570
2182
+ That means you might think of this not helpful
2183
+ at all, but you know think about translating.
2184
+
2185
+ 1:01:33.513 --> 1:01:38.701
2186
+ The law text is more like the interactive
2187
+ scenario for the human translator.
2188
+
2189
+ 1:01:38.701 --> 1:01:44.091
2190
+ In law text there is a lot of repetition and
2191
+ a lot of phrases occur very often.
2192
+
2193
+ 1:01:44.424 --> 1:01:55.412
2194
+ The translator has just a background of translation
2195
+ memory and retrieve all this translation.
2196
+
2197
+ 1:01:55.895 --> 1:02:07.147
2198
+ There is even another benefit in addition
2199
+ to less work: That is also precise in the way
2200
+
2201
+ 1:02:07.147 --> 1:02:19.842
2202
+ know this creates a small mistake in the North
2203
+ Carolina.
2204
+
2205
+ 1:02:20.300 --> 1:02:22.584
2206
+ By especially its like consistence,.
2207
+
2208
+ 1:02:23.243 --> 1:02:32.954
2209
+ If you once translate the sentence this way
2210
+ you again translate it and especially for some
2211
+
2212
+ 1:02:32.954 --> 1:02:36.903
2213
+ situations like a company they have.
2214
+
2215
+ 1:02:37.217 --> 1:02:47.695
2216
+ With this one, of course, you get more consistent
2217
+ translations.
2218
+
2219
+ 1:02:47.695 --> 1:02:56.700
2220
+ Each one is a style where phrases maybe are
2221
+ retrieved.
2222
+
2223
+ 1:03:01.861 --> 1:03:15.502
2224
+ Then we have these transfer based approaches
2225
+ where we have three steps: Analysts remain
2226
+
2227
+ 1:03:15.502 --> 1:03:25.975
2228
+ that you check one synthetic structure, so
2229
+ for example for morphology the basic.
2230
+
2231
+ 1:03:26.286 --> 1:03:37.277
2232
+ Then you will do a parstry or dependency structure
2233
+ that this is the adjective of the balm.
2234
+
2235
+ 1:03:37.917 --> 1:03:42.117
2236
+ Then you can do the transfer where you transfer
2237
+ the structure to the other.
2238
+
2239
+ 1:03:42.382 --> 1:03:46.633
2240
+ There you have to do, for example, it's re-ordering
2241
+ because the satisfaction is different.
2242
+
2243
+ 1:03:46.987 --> 1:03:50.088
2244
+ In German, the adjective is before the noun.
2245
+
2246
+ 1:03:50.088 --> 1:03:52.777
2247
+ In Spanish, it's the other way around.
2248
+
2249
+ 1:03:52.777 --> 1:03:59.256
2250
+ You have first found and then that it's nice
2251
+ and these types of rehonoring can be done there.
2252
+
2253
+ 1:03:59.256 --> 1:04:04.633
2254
+ You might have to do other things like passive
2255
+ voice to exit voice and so on.
2256
+
2257
+ 1:04:05.145 --> 1:04:14.074
2258
+ And in some type of lexical transverse it
2259
+ should like to me: And then you are doing the
2260
+
2261
+ 1:04:14.074 --> 1:04:16.014
2262
+ generation.
2263
+
2264
+ 1:04:16.014 --> 1:04:25.551
2265
+ Of course, you would do the agreement if it
2266
+ is accusative.
2267
+
2268
+ 1:04:25.551 --> 1:04:29.430
2269
+ What type of adjective?
2270
+
2271
+ 1:04:30.090 --> 1:04:32.048
2272
+ Is some kind of saving.
2273
+
2274
+ 1:04:32.048 --> 1:04:39.720
2275
+ Of course, here, because the analyze has only
2276
+ to be done in the source language, the transfer
2277
+
2278
+ 1:04:39.720 --> 1:04:41.679
2279
+ has to do on the pairs.
2280
+
2281
+ 1:04:41.679 --> 1:04:48.289
2282
+ But if you not look German, English and French
2283
+ through all directions, you only.
2284
+
2285
+ 1:04:53.273 --> 1:04:59.340
2286
+ Then there is an interlingua card which is
2287
+ really about the pure meaning, so you have
2288
+
2289
+ 1:04:59.340 --> 1:05:00.751
2290
+ a semantic grammar.
2291
+
2292
+ 1:05:01.061 --> 1:05:07.930
2293
+ To represent everything and one thing, one
2294
+ nice implication is more extreme than before.
2295
+
2296
+ 1:05:07.930 --> 1:05:15.032
2297
+ You don't have the transfer anymore, so if
2298
+ you add one language to it and you have already.
2299
+
2300
+ 1:05:15.515 --> 1:05:26.188
2301
+ If you add the one parting and the one generation
2302
+ phase, you can now translate from: So you need
2303
+
2304
+ 1:05:26.188 --> 1:05:40.172
2305
+ components which do the and components which
2306
+ do the generation, and then you can translate:
2307
+
2308
+ 1:05:41.001 --> 1:05:45.994
2309
+ You can also do other things like paraphrasing.
2310
+
2311
+ 1:05:45.994 --> 1:05:52.236
2312
+ You can translate back to the words language
2313
+ and hopefully.
2314
+
2315
+ 1:05:53.533 --> 1:06:05.013
2316
+ If you're sparkling trying to analyze it,
2317
+ it was also down a lot for ungrammetical speech
2318
+
2319
+ 1:06:05.013 --> 1:06:11.518
2320
+ because the idea is you're in this representation.
2321
+
2322
+ 1:06:12.552 --> 1:06:18.679
2323
+ Of course, it's very much work and it's only
2324
+ realistic for limited domains.
2325
+
2326
+ 1:06:20.000 --> 1:06:25.454
2327
+ Then we're, we're have the campus based approach.
2328
+
2329
+ 1:06:25.745 --> 1:06:32.486
2330
+ So we'll talk about a lot about peril layer
2331
+ and what is really peril data is what you know
2332
+
2333
+ 1:06:32.486 --> 1:06:34.634
2334
+ from the Rosetta stone page.
2335
+
2336
+ 1:06:34.634 --> 1:06:41.227
2337
+ That is, you have a sewer sentence and you
2338
+ have a target sentence and you know they need
2339
+
2340
+ 1:06:41.227 --> 1:06:42.856
2341
+ to watch translation.
2342
+
2343
+ 1:06:43.343 --> 1:06:46.651
2344
+ And that's important, so the alignment is
2345
+ typically at a sentence level.
2346
+
2347
+ 1:06:46.987 --> 1:06:50.252
2348
+ So you know, for each sentence what is a translation?
2349
+
2350
+ 1:06:50.252 --> 1:06:55.756
2351
+ Not always perfect because maybe there's two
2352
+ German sentences and one English, but at that
2353
+
2354
+ 1:06:55.756 --> 1:06:57.570
2355
+ level it's normally possible.
2356
+
2357
+ 1:06:57.570 --> 1:07:03.194
2358
+ At word level you can't do that because it's
2359
+ a very complicated thing and sense level that's
2360
+
2361
+ 1:07:03.194 --> 1:07:04.464
2362
+ normally a relative.
2363
+
2364
+ 1:07:05.986 --> 1:07:12.693
2365
+ Some type of machine learning which tries
2366
+ to learn dismapping between sentences on the
2367
+
2368
+ 1:07:12.693 --> 1:07:14.851
2369
+ English side and sentences.
2370
+
2371
+ 1:07:15.355 --> 1:07:22.088
2372
+ Of course this doesn't look like good mapping
2373
+ too complex but you try to find something like
2374
+
2375
+ 1:07:22.088 --> 1:07:28.894
2376
+ that where it's a very nice mapping so there's
2377
+ always the mixing things are met to each other
2378
+
2379
+ 1:07:28.894 --> 1:07:32.224
2380
+ and then if you have the English you can try.
2381
+
2382
+ 1:07:32.172 --> 1:07:36.900
2383
+ In another English sentence you can apply
2384
+ the same mannering and hopefully adhere to
2385
+
2386
+ 1:07:36.900 --> 1:07:38.514
2387
+ the right sentence in terms.
2388
+
2389
+ 1:07:38.918 --> 1:07:41.438
2390
+ The big problem here.
2391
+
2392
+ 1:07:41.438 --> 1:07:44.646
2393
+ How can we find this model?
2394
+
2395
+ 1:07:44.646 --> 1:07:50.144
2396
+ How to map English centers into German centers?
2397
+
2398
+ 1:07:54.374 --> 1:08:08.492
2399
+ How we do that is that we are trying to maximize
2400
+ the probability, so we have all the letterstone.
2401
+
2402
+ 1:08:09.109 --> 1:08:15.230
2403
+ Then we're having some type of model here
2404
+ which takes the Suez language and translates
2405
+
2406
+ 1:08:15.230 --> 1:08:16.426
2407
+ it for a target.
2408
+
2409
+ 1:08:16.896 --> 1:08:34.008
2410
+ And then we are in our translation, and we
2411
+ are adjusting our model in a way that the probability.
2412
+
2413
+ 1:08:34.554 --> 1:08:48.619
2414
+ How that is the idea behind it, how we are
2415
+ pushed now, implement that is part of the bottle.
2416
+
2417
+ 1:08:51.131 --> 1:09:01.809
2418
+ And then if we want to do translation, what
2419
+ we are doing is we are trying to find the translation.
2420
+
2421
+ 1:09:01.962 --> 1:09:06.297
2422
+ So we are scoring many possible translations.
2423
+
2424
+ 1:09:06.297 --> 1:09:12.046
2425
+ There is an infinite number of sentences that
2426
+ we are trying.
2427
+
2428
+ 1:09:12.552 --> 1:09:18.191
2429
+ That may be a bit of a problem when we talk
2430
+ about confidence because we are always trying
2431
+
2432
+ 1:09:18.191 --> 1:09:19.882
2433
+ to find the most probable.
2434
+
2435
+ 1:09:20.440 --> 1:09:28.241
2436
+ And then, of course, we are not really having
2437
+ intrinsically the possibility to say, oh, I
2438
+
2439
+ 1:09:28.241 --> 1:09:31.015
2440
+ have no idea in this situation.
2441
+
2442
+ 1:09:31.015 --> 1:09:35.782
2443
+ But our general model is always about how
2444
+ can we find?
2445
+
2446
+ 1:09:40.440 --> 1:09:41.816
2447
+ Think It's.
2448
+
2449
+ 1:09:42.963 --> 1:09:44.242
2450
+ Get Four More Slides.
2451
+
2452
+ 1:09:46.686 --> 1:09:52.025
2453
+ So just high level, so for a proper space
2454
+ this one we won't cover again.
2455
+
2456
+ 1:09:52.352 --> 1:10:00.808
2457
+ Its example based machine translation was
2458
+ at the beginning of SMT.
2459
+
2460
+ 1:10:00.808 --> 1:10:08.254
2461
+ The idea is that you take subparts and combine
2462
+ them again.
2463
+
2464
+ 1:10:08.568 --> 1:10:11.569
2465
+ So this will not be really covered here.
2466
+
2467
+ 1:10:11.569 --> 1:10:15.228
2468
+ Then the statistical machine translation we
2469
+ will.
2470
+
2471
+ 1:10:17.077 --> 1:10:18.773
2472
+ Yeah, we will cover next week.
2473
+
2474
+ 1:10:19.079 --> 1:10:27.594
2475
+ The idea is there that we automatically now,
2476
+ if we have the sentence alignment, we automatically.
2477
+
2478
+ 1:10:27.527 --> 1:10:34.207
2479
+ In the sentences, and then we can learn statistical
2480
+ models of how probable words are translated
2481
+
2482
+ 1:10:34.207 --> 1:10:39.356
2483
+ to each other, and then the surge is that we
2484
+ create different hypotheses.
2485
+
2486
+ 1:10:39.356 --> 1:10:45.200
2487
+ This could be a translation of this part,
2488
+ this could be a translation of that part.
2489
+
2490
+ 1:10:45.200 --> 1:10:47.496
2491
+ We give a score to each of them.
2492
+
2493
+ 1:10:47.727 --> 1:10:51.584
2494
+ The statistical machine manual is where a
2495
+ lot of work is done.
2496
+
2497
+ 1:10:51.584 --> 1:10:54.155
2498
+ How can we score how good translation is?
2499
+
2500
+ 1:10:54.494 --> 1:11:04.764
2501
+ The words can recur this type of structure,
2502
+ how is it reordered, and then based on that
2503
+
2504
+ 1:11:04.764 --> 1:11:08.965
2505
+ we search for the best translation.
2506
+
2507
+ 1:11:12.252 --> 1:11:19.127
2508
+ Then yeah, that one what we'll cover most
2509
+ of the time is is a neural, a model where we
2510
+
2511
+ 1:11:19.127 --> 1:11:21.102
2512
+ can use neural networks.
2513
+
2514
+ 1:11:21.102 --> 1:11:27.187
2515
+ The nice thing is between everything together
2516
+ before we get some compliment.
2517
+
2518
+ 1:11:27.187 --> 1:11:30.269
2519
+ Each of them is trained independently.
2520
+
2521
+ 1:11:30.210 --> 1:11:34.349
2522
+ Which of course has a disadvantage that they
2523
+ might not best work together.
2524
+
2525
+ 1:11:34.694 --> 1:11:36.601
2526
+ Here everything is trained together.
2527
+
2528
+ 1:11:36.601 --> 1:11:39.230
2529
+ The continuous representation will look into
2530
+ that.
2531
+
2532
+ 1:11:39.339 --> 1:11:41.846
2533
+ That's very helpful soft.
2534
+
2535
+ 1:11:41.846 --> 1:11:50.426
2536
+ We then neonetworks are able to learn somehow
2537
+ the relation between words and that's very
2538
+
2539
+ 1:11:50.426 --> 1:11:57.753
2540
+ helpful because then we can more easily deal
2541
+ with words which didn't occur.
2542
+
2543
+ 1:12:00.000 --> 1:12:05.240
2544
+ One thing just to correlate that to interlingua
2545
+ based.
2546
+
2547
+ 1:12:05.345 --> 1:12:07.646
2548
+ So we have this as an actual language.
2549
+
2550
+ 1:12:07.627 --> 1:12:11.705
2551
+ And if you do an interlingual based approach
2552
+ but don't take an artificial.
2553
+
2554
+ 1:12:11.731 --> 1:12:17.814
2555
+ With no ambiguities, but with a natural language
2556
+ that's referred to as pivot based in tea and
2557
+
2558
+ 1:12:17.814 --> 1:12:20.208
2559
+ can be done with all the approaches.
2560
+
2561
+ 1:12:20.208 --> 1:12:25.902
2562
+ So the ideas instead of directly translating
2563
+ from German to French, you first translate
2564
+
2565
+ 1:12:25.902 --> 1:12:29.073
2566
+ from German to English and then from English
2567
+ to.
2568
+
2569
+ 1:12:29.409 --> 1:12:40.954
2570
+ French where the big advantage is that you
2571
+ might have a lot more data for these two directions
2572
+
2573
+ 1:12:40.954 --> 1:12:43.384
2574
+ than you have here.
2575
+
2576
+ 1:12:44.864 --> 1:12:54.666
2577
+ With this thank you and deserve more questions
2578
+ and a bit late I'm sorry and then I'll see
2579
+
2580
+ 1:12:54.666 --> 1:12:55.864
2581
+ you again.
2582
+
demo_data/lectures/Lecture-01-18.04.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f95bffd5a310af38b1ee51daef47a0af905687cbee799c161515f743cb30d0c
3
+ size 103388000
demo_data/lectures/Lecture-02-20.04.2023/English.vtt ADDED
@@ -0,0 +1,2984 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 0:00:01.561 --> 0:00:05.186
4
+ Okay So Um.
5
+
6
+ 0:00:08.268 --> 0:00:17.655
7
+ Welcome to today's presentation of the second
8
+ class and machine translation where we'll today
9
+
10
+ 0:00:17.655 --> 0:00:25.044
11
+ do a bit of a specific topic and we'll talk
12
+ about linguistic backgrounds.
13
+
14
+ 0:00:26.226 --> 0:00:34.851
15
+ Will cover their three different parts of
16
+ the lecture.
17
+
18
+ 0:00:35.615 --> 0:00:42.538
19
+ We'll do first a very, very brief introduction
20
+ about linguistic background in a way that what
21
+
22
+ 0:00:42.538 --> 0:00:49.608
23
+ is language, what are ways of describing language,
24
+ what are a bit serious behind it, very, very
25
+
26
+ 0:00:49.608 --> 0:00:50.123
27
+ short.
28
+
29
+ 0:00:50.410 --> 0:00:57.669
30
+ Don't know some of you have listened, think
31
+ to NLP in the last semester or so.
32
+
33
+ 0:00:58.598 --> 0:01:02.553
34
+ So there we did a lot longer explanation.
35
+
36
+ 0:01:02.553 --> 0:01:08.862
37
+ Here is just because we are not talking about
38
+ machine translation.
39
+
40
+ 0:01:09.109 --> 0:01:15.461
41
+ So it's really focused on the parts which
42
+ are important when we talk about machine translation.
43
+
44
+ 0:01:15.755 --> 0:01:19.377
45
+ Though for everybody who has listened to that
46
+ already, it's a bit of a repetition.
47
+
48
+ 0:01:19.377 --> 0:01:19.683
49
+ Maybe.
50
+
51
+ 0:01:19.980 --> 0:01:23.415
52
+ But it's really trying to look.
53
+
54
+ 0:01:23.415 --> 0:01:31.358
55
+ These are properties of languages and how
56
+ can they influence translation.
57
+
58
+ 0:01:31.671 --> 0:01:38.928
59
+ We'll use that in the second part to discuss
60
+ why is machine translation more from what we
61
+
62
+ 0:01:38.928 --> 0:01:40.621
63
+ know about language.
64
+
65
+ 0:01:40.940 --> 0:01:47.044
66
+ We will see that I mean there's two main things
67
+ is that the language might express ideas and
68
+
69
+ 0:01:47.044 --> 0:01:53.279
70
+ information differently, and if they are expressed
71
+ different in different languages we have to
72
+
73
+ 0:01:53.279 --> 0:01:54.920
74
+ do somehow the transfer.
75
+
76
+ 0:01:55.135 --> 0:02:02.771
77
+ And it's not purely that we know there's words
78
+ used for it, but it's not that simple and very
79
+
80
+ 0:02:02.771 --> 0:02:03.664
81
+ different.
82
+
83
+ 0:02:04.084 --> 0:02:10.088
84
+ And the other problem we mentioned last time
85
+ about biases is that there's not always the
86
+
87
+ 0:02:10.088 --> 0:02:12.179
88
+ same amount of information in.
89
+
90
+ 0:02:12.592 --> 0:02:18.206
91
+ So it can be that there's some more information
92
+ in the one or you can't express that few information
93
+
94
+ 0:02:18.206 --> 0:02:19.039
95
+ on the target.
96
+
97
+ 0:02:19.039 --> 0:02:24.264
98
+ We had that also, for example, with the example
99
+ with the rice plant in Germany, we would just
100
+
101
+ 0:02:24.264 --> 0:02:24.820
102
+ say rice.
103
+
104
+ 0:02:24.904 --> 0:02:33.178
105
+ Or in English, while in other countries you
106
+ have to distinguish between rice plant or rice
107
+
108
+ 0:02:33.178 --> 0:02:33.724
109
+ as a.
110
+
111
+ 0:02:34.194 --> 0:02:40.446
112
+ And then it's not always possible to directly
113
+ infer this on the surface.
114
+
115
+ 0:02:41.781 --> 0:02:48.501
116
+ And if we make it to the last point otherwise
117
+ we'll do that next Tuesday or we'll partly
118
+
119
+ 0:02:48.501 --> 0:02:55.447
120
+ do it only here is like we'll describe briefly
121
+ the three main approaches on a rule based so
122
+
123
+ 0:02:55.447 --> 0:02:59.675
124
+ linguistic motivated ways of doing machine
125
+ translation.
126
+
127
+ 0:02:59.779 --> 0:03:03.680
128
+ We mentioned them last time like the direct
129
+ translation.
130
+
131
+ 0:03:03.680 --> 0:03:10.318
132
+ The translation by transfer the lingua interlingua
133
+ bass will do that a bit more in detail today.
134
+
135
+ 0:03:10.590 --> 0:03:27.400
136
+ But very briefly because this is not a focus
137
+ of this class and then next week because.
138
+
139
+ 0:03:29.569 --> 0:03:31.757
140
+ Why do we think this is important?
141
+
142
+ 0:03:31.757 --> 0:03:37.259
143
+ On the one hand, of course, we are dealing
144
+ with natural language, so therefore it might
145
+
146
+ 0:03:37.259 --> 0:03:43.074
147
+ be good to spend a bit of time in understanding
148
+ what we are really dealing with because this
149
+
150
+ 0:03:43.074 --> 0:03:45.387
151
+ is challenging these other problems.
152
+
153
+ 0:03:45.785 --> 0:03:50.890
154
+ And on the other hand, this was the first
155
+ way of how we're doing machine translation.
156
+
157
+ 0:03:51.271 --> 0:04:01.520
158
+ Therefore, it's interesting to understand
159
+ what was the idea behind that and also to later
160
+
161
+ 0:04:01.520 --> 0:04:08.922
162
+ see what is done differently and to understand
163
+ when some models.
164
+
165
+ 0:04:13.453 --> 0:04:20.213
166
+ When we're talking about linguistics, we can
167
+ of course do that on different levels and there's
168
+
169
+ 0:04:20.213 --> 0:04:21.352
170
+ different ways.
171
+
172
+ 0:04:21.521 --> 0:04:26.841
173
+ On the right side here you are seeing the
174
+ basic levels of linguistics.
175
+
176
+ 0:04:27.007 --> 0:04:31.431
177
+ So we have at the bottom the phonetics and
178
+ phonology.
179
+
180
+ 0:04:31.431 --> 0:04:38.477
181
+ Phones will not cover this year because we
182
+ are mainly focusing on text input where we
183
+
184
+ 0:04:38.477 --> 0:04:42.163
185
+ are directly having directors and then work.
186
+
187
+ 0:04:42.642 --> 0:04:52.646
188
+ Then what we touch today, at least mention
189
+ what it is, is a morphology which is the first
190
+
191
+ 0:04:52.646 --> 0:04:53.424
192
+ level.
193
+
194
+ 0:04:53.833 --> 0:04:59.654
195
+ Already mentioned it a bit on Tuesday that
196
+ of course there are some languages where this
197
+
198
+ 0:04:59.654 --> 0:05:05.343
199
+ is very, very basic and there is not really
200
+ a lot of rules of how you can build words.
201
+
202
+ 0:05:05.343 --> 0:05:11.099
203
+ But since I assume you all have some basic
204
+ knowledge of German there is like a lot more
205
+
206
+ 0:05:11.099 --> 0:05:12.537
207
+ challenges than that.
208
+
209
+ 0:05:13.473 --> 0:05:20.030
210
+ You know, maybe if you're a native speaker
211
+ that's quite easy and everything is clear,
212
+
213
+ 0:05:20.030 --> 0:05:26.969
214
+ but if you have to learn it like the endings
215
+ of a word, we are famous for doing compositar
216
+
217
+ 0:05:26.969 --> 0:05:29.103
218
+ and putting words together.
219
+
220
+ 0:05:29.103 --> 0:05:31.467
221
+ So this is like the first lab.
222
+
223
+ 0:05:32.332 --> 0:05:40.268
224
+ Then we have the syntax, which is both on
225
+ the word and on the sentence level, and that's
226
+
227
+ 0:05:40.268 --> 0:05:43.567
228
+ about the structure of the sentence.
229
+
230
+ 0:05:43.567 --> 0:05:46.955
231
+ What are the functions of some words?
232
+
233
+ 0:05:47.127 --> 0:05:51.757
234
+ You might remember part of speech text from
235
+ From Your High School Time.
236
+
237
+ 0:05:51.757 --> 0:05:57.481
238
+ There is like noun and adjective and and things
239
+ like that and this is something helpful.
240
+
241
+ 0:05:57.737 --> 0:06:03.933
242
+ Just imagine in the beginning that it was
243
+ not only used for rule based but for statistical
244
+
245
+ 0:06:03.933 --> 0:06:10.538
246
+ machine translation, for example, the reordering
247
+ between languages was quite a challenging task.
248
+
249
+ 0:06:10.770 --> 0:06:16.330
250
+ Especially if you have long range reorderings
251
+ and their part of speech information is very
252
+
253
+ 0:06:16.330 --> 0:06:16.880
254
+ helpful.
255
+
256
+ 0:06:16.880 --> 0:06:20.301
257
+ You know, in German you have to move the word
258
+ the verb.
259
+
260
+ 0:06:20.260 --> 0:06:26.599
261
+ To the second position, if you have Spanish
262
+ you have to change the noun and the adjective
263
+
264
+ 0:06:26.599 --> 0:06:30.120
265
+ so information from part of speech could be
266
+ very.
267
+
268
+ 0:06:30.410 --> 0:06:38.621
269
+ Then you have a syntax base structure where
270
+ you have a full syntax tree in the beginning
271
+
272
+ 0:06:38.621 --> 0:06:43.695
273
+ and then it came into statistical machine translation.
274
+
275
+ 0:06:44.224 --> 0:06:50.930
276
+ And it got more and more important for statistical
277
+ machine translation that you are really trying
278
+
279
+ 0:06:50.930 --> 0:06:53.461
280
+ to model the whole syntax tree of a.
281
+
282
+ 0:06:53.413 --> 0:06:57.574
283
+ Sentence in order to better match how to do
284
+ that in UM.
285
+
286
+ 0:06:57.574 --> 0:07:04.335
287
+ In the target language, a bit yeah, the syntax
288
+ based statistical machine translation had a
289
+
290
+ 0:07:04.335 --> 0:07:05.896
291
+ bitter of a problem.
292
+
293
+ 0:07:05.896 --> 0:07:08.422
294
+ It got better and better and was.
295
+
296
+ 0:07:08.368 --> 0:07:13.349
297
+ Just on the way of getting better in some
298
+ languages than traditional statistical models.
299
+
300
+ 0:07:13.349 --> 0:07:18.219
301
+ But then the neural models came up and they
302
+ were just so much better in modelling that
303
+
304
+ 0:07:18.219 --> 0:07:19.115
305
+ all implicitly.
306
+
307
+ 0:07:19.339 --> 0:07:23.847
308
+ So that they are never were used in practice
309
+ so much.
310
+
311
+ 0:07:24.304 --> 0:07:34.262
312
+ And then we'll talk about the semantics, so
313
+ what is the meaning of the words?
314
+
315
+ 0:07:34.262 --> 0:07:40.007
316
+ Last time words can have different meanings.
317
+
318
+ 0:07:40.260 --> 0:07:46.033
319
+ And yeah, how you represent meaning of cause
320
+ is very challenging.
321
+
322
+ 0:07:45.966 --> 0:07:53.043
323
+ And normally that like formalizing this is
324
+ typically done in quite limited domains because
325
+
326
+ 0:07:53.043 --> 0:08:00.043
327
+ like doing that for like all possible words
328
+ has not really been achieved yet in this very
329
+
330
+ 0:08:00.043 --> 0:08:00.898
331
+ challenge.
332
+
333
+ 0:08:02.882 --> 0:08:09.436
334
+ About pragmatics, so pragmatics is then what
335
+ is meaning in the context of the current situation.
336
+
337
+ 0:08:09.789 --> 0:08:16.202
338
+ So one famous example is there, for example,
339
+ if you say the light is red.
340
+
341
+ 0:08:16.716 --> 0:08:21.795
342
+ The traffic light is red so that typically
343
+ not you don't want to tell the other person
344
+
345
+ 0:08:21.795 --> 0:08:27.458
346
+ if you're sitting in a car that it's surprising
347
+ oh the light is red but typically you're meaning
348
+
349
+ 0:08:27.458 --> 0:08:30.668
350
+ okay you should stop and you shouldn't pass
351
+ the light.
352
+
353
+ 0:08:30.850 --> 0:08:40.994
354
+ So the meaning of this sentence, the light,
355
+ is red in the context of sitting in the car.
356
+
357
+ 0:08:42.762 --> 0:08:51.080
358
+ So let's start with the morphology so that
359
+ with the things we are starting there and one
360
+
361
+ 0:08:51.080 --> 0:08:53.977
362
+ easy and first thing is there.
363
+
364
+ 0:08:53.977 --> 0:09:02.575
365
+ Of course we have to split the sentence into
366
+ words or joint directors so that we have word.
367
+
368
+ 0:09:02.942 --> 0:09:09.017
369
+ Because in most of our work we'll deal like
370
+ machine translation with some type of words.
371
+
372
+ 0:09:09.449 --> 0:09:15.970
373
+ In neuromachine translation, people are working
374
+ also on director based and subwords, but a
375
+
376
+ 0:09:15.970 --> 0:09:20.772
377
+ basic unique words of the sentence is a very
378
+ important first step.
379
+
380
+ 0:09:21.421 --> 0:09:32.379
381
+ And for many languages that is quite simple
382
+ in German, it's not that hard to determine
383
+
384
+ 0:09:32.379 --> 0:09:33.639
385
+ the word.
386
+
387
+ 0:09:34.234 --> 0:09:46.265
388
+ In tokenization, the main challenge is if
389
+ we are doing corpus-based methods that we are
390
+
391
+ 0:09:46.265 --> 0:09:50.366
392
+ also dealing as normal words.
393
+
394
+ 0:09:50.770 --> 0:10:06.115
395
+ And there of course it's getting a bit more
396
+ challenging.
397
+
398
+ 0:10:13.173 --> 0:10:17.426
399
+ So that is maybe the main thing where, for
400
+ example, in Germany, if you think of German
401
+
402
+ 0:10:17.426 --> 0:10:19.528
403
+ tokenization, it's easy to get every word.
404
+
405
+ 0:10:19.779 --> 0:10:26.159
406
+ You split it at a space, but then you would
407
+ have the dots at the end join to the last word,
408
+
409
+ 0:10:26.159 --> 0:10:30.666
410
+ and of course that you don't want because it's
411
+ a different word.
412
+
413
+ 0:10:30.666 --> 0:10:37.046
414
+ The last word would not be go, but go dot,
415
+ but what you can do is split up the dots always.
416
+
417
+ 0:10:37.677 --> 0:10:45.390
418
+ Can you really do that always or it might
419
+ be sometimes better to keep the dot as a point?
420
+
421
+ 0:10:47.807 --> 0:10:51.001
422
+ For example, email addresses or abbreviations
423
+ here.
424
+
425
+ 0:10:51.001 --> 0:10:56.284
426
+ For example, doctor, maybe it doesn't make
427
+ sense to split up the dot because then you
428
+
429
+ 0:10:56.284 --> 0:11:01.382
430
+ would assume all year starts a new sentence,
431
+ but it's just the DR dot from doctor.
432
+
433
+ 0:11:01.721 --> 0:11:08.797
434
+ Or if you have numbers like he's a seventh
435
+ person like the zipter, then you don't want
436
+
437
+ 0:11:08.797 --> 0:11:09.610
438
+ to split.
439
+
440
+ 0:11:09.669 --> 0:11:15.333
441
+ So there are some things where it could be
442
+ a bit more difficult, but it's not really challenging.
443
+
444
+ 0:11:16.796 --> 0:11:23.318
445
+ In other languages it's getting a lot more
446
+ challenging, especially in Asian languages
447
+
448
+ 0:11:23.318 --> 0:11:26.882
449
+ where often there are no spaces between words.
450
+
451
+ 0:11:27.147 --> 0:11:32.775
452
+ So you just have the sequence of characters.
453
+
454
+ 0:11:32.775 --> 0:11:38.403
455
+ The quick brown fox jumps over the lazy dog.
456
+
457
+ 0:11:38.999 --> 0:11:44.569
458
+ And then it still might be helpful to work
459
+ on something like words.
460
+
461
+ 0:11:44.569 --> 0:11:48.009
462
+ Then you need to have a bit more complex.
463
+
464
+ 0:11:48.328 --> 0:11:55.782
465
+ And here you see we are again having our typical
466
+ problem.
467
+
468
+ 0:11:55.782 --> 0:12:00.408
469
+ That means that there is ambiguity.
470
+
471
+ 0:12:00.600 --> 0:12:02.104
472
+ So you're seeing here.
473
+
474
+ 0:12:02.104 --> 0:12:08.056
475
+ We have exactly the same sequence of characters
476
+ or here, but depending on how we split it,
477
+
478
+ 0:12:08.056 --> 0:12:12.437
479
+ it means he is your servant or he is the one
480
+ who used your things.
481
+
482
+ 0:12:12.437 --> 0:12:15.380
483
+ Or here we have round eyes and take the air.
484
+
485
+ 0:12:15.895 --> 0:12:22.953
486
+ So then of course yeah this type of tokenization
487
+ gets more important because you could introduce
488
+
489
+ 0:12:22.953 --> 0:12:27.756
490
+ already arrows and you can imagine if you're
491
+ doing it here wrong.
492
+
493
+ 0:12:27.756 --> 0:12:34.086
494
+ If you once do a wrong decision it's quite
495
+ difficult to recover from a wrong decision.
496
+
497
+ 0:12:34.634 --> 0:12:47.088
498
+ And so in these cases looking about how we're
499
+ doing tokenization is an important issue.
500
+
501
+ 0:12:47.127 --> 0:12:54.424
502
+ And then it might be helpful to do things
503
+ like director based models where we treat each
504
+
505
+ 0:12:54.424 --> 0:12:56.228
506
+ director as a symbol.
507
+
508
+ 0:12:56.228 --> 0:13:01.803
509
+ For example, do this decision in the later
510
+ or never really do this?
511
+
512
+ 0:13:06.306 --> 0:13:12.033
513
+ The other thing is that if we have words we
514
+ might, it might not be the optimal unit to
515
+
516
+ 0:13:12.033 --> 0:13:18.155
517
+ work with because it can be that we should
518
+ look into the internal structure of words because
519
+
520
+ 0:13:18.155 --> 0:13:20.986
521
+ if we have a morphological rich language,.
522
+
523
+ 0:13:21.141 --> 0:13:27.100
524
+ That means we have a lot of different types
525
+ of words, and if you have a lot of many different
526
+
527
+ 0:13:27.100 --> 0:13:32.552
528
+ types of words, it on the other hand means
529
+ of course each of these words we have seen
530
+
531
+ 0:13:32.552 --> 0:13:33.757
532
+ very infrequently.
533
+
534
+ 0:13:33.793 --> 0:13:39.681
535
+ So if you only have ten words and you have
536
+ a large corpus, each word occurs more often.
537
+
538
+ 0:13:39.681 --> 0:13:45.301
539
+ If you have three million different words,
540
+ then each of them will occur less often.
541
+
542
+ 0:13:45.301 --> 0:13:51.055
543
+ Hopefully you know, from machine learning,
544
+ it's helpful if you have seen each example
545
+
546
+ 0:13:51.055 --> 0:13:51.858
547
+ very often.
548
+
549
+ 0:13:52.552 --> 0:13:54.524
550
+ And so why does it help?
551
+
552
+ 0:13:54.524 --> 0:13:56.495
553
+ Why does it help happen?
554
+
555
+ 0:13:56.495 --> 0:14:02.410
556
+ Yeah, in some languages we have quite a complex
557
+ information inside a word.
558
+
559
+ 0:14:02.410 --> 0:14:09.271
560
+ So here's a word from a finish talosanikiko
561
+ or something like that, and it means in my
562
+
563
+ 0:14:09.271 --> 0:14:10.769
564
+ house to question.
565
+
566
+ 0:14:11.491 --> 0:14:15.690
567
+ So you have all these information attached
568
+ to the word.
569
+
570
+ 0:14:16.036 --> 0:14:20.326
571
+ And that of course in extreme case that's
572
+ why typically, for example, Finnish is the
573
+
574
+ 0:14:20.326 --> 0:14:20.831
575
+ language.
576
+
577
+ 0:14:20.820 --> 0:14:26.725
578
+ Where machine translation quality is less
579
+ good because generating all these different
580
+
581
+ 0:14:26.725 --> 0:14:33.110
582
+ morphological variants is is a challenge and
583
+ the additional challenge is typically in finish
584
+
585
+ 0:14:33.110 --> 0:14:39.564
586
+ not really low resource but for in low resource
587
+ languages you quite often have more difficult
588
+
589
+ 0:14:39.564 --> 0:14:40.388
590
+ morphology.
591
+
592
+ 0:14:40.440 --> 0:14:43.949
593
+ Mean English is an example of a relatively
594
+ easy one.
595
+
596
+ 0:14:46.066 --> 0:14:54.230
597
+ And so in general we can say that words are
598
+ composed of more themes, and more themes are
599
+
600
+ 0:14:54.230 --> 0:15:03.069
601
+ the smallest meaning carrying unit, so normally
602
+ it means: All morphine should have some type
603
+
604
+ 0:15:03.069 --> 0:15:04.218
605
+ of meaning.
606
+
607
+ 0:15:04.218 --> 0:15:09.004
608
+ For example, here does not really have a meaning.
609
+
610
+ 0:15:09.289 --> 0:15:12.005
611
+ Bian has some type of meaning.
612
+
613
+ 0:15:12.005 --> 0:15:14.371
614
+ It's changing the meaning.
615
+
616
+ 0:15:14.371 --> 0:15:21.468
617
+ The NES has the meaning that it's making out
618
+ of an adjective, a noun, and happy.
619
+
620
+ 0:15:21.701 --> 0:15:31.215
621
+ So each of these parts conveys some meaning,
622
+ but you cannot split them further up and have
623
+
624
+ 0:15:31.215 --> 0:15:32.156
625
+ somehow.
626
+
627
+ 0:15:32.312 --> 0:15:36.589
628
+ You see that of course a little bit more is
629
+ happening.
630
+
631
+ 0:15:36.589 --> 0:15:43.511
632
+ Typically the Y is going into an E so there
633
+ can be some variation, but these are typical
634
+
635
+ 0:15:43.511 --> 0:15:46.544
636
+ examples of what we have as morphines.
637
+
638
+ 0:16:02.963 --> 0:16:08.804
639
+ That is, of course, a problem and that's the
640
+ question why how you do your splitting.
641
+
642
+ 0:16:08.804 --> 0:16:15.057
643
+ But that problem we have anyway always because
644
+ even full words can have different meanings
645
+
646
+ 0:16:15.057 --> 0:16:17.806
647
+ depending on the context they're using.
648
+
649
+ 0:16:18.038 --> 0:16:24.328
650
+ So we always have to somewhat have a model
651
+ which can infer or represent the meaning of
652
+
653
+ 0:16:24.328 --> 0:16:25.557
654
+ the word in the.
655
+
656
+ 0:16:25.825 --> 0:16:30.917
657
+ But you are right that this problem might
658
+ get even more severe if you're splitting up.
659
+
660
+ 0:16:30.917 --> 0:16:36.126
661
+ Therefore, it might not be the best to go
662
+ for the very extreme and represent each letter
663
+
664
+ 0:16:36.126 --> 0:16:41.920
665
+ and have a model which is only on letters because,
666
+ of course, a letter can have a lot of different
667
+
668
+ 0:16:41.920 --> 0:16:44.202
669
+ meanings depending on where it's used.
670
+
671
+ 0:16:44.524 --> 0:16:50.061
672
+ And yeah, there is no right solution like
673
+ what is the right splitting.
674
+
675
+ 0:16:50.061 --> 0:16:56.613
676
+ It depends on the language and the application
677
+ on the amount of data you're having.
678
+
679
+ 0:16:56.613 --> 0:17:01.058
680
+ For example, typically it means the fewer
681
+ data you have.
682
+
683
+ 0:17:01.301 --> 0:17:12.351
684
+ The more splitting you should do, if you have
685
+ more data, then you can be better distinguish.
686
+
687
+ 0:17:13.653 --> 0:17:19.065
688
+ Then there are different types of morphines:
689
+ So we have typically one stemmed theme: It's
690
+
691
+ 0:17:19.065 --> 0:17:21.746
692
+ like house or tish, so the main meaning.
693
+
694
+ 0:17:21.941 --> 0:17:29.131
695
+ And then you can have functional or bound
696
+ morphemes which can be f which can be prefix,
697
+
698
+ 0:17:29.131 --> 0:17:34.115
699
+ suffix, infix or circumfix so it can be before
700
+ can be after.
701
+
702
+ 0:17:34.114 --> 0:17:39.416
703
+ It can be inside or it can be around it, something
704
+ like a coughed there.
705
+
706
+ 0:17:39.416 --> 0:17:45.736
707
+ Typically you would say that it's not like
708
+ two more themes, G and T, because they both
709
+
710
+ 0:17:45.736 --> 0:17:50.603
711
+ describe the function, but together G and T
712
+ are marking the cough.
713
+
714
+ 0:17:53.733 --> 0:18:01.209
715
+ For what are people using them you can use
716
+ them for inflection to describe something like
717
+
718
+ 0:18:01.209 --> 0:18:03.286
719
+ tense count person case.
720
+
721
+ 0:18:04.604 --> 0:18:09.238
722
+ That is yeah, if you know German, this is
723
+ commonly used in German.
724
+
725
+ 0:18:10.991 --> 0:18:16.749
726
+ But of course there is a lot more complicated
727
+ things: I think in in some languages it also.
728
+
729
+ 0:18:16.749 --> 0:18:21.431
730
+ I mean, in Germany it only depends counting
731
+ person on the subject.
732
+
733
+ 0:18:21.431 --> 0:18:27.650
734
+ For the word, for example, in other languages
735
+ it can also determine the first and on the
736
+
737
+ 0:18:27.650 --> 0:18:28.698
738
+ second object.
739
+
740
+ 0:18:28.908 --> 0:18:35.776
741
+ So that it like if you buy an apple or an
742
+ house, that not only the, the, the.
743
+
744
+ 0:18:35.776 --> 0:18:43.435
745
+ Kauft depends on on me like in German, but
746
+ it can also depend on whether it's an apple
747
+
748
+ 0:18:43.435 --> 0:18:44.492
749
+ or a house.
750
+
751
+ 0:18:44.724 --> 0:18:48.305
752
+ And then of course you have an exploding number
753
+ of web fronts.
754
+
755
+ 0:18:49.409 --> 0:19:04.731
756
+ Furthermore, it can be used to do derivations
757
+ so you can make other types of words from it.
758
+
759
+ 0:19:05.165 --> 0:19:06.254
760
+ And then yeah.
761
+
762
+ 0:19:06.254 --> 0:19:12.645
763
+ This is like creating new words by joining
764
+ them like rainbow waterproof but for example
765
+
766
+ 0:19:12.645 --> 0:19:19.254
767
+ in German like Einköw's Wagen, Ice Cult and
768
+ so on where you can join where you can do that
769
+
770
+ 0:19:19.254 --> 0:19:22.014
771
+ with nouns and German adjectives and.
772
+
773
+ 0:19:22.282 --> 0:19:29.077
774
+ Then of course you might have additional challenges
775
+ like the Fugan where you have to add this one.
776
+
777
+ 0:19:32.452 --> 0:19:39.021
778
+ Yeah, then there is a yeah of course additional
779
+ special things.
780
+
781
+ 0:19:39.639 --> 0:19:48.537
782
+ You have to sometimes put extra stuff because
783
+ of phonology, so it's dig the plural, not plural.
784
+
785
+ 0:19:48.537 --> 0:19:56.508
786
+ The third person singular, as in English,
787
+ is normally S, but by Goes, for example, is
788
+
789
+ 0:19:56.508 --> 0:19:57.249
790
+ an E S.
791
+
792
+ 0:19:57.277 --> 0:20:04.321
793
+ In German you can also have other things that
794
+ like Osmutta gets Mutter so you're changing
795
+
796
+ 0:20:04.321 --> 0:20:11.758
797
+ the Umlaud in order to express the plural and
798
+ in other languages for example the vowel harmony
799
+
800
+ 0:20:11.758 --> 0:20:17.315
801
+ where the vowels inside are changing depending
802
+ on which form you have.
803
+
804
+ 0:20:17.657 --> 0:20:23.793
805
+ Which makes things more difficult than splitting
806
+ a word into its part doesn't really work anymore.
807
+
808
+ 0:20:23.793 --> 0:20:28.070
809
+ So like for Muta and Muta, for example, that
810
+ is not really possible.
811
+
812
+ 0:20:28.348 --> 0:20:36.520
813
+ The nice thing is, of course, more like a
814
+ general thing, but often irregular things are
815
+
816
+ 0:20:36.520 --> 0:20:39.492
817
+ happening as words which occur.
818
+
819
+ 0:20:39.839 --> 0:20:52.177
820
+ So that you can have enough examples, while
821
+ the regular things you can do by some type
822
+
823
+ 0:20:52.177 --> 0:20:53.595
824
+ of rules.
825
+
826
+ 0:20:55.655 --> 0:20:57.326
827
+ Yeah, This Can Be Done.
828
+
829
+ 0:20:57.557 --> 0:21:02.849
830
+ So there are tasks on this: how to do automatic
831
+ inflection, how to analyze them.
832
+
833
+ 0:21:02.849 --> 0:21:04.548
834
+ So you give it a word to.
835
+
836
+ 0:21:04.548 --> 0:21:10.427
837
+ It's telling you what are the possible forms
838
+ of that, like how they are built, and so on.
839
+
840
+ 0:21:10.427 --> 0:21:15.654
841
+ And for the at least Ah Iris shoes language,
842
+ there are a lot of tools for that.
843
+
844
+ 0:21:15.654 --> 0:21:18.463
845
+ Of course, if you now want to do that for.
846
+
847
+ 0:21:18.558 --> 0:21:24.281
848
+ Some language which is very low resourced
849
+ might be very difficult and there might be
850
+
851
+ 0:21:24.281 --> 0:21:25.492
852
+ no tool for them.
853
+
854
+ 0:21:28.368 --> 0:21:37.652
855
+ Good before we are going for the next part
856
+ about part of speech, are there any questions
857
+
858
+ 0:21:37.652 --> 0:21:38.382
859
+ about?
860
+
861
+ 0:22:01.781 --> 0:22:03.187
862
+ Yeah, we'll come to that a bit.
863
+
864
+ 0:22:03.483 --> 0:22:09.108
865
+ So it's a very good question and difficult
866
+ and especially we'll see that later if you
867
+
868
+ 0:22:09.108 --> 0:22:14.994
869
+ just put in words it would be very bad because
870
+ words are put into neural networks just as
871
+
872
+ 0:22:14.994 --> 0:22:15.844
873
+ some digits.
874
+
875
+ 0:22:15.844 --> 0:22:21.534
876
+ Each word is mapped into a jitter and you
877
+ put it in so it doesn't really know any more
878
+
879
+ 0:22:21.534 --> 0:22:22.908
880
+ about the structure.
881
+
882
+ 0:22:23.543 --> 0:22:29.898
883
+ What we will see therefore the most successful
884
+ approach which is mostly done is a subword
885
+
886
+ 0:22:29.898 --> 0:22:34.730
887
+ unit where we split: But we will do this.
888
+
889
+ 0:22:34.730 --> 0:22:40.154
890
+ Don't know if you have been in advanced.
891
+
892
+ 0:22:40.154 --> 0:22:44.256
893
+ We'll cover this on a Tuesday.
894
+
895
+ 0:22:44.364 --> 0:22:52.316
896
+ So there is an algorithm called bite pairing
897
+ coding, which is about splitting words into
898
+
899
+ 0:22:52.316 --> 0:22:52.942
900
+ parts.
901
+
902
+ 0:22:53.293 --> 0:23:00.078
903
+ So it's doing the splitting of words but not
904
+ morphologically motivated but more based on
905
+
906
+ 0:23:00.078 --> 0:23:00.916
907
+ frequency.
908
+
909
+ 0:23:00.940 --> 0:23:11.312
910
+ However, it performs very good and that's
911
+ why it's used and there is a bit of correlation.
912
+
913
+ 0:23:11.312 --> 0:23:15.529
914
+ Sometimes they agree on count based.
915
+
916
+ 0:23:15.695 --> 0:23:20.709
917
+ So we're splitting words and we're splitting
918
+ especially words which are infrequent and that's
919
+
920
+ 0:23:20.709 --> 0:23:23.962
921
+ maybe a good motivation why that's good for
922
+ neural networks.
923
+
924
+ 0:23:23.962 --> 0:23:28.709
925
+ That means if you have seen a word very often
926
+ you don't need to split it and it's easier
927
+
928
+ 0:23:28.709 --> 0:23:30.043
929
+ to just process it fast.
930
+
931
+ 0:23:30.690 --> 0:23:39.218
932
+ While if you have seen the words infrequently,
933
+ it is good to split it into parts so it can
934
+
935
+ 0:23:39.218 --> 0:23:39.593
936
+ do.
937
+
938
+ 0:23:39.779 --> 0:23:47.729
939
+ So there is some way of doing it, but linguists
940
+ would say this is not a morphological analyst.
941
+
942
+ 0:23:47.729 --> 0:23:53.837
943
+ That is true, but we are spitting words into
944
+ parts if they are not seen.
945
+
946
+ 0:23:59.699 --> 0:24:06.324
947
+ Yes, so another important thing about words
948
+ are the paddle speech text.
949
+
950
+ 0:24:06.324 --> 0:24:14.881
951
+ These are the common ones: noun, verb, adjective,
952
+ verb, determine, pronoun, proposition, and
953
+
954
+ 0:24:14.881 --> 0:24:16.077
955
+ conjunction.
956
+
957
+ 0:24:16.077 --> 0:24:26.880
958
+ There are some more: They are not the same
959
+ in all language, but for example there is this
960
+
961
+ 0:24:26.880 --> 0:24:38.104
962
+ universal grammar which tries to do this type
963
+ of part of speech text for many languages.
964
+
965
+ 0:24:38.258 --> 0:24:42.018
966
+ And then, of course, it's helping you for
967
+ generalization.
968
+
969
+ 0:24:42.018 --> 0:24:48.373
970
+ There are some language deals with verbs and
971
+ nouns, especially if you look at sentence structure.
972
+
973
+ 0:24:48.688 --> 0:24:55.332
974
+ And so if you know the part of speech tag
975
+ you can easily generalize and do get these
976
+
977
+ 0:24:55.332 --> 0:24:58.459
978
+ rules or apply these rules as you know.
979
+
980
+ 0:24:58.459 --> 0:25:02.680
981
+ The verb in English is always at the second
982
+ position.
983
+
984
+ 0:25:03.043 --> 0:25:10.084
985
+ So you know how to deal with verbs independently
986
+ of which words you are now really looking at.
987
+
988
+ 0:25:12.272 --> 0:25:18.551
989
+ And that again can be done is ambiguous.
990
+
991
+ 0:25:18.598 --> 0:25:27.171
992
+ So there are some words which can have several
993
+ pot of speech text.
994
+
995
+ 0:25:27.171 --> 0:25:38.686
996
+ Example are the word can, for example, which
997
+ can be the can of beans or can do something.
998
+
999
+ 0:25:38.959 --> 0:25:46.021
1000
+ Often is also in English related work.
1001
+
1002
+ 0:25:46.021 --> 0:25:55.256
1003
+ Access can be to excess or to access to something.
1004
+
1005
+ 0:25:56.836 --> 0:26:02.877
1006
+ Most words have only one single part of speech
1007
+ tag, but they are some where it's a bit more
1008
+
1009
+ 0:26:02.877 --> 0:26:03.731
1010
+ challenging.
1011
+
1012
+ 0:26:03.731 --> 0:26:09.640
1013
+ The nice thing is the ones which are in big
1014
+ are often more words, which occur more often,
1015
+
1016
+ 0:26:09.640 --> 0:26:12.858
1017
+ while for really ware words it's not that often.
1018
+
1019
+ 0:26:13.473 --> 0:26:23.159
1020
+ If you look at these classes you can distinguish
1021
+ open classes where new words can happen so
1022
+
1023
+ 0:26:23.159 --> 0:26:25.790
1024
+ we can invent new nouns.
1025
+
1026
+ 0:26:26.926 --> 0:26:31.461
1027
+ But then there are the close classes which
1028
+ I think are determined or pronoun.
1029
+
1030
+ 0:26:31.461 --> 0:26:35.414
1031
+ For example, it's not that you can easily
1032
+ develop your new pronoun.
1033
+
1034
+ 0:26:35.414 --> 0:26:38.901
1035
+ So there is a fixed list of pronouns and we
1036
+ are using that.
1037
+
1038
+ 0:26:38.901 --> 0:26:44.075
1039
+ So it's not like that or tomorrow there is
1040
+ something happening and then people are using
1041
+
1042
+ 0:26:44.075 --> 0:26:44.482
1043
+ a new.
1044
+
1045
+ 0:26:45.085 --> 0:26:52.426
1046
+ Pronoun or new conjectures, so it's like end,
1047
+ because it's not that you normally invent a
1048
+
1049
+ 0:26:52.426 --> 0:26:52.834
1050
+ new.
1051
+
1052
+ 0:27:00.120 --> 0:27:03.391
1053
+ And additional to part of speech text.
1054
+
1055
+ 0:27:03.391 --> 0:27:09.012
1056
+ Then some of these part of speech texts have
1057
+ different properties.
1058
+
1059
+ 0:27:09.389 --> 0:27:21.813
1060
+ So, for example, for nouns and adjectives
1061
+ we can have a singular plural: In other languages,
1062
+
1063
+ 0:27:21.813 --> 0:27:29.351
1064
+ there is a duel so that a word is not only
1065
+ like a single or in plural, but also like a
1066
+
1067
+ 0:27:29.351 --> 0:27:31.257
1068
+ duel if it's meaning.
1069
+
1070
+ 0:27:31.631 --> 0:27:36.246
1071
+ You have the gender and masculine feminine
1072
+ neutre we know.
1073
+
1074
+ 0:27:36.246 --> 0:27:43.912
1075
+ In other language there is animated and inanimated
1076
+ and you have the cases like in German you have
1077
+
1078
+ 0:27:43.912 --> 0:27:46.884
1079
+ no maternative guinetive acquisitive.
1080
+
1081
+ 0:27:47.467 --> 0:27:57.201
1082
+ So here and then in other languages you also
1083
+ have Latin with the upper teeth.
1084
+
1085
+ 0:27:57.497 --> 0:28:03.729
1086
+ So there's like more, it's just like yeah,
1087
+ and there you have no one to one correspondence,
1088
+
1089
+ 0:28:03.729 --> 0:28:09.961
1090
+ so it can be that there are some cases which
1091
+ are only in the one language and do not happen
1092
+
1093
+ 0:28:09.961 --> 0:28:11.519
1094
+ in the other language.
1095
+
1096
+ 0:28:13.473 --> 0:28:20.373
1097
+ For whorps we have tenses of course like walk
1098
+ is walking walked have walked head walked will
1099
+
1100
+ 0:28:20.373 --> 0:28:21.560
1101
+ walk and so on.
1102
+
1103
+ 0:28:21.560 --> 0:28:28.015
1104
+ Interestingly for example in Japanese this
1105
+ can also happen for adjectives though there
1106
+
1107
+ 0:28:28.015 --> 0:28:32.987
1108
+ is a difference between something is white
1109
+ or something was white.
1110
+
1111
+ 0:28:35.635 --> 0:28:41.496
1112
+ There is this continuous thing which should
1113
+ not really have that commonly in German and
1114
+
1115
+ 0:28:41.496 --> 0:28:47.423
1116
+ I guess that's if you're German and learning
1117
+ English that's something like she sings and
1118
+
1119
+ 0:28:47.423 --> 0:28:53.350
1120
+ she is singing and of course we can express
1121
+ that but it's not commonly used and normally
1122
+
1123
+ 0:28:53.350 --> 0:28:55.281
1124
+ we're not doing this aspect.
1125
+
1126
+ 0:28:55.455 --> 0:28:57.240
1127
+ Also about tenses.
1128
+
1129
+ 0:28:57.240 --> 0:29:05.505
1130
+ If you use pasts in English you will also
1131
+ use past tenses in German, so we have similar
1132
+
1133
+ 0:29:05.505 --> 0:29:09.263
1134
+ tenses, but the use might be different.
1135
+
1136
+ 0:29:14.214 --> 0:29:20.710
1137
+ There is uncertainty like the mood in there
1138
+ indicative.
1139
+
1140
+ 0:29:20.710 --> 0:29:26.742
1141
+ If he were here, there's voices active and
1142
+ passive.
1143
+
1144
+ 0:29:27.607 --> 0:29:34.024
1145
+ That you know, that is like both in German
1146
+ and English there, but there is something in
1147
+
1148
+ 0:29:34.024 --> 0:29:35.628
1149
+ the Middle and Greek.
1150
+
1151
+ 0:29:35.628 --> 0:29:42.555
1152
+ I get myself taught, so there is other phenomens
1153
+ than which might only happen in one language.
1154
+
1155
+ 0:29:42.762 --> 0:29:50.101
1156
+ This is, like yeah, the different synthetic
1157
+ structures that you can can have in the language,
1158
+
1159
+ 0:29:50.101 --> 0:29:57.361
1160
+ and where there's the two things, so it might
1161
+ be that some only are in some language, others
1162
+
1163
+ 0:29:57.361 --> 0:29:58.376
1164
+ don't exist.
1165
+
1166
+ 0:29:58.358 --> 0:30:05.219
1167
+ And on the other hand there is also matching,
1168
+ so it might be that in some situations you
1169
+
1170
+ 0:30:05.219 --> 0:30:07.224
1171
+ use different structures.
1172
+
1173
+ 0:30:10.730 --> 0:30:13.759
1174
+ The next would be then about semantics.
1175
+
1176
+ 0:30:13.759 --> 0:30:16.712
1177
+ Do you have any questions before that?
1178
+
1179
+ 0:30:19.819 --> 0:30:31.326
1180
+ I'll just continue, but if something is unclear
1181
+ beside the structure, we typically have more
1182
+
1183
+ 0:30:31.326 --> 0:30:39.863
1184
+ ambiguities, so it can be that words itself
1185
+ have different meanings.
1186
+
1187
+ 0:30:40.200 --> 0:30:48.115
1188
+ And we are typically talking about polysemy
1189
+ and homonyme, where polysemy means that a word
1190
+
1191
+ 0:30:48.115 --> 0:30:50.637
1192
+ can have different meanings.
1193
+
1194
+ 0:30:50.690 --> 0:30:58.464
1195
+ So if you have the English word interest,
1196
+ it can be that you are interested in something.
1197
+
1198
+ 0:30:58.598 --> 0:31:07.051
1199
+ Or it can be like the interest rate financial,
1200
+ but it is somehow related because if you are
1201
+
1202
+ 0:31:07.051 --> 0:31:11.002
1203
+ getting some interest rates there is some.
1204
+
1205
+ 0:31:11.531 --> 0:31:18.158
1206
+ Are, but there is a homophemer where they
1207
+ really are not related.
1208
+
1209
+ 0:31:18.458 --> 0:31:24.086
1210
+ So you can and can doesn't really have anything
1211
+ in common, so it's really very different.
1212
+
1213
+ 0:31:24.324 --> 0:31:29.527
1214
+ And of course that's not completely clear
1215
+ so there is not a clear definition so for example
1216
+
1217
+ 0:31:29.527 --> 0:31:34.730
1218
+ for the bank it can be that you say it's related
1219
+ but it can also be other can argue that so
1220
+
1221
+ 0:31:34.730 --> 0:31:39.876
1222
+ there are some clear things which is interest
1223
+ there are some which is vague and then there
1224
+
1225
+ 0:31:39.876 --> 0:31:43.439
1226
+ are some where it's very clear again that there
1227
+ are different.
1228
+
1229
+ 0:31:45.065 --> 0:31:49.994
1230
+ And in order to translate them, of course,
1231
+ we might need the context to disambiguate.
1232
+
1233
+ 0:31:49.994 --> 0:31:54.981
1234
+ That's typically where we can disambiguate,
1235
+ and that's not only for lexical semantics,
1236
+
1237
+ 0:31:54.981 --> 0:32:00.198
1238
+ that's generally very often that if you want
1239
+ to disambiguate, context can be very helpful.
1240
+
1241
+ 0:32:00.198 --> 0:32:03.981
1242
+ So in which sentence and which general knowledge
1243
+ who is speaking?
1244
+
1245
+ 0:32:04.944 --> 0:32:09.867
1246
+ You can do that externally by some disinvigration
1247
+ task.
1248
+
1249
+ 0:32:09.867 --> 0:32:14.702
1250
+ Machine translation system will also do it
1251
+ internally.
1252
+
1253
+ 0:32:16.156 --> 0:32:21.485
1254
+ And sometimes you're lucky and you don't need
1255
+ to do it because you just have the same ambiguity
1256
+
1257
+ 0:32:21.485 --> 0:32:23.651
1258
+ in the source and the target language.
1259
+
1260
+ 0:32:23.651 --> 0:32:26.815
1261
+ And then it doesn't matter if you think about
1262
+ the mouse.
1263
+
1264
+ 0:32:26.815 --> 0:32:31.812
1265
+ As I said, you don't really need to know if
1266
+ it's a computer mouse or the living mouse you
1267
+
1268
+ 0:32:31.812 --> 0:32:36.031
1269
+ translate from German to English because it
1270
+ has exactly the same ambiguity.
1271
+
1272
+ 0:32:40.400 --> 0:32:46.764
1273
+ There's also relations between words like
1274
+ synonyms, antonyms, hipponomes, like the is
1275
+
1276
+ 0:32:46.764 --> 0:32:50.019
1277
+ a relation and the part of like Dora House.
1278
+
1279
+ 0:32:50.019 --> 0:32:55.569
1280
+ Big small is an antonym and synonym is like
1281
+ which needs something similar.
1282
+
1283
+ 0:32:56.396 --> 0:33:03.252
1284
+ There are resources which try to express all
1285
+ these linguistic information like word net
1286
+
1287
+ 0:33:03.252 --> 0:33:10.107
1288
+ or German net where you have a graph with words
1289
+ and how they are related to each other.
1290
+
1291
+ 0:33:11.131 --> 0:33:12.602
1292
+ Which can be helpful.
1293
+
1294
+ 0:33:12.602 --> 0:33:18.690
1295
+ Typically these things were more used in tasks
1296
+ where there is fewer data, so there's a lot
1297
+
1298
+ 0:33:18.690 --> 0:33:24.510
1299
+ of tasks in NLP where you have very limited
1300
+ data because you really need to hand align
1301
+
1302
+ 0:33:24.510 --> 0:33:24.911
1303
+ that.
1304
+
1305
+ 0:33:25.125 --> 0:33:28.024
1306
+ Machine translation has a big advantage.
1307
+
1308
+ 0:33:28.024 --> 0:33:31.842
1309
+ There's naturally a lot of text translated
1310
+ out there.
1311
+
1312
+ 0:33:32.212 --> 0:33:39.519
1313
+ Typically in machine translation we have compared
1314
+ to other tasks significantly amount of data.
1315
+
1316
+ 0:33:39.519 --> 0:33:46.212
1317
+ People have looked into integrating wordnet
1318
+ or things like that, but it is rarely used
1319
+
1320
+ 0:33:46.212 --> 0:33:49.366
1321
+ in like commercial systems or something.
1322
+
1323
+ 0:33:52.692 --> 0:33:55.626
1324
+ So this was based on the words.
1325
+
1326
+ 0:33:55.626 --> 0:34:03.877
1327
+ We have morphology, syntax, and semantics,
1328
+ and then of course it makes sense to also look
1329
+
1330
+ 0:34:03.877 --> 0:34:06.169
1331
+ at the bigger structure.
1332
+
1333
+ 0:34:06.169 --> 0:34:08.920
1334
+ That means information about.
1335
+
1336
+ 0:34:08.948 --> 0:34:17.822
1337
+ Of course, we don't have a really morphology
1338
+ there because morphology about the structure
1339
+
1340
+ 0:34:17.822 --> 0:34:26.104
1341
+ of words, but we have syntax on the sentence
1342
+ level and the semantic representation.
1343
+
1344
+ 0:34:28.548 --> 0:34:35.637
1345
+ When we are thinking about the sentence structure,
1346
+ then the sentence is, of course, first a sequence
1347
+
1348
+ 0:34:35.637 --> 0:34:37.742
1349
+ of words terminated by a dot.
1350
+
1351
+ 0:34:37.742 --> 0:34:42.515
1352
+ Jane bought the house and we can say something
1353
+ about the structure.
1354
+
1355
+ 0:34:42.515 --> 0:34:47.077
1356
+ It's typically its subject work and then one
1357
+ or several objects.
1358
+
1359
+ 0:34:47.367 --> 0:34:51.996
1360
+ And the number of objects, for example, is
1361
+ then determined by the word.
1362
+
1363
+ 0:34:52.232 --> 0:34:54.317
1364
+ It's Called the Valency.
1365
+
1366
+ 0:34:54.354 --> 0:35:01.410
1367
+ So you have intransitive verbs which don't
1368
+ get any object, it's just to sleep.
1369
+
1370
+ 0:35:02.622 --> 0:35:05.912
1371
+ For example, there is no object sleep beds.
1372
+
1373
+ 0:35:05.912 --> 0:35:14.857
1374
+ You cannot say that: And there are transitive
1375
+ verbs where you have to put one or more objects,
1376
+
1377
+ 0:35:14.857 --> 0:35:16.221
1378
+ and you always.
1379
+
1380
+ 0:35:16.636 --> 0:35:19.248
1381
+ Sentence is not correct if you don't put the
1382
+ object.
1383
+
1384
+ 0:35:19.599 --> 0:35:33.909
1385
+ So if you have to buy something you have to
1386
+ say bought this or give someone something then.
1387
+
1388
+ 0:35:34.194 --> 0:35:40.683
1389
+ Here you see a bit that may be interesting
1390
+ the relation between word order and morphology.
1391
+
1392
+ 0:35:40.683 --> 0:35:47.243
1393
+ Of course it's not that strong, but for example
1394
+ in English you always have to first say who
1395
+
1396
+ 0:35:47.243 --> 0:35:49.453
1397
+ you gave it and what you gave.
1398
+
1399
+ 0:35:49.453 --> 0:35:53.304
1400
+ So the structure is very clear and cannot
1401
+ be changed.
1402
+
1403
+ 0:35:54.154 --> 0:36:00.801
1404
+ German, for example, has a possibility of
1405
+ determining what you gave and whom you gave
1406
+
1407
+ 0:36:00.801 --> 0:36:07.913
1408
+ it because there is a morphology and you can
1409
+ do what you gave a different form than to whom
1410
+
1411
+ 0:36:07.913 --> 0:36:08.685
1412
+ you gave.
1413
+
1414
+ 0:36:11.691 --> 0:36:18.477
1415
+ And that is a general tendency that if you
1416
+ have morphology then typically the word order
1417
+
1418
+ 0:36:18.477 --> 0:36:25.262
1419
+ is more free and possible, while in English
1420
+ you cannot express these information through
1421
+
1422
+ 0:36:25.262 --> 0:36:26.482
1423
+ the morphology.
1424
+
1425
+ 0:36:26.706 --> 0:36:30.238
1426
+ You typically have to express them through
1427
+ the word order.
1428
+
1429
+ 0:36:30.238 --> 0:36:32.872
1430
+ It's not as free, but it's more restricted.
1431
+
1432
+ 0:36:35.015 --> 0:36:40.060
1433
+ Yeah, the first part is typically the noun
1434
+ phrase, the subject, and that can not only
1435
+
1436
+ 0:36:40.060 --> 0:36:43.521
1437
+ be a single noun, but of course it can be a
1438
+ longer phrase.
1439
+
1440
+ 0:36:43.521 --> 0:36:48.860
1441
+ So if you have Jane the woman, it can be Jane,
1442
+ it can be the woman, it can a woman, it can
1443
+
1444
+ 0:36:48.860 --> 0:36:52.791
1445
+ be the young woman or the young woman who lives
1446
+ across the street.
1447
+
1448
+ 0:36:53.073 --> 0:36:56.890
1449
+ All of these are the subjects, so this can
1450
+ be already very, very long.
1451
+
1452
+ 0:36:57.257 --> 0:36:58.921
1453
+ And they also put this.
1454
+
1455
+ 0:36:58.921 --> 0:37:05.092
1456
+ The verb is on the second position in a bit
1457
+ more complicated way because if you have now
1458
+
1459
+ 0:37:05.092 --> 0:37:11.262
1460
+ the young woman who lives across the street
1461
+ runs to somewhere or so then yeah runs is at
1462
+
1463
+ 0:37:11.262 --> 0:37:16.185
1464
+ the second position in this tree but the first
1465
+ position is quite long.
1466
+
1467
+ 0:37:16.476 --> 0:37:19.277
1468
+ And so it's not just counting okay.
1469
+
1470
+ 0:37:19.277 --> 0:37:22.700
1471
+ The second word is always is always a word.
1472
+
1473
+ 0:37:26.306 --> 0:37:32.681
1474
+ Additional to these simple things, there's
1475
+ more complex stuff.
1476
+
1477
+ 0:37:32.681 --> 0:37:43.104
1478
+ Jane bought the house from Jim without hesitation,
1479
+ or Jane bought the house in the pushed neighborhood
1480
+
1481
+ 0:37:43.104 --> 0:37:44.925
1482
+ across the river.
1483
+
1484
+ 0:37:45.145 --> 0:37:51.694
1485
+ And these often lead to additional ambiguities
1486
+ because it's not always completely clear to
1487
+
1488
+ 0:37:51.694 --> 0:37:53.565
1489
+ which this prepositional.
1490
+
1491
+ 0:37:54.054 --> 0:37:59.076
1492
+ So that we'll see and you have, of course,
1493
+ subclasses and so on.
1494
+
1495
+ 0:38:01.061 --> 0:38:09.926
1496
+ And then there is a theory behind it which
1497
+ was very important for rule based machine translation
1498
+
1499
+ 0:38:09.926 --> 0:38:14.314
1500
+ because that's exactly what you're doing there.
1501
+
1502
+ 0:38:14.314 --> 0:38:18.609
1503
+ You would take the sentence, do the syntactic.
1504
+
1505
+ 0:38:18.979 --> 0:38:28.432
1506
+ So that we can have this constituents which
1507
+ like describe the basic parts of the language.
1508
+
1509
+ 0:38:28.468 --> 0:38:35.268
1510
+ And we can create the sentence structure as
1511
+ a context free grammar, which you hopefully
1512
+
1513
+ 0:38:35.268 --> 0:38:42.223
1514
+ remember from basic computer science, which
1515
+ is a pair of non terminals, terminal symbols,
1516
+
1517
+ 0:38:42.223 --> 0:38:44.001
1518
+ production rules, and.
1519
+
1520
+ 0:38:43.943 --> 0:38:50.218
1521
+ And the star symbol, and you can then describe
1522
+ a sentence by this phrase structure grammar:
1523
+
1524
+ 0:38:51.751 --> 0:38:59.628
1525
+ So a simple example would be something like
1526
+ that: you have a lexicon, Jane is a noun, Frays
1527
+
1528
+ 0:38:59.628 --> 0:39:02.367
1529
+ is a noun, Telescope is a noun.
1530
+
1531
+ 0:39:02.782 --> 0:39:10.318
1532
+ And then you have these production rules sentences:
1533
+ a noun phrase in the web phrase.
1534
+
1535
+ 0:39:10.318 --> 0:39:18.918
1536
+ The noun phrase can either be a determinized
1537
+ noun or it can be a noun phrase and a propositional
1538
+
1539
+ 0:39:18.918 --> 0:39:19.628
1540
+ phrase.
1541
+
1542
+ 0:39:19.919 --> 0:39:25.569
1543
+ Or a prepositional phrase and a prepositional
1544
+ phrase is a preposition and a non phrase.
1545
+
1546
+ 0:39:26.426 --> 0:39:27.622
1547
+ We're looking at this.
1548
+
1549
+ 0:39:27.622 --> 0:39:30.482
1550
+ What is the valency of the word we're describing
1551
+ here?
1552
+
1553
+ 0:39:33.513 --> 0:39:36.330
1554
+ How many objects would in this case the world
1555
+ have?
1556
+
1557
+ 0:39:46.706 --> 0:39:48.810
1558
+ We're looking at the web phrase.
1559
+
1560
+ 0:39:48.810 --> 0:39:54.358
1561
+ The web phrase is a verb and a noun phrase,
1562
+ so one object here, so this would be for a
1563
+
1564
+ 0:39:54.358 --> 0:39:55.378
1565
+ balance of one.
1566
+
1567
+ 0:39:55.378 --> 0:40:00.925
1568
+ If you have intransitive verbs, it would be
1569
+ verb phrases, just a word, and if you have
1570
+
1571
+ 0:40:00.925 --> 0:40:03.667
1572
+ two, it would be noun phrase, noun phrase.
1573
+
1574
+ 0:40:08.088 --> 0:40:15.348
1575
+ And yeah, then the, the, the challenge or
1576
+ what you have to do is like this: Given a natural
1577
+
1578
+ 0:40:15.348 --> 0:40:23.657
1579
+ language sentence, you want to parse it to
1580
+ get this type of pastry from programming languages
1581
+
1582
+ 0:40:23.657 --> 0:40:30.198
1583
+ where you also need to parse the code in order
1584
+ to get the representation.
1585
+
1586
+ 0:40:30.330 --> 0:40:39.356
1587
+ However, there is one challenge if you parse
1588
+ natural language compared to computer language.
1589
+
1590
+ 0:40:43.823 --> 0:40:56.209
1591
+ So there are different ways of how you can
1592
+ express things and there are different pastures
1593
+
1594
+ 0:40:56.209 --> 0:41:00.156
1595
+ belonging to the same input.
1596
+
1597
+ 0:41:00.740 --> 0:41:05.241
1598
+ So if you have Jane buys a horse, how's that
1599
+ an easy example?
1600
+
1601
+ 0:41:05.241 --> 0:41:07.491
1602
+ So you do the lexicon look up.
1603
+
1604
+ 0:41:07.491 --> 0:41:13.806
1605
+ Jane can be a noun phrase, a bias is a verb,
1606
+ a is a determiner, and a house is a noun.
1607
+
1608
+ 0:41:15.215 --> 0:41:18.098
1609
+ And then you can now use the grammar rules
1610
+ of here.
1611
+
1612
+ 0:41:18.098 --> 0:41:19.594
1613
+ There is no rule for that.
1614
+
1615
+ 0:41:20.080 --> 0:41:23.564
1616
+ Here we have no rules, but here we have a
1617
+ rule.
1618
+
1619
+ 0:41:23.564 --> 0:41:27.920
1620
+ A noun is a non-phrase, so we have mapped
1621
+ that to the noun.
1622
+
1623
+ 0:41:28.268 --> 0:41:34.012
1624
+ Then we can map this to the web phrase.
1625
+
1626
+ 0:41:34.012 --> 0:41:47.510
1627
+ We have a verb noun phrase to web phrase and
1628
+ then we can map this to a sentence representing:
1629
+
1630
+ 0:41:49.069 --> 0:41:53.042
1631
+ We can have that even more complex.
1632
+
1633
+ 0:41:53.042 --> 0:42:01.431
1634
+ The woman who won the lottery yesterday bought
1635
+ the house across the street.
1636
+
1637
+ 0:42:01.431 --> 0:42:05.515
1638
+ The structure gets more complicated.
1639
+
1640
+ 0:42:05.685 --> 0:42:12.103
1641
+ You now see that the word phrase is at the
1642
+ second position, but the noun phrase is quite.
1643
+
1644
+ 0:42:12.052 --> 0:42:18.655
1645
+ Quite big in here and the p p phrases, it's
1646
+ sometimes difficult where to put them because
1647
+
1648
+ 0:42:18.655 --> 0:42:25.038
1649
+ they can be put to the noun phrase, but in
1650
+ other sentences they can also be put to the
1651
+
1652
+ 0:42:25.038 --> 0:42:25.919
1653
+ web phrase.
1654
+
1655
+ 0:42:36.496 --> 0:42:38.250
1656
+ Yeah.
1657
+
1658
+ 0:42:43.883 --> 0:42:50.321
1659
+ Yes, so then either it can have two tags,
1660
+ noun or noun phrase, or you can have the extra
1661
+
1662
+ 0:42:50.321 --> 0:42:50.755
1663
+ rule.
1664
+
1665
+ 0:42:50.755 --> 0:42:57.409
1666
+ The noun phrase can not only be a determiner
1667
+ in the noun, but it can also be a noun phrase.
1668
+
1669
+ 0:42:57.717 --> 0:43:04.360
1670
+ Then of course either you introduce additional
1671
+ rules when what is possible or the problem
1672
+
1673
+ 0:43:04.360 --> 0:43:11.446
1674
+ that if you do pastures which are not correct
1675
+ and then you have to add some type of probability
1676
+
1677
+ 0:43:11.446 --> 0:43:13.587
1678
+ which type is more probable.
1679
+
1680
+ 0:43:16.876 --> 0:43:23.280
1681
+ But of course some things also can't really
1682
+ model easily with this type of cheese.
1683
+
1684
+ 0:43:23.923 --> 0:43:32.095
1685
+ There, for example, the agreement is not straightforward
1686
+ to do so that in subject and work you can check
1687
+
1688
+ 0:43:32.095 --> 0:43:38.866
1689
+ that the person, the agreement, the number
1690
+ in person, the number agreement is correct,
1691
+
1692
+ 0:43:38.866 --> 0:43:41.279
1693
+ but if it's a singular object.
1694
+
1695
+ 0:43:41.561 --> 0:43:44.191
1696
+ A singular verb, it's also a singular.
1697
+
1698
+ 0:43:44.604 --> 0:43:49.242
1699
+ Non-subject, and if it's a plural subject,
1700
+ it's a plural work.
1701
+
1702
+ 0:43:49.489 --> 0:43:56.519
1703
+ Things like that are yeah, the agreement in
1704
+ determining action driven now, so they also
1705
+
1706
+ 0:43:56.519 --> 0:43:57.717
1707
+ have to agree.
1708
+
1709
+ 0:43:57.877 --> 0:44:05.549
1710
+ Things like that cannot be easily done with
1711
+ this type of grammar or this subcategorization
1712
+
1713
+ 0:44:05.549 --> 0:44:13.221
1714
+ that you check whether the verb is transitive
1715
+ or intransitive, and that Jane sleeps is OK,
1716
+
1717
+ 0:44:13.221 --> 0:44:16.340
1718
+ but Jane sleeps the house is not OK.
1719
+
1720
+ 0:44:16.436 --> 0:44:21.073
1721
+ And Jane Walterhouse is okay, but Jane Walterhouse
1722
+ is not okay.
1723
+
1724
+ 0:44:23.183 --> 0:44:29.285
1725
+ Furthermore, this long range dependency might
1726
+ be difficult and which word orders are allowed
1727
+
1728
+ 0:44:29.285 --> 0:44:31.056
1729
+ and which are not allowed.
1730
+
1731
+ 0:44:31.571 --> 0:44:40.011
1732
+ This is also not directly so you can say Maria
1733
+ give de man das bourg, de man give Maria das
1734
+
1735
+ 0:44:40.011 --> 0:44:47.258
1736
+ bourg, das bourg give Maria, de man aber Maria,
1737
+ de man give des bourg is some.
1738
+
1739
+ 0:44:47.227 --> 0:44:55.191
1740
+ One yeah, which one from this one is possible
1741
+ and not is sometimes not possible to model,
1742
+
1743
+ 0:44:55.191 --> 0:44:56.164
1744
+ is simple.
1745
+
1746
+ 0:44:56.876 --> 0:45:05.842
1747
+ Therefore, people have done more complex stuff
1748
+ like this unification grammar and tried to
1749
+
1750
+ 0:45:05.842 --> 0:45:09.328
1751
+ model both the categories of verb.
1752
+
1753
+ 0:45:09.529 --> 0:45:13.367
1754
+ The agreement has to be that it's person and
1755
+ single.
1756
+
1757
+ 0:45:13.367 --> 0:45:20.028
1758
+ You're joining that so you're annotating this
1759
+ thing with more information and then you have
1760
+
1761
+ 0:45:20.028 --> 0:45:25.097
1762
+ more complex synthetic structures in order
1763
+ to model also these types.
1764
+
1765
+ 0:45:28.948 --> 0:45:33.137
1766
+ Yeah, why is this difficult?
1767
+
1768
+ 0:45:33.873 --> 0:45:39.783
1769
+ We have different ambiguities and that makes
1770
+ it different, so words have different part
1771
+
1772
+ 0:45:39.783 --> 0:45:43.610
1773
+ of speech text and if you have time flies like
1774
+ an error.
1775
+
1776
+ 0:45:43.583 --> 0:45:53.554
1777
+ It can mean that sometimes the animal L look
1778
+ like an arrow and or it can mean that the time
1779
+
1780
+ 0:45:53.554 --> 0:45:59.948
1781
+ is flying very fast is going away very fast
1782
+ like an error.
1783
+
1784
+ 0:46:00.220 --> 0:46:10.473
1785
+ And if you want to do a pastry, these two
1786
+ meanings have a different part of speech text,
1787
+
1788
+ 0:46:10.473 --> 0:46:13.008
1789
+ so flies is the verb.
1790
+
1791
+ 0:46:13.373 --> 0:46:17.999
1792
+ And of course that is a different semantic,
1793
+ and so that is very different.
1794
+
1795
+ 0:46:19.499 --> 0:46:23.361
1796
+ And otherwise a structural.
1797
+
1798
+ 0:46:23.243 --> 0:46:32.419
1799
+ Ambiguity so that like some part of the sentence
1800
+ can have different rules, so the famous thing
1801
+
1802
+ 0:46:32.419 --> 0:46:34.350
1803
+ is this attachment.
1804
+
1805
+ 0:46:34.514 --> 0:46:39.724
1806
+ So the cops saw the Bulgara with a binoculars.
1807
+
1808
+ 0:46:39.724 --> 0:46:48.038
1809
+ Then with a binocular can be attached to saw
1810
+ or it can be attached to the.
1811
+
1812
+ 0:46:48.448 --> 0:46:59.897
1813
+ And so in the first two it's more probable
1814
+ that he saw the theft, and not that the theft
1815
+
1816
+ 0:46:59.897 --> 0:47:01.570
1817
+ has the one.
1818
+
1819
+ 0:47:01.982 --> 0:47:13.356
1820
+ And this, of course, makes things difficult
1821
+ while parsing and doing structure implicitly
1822
+
1823
+ 0:47:13.356 --> 0:47:16.424
1824
+ defining the semantics.
1825
+
1826
+ 0:47:20.120 --> 0:47:29.736
1827
+ Therefore, we would then go directly to semantics,
1828
+ but maybe some questions about spintax and
1829
+
1830
+ 0:47:29.736 --> 0:47:31.373
1831
+ how that works.
1832
+
1833
+ 0:47:33.113 --> 0:47:46.647
1834
+ Then we'll do a bit more about semantics,
1835
+ so now we only describe the structure of the
1836
+
1837
+ 0:47:46.647 --> 0:47:48.203
1838
+ sentence.
1839
+
1840
+ 0:47:48.408 --> 0:47:55.584
1841
+ And for the meaning of the sentence we typically
1842
+ have the compositionality of meaning.
1843
+
1844
+ 0:47:55.584 --> 0:48:03.091
1845
+ The meaning of the full sentence is determined
1846
+ by the meaning of the individual words, and
1847
+
1848
+ 0:48:03.091 --> 0:48:06.308
1849
+ they together form the meaning of the.
1850
+
1851
+ 0:48:06.686 --> 0:48:17.936
1852
+ For words that is partly true but not always
1853
+ mean for things like rainbow, jointly rain
1854
+
1855
+ 0:48:17.936 --> 0:48:19.086
1856
+ and bow.
1857
+
1858
+ 0:48:19.319 --> 0:48:26.020
1859
+ But this is not always a case, while for sentences
1860
+ typically that is happening because you can't
1861
+
1862
+ 0:48:26.020 --> 0:48:30.579
1863
+ directly determine the full meaning, but you
1864
+ split it into parts.
1865
+
1866
+ 0:48:30.590 --> 0:48:36.164
1867
+ Sometimes only in some parts like kick the
1868
+ bucket the expression.
1869
+
1870
+ 0:48:36.164 --> 0:48:43.596
1871
+ Of course you cannot get the meaning of kick
1872
+ the bucket by looking at the individual or
1873
+
1874
+ 0:48:43.596 --> 0:48:46.130
1875
+ in German abyss in its grass.
1876
+
1877
+ 0:48:47.207 --> 0:48:53.763
1878
+ You cannot get that he died by looking at
1879
+ the individual words of Bis ins grass, but
1880
+
1881
+ 0:48:53.763 --> 0:48:54.611
1882
+ they have.
1883
+
1884
+ 0:48:55.195 --> 0:49:10.264
1885
+ And there are different ways of describing
1886
+ that some people have tried that more commonly
1887
+
1888
+ 0:49:10.264 --> 0:49:13.781
1889
+ used for some tasks.
1890
+
1891
+ 0:49:14.654 --> 0:49:20.073
1892
+ Will come to so the first thing would be something
1893
+ like first order logic.
1894
+
1895
+ 0:49:20.073 --> 0:49:27.297
1896
+ If you have Peter loves Jane then you have
1897
+ this meaning and you're having the end of representation
1898
+
1899
+ 0:49:27.297 --> 0:49:33.005
1900
+ that you have a love property between Peter
1901
+ and Jane and you try to construct.
1902
+
1903
+ 0:49:32.953 --> 0:49:40.606
1904
+ That you will see this a lot more complex
1905
+ than directly than only doing syntax but also
1906
+
1907
+ 0:49:40.606 --> 0:49:43.650
1908
+ doing this type of representation.
1909
+
1910
+ 0:49:44.164 --> 0:49:47.761
1911
+ The other thing is to try to do frame semantics.
1912
+
1913
+ 0:49:47.867 --> 0:49:55.094
1914
+ That means that you try to represent the knowledge
1915
+ about the world and you have these ah frames.
1916
+
1917
+ 0:49:55.094 --> 0:49:58.372
1918
+ For example, you might have a frame to buy.
1919
+
1920
+ 0:49:58.418 --> 0:50:05.030
1921
+ And the meaning is that you have a commercial
1922
+ transaction.
1923
+
1924
+ 0:50:05.030 --> 0:50:08.840
1925
+ You have a person who is selling.
1926
+
1927
+ 0:50:08.969 --> 0:50:10.725
1928
+ You Have a Person Who's Buying.
1929
+
1930
+ 0:50:11.411 --> 0:50:16.123
1931
+ You have something that is priced, you might
1932
+ have a price, and so on.
1933
+
1934
+ 0:50:17.237 --> 0:50:22.698
1935
+ And then what you are doing in semantic parsing
1936
+ with frame semantics you first try to determine.
1937
+
1938
+ 0:50:22.902 --> 0:50:30.494
1939
+ Which frames are happening in the sentence,
1940
+ so if it's something with Bowie buying you
1941
+
1942
+ 0:50:30.494 --> 0:50:33.025
1943
+ would try to first identify.
1944
+
1945
+ 0:50:33.025 --> 0:50:40.704
1946
+ Oh, here we have to try Brain B, which does
1947
+ not always have to be indicated by the verb
1948
+
1949
+ 0:50:40.704 --> 0:50:42.449
1950
+ cell or other ways.
1951
+
1952
+ 0:50:42.582 --> 0:50:52.515
1953
+ And then you try to find out which elements
1954
+ of these frame are in the sentence and try
1955
+
1956
+ 0:50:52.515 --> 0:50:54.228
1957
+ to align them.
1958
+
1959
+ 0:50:56.856 --> 0:51:01.121
1960
+ Yeah, you have, for example, to buy and sell.
1961
+
1962
+ 0:51:01.121 --> 0:51:07.239
1963
+ If you have a model that has frames, they
1964
+ have the same elements.
1965
+
1966
+ 0:51:09.829 --> 0:51:15.018
1967
+ In addition over like sentence, then you have
1968
+ also a phenomenon beyond sentence level.
1969
+
1970
+ 0:51:15.018 --> 0:51:20.088
1971
+ We're coming to this later because it's a
1972
+ special challenge for machine translation.
1973
+
1974
+ 0:51:20.088 --> 0:51:22.295
1975
+ There is, for example, co reference.
1976
+
1977
+ 0:51:22.295 --> 0:51:27.186
1978
+ That means if you first mention it, it's like
1979
+ the President of the United States.
1980
+
1981
+ 0:51:27.467 --> 0:51:30.107
1982
+ And later you would refer to him maybe as
1983
+ he.
1984
+
1985
+ 0:51:30.510 --> 0:51:36.966
1986
+ And that is especially challenging in machine
1987
+ translation because you're not always using
1988
+
1989
+ 0:51:36.966 --> 0:51:38.114
1990
+ the same thing.
1991
+
1992
+ 0:51:38.114 --> 0:51:44.355
1993
+ Of course, for the president, it's he and
1994
+ air in German, but for other things it might
1995
+
1996
+ 0:51:44.355 --> 0:51:49.521
1997
+ be different depending on the gender in languages
1998
+ that you refer to it.
1999
+
2000
+ 0:51:55.435 --> 0:52:03.866
2001
+ So much for the background and the next, we
2002
+ want to look based on the knowledge we have
2003
+
2004
+ 0:52:03.866 --> 0:52:04.345
2005
+ now.
2006
+
2007
+ 0:52:04.345 --> 0:52:10.285
2008
+ Why is machine translation difficult before
2009
+ we have any more?
2010
+
2011
+ 0:52:16.316 --> 0:52:22.471
2012
+ The first type of problem is what we refer
2013
+ to as translation divers.
2014
+
2015
+ 0:52:22.471 --> 0:52:30.588
2016
+ That means that we have the same information
2017
+ in source and target, but the problem is that
2018
+
2019
+ 0:52:30.588 --> 0:52:33.442
2020
+ they are expressed differently.
2021
+
2022
+ 0:52:33.713 --> 0:52:42.222
2023
+ So it is not the same way, and we have to
2024
+ translate these things more easily by just
2025
+
2026
+ 0:52:42.222 --> 0:52:44.924
2027
+ having a bit more complex.
2028
+
2029
+ 0:52:45.325 --> 0:52:51.324
2030
+ So example is if it's only a structure in
2031
+ English, the delicious.
2032
+
2033
+ 0:52:51.324 --> 0:52:59.141
2034
+ The adjective is before the noun, while in
2035
+ Spanish you have to put it after the noun,
2036
+
2037
+ 0:52:59.141 --> 0:53:02.413
2038
+ and so you have to change the word.
2039
+
2040
+ 0:53:02.983 --> 0:53:10.281
2041
+ So there are different ways of divergence,
2042
+ so there can be structural divergence, which
2043
+
2044
+ 0:53:10.281 --> 0:53:10.613
2045
+ is.
2046
+
2047
+ 0:53:10.550 --> 0:53:16.121
2048
+ The word orders so that the order is different,
2049
+ so in German we have that especially in the
2050
+
2051
+ 0:53:16.121 --> 0:53:19.451
2052
+ in the sub clause, while in English in the
2053
+ sub clause.
2054
+
2055
+ 0:53:19.451 --> 0:53:24.718
2056
+ The verb is also at the second position, in
2057
+ German it's at the end, and so you have to
2058
+
2059
+ 0:53:24.718 --> 0:53:25.506
2060
+ move it all.
2061
+
2062
+ 0:53:25.465 --> 0:53:27.222
2063
+ Um All Over.
2064
+
2065
+ 0:53:27.487 --> 0:53:32.978
2066
+ It can be that that it's a complete different
2067
+ grammatical role.
2068
+
2069
+ 0:53:33.253 --> 0:53:35.080
2070
+ So,.
2071
+
2072
+ 0:53:35.595 --> 0:53:37.458
2073
+ You Have You Like Her.
2074
+
2075
+ 0:53:38.238 --> 0:53:41.472
2076
+ And eh in in.
2077
+
2078
+ 0:53:41.261 --> 0:53:47.708
2079
+ English: In Spanish it's a la ti gusta which
2080
+ means she so now she is no longer like object
2081
+
2082
+ 0:53:47.708 --> 0:53:54.509
2083
+ but she is subject here and you are now acquisitive
2084
+ and then pleases or like yeah so you really
2085
+
2086
+ 0:53:54.509 --> 0:53:58.689
2087
+ use a different sentence structure and you
2088
+ have to change.
2089
+
2090
+ 0:53:59.139 --> 0:54:03.624
2091
+ Can also be the head switch.
2092
+
2093
+ 0:54:03.624 --> 0:54:09.501
2094
+ In English you say the baby just ate.
2095
+
2096
+ 0:54:09.501 --> 0:54:16.771
2097
+ In Spanish literary you say the baby finishes.
2098
+
2099
+ 0:54:16.997 --> 0:54:20.803
2100
+ So the is no longer the word, but the finishing
2101
+ is the word.
2102
+
2103
+ 0:54:21.241 --> 0:54:30.859
2104
+ So you have to learn so you cannot always
2105
+ have the same structures in your input and
2106
+
2107
+ 0:54:30.859 --> 0:54:31.764
2108
+ output.
2109
+
2110
+ 0:54:36.856 --> 0:54:42.318
2111
+ Lexical things like to swim across or to cross
2112
+ swimming.
2113
+
2114
+ 0:54:43.243 --> 0:54:57.397
2115
+ You have categorical like an adjective gets
2116
+ into a noun, so you have a little bread to
2117
+
2118
+ 0:54:57.397 --> 0:55:00.162
2119
+ make a decision.
2120
+
2121
+ 0:55:00.480 --> 0:55:15.427
2122
+ That is the one challenge and the even bigger
2123
+ challenge is referred to as translation.
2124
+
2125
+ 0:55:17.017 --> 0:55:19.301
2126
+ That can be their lexical mismatch.
2127
+
2128
+ 0:55:19.301 --> 0:55:21.395
2129
+ That's the fish we talked about.
2130
+
2131
+ 0:55:21.395 --> 0:55:27.169
2132
+ If it's like the, the fish you eat or the
2133
+ fish which is living is the two different worlds
2134
+
2135
+ 0:55:27.169 --> 0:55:27.931
2136
+ in Spanish.
2137
+
2138
+ 0:55:28.108 --> 0:55:34.334
2139
+ And then that's partly sometimes even not
2140
+ known, so even the human might not be able
2141
+
2142
+ 0:55:34.334 --> 0:55:34.627
2143
+ to.
2144
+
2145
+ 0:55:34.774 --> 0:55:40.242
2146
+ Infer that you maybe need to see the context
2147
+ you maybe need to have the sentences around,
2148
+
2149
+ 0:55:40.242 --> 0:55:45.770
2150
+ so one problem is that at least traditional
2151
+ machine translation works on a sentence level,
2152
+
2153
+ 0:55:45.770 --> 0:55:51.663
2154
+ so we take each sentence and translate it independent
2155
+ of everything else, but that's, of course,
2156
+
2157
+ 0:55:51.663 --> 0:55:52.453
2158
+ not correct.
2159
+
2160
+ 0:55:52.532 --> 0:55:59.901
2161
+ Will look into some ways of looking at and
2162
+ doing document-based machine translation, but.
2163
+
2164
+ 0:56:00.380 --> 0:56:06.793
2165
+ There's gender information might be a problem,
2166
+ so in English it's player and you don't know
2167
+
2168
+ 0:56:06.793 --> 0:56:10.139
2169
+ if it's Spieler Spielerin or if it's not known.
2170
+
2171
+ 0:56:10.330 --> 0:56:15.770
2172
+ But in the English, if you now generate German,
2173
+ you should know is the reader.
2174
+
2175
+ 0:56:15.770 --> 0:56:21.830
2176
+ Does he know the gender or does he not know
2177
+ the gender and then generate the right one?
2178
+
2179
+ 0:56:22.082 --> 0:56:38.333
2180
+ So just imagine a commentator if he's talking
2181
+ about the player and you can see if it's male
2182
+
2183
+ 0:56:38.333 --> 0:56:40.276
2184
+ or female.
2185
+
2186
+ 0:56:40.540 --> 0:56:47.801
2187
+ So in generally the problem is that if you
2188
+ have less information and you need more information
2189
+
2190
+ 0:56:47.801 --> 0:56:51.928
2191
+ in your target, this translation doesn't really
2192
+ work.
2193
+
2194
+ 0:56:55.175 --> 0:56:59.180
2195
+ Another problem is we just talked about the
2196
+ the.
2197
+
2198
+ 0:56:59.119 --> 0:57:01.429
2199
+ The co reference.
2200
+
2201
+ 0:57:01.641 --> 0:57:08.818
2202
+ So if you refer to an object and that can
2203
+ be across sentence boundaries then you have
2204
+
2205
+ 0:57:08.818 --> 0:57:14.492
2206
+ to use the right pronoun and you cannot just
2207
+ translate the pronoun.
2208
+
2209
+ 0:57:14.492 --> 0:57:18.581
2210
+ If the baby does not thrive on raw milk boil
2211
+ it.
2212
+
2213
+ 0:57:19.079 --> 0:57:28.279
2214
+ And if you are now using it and just take
2215
+ the typical translation, it will be: And That
2216
+
2217
+ 0:57:28.279 --> 0:57:31.065
2218
+ Will Be Ah Wrong.
2219
+
2220
+ 0:57:31.291 --> 0:57:35.784
2221
+ No, that will be even right because it is
2222
+ dust baby.
2223
+
2224
+ 0:57:35.784 --> 0:57:42.650
2225
+ Yes, but I mean, you have to determine that
2226
+ and it might be wrong at some point.
2227
+
2228
+ 0:57:42.650 --> 0:57:48.753
2229
+ So getting this this um yeah, it will be wrong
2230
+ yes, that is right yeah.
2231
+
2232
+ 0:57:48.908 --> 0:57:55.469
2233
+ Because in English both are baby and milk,
2234
+ and baby are both referred to it, so if you
2235
+
2236
+ 0:57:55.469 --> 0:58:02.180
2237
+ do S it will be to the first one referred to,
2238
+ so it's correct, but in Germany it will be
2239
+
2240
+ 0:58:02.180 --> 0:58:06.101
2241
+ S, and so if you translate it as S it will
2242
+ be baby.
2243
+
2244
+ 0:58:06.546 --> 0:58:13.808
2245
+ But you have to do Z because milk is female,
2246
+ although that is really very uncommon because
2247
+
2248
+ 0:58:13.808 --> 0:58:18.037
2249
+ maybe a model is an object and so it should
2250
+ be more.
2251
+
2252
+ 0:58:18.358 --> 0:58:25.176
2253
+ Of course, I agree there might be a situation
2254
+ which is a bit created and not a common thing,
2255
+
2256
+ 0:58:25.176 --> 0:58:29.062
2257
+ but you can see that these things are not that
2258
+ easy.
2259
+
2260
+ 0:58:29.069 --> 0:58:31.779
2261
+ Another example is this: Dr.
2262
+
2263
+ 0:58:31.779 --> 0:58:37.855
2264
+ McLean often brings his dog champion to visit
2265
+ with his patients.
2266
+
2267
+ 0:58:37.855 --> 0:58:41.594
2268
+ He loves to give big wets loppy kisses.
2269
+
2270
+ 0:58:42.122 --> 0:58:58.371
2271
+ And there, of course, it's also important
2272
+ if he refers to the dog or to the doctor.
2273
+
2274
+ 0:58:59.779 --> 0:59:11.260
2275
+ Another example of challenging is that we
2276
+ don't have a fixed language and that was referred
2277
+
2278
+ 0:59:11.260 --> 0:59:16.501
2279
+ to morphology and we can build new words.
2280
+
2281
+ 0:59:16.496 --> 0:59:23.787
2282
+ So we can in all languages build new words
2283
+ by just concatinating part of it like braxits,
2284
+
2285
+ 0:59:23.787 --> 0:59:30.570
2286
+ some things like: And then, of course, also
2287
+ words don't exist in languages, don't exist
2288
+
2289
+ 0:59:30.570 --> 0:59:31.578
2290
+ in isolations.
2291
+
2292
+ 0:59:32.012 --> 0:59:41.591
2293
+ In Germany you can now use the word download
2294
+ somewhere and you can also use a morphological
2295
+
2296
+ 0:59:41.591 --> 0:59:43.570
2297
+ operation on that.
2298
+
2299
+ 0:59:43.570 --> 0:59:48.152
2300
+ I guess there is even not the correct word.
2301
+
2302
+ 0:59:48.508 --> 0:59:55.575
2303
+ But so you have to deal with these things,
2304
+ and yeah, in social meters.
2305
+
2306
+ 0:59:55.996 --> 1:00:00.215
2307
+ This word is maybe most of you have forgotten
2308
+ already.
2309
+
2310
+ 1:00:00.215 --> 1:00:02.517
2311
+ This was ten years ago or so.
2312
+
2313
+ 1:00:02.517 --> 1:00:08.885
2314
+ I don't know there was a volcano in Iceland
2315
+ which stopped Europeans flying around.
2316
+
2317
+ 1:00:09.929 --> 1:00:14.706
2318
+ So there is always new words coming up and
2319
+ you have to deal with.
2320
+
2321
+ 1:00:18.278 --> 1:00:24.041
2322
+ Yeah, one last thing, so some of these examples
2323
+ we have seen are a bit artificial.
2324
+
2325
+ 1:00:24.041 --> 1:00:30.429
2326
+ So one example what is very common with machine
2327
+ translation doesn't really work is this box
2328
+
2329
+ 1:00:30.429 --> 1:00:31.540
2330
+ was in the pen.
2331
+
2332
+ 1:00:32.192 --> 1:00:36.887
2333
+ And maybe you would be surprised, at least
2334
+ when read it.
2335
+
2336
+ 1:00:36.887 --> 1:00:39.441
2337
+ How can a box be inside a pen?
2338
+
2339
+ 1:00:40.320 --> 1:00:44.175
2340
+ Does anybody have a solution for that while
2341
+ the sentence is still correct?
2342
+
2343
+ 1:00:47.367 --> 1:00:51.692
2344
+ Maybe it's directly clear for you, maybe your
2345
+ English was aside, yeah.
2346
+
2347
+ 1:00:54.654 --> 1:01:07.377
2348
+ Yes, like at a farm or for small children,
2349
+ and that is also called a pen or a pen on a
2350
+
2351
+ 1:01:07.377 --> 1:01:08.254
2352
+ farm.
2353
+
2354
+ 1:01:08.368 --> 1:01:12.056
2355
+ And then this is, and so you can mean okay.
2356
+
2357
+ 1:01:12.056 --> 1:01:16.079
2358
+ To infer these two meanings is quite difficult.
2359
+
2360
+ 1:01:16.436 --> 1:01:23.620
2361
+ But at least when I saw it, I wasn't completely
2362
+ convinced because it's maybe not the sentence
2363
+
2364
+ 1:01:23.620 --> 1:01:29.505
2365
+ you're using in your daily life, and some of
2366
+ these constructions seem to be.
2367
+
2368
+ 1:01:29.509 --> 1:01:35.155
2369
+ They are very good in showing where the problem
2370
+ is, but the question is, does it really imply
2371
+
2372
+ 1:01:35.155 --> 1:01:35.995
2373
+ in real life?
2374
+
2375
+ 1:01:35.996 --> 1:01:42.349
2376
+ And therefore here some examples also that
2377
+ we had here with a lecture translator that
2378
+
2379
+ 1:01:42.349 --> 1:01:43.605
2380
+ really occurred.
2381
+
2382
+ 1:01:43.605 --> 1:01:49.663
2383
+ They maybe looked simple, but you will see
2384
+ that some of them still are happening.
2385
+
2386
+ 1:01:50.050 --> 1:01:53.948
2387
+ And they are partly about spitting words,
2388
+ and then they are happening.
2389
+
2390
+ 1:01:54.294 --> 1:01:56.816
2391
+ So Um.
2392
+
2393
+ 1:01:56.596 --> 1:02:03.087
2394
+ We had a text about the numeral system in
2395
+ German, the Silen system, which got splitted
2396
+
2397
+ 1:02:03.087 --> 1:02:07.041
2398
+ into sub parts because otherwise we can't translate.
2399
+
2400
+ 1:02:07.367 --> 1:02:14.927
2401
+ And then he did only a proximate match and
2402
+ was talking about the binary payment system
2403
+
2404
+ 1:02:14.927 --> 1:02:23.270
2405
+ because the payment system was a lot more common
2406
+ in the training data than the Thailand system.
2407
+
2408
+ 1:02:23.823 --> 1:02:29.900
2409
+ And so there you see like rare words, which
2410
+ don't occur that often.
2411
+
2412
+ 1:02:29.900 --> 1:02:38.211
2413
+ They are very challenging to deal with because
2414
+ we are good and inferring that sometimes, but
2415
+
2416
+ 1:02:38.211 --> 1:02:41.250
2417
+ for others that's very difficult.
2418
+
2419
+ 1:02:44.344 --> 1:02:49.605
2420
+ Another challenge is that, of course, the
2421
+ context is very difficult.
2422
+
2423
+ 1:02:50.010 --> 1:02:56.448
2424
+ This is also an example a bit older from also
2425
+ the lecture translators we were translating
2426
+
2427
+ 1:02:56.448 --> 1:03:01.813
2428
+ in mass lecture, and he was always talking
2429
+ about the omens of the numbers.
2430
+
2431
+ 1:03:02.322 --> 1:03:11.063
2432
+ Which doesn't make any sense at all, but the
2433
+ German word fortsizing can of course mean the
2434
+
2435
+ 1:03:11.063 --> 1:03:12.408
2436
+ sign and the.
2437
+
2438
+ 1:03:12.732 --> 1:03:22.703
2439
+ And if you not have the right to main knowledge
2440
+ in there and encode it, it might use the main
2441
+
2442
+ 1:03:22.703 --> 1:03:23.869
2443
+ knowledge.
2444
+
2445
+ 1:03:25.705 --> 1:03:31.205
2446
+ A more recent version of that is like here
2447
+ from a paper where it's about translating.
2448
+
2449
+ 1:03:31.205 --> 1:03:36.833
2450
+ We had this pivot based translation where
2451
+ you translate maybe to English and to another
2452
+
2453
+ 1:03:36.833 --> 1:03:39.583
2454
+ because you have not enough training data.
2455
+
2456
+ 1:03:40.880 --> 1:03:48.051
2457
+ And we did that from Dutch to German guess
2458
+ if you don't understand Dutch, if you speak
2459
+
2460
+ 1:03:48.051 --> 1:03:48.710
2461
+ German.
2462
+
2463
+ 1:03:48.908 --> 1:03:56.939
2464
+ So we have this raven forebuilt, which means
2465
+ to geben in English.
2466
+
2467
+ 1:03:56.939 --> 1:04:05.417
2468
+ It's correctly in setting an example: However,
2469
+ if we're then translate to German, he didn't
2470
+
2471
+ 1:04:05.417 --> 1:04:11.524
2472
+ get the full context, and in German you normally
2473
+ don't set an example, but you give an example,
2474
+
2475
+ 1:04:11.524 --> 1:04:16.740
2476
+ and so yes, going through another language
2477
+ you introduce their additional errors.
2478
+
2479
+ 1:04:19.919 --> 1:04:27.568
2480
+ Good so much for this are there more questions
2481
+ about why this is difficult.
2482
+
2483
+ 1:04:30.730 --> 1:04:35.606
2484
+ Then we'll start with this one.
2485
+
2486
+ 1:04:35.606 --> 1:04:44.596
2487
+ I have to leave a bit early today in a quarter
2488
+ of an hour.
2489
+
2490
+ 1:04:44.904 --> 1:04:58.403
2491
+ If you look about linguistic approaches to
2492
+ machine translation, they are typically described
2493
+
2494
+ 1:04:58.403 --> 1:05:03.599
2495
+ by: So we can do a direct translation, so you
2496
+ take the Suez language.
2497
+
2498
+ 1:05:03.599 --> 1:05:09.452
2499
+ Do not apply a lot of the analysis we were
2500
+ discussing today about syntax representation,
2501
+
2502
+ 1:05:09.452 --> 1:05:11.096
2503
+ semantic representation.
2504
+
2505
+ 1:05:11.551 --> 1:05:14.678
2506
+ But you directly translate to your target
2507
+ text.
2508
+
2509
+ 1:05:14.678 --> 1:05:16.241
2510
+ That's here the direct.
2511
+
2512
+ 1:05:16.516 --> 1:05:19.285
2513
+ Then there is a transfer based approach.
2514
+
2515
+ 1:05:19.285 --> 1:05:23.811
2516
+ Then you transfer everything over and you
2517
+ do the text translation.
2518
+
2519
+ 1:05:24.064 --> 1:05:28.354
2520
+ And you can do that at two levels, more at
2521
+ the syntax level.
2522
+
2523
+ 1:05:28.354 --> 1:05:34.683
2524
+ That means you only do synthetic analysts
2525
+ like you do a pasture or so, or at the semantic
2526
+
2527
+ 1:05:34.683 --> 1:05:37.848
2528
+ level where you do a semantic parsing frame.
2529
+
2530
+ 1:05:38.638 --> 1:05:51.489
2531
+ Then there is an interlingua based approach
2532
+ where you don't do any transfer anymore, but
2533
+
2534
+ 1:05:51.489 --> 1:05:55.099
2535
+ you only do an analysis.
2536
+
2537
+ 1:05:57.437 --> 1:06:02.790
2538
+ So how does now the direct transfer, the direct
2539
+ translation?
2540
+
2541
+ 1:06:03.043 --> 1:06:07.031
2542
+ Look like it's one of the earliest approaches.
2543
+
2544
+ 1:06:07.327 --> 1:06:18.485
2545
+ So you do maybe some morphological analysts,
2546
+ but not a lot, and then you do this bilingual
2547
+
2548
+ 1:06:18.485 --> 1:06:20.202
2549
+ word mapping.
2550
+
2551
+ 1:06:20.540 --> 1:06:25.067
2552
+ You might do some here in generations.
2553
+
2554
+ 1:06:25.067 --> 1:06:32.148
2555
+ These two things are not really big, but you
2556
+ are working on.
2557
+
2558
+ 1:06:32.672 --> 1:06:39.237
2559
+ And of course this might be a first easy solution
2560
+ about all the challenges we have seen that
2561
+
2562
+ 1:06:39.237 --> 1:06:41.214
2563
+ the structure is different.
2564
+
2565
+ 1:06:41.214 --> 1:06:45.449
2566
+ That you have to reorder, look at the agreement,
2567
+ then work.
2568
+
2569
+ 1:06:45.449 --> 1:06:47.638
2570
+ That's why the first approach.
2571
+
2572
+ 1:06:47.827 --> 1:06:54.618
2573
+ So if we have different word order, structural
2574
+ shifts or idiomatic expressions that doesn't
2575
+
2576
+ 1:06:54.618 --> 1:06:55.208
2577
+ really.
2578
+
2579
+ 1:06:57.797 --> 1:07:05.034
2580
+ Then there are these rule based approaches
2581
+ which were more commonly used.
2582
+
2583
+ 1:07:05.034 --> 1:07:15.249
2584
+ They might still be somewhere: Mean most commonly
2585
+ they are now used by neural networks but wouldn't
2586
+
2587
+ 1:07:15.249 --> 1:07:19.254
2588
+ be sure there is no system out there but.
2589
+
2590
+ 1:07:19.719 --> 1:07:25.936
2591
+ And in this transfer based approach we have
2592
+ these steps there nicely visualized in the.
2593
+
2594
+ 1:07:26.406 --> 1:07:32.397
2595
+ Triangle, so we have the analytic of the sur
2596
+ sentence where we then get some type of abstract
2597
+
2598
+ 1:07:32.397 --> 1:07:33.416
2599
+ representation.
2600
+
2601
+ 1:07:33.693 --> 1:07:40.010
2602
+ Then we are doing the transfer of the representation
2603
+ of the source sentence into the representation
2604
+
2605
+ 1:07:40.010 --> 1:07:40.263
2606
+ of.
2607
+
2608
+ 1:07:40.580 --> 1:07:46.754
2609
+ And then we have the generation where we take
2610
+ this abstract representation and do then the
2611
+
2612
+ 1:07:46.754 --> 1:07:47.772
2613
+ surface forms.
2614
+
2615
+ 1:07:47.772 --> 1:07:54.217
2616
+ For example, it might be that there is no
2617
+ morphological variants in the episode representation
2618
+
2619
+ 1:07:54.217 --> 1:07:56.524
2620
+ and we have to do this agreement.
2621
+
2622
+ 1:07:56.656 --> 1:08:00.077
2623
+ Which components do you they need?
2624
+
2625
+ 1:08:01.061 --> 1:08:08.854
2626
+ You need monolingual source and target lexicon
2627
+ and the corresponding grammars in order to
2628
+
2629
+ 1:08:08.854 --> 1:08:12.318
2630
+ do both the analyst and the generation.
2631
+
2632
+ 1:08:12.412 --> 1:08:18.584
2633
+ Then you need the bilingual dictionary in
2634
+ order to do the lexical translation and the
2635
+
2636
+ 1:08:18.584 --> 1:08:25.116
2637
+ bilingual transfer rules in order to transfer
2638
+ the grammar, for example in German, into the
2639
+
2640
+ 1:08:25.116 --> 1:08:28.920
2641
+ grammar in English, and that enables you to
2642
+ do that.
2643
+
2644
+ 1:08:29.269 --> 1:08:32.579
2645
+ So an example is is something like this here.
2646
+
2647
+ 1:08:32.579 --> 1:08:38.193
2648
+ So if you're doing a syntactic transfer it
2649
+ means you're starting with John E.
2650
+
2651
+ 1:08:38.193 --> 1:08:38.408
2652
+ Z.
2653
+
2654
+ 1:08:38.408 --> 1:08:43.014
2655
+ Apple you do the analyst then you have this
2656
+ type of graph here.
2657
+
2658
+ 1:08:43.014 --> 1:08:48.340
2659
+ Therefore you need your monolingual lexicon
2660
+ and your monolingual grammar.
2661
+
2662
+ 1:08:48.748 --> 1:08:59.113
2663
+ Then you're doing the transfer where you're
2664
+ transferring this representation into this
2665
+
2666
+ 1:08:59.113 --> 1:09:01.020
2667
+ representation.
2668
+
2669
+ 1:09:01.681 --> 1:09:05.965
2670
+ So how could this type of translation then
2671
+ look like?
2672
+
2673
+ 1:09:07.607 --> 1:09:08.276
2674
+ Style.
2675
+
2676
+ 1:09:08.276 --> 1:09:14.389
2677
+ We have the example of a delicious soup and
2678
+ una soup deliciosa.
2679
+
2680
+ 1:09:14.894 --> 1:09:22.173
2681
+ This is your source language tree and this
2682
+ is your target language tree and then the rules
2683
+
2684
+ 1:09:22.173 --> 1:09:26.092
2685
+ that you need are these ones to do the transfer.
2686
+
2687
+ 1:09:26.092 --> 1:09:31.211
2688
+ So if you have a noun phrase that also goes
2689
+ to the noun phrase.
2690
+
2691
+ 1:09:31.691 --> 1:09:44.609
2692
+ You see here that the switch is happening,
2693
+ so the second position is here at the first
2694
+
2695
+ 1:09:44.609 --> 1:09:46.094
2696
+ position.
2697
+
2698
+ 1:09:46.146 --> 1:09:52.669
2699
+ Then you have the translation of determiner
2700
+ of the words, so the dictionary entries.
2701
+
2702
+ 1:09:53.053 --> 1:10:07.752
2703
+ And with these types of rules you can then
2704
+ do these mappings and do the transfer between
2705
+
2706
+ 1:10:07.752 --> 1:10:11.056
2707
+ the representation.
2708
+
2709
+ 1:10:25.705 --> 1:10:32.505
2710
+ Think it more depends on the amount of expertise
2711
+ you have in representing them.
2712
+
2713
+ 1:10:32.505 --> 1:10:35.480
2714
+ The rules will get more difficult.
2715
+
2716
+ 1:10:36.136 --> 1:10:42.445
2717
+ For example, these rule based were, so I think
2718
+ it more depends on how difficult the structure
2719
+
2720
+ 1:10:42.445 --> 1:10:42.713
2721
+ is.
2722
+
2723
+ 1:10:42.713 --> 1:10:48.619
2724
+ So for German generating German they were
2725
+ quite long, quite successful because modeling
2726
+
2727
+ 1:10:48.619 --> 1:10:52.579
2728
+ all the German phenomena which are in there
2729
+ was difficult.
2730
+
2731
+ 1:10:52.953 --> 1:10:56.786
2732
+ And that can be done there, and it wasn't
2733
+ easy to learn that just from data.
2734
+
2735
+ 1:10:59.019 --> 1:11:07.716
2736
+ Think even if you think about Chinese and
2737
+ English or so, if you have the trees there
2738
+
2739
+ 1:11:07.716 --> 1:11:10.172
2740
+ is quite some rule and.
2741
+
2742
+ 1:11:15.775 --> 1:11:23.370
2743
+ Another thing is you can also try to do something
2744
+ like that on the semantic, which means this
2745
+
2746
+ 1:11:23.370 --> 1:11:24.905
2747
+ gets more complex.
2748
+
2749
+ 1:11:25.645 --> 1:11:31.047
2750
+ This gets maybe a bit easier because this
2751
+ representation, the semantic representation
2752
+
2753
+ 1:11:31.047 --> 1:11:36.198
2754
+ between languages, are more similar and therefore
2755
+ this gets more difficult again.
2756
+
2757
+ 1:11:36.496 --> 1:11:45.869
2758
+ So typically if you go higher in your triangle
2759
+ this is more work while this is less work.
2760
+
2761
+ 1:11:49.729 --> 1:11:56.023
2762
+ So it can be then, for example, like in Gusta,
2763
+ we have again that the the the order changes.
2764
+
2765
+ 1:11:56.023 --> 1:12:02.182
2766
+ So you see the transfer rule for like is that
2767
+ the first argument is here and the second is
2768
+
2769
+ 1:12:02.182 --> 1:12:06.514
2770
+ there, while on the on the Gusta side here
2771
+ the second argument.
2772
+
2773
+ 1:12:06.466 --> 1:12:11.232
2774
+ It is in the first position and the first
2775
+ argument is in the second position.
2776
+
2777
+ 1:12:11.511 --> 1:12:14.061
2778
+ So that you do yeah, and also there you're
2779
+ ordering,.
2780
+
2781
+ 1:12:14.354 --> 1:12:20.767
2782
+ From the principle it is more like you have
2783
+ a different type of formalism of representing
2784
+
2785
+ 1:12:20.767 --> 1:12:27.038
2786
+ your sentence and therefore you need to do
2787
+ more on one side and less on the other side.
2788
+
2789
+ 1:12:32.852 --> 1:12:42.365
2790
+ Then so in general transfer based approaches
2791
+ are you have to first select how to represent
2792
+
2793
+ 1:12:42.365 --> 1:12:44.769
2794
+ a synthetic structure.
2795
+
2796
+ 1:12:45.165 --> 1:12:55.147
2797
+ There's like these variable abstraction levels
2798
+ and then you have the three components: The
2799
+
2800
+ 1:12:55.147 --> 1:13:04.652
2801
+ disadvantage is that on the one hand you need
2802
+ normally a lot of experts monolingual experts
2803
+
2804
+ 1:13:04.652 --> 1:13:08.371
2805
+ who analyze how to do the transfer.
2806
+
2807
+ 1:13:08.868 --> 1:13:18.860
2808
+ And if you're doing a new language, you have
2809
+ to do analyst transfer in generation and the
2810
+
2811
+ 1:13:18.860 --> 1:13:19.970
2812
+ transfer.
2813
+
2814
+ 1:13:20.400 --> 1:13:27.074
2815
+ So if you need one language, add one language
2816
+ in existing systems, of course you have to
2817
+
2818
+ 1:13:27.074 --> 1:13:29.624
2819
+ do transfer to all the languages.
2820
+
2821
+ 1:13:32.752 --> 1:13:39.297
2822
+ Therefore, the other idea which people were
2823
+ interested in is the interlingua based machine
2824
+
2825
+ 1:13:39.297 --> 1:13:40.232
2826
+ translation.
2827
+
2828
+ 1:13:40.560 --> 1:13:47.321
2829
+ Where the idea is that we have this intermediate
2830
+ language with this abstract language independent
2831
+
2832
+ 1:13:47.321 --> 1:13:53.530
2833
+ representation and so the important thing is
2834
+ it's language independent so it's really the
2835
+
2836
+ 1:13:53.530 --> 1:13:59.188
2837
+ same for all language and it's a pure meaning
2838
+ and there is no ambiguity in there.
2839
+
2840
+ 1:14:00.100 --> 1:14:05.833
2841
+ That allows this nice translation without
2842
+ transfer, so you just do an analysis into your
2843
+
2844
+ 1:14:05.833 --> 1:14:11.695
2845
+ representation, and there afterwards you do
2846
+ the generation into the other target language.
2847
+
2848
+ 1:14:13.293 --> 1:14:16.953
2849
+ And that of course makes especially multilingual.
2850
+
2851
+ 1:14:16.953 --> 1:14:19.150
2852
+ It's like somehow is a dream.
2853
+
2854
+ 1:14:19.150 --> 1:14:25.519
2855
+ If you want to add a language you just need
2856
+ to add one analyst tool and one generation
2857
+
2858
+ 1:14:25.519 --> 1:14:25.959
2859
+ tool.
2860
+
2861
+ 1:14:29.249 --> 1:14:32.279
2862
+ Which is not the case in the other scenario.
2863
+
2864
+ 1:14:33.193 --> 1:14:40.547
2865
+ However, the big challenge is in this case
2866
+ the interlingua based representation because
2867
+
2868
+ 1:14:40.547 --> 1:14:47.651
2869
+ you need to represent all different types of
2870
+ knowledge in there in order to do that.
2871
+
2872
+ 1:14:47.807 --> 1:14:54.371
2873
+ And also like world knowledge, so something
2874
+ like an apple is a fruit and property is a
2875
+
2876
+ 1:14:54.371 --> 1:14:57.993
2877
+ fruit, so they are eatable and stuff like that.
2878
+
2879
+ 1:14:58.578 --> 1:15:06.286
2880
+ So that is why this is typically always only
2881
+ done for small amounts of data.
2882
+
2883
+ 1:15:06.326 --> 1:15:13.106
2884
+ So what people have done for special applications
2885
+ like hotel reservation people have looked into
2886
+
2887
+ 1:15:13.106 --> 1:15:18.348
2888
+ that, but they have typically not done it for
2889
+ any possibility of doing it.
2890
+
2891
+ 1:15:18.718 --> 1:15:31.640
2892
+ So the advantage is you need to represent
2893
+ all the world knowledge in your interlingua.
2894
+
2895
+ 1:15:32.092 --> 1:15:40.198
2896
+ And that is not possible at the moment or
2897
+ never was possible so far.
2898
+
2899
+ 1:15:40.198 --> 1:15:47.364
2900
+ Typically they were for small domains for
2901
+ hotel reservation.
2902
+
2903
+ 1:15:51.431 --> 1:15:57.926
2904
+ But of course this idea of doing that and
2905
+ that's why some people are interested in is
2906
+
2907
+ 1:15:57.926 --> 1:16:04.950
2908
+ like if you now do a neural system where you
2909
+ learn the representation in your neural network
2910
+
2911
+ 1:16:04.950 --> 1:16:07.442
2912
+ is that some type of artificial.
2913
+
2914
+ 1:16:08.848 --> 1:16:09.620
2915
+ Interlingua.
2916
+
2917
+ 1:16:09.620 --> 1:16:15.025
2918
+ However, what we at least found out until
2919
+ now is that there's often very language specific
2920
+
2921
+ 1:16:15.025 --> 1:16:15.975
2922
+ information in.
2923
+
2924
+ 1:16:16.196 --> 1:16:19.648
2925
+ And they might be important and essential.
2926
+
2927
+ 1:16:19.648 --> 1:16:26.552
2928
+ You don't have all the information in your
2929
+ input, so you typically can't do resolving
2930
+
2931
+ 1:16:26.552 --> 1:16:32.412
2932
+ all ambiguities inside there because you might
2933
+ not have all information.
2934
+
2935
+ 1:16:32.652 --> 1:16:37.870
2936
+ So in English you don't know if it's a living
2937
+ fish or the fish which you're eating, and if
2938
+
2939
+ 1:16:37.870 --> 1:16:43.087
2940
+ you're translating to Germany you also don't
2941
+ have to resolve this problem because you have
2942
+
2943
+ 1:16:43.087 --> 1:16:45.610
2944
+ the same ambiguity in your target language.
2945
+
2946
+ 1:16:45.610 --> 1:16:50.828
2947
+ So why would you put in our effort in finding
2948
+ out if it's a dish or the other fish if it's
2949
+
2950
+ 1:16:50.828 --> 1:16:52.089
2951
+ not necessary at all?
2952
+
2953
+ 1:16:54.774 --> 1:16:59.509
2954
+ Yeah Yeah.
2955
+
2956
+ 1:17:05.585 --> 1:17:15.019
2957
+ The semantic transfer is not the same for
2958
+ both languages, so you still represent the
2959
+
2960
+ 1:17:15.019 --> 1:17:17.127
2961
+ semantic language.
2962
+
2963
+ 1:17:17.377 --> 1:17:23.685
2964
+ So you have the like semantic representation
2965
+ in the Gusta, but that's not the same as semantic
2966
+
2967
+ 1:17:23.685 --> 1:17:28.134
2968
+ representation for both languages, and that's
2969
+ the main difference.
2970
+
2971
+ 1:17:35.515 --> 1:17:44.707
2972
+ Okay, then these are the most important things
2973
+ for today: what is language and how our rule
2974
+
2975
+ 1:17:44.707 --> 1:17:46.205
2976
+ based systems.
2977
+
2978
+ 1:17:46.926 --> 1:17:59.337
2979
+ And if there is no more questions thank you
2980
+ for joining, we have today a bit of a shorter
2981
+
2982
+ 1:17:59.337 --> 1:18:00.578
2983
+ lecture.
2984
+
demo_data/lectures/Lecture-02-20.04.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0ac15772e9e528ff3f7fb957401be410fcdf4a4ad54542e96916fe654443eb3
3
+ size 111655016
demo_data/lectures/Lecture-03-25.04.2023/English.vtt ADDED
@@ -0,0 +1,3102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 0:00:02.822 --> 0:00:07.880
4
+ We look into more linguistic approaches.
5
+
6
+ 0:00:07.880 --> 0:00:14.912
7
+ We can do machine translation in a more traditional
8
+ way.
9
+
10
+ 0:00:14.912 --> 0:00:21.224
11
+ It should be: Translation should be generated
12
+ this way.
13
+
14
+ 0:00:21.224 --> 0:00:27.933
15
+ We can analyze versus a sewer sentence what
16
+ is the meaning or the syntax.
17
+
18
+ 0:00:27.933 --> 0:00:35.185
19
+ Then we transfer this information to the target
20
+ side and then we then generate.
21
+
22
+ 0:00:36.556 --> 0:00:42.341
23
+ And this was the strong and common used approach
24
+ for yeah several years.
25
+
26
+ 0:00:44.024 --> 0:00:50.839
27
+ However, we saw already at the beginning there
28
+ some challenges with that: Language is very
29
+
30
+ 0:00:50.839 --> 0:00:57.232
31
+ ambigue and it's often very difficult to really
32
+ get high coated rules.
33
+
34
+ 0:00:57.232 --> 0:01:05.336
35
+ What are the different meanings and we have
36
+ to do that also with a living language so new
37
+
38
+ 0:01:05.336 --> 0:01:06.596
39
+ things occur.
40
+
41
+ 0:01:07.007 --> 0:01:09.308
42
+ And that's why people look into.
43
+
44
+ 0:01:09.308 --> 0:01:13.282
45
+ Can we maybe do it differently and use machine
46
+ learning?
47
+
48
+ 0:01:13.333 --> 0:01:24.849
49
+ So we are no longer giving rules of how to
50
+ do it, but we just give examples and the system.
51
+
52
+ 0:01:25.045 --> 0:01:34.836
53
+ And one important thing then is these examples:
54
+ how can we learn how to translate one sentence?
55
+
56
+ 0:01:35.635 --> 0:01:42.516
57
+ And therefore these yeah, the data is now
58
+ really a very important issue.
59
+
60
+ 0:01:42.582 --> 0:01:50.021
61
+ And that is what we want to look into today.
62
+
63
+ 0:01:50.021 --> 0:01:58.783
64
+ What type of data do we use for machine translation?
65
+
66
+ 0:01:59.019 --> 0:02:08.674
67
+ So the idea in preprocessing is always: Can
68
+ we make the task somehow a bit easier so that
69
+
70
+ 0:02:08.674 --> 0:02:13.180
71
+ the empty system will be in a way better?
72
+
73
+ 0:02:13.493 --> 0:02:28.309
74
+ So one example could be if it has problems
75
+ dealing with numbers because they are occurring.
76
+
77
+ 0:02:28.648 --> 0:02:35.479
78
+ Or think about so one problem which still
79
+ might be is there in some systems think about
80
+
81
+ 0:02:35.479 --> 0:02:36.333
82
+ different.
83
+
84
+ 0:02:36.656 --> 0:02:44.897
85
+ So a system might learn that of course if
86
+ there's a German over in English there should.
87
+
88
+ 0:02:45.365 --> 0:02:52.270
89
+ However, if it's in pearl text, it will see
90
+ that in Germany there is often km, and in English
91
+
92
+ 0:02:52.270 --> 0:02:54.107
93
+ typically various miles.
94
+
95
+ 0:02:54.594 --> 0:03:00.607
96
+ Might just translate three hundred and fifty
97
+ five miles into three hundred and fiftY five
98
+
99
+ 0:03:00.607 --> 0:03:04.348
100
+ kilometers, which of course is not right, and
101
+ so forth.
102
+
103
+ 0:03:04.348 --> 0:03:06.953
104
+ It might make things to look into the.
105
+
106
+ 0:03:07.067 --> 0:03:13.072
107
+ Therefore, first step when you build your
108
+ machine translation system is normally to look
109
+
110
+ 0:03:13.072 --> 0:03:19.077
111
+ at the data, to check it, to see if there is
112
+ anything happening which you should address
113
+
114
+ 0:03:19.077 --> 0:03:19.887
115
+ beforehand.
116
+
117
+ 0:03:20.360 --> 0:03:29.152
118
+ And then the second part is how do you represent
119
+ no works machine learning normally?
120
+
121
+ 0:03:29.109 --> 0:03:35.404
122
+ So the question is how do we get out from
123
+ the words into numbers and I've seen some of
124
+
125
+ 0:03:35.404 --> 0:03:35.766
126
+ you?
127
+
128
+ 0:03:35.766 --> 0:03:42.568
129
+ For example, in advance there we have introduced
130
+ to an algorithm which we also shortly repeat
131
+
132
+ 0:03:42.568 --> 0:03:43.075
133
+ today.
134
+
135
+ 0:03:43.303 --> 0:03:53.842
136
+ The subword unit approach which was first
137
+ introduced in machine translation and now used
138
+
139
+ 0:03:53.842 --> 0:04:05.271
140
+ for an in order to represent: Now you've learned
141
+ about morphology, so you know that maybe in
142
+
143
+ 0:04:05.271 --> 0:04:09.270
144
+ English it's not that important.
145
+
146
+ 0:04:09.429 --> 0:04:22.485
147
+ In German you have all these different word
148
+ poems and to learn independent representation.
149
+
150
+ 0:04:24.024 --> 0:04:26.031
151
+ And then, of course, they are more extreme.
152
+
153
+ 0:04:27.807 --> 0:04:34.387
154
+ So how are we doing?
155
+
156
+ 0:04:34.975 --> 0:04:37.099
157
+ Machine translation.
158
+
159
+ 0:04:37.099 --> 0:04:46.202
160
+ So hopefully you remember we had these approaches
161
+ to machine translation, the rule based.
162
+
163
+ 0:04:46.202 --> 0:04:52.473
164
+ We had a big block of corpus space machine
165
+ translation which.
166
+
167
+ 0:04:52.492 --> 0:05:00.443
168
+ Will on Thursday have an overview on statistical
169
+ models and then afterwards concentrate on the.
170
+
171
+ 0:05:00.680 --> 0:05:08.828
172
+ Both of them are corpus based machine translation
173
+ and therefore it's really essential, and while
174
+
175
+ 0:05:08.828 --> 0:05:16.640
176
+ we are typically training a machine translation
177
+ system is what we refer to as parallel data.
178
+
179
+ 0:05:16.957 --> 0:05:22.395
180
+ Talk a lot about pearl corpus or pearl data,
181
+ and what I mean there is something which you
182
+
183
+ 0:05:22.395 --> 0:05:28.257
184
+ might know from was that a stone or something
185
+ like that, so it's typically you have one sentence
186
+
187
+ 0:05:28.257 --> 0:05:33.273
188
+ in the one language, and then you have aligned
189
+ to it one sentence in the charcote.
190
+
191
+ 0:05:33.833 --> 0:05:38.261
192
+ And this is how we train all our alignments.
193
+
194
+ 0:05:38.261 --> 0:05:43.181
195
+ We'll see today that of course we might not
196
+ have.
197
+
198
+ 0:05:43.723 --> 0:05:51.279
199
+ However, this is relatively easy to create,
200
+ at least for iquality data.
201
+
202
+ 0:05:51.279 --> 0:06:00.933
203
+ We look into data trawling so that means how
204
+ we can automatically create this parallel data
205
+
206
+ 0:06:00.933 --> 0:06:02.927
207
+ from the Internet.
208
+
209
+ 0:06:04.144 --> 0:06:13.850
210
+ It's not so difficult to learn these alignments
211
+ if we have some type of dictionary, so which
212
+
213
+ 0:06:13.850 --> 0:06:16.981
214
+ sentence is aligned to which.
215
+
216
+ 0:06:18.718 --> 0:06:25.069
217
+ What it would, of course, be a lot more difficult
218
+ is really to word alignment, and that's also
219
+
220
+ 0:06:25.069 --> 0:06:27.476
221
+ often no longer that good possible.
222
+
223
+ 0:06:27.476 --> 0:06:33.360
224
+ We do that automatically in some yes for symbols,
225
+ but it's definitely more challenging.
226
+
227
+ 0:06:33.733 --> 0:06:40.691
228
+ For sentence alignment, of course, it's still
229
+ not always perfect, so there might be that
230
+
231
+ 0:06:40.691 --> 0:06:46.085
232
+ there is two German sentences and one English
233
+ sentence or the other.
234
+
235
+ 0:06:46.085 --> 0:06:53.511
236
+ So there's not always perfect alignment, but
237
+ if you look at text, it's still bigly relatively.
238
+
239
+ 0:06:54.014 --> 0:07:03.862
240
+ If we have that then we can build a machine
241
+ learning model which tries to map ignition
242
+
243
+ 0:07:03.862 --> 0:07:06.239
244
+ sentences somewhere.
245
+
246
+ 0:07:06.626 --> 0:07:15.932
247
+ So this is the idea of behind statistical
248
+ machine translation and machine translation.
249
+
250
+ 0:07:15.932 --> 0:07:27.098
251
+ The difference is: Statistical machine translation
252
+ is typically a whole box of different models
253
+
254
+ 0:07:27.098 --> 0:07:30.205
255
+ which try to evaluate the.
256
+
257
+ 0:07:30.510 --> 0:07:42.798
258
+ In neural machine translation, it's all one
259
+ large neural network where we use the one-sur-sentence
260
+
261
+ 0:07:42.798 --> 0:07:43.667
262
+ input.
263
+
264
+ 0:07:44.584 --> 0:07:50.971
265
+ And then we can train it by having exactly
266
+ this mapping port or parallel data.
267
+
268
+ 0:07:54.214 --> 0:08:02.964
269
+ So what we want today to look at today is
270
+ we want to first look at general text data.
271
+
272
+ 0:08:03.083 --> 0:08:06.250
273
+ So what is text data?
274
+
275
+ 0:08:06.250 --> 0:08:09.850
276
+ What text data is there?
277
+
278
+ 0:08:09.850 --> 0:08:18.202
279
+ Why is it challenging so that we have large
280
+ vocabularies?
281
+
282
+ 0:08:18.378 --> 0:08:22.003
283
+ It's so that you always have words which you
284
+ haven't seen.
285
+
286
+ 0:08:22.142 --> 0:08:29.053
287
+ If you increase your corporate science normally
288
+ you will also increase your vocabulary so you
289
+
290
+ 0:08:29.053 --> 0:08:30.744
291
+ always find new words.
292
+
293
+ 0:08:31.811 --> 0:08:39.738
294
+ Then based on that we'll look into pre-processing.
295
+
296
+ 0:08:39.738 --> 0:08:45.333
297
+ So how can we pre-process our data?
298
+
299
+ 0:08:45.333 --> 0:08:46.421
300
+ Maybe.
301
+
302
+ 0:08:46.526 --> 0:08:54.788
303
+ This is a lot about tokenization, for example,
304
+ which we heard is not so challenging in European
305
+
306
+ 0:08:54.788 --> 0:09:02.534
307
+ languages but still important, but might be
308
+ really difficult in Asian languages where you
309
+
310
+ 0:09:02.534 --> 0:09:05.030
311
+ don't have space separation.
312
+
313
+ 0:09:05.986 --> 0:09:12.161
314
+ And this preprocessing typically tries to
315
+ deal with the extreme cases where you have
316
+
317
+ 0:09:12.161 --> 0:09:13.105
318
+ seen things.
319
+
320
+ 0:09:13.353 --> 0:09:25.091
321
+ If you have seen your words three one hundred
322
+ times, it doesn't really matter if you have
323
+
324
+ 0:09:25.091 --> 0:09:31.221
325
+ seen them with them without punctuation or
326
+ so.
327
+
328
+ 0:09:31.651 --> 0:09:38.578
329
+ And then we look into word representation,
330
+ so what is the best way to represent a word?
331
+
332
+ 0:09:38.578 --> 0:09:45.584
333
+ And finally, we look into the other type of
334
+ data we really need for machine translation.
335
+
336
+ 0:09:45.725 --> 0:09:56.842
337
+ So in first we can use for many tasks, and
338
+ later we can also use purely monolingual data
339
+
340
+ 0:09:56.842 --> 0:10:00.465
341
+ to make machine translation.
342
+
343
+ 0:10:00.660 --> 0:10:03.187
344
+ So then the traditional approach was that
345
+ it was easier.
346
+
347
+ 0:10:03.483 --> 0:10:08.697
348
+ We have this type of language model which
349
+ we can train only on the target data to make
350
+
351
+ 0:10:08.697 --> 0:10:12.173
352
+ the text more fluent in neural machine translation
353
+ model.
354
+
355
+ 0:10:12.173 --> 0:10:18.106
356
+ It's partly a bit more complicated to integrate
357
+ this data but still it's very important especially
358
+
359
+ 0:10:18.106 --> 0:10:22.362
360
+ if you think about lower issue languages where
361
+ you have very few data.
362
+
363
+ 0:10:23.603 --> 0:10:26.999
364
+ It's harder to get parallel data than you
365
+ get monolingual data.
366
+
367
+ 0:10:27.347 --> 0:10:33.821
368
+ Because monolingual data you just have out
369
+ there not huge amounts for some languages,
370
+
371
+ 0:10:33.821 --> 0:10:38.113
372
+ but definitely the amount of data is always
373
+ significant.
374
+
375
+ 0:10:40.940 --> 0:10:50.454
376
+ When we talk about data, it's also of course
377
+ important how we use it for machine learning.
378
+
379
+ 0:10:50.530 --> 0:11:05.867
380
+ And that you hopefully learn in some prior
381
+ class, so typically we separate our data into
382
+
383
+ 0:11:05.867 --> 0:11:17.848
384
+ three chunks: So this is really by far the
385
+ largest, and this grows with the data we get.
386
+
387
+ 0:11:17.848 --> 0:11:21.387
388
+ Today we get here millions.
389
+
390
+ 0:11:22.222 --> 0:11:27.320
391
+ Then we have our validation data and that
392
+ is to train some type of parameters.
393
+
394
+ 0:11:27.320 --> 0:11:33.129
395
+ So not only you have some things to configure
396
+ and you don't know what is the right value,
397
+
398
+ 0:11:33.129 --> 0:11:39.067
399
+ so what you can do is train a model and change
400
+ these a bit and try to find the best ones on
401
+
402
+ 0:11:39.067 --> 0:11:40.164
403
+ your validation.
404
+
405
+ 0:11:40.700 --> 0:11:48.531
406
+ For a statistical model, for example data
407
+ in what you want to use if you have several
408
+
409
+ 0:11:48.531 --> 0:11:54.664
410
+ models: You know how to combine it, so how
411
+ much focus should you put on the different
412
+
413
+ 0:11:54.664 --> 0:11:55.186
414
+ models?
415
+
416
+ 0:11:55.186 --> 0:11:59.301
417
+ And if it's like twenty models, so it's only
418
+ twenty per meter.
419
+
420
+ 0:11:59.301 --> 0:12:02.828
421
+ It's not that much, so that is still bigly
422
+ estimated.
423
+
424
+ 0:12:03.183 --> 0:12:18.964
425
+ In your model there's often a question how
426
+ long should train the model before you have
427
+
428
+ 0:12:18.964 --> 0:12:21.322
429
+ overfitting.
430
+
431
+ 0:12:22.902 --> 0:12:28.679
432
+ And then you have your test data, which is
433
+ finally where you report on your test.
434
+
435
+ 0:12:29.009 --> 0:12:33.663
436
+ And therefore it's also important that from
437
+ time to time you get new test data because
438
+
439
+ 0:12:33.663 --> 0:12:38.423
440
+ if you're always through your experiments you
441
+ test on it and then you do new experiments
442
+
443
+ 0:12:38.423 --> 0:12:43.452
444
+ and tests again at some point you have tested
445
+ so many on it that you do some type of training
446
+
447
+ 0:12:43.452 --> 0:12:48.373
448
+ on your test data again because you just select
449
+ the things which is at the end best on your
450
+
451
+ 0:12:48.373 --> 0:12:48.962
452
+ test data.
453
+
454
+ 0:12:49.009 --> 0:12:54.755
455
+ It's important to get a new test data from
456
+ time to time, for example in important evaluation
457
+
458
+ 0:12:54.755 --> 0:12:58.340
459
+ campaigns for machine translation and speech
460
+ translation.
461
+
462
+ 0:12:58.618 --> 0:13:07.459
463
+ There is like every year there should do tests
464
+ that create it so we can see if the model really
465
+
466
+ 0:13:07.459 --> 0:13:09.761
467
+ gets better on new data.
468
+
469
+ 0:13:10.951 --> 0:13:19.629
470
+ And of course it is important that this is
471
+ a representative of the use case you are interested.
472
+
473
+ 0:13:19.879 --> 0:13:36.511
474
+ So if you're building a system for translating
475
+ websites, this should be on websites.
476
+
477
+ 0:13:36.816 --> 0:13:39.356
478
+ So normally a system is good on some tasks.
479
+
480
+ 0:13:40.780 --> 0:13:48.596
481
+ I would solve everything and then your test
482
+ data should be out of everything because if
483
+
484
+ 0:13:48.596 --> 0:13:54.102
485
+ you only have a very small subset you know
486
+ it's good on this.
487
+
488
+ 0:13:54.394 --> 0:14:02.714
489
+ Therefore, the selection of your test data
490
+ is really important in order to ensure that
491
+
492
+ 0:14:02.714 --> 0:14:05.200
493
+ the MP system in the end.
494
+
495
+ 0:14:05.525 --> 0:14:12.646
496
+ Is the greatest system ever you have evaluated
497
+ on translating Bible.
498
+
499
+ 0:14:12.646 --> 0:14:21.830
500
+ The use case is to translate some Twitter
501
+ data and you can imagine the performance might
502
+
503
+ 0:14:21.830 --> 0:14:22.965
504
+ be really.
505
+
506
+ 0:14:23.803 --> 0:14:25.471
507
+ And privately.
508
+
509
+ 0:14:25.471 --> 0:14:35.478
510
+ Of course, in honor to have this and realistic
511
+ evaluation, it's important that there's no
512
+
513
+ 0:14:35.478 --> 0:14:39.370
514
+ overlap between this data because.
515
+
516
+ 0:14:39.799 --> 0:14:51.615
517
+ Because the danger might be is learning by
518
+ heart how to translate the sentences from your
519
+
520
+ 0:14:51.615 --> 0:14:53.584
521
+ training data.
522
+
523
+ 0:14:54.194 --> 0:15:04.430
524
+ That the test data is really different from
525
+ your training data.
526
+
527
+ 0:15:04.430 --> 0:15:16.811
528
+ Therefore, it's important to: So what type
529
+ of data we have?
530
+
531
+ 0:15:16.811 --> 0:15:24.966
532
+ There's a lot of different text data and the
533
+ nice thing is with digitalization.
534
+
535
+ 0:15:25.345 --> 0:15:31.785
536
+ You might think there's a large amount with
537
+ books, but to be honest books and printed things
538
+
539
+ 0:15:31.785 --> 0:15:35.524
540
+ that's by now a minor percentage of the data
541
+ we have.
542
+
543
+ 0:15:35.815 --> 0:15:39.947
544
+ There's like so much data created every day
545
+ on the Internet.
546
+
547
+ 0:15:39.980 --> 0:15:46.223
548
+ With social media and all the other types.
549
+
550
+ 0:15:46.223 --> 0:15:56.821
551
+ This of course is a largest amount of data,
552
+ more of colloquial language.
553
+
554
+ 0:15:56.856 --> 0:16:02.609
555
+ It might be more noisy and harder to process,
556
+ so there is a whole area on how to deal with
557
+
558
+ 0:16:02.609 --> 0:16:04.948
559
+ more social media and outdoor stuff.
560
+
561
+ 0:16:07.347 --> 0:16:20.702
562
+ What type of data is there if you think about
563
+ parallel data news type of data official sites?
564
+
565
+ 0:16:20.900 --> 0:16:26.629
566
+ So the first Power Corpora were like things
567
+ like the European Parliament or like some news
568
+
569
+ 0:16:26.629 --> 0:16:27.069
570
+ sites.
571
+
572
+ 0:16:27.227 --> 0:16:32.888
573
+ Nowadays there's quite a large amount of data
574
+ crawled from the Internet, but of course if
575
+
576
+ 0:16:32.888 --> 0:16:38.613
577
+ you crawl parallel data from the Internet,
578
+ a lot of the data is also like company websites
579
+
580
+ 0:16:38.613 --> 0:16:41.884
581
+ or so which gets translated into several languages.
582
+
583
+ 0:16:45.365 --> 0:17:00.613
584
+ Then, of course, there is different levels
585
+ of text and we have to look at what level we
586
+
587
+ 0:17:00.613 --> 0:17:05.118
588
+ want to process our data.
589
+
590
+ 0:17:05.885 --> 0:17:16.140
591
+ It one normally doesn't make sense to work
592
+ on full sentences because a lot of sentences
593
+
594
+ 0:17:16.140 --> 0:17:22.899
595
+ have never been seen and you always create
596
+ new sentences.
597
+
598
+ 0:17:23.283 --> 0:17:37.421
599
+ So typically what we take is our basic words,
600
+ something between words and letters, and that
601
+
602
+ 0:17:37.421 --> 0:17:40.033
603
+ is an essential.
604
+
605
+ 0:17:40.400 --> 0:17:47.873
606
+ So we need some of these atomic blocks or
607
+ basic blocks on which we can't make smaller.
608
+
609
+ 0:17:48.128 --> 0:17:55.987
610
+ So if we're building a sentence, for example,
611
+ you can build it out of something and you can
612
+
613
+ 0:17:55.987 --> 0:17:57.268
614
+ either decide.
615
+
616
+ 0:17:57.268 --> 0:18:01.967
617
+ For example, you take words and you spit them
618
+ further.
619
+
620
+ 0:18:03.683 --> 0:18:10.178
621
+ Then, of course, the nice thing is not too
622
+ small and therefore building larger things
623
+
624
+ 0:18:10.178 --> 0:18:11.386
625
+ like sentences.
626
+
627
+ 0:18:11.831 --> 0:18:16.690
628
+ So you only have to take your vocabulary and
629
+ put it somewhere together to get your full
630
+
631
+ 0:18:16.690 --> 0:18:17.132
632
+ center.
633
+
634
+ 0:18:19.659 --> 0:18:27.670
635
+ However, if it's too large, these blocks don't
636
+ occur often enough, and you have more blocks
637
+
638
+ 0:18:27.670 --> 0:18:28.715
639
+ that occur.
640
+
641
+ 0:18:29.249 --> 0:18:34.400
642
+ And that's why yeah we can work with blocks
643
+ for smaller like software blocks.
644
+
645
+ 0:18:34.714 --> 0:18:38.183
646
+ Work with neural models.
647
+
648
+ 0:18:38.183 --> 0:18:50.533
649
+ Then you can work on letters so you have a
650
+ system which tries to understand the sentence
651
+
652
+ 0:18:50.533 --> 0:18:53.031
653
+ letter by letter.
654
+
655
+ 0:18:53.313 --> 0:18:57.608
656
+ But that is a design decision which you have
657
+ to take at some point.
658
+
659
+ 0:18:57.608 --> 0:19:03.292
660
+ On which level do you want to split your text
661
+ and that of the evasive blocks that you are
662
+
663
+ 0:19:03.292 --> 0:19:04.176
664
+ working with?
665
+
666
+ 0:19:04.176 --> 0:19:06.955
667
+ And that's something we'll look into today.
668
+
669
+ 0:19:06.955 --> 0:19:08.471
670
+ What possibilities are?
671
+
672
+ 0:19:12.572 --> 0:19:14.189
673
+ Any question.
674
+
675
+ 0:19:17.998 --> 0:19:24.456
676
+ Then let's look a bit on what type of data
677
+ there is in how much data there is to person.
678
+
679
+ 0:19:24.824 --> 0:19:34.006
680
+ Is that nowadays, at least for pure text,
681
+ it's no longer for some language.
682
+
683
+ 0:19:34.006 --> 0:19:38.959
684
+ There is so much data we cannot process.
685
+
686
+ 0:19:39.479 --> 0:19:49.384
687
+ That is only true for some languages, but
688
+ there is also interest in other languages and
689
+
690
+ 0:19:49.384 --> 0:19:50.622
691
+ important.
692
+
693
+ 0:19:50.810 --> 0:20:01.483
694
+ So if you want to build a system for Sweden
695
+ or for some dialect in other countries, then
696
+
697
+ 0:20:01.483 --> 0:20:02.802
698
+ of course.
699
+
700
+ 0:20:03.103 --> 0:20:06.888
701
+ Otherwise you have this huge amount of hair.
702
+
703
+ 0:20:06.888 --> 0:20:11.515
704
+ We are often no longer taking about gigabytes
705
+ or more.
706
+
707
+ 0:20:11.891 --> 0:20:35.788
708
+ The general information that is produced every
709
+ year is: And this is like all the information
710
+
711
+ 0:20:35.788 --> 0:20:40.661
712
+ that are available in the, so there are really.
713
+
714
+ 0:20:41.001 --> 0:20:44.129
715
+ We look at machine translation.
716
+
717
+ 0:20:44.129 --> 0:20:53.027
718
+ We can see these numbers are really like more
719
+ than ten years old, but we see this increase
720
+
721
+ 0:20:53.027 --> 0:20:58.796
722
+ in one billion works we had at that time for
723
+ English data.
724
+
725
+ 0:20:59.019 --> 0:21:01.955
726
+ Then I wore like new shuffle on Google Maps
727
+ and stuff.
728
+
729
+ 0:21:02.382 --> 0:21:05.003
730
+ For this one you could train your system on.
731
+
732
+ 0:21:05.805 --> 0:21:20.457
733
+ And the interesting thing is this one billion
734
+ words is more than any human typically speaks.
735
+
736
+ 0:21:21.001 --> 0:21:25.892
737
+ So these systems they see by now like a magnitude
738
+ of more data.
739
+
740
+ 0:21:25.892 --> 0:21:32.465
741
+ We know I think are a magnitude higher of
742
+ more data than a human has ever seen in his
743
+
744
+ 0:21:32.465 --> 0:21:33.229
745
+ lifetime.
746
+
747
+ 0:21:35.175 --> 0:21:41.808
748
+ And that is maybe the interesting thing why
749
+ it still doesn't work on it because you see
750
+
751
+ 0:21:41.808 --> 0:21:42.637
752
+ they seem.
753
+
754
+ 0:21:43.103 --> 0:21:48.745
755
+ So we are seeing a really impressive result,
756
+ but in most cases it's not that they're really
757
+
758
+ 0:21:48.745 --> 0:21:49.911
759
+ better than human.
760
+
761
+ 0:21:50.170 --> 0:21:56.852
762
+ However, they really have seen more data than
763
+ any human ever has seen in this lifetime.
764
+
765
+ 0:21:57.197 --> 0:22:01.468
766
+ They can just process so much data, so.
767
+
768
+ 0:22:01.501 --> 0:22:08.425
769
+ The question is, can we make them more efficient
770
+ so that they can learn similarly good without
771
+
772
+ 0:22:08.425 --> 0:22:09.592
773
+ that much data?
774
+
775
+ 0:22:09.592 --> 0:22:16.443
776
+ And that is essential if we now go to Lawrence's
777
+ languages where we might never get that much
778
+
779
+ 0:22:16.443 --> 0:22:21.254
780
+ data, and we should be also able to achieve
781
+ a reasonable perform.
782
+
783
+ 0:22:23.303 --> 0:22:32.399
784
+ On the other hand, this of course links also
785
+ to one topic which we will cover later: If
786
+
787
+ 0:22:32.399 --> 0:22:37.965
788
+ you think about this, it's really important
789
+ that your algorithms are also very efficient
790
+
791
+ 0:22:37.965 --> 0:22:41.280
792
+ in order to process that much data both in
793
+ training.
794
+
795
+ 0:22:41.280 --> 0:22:46.408
796
+ If you have more data, you want to process
797
+ more data so you can make use of that.
798
+
799
+ 0:22:46.466 --> 0:22:54.499
800
+ On the other hand, if more and more data is
801
+ processed, more and more people will use machine
802
+
803
+ 0:22:54.499 --> 0:23:06.816
804
+ translation to generate translations, and it
805
+ will be important to: And there is yeah, there
806
+
807
+ 0:23:06.816 --> 0:23:07.257
808
+ is.
809
+
810
+ 0:23:07.607 --> 0:23:10.610
811
+ More.
812
+
813
+ 0:23:10.170 --> 0:23:17.262
814
+ More data generated every day, we hear just
815
+ some general numbers on how much data there
816
+
817
+ 0:23:17.262 --> 0:23:17.584
818
+ is.
819
+
820
+ 0:23:17.584 --> 0:23:24.595
821
+ It says that a lot of the data we produce
822
+ at least at the moment is text rich, so text
823
+
824
+ 0:23:24.595 --> 0:23:26.046
825
+ that is produced.
826
+
827
+ 0:23:26.026 --> 0:23:29.748
828
+ That is very important to either wise.
829
+
830
+ 0:23:29.748 --> 0:23:33.949
831
+ We can use it as training data in some way.
832
+
833
+ 0:23:33.873 --> 0:23:40.836
834
+ That we want to translate some of that because
835
+ it might not be published in all the languages,
836
+
837
+ 0:23:40.836 --> 0:23:46.039
838
+ and step with the need for machine translation
839
+ is even more important.
840
+
841
+ 0:23:47.907 --> 0:23:51.547
842
+ So what are the challenges with this?
843
+
844
+ 0:23:51.831 --> 0:24:01.360
845
+ So first of all that seems to be very good
846
+ news, so there is more and more data, so we
847
+
848
+ 0:24:01.360 --> 0:24:10.780
849
+ can just wait for three years and have more
850
+ data, and then our system will be better.
851
+
852
+ 0:24:11.011 --> 0:24:22.629
853
+ If you see in competitions, the system performance
854
+ increases.
855
+
856
+ 0:24:24.004 --> 0:24:27.190
857
+ See that here are three different systems.
858
+
859
+ 0:24:27.190 --> 0:24:34.008
860
+ Blue score is metric to measure how good an
861
+ empty system is and we'll talk about evaluation
862
+
863
+ 0:24:34.008 --> 0:24:40.974
864
+ and the next week so you'll have to evaluate
865
+ machine validation and also a practical session.
866
+
867
+ 0:24:41.581 --> 0:24:45.219
868
+ And so.
869
+
870
+ 0:24:44.784 --> 0:24:50.960
871
+ This shows you that this is like how much
872
+ data of the training data you have five percent.
873
+
874
+ 0:24:50.960 --> 0:24:56.117
875
+ You're significantly worse than if you're
876
+ forty percent and eighty percent.
877
+
878
+ 0:24:56.117 --> 0:25:02.021
879
+ You're getting better and you're seeing two
880
+ between this curve, which maybe not really
881
+
882
+ 0:25:02.021 --> 0:25:02.971
883
+ flattens out.
884
+
885
+ 0:25:02.971 --> 0:25:03.311
886
+ But.
887
+
888
+ 0:25:03.263 --> 0:25:07.525
889
+ Of course, the gains you get are normally
890
+ smaller and smaller.
891
+
892
+ 0:25:07.525 --> 0:25:09.216
893
+ The more data you have,.
894
+
895
+ 0:25:09.549 --> 0:25:21.432
896
+ If your improvements are unnormally better,
897
+ if you add the same thing or even double your
898
+
899
+ 0:25:21.432 --> 0:25:25.657
900
+ data late, of course more data.
901
+
902
+ 0:25:26.526 --> 0:25:34.955
903
+ However, you see the clear tendency if you
904
+ need to improve your system.
905
+
906
+ 0:25:34.955 --> 0:25:38.935
907
+ This is possible by just getting.
908
+
909
+ 0:25:39.039 --> 0:25:41.110
910
+ But it's not all about data.
911
+
912
+ 0:25:41.110 --> 0:25:45.396
913
+ It can also be the domain of the day that
914
+ there's building.
915
+
916
+ 0:25:45.865 --> 0:25:55.668
917
+ So this was a test on machine translation
918
+ system on translating genome data.
919
+
920
+ 0:25:55.668 --> 0:26:02.669
921
+ We have the like SAI said he's working on
922
+ translating.
923
+
924
+ 0:26:02.862 --> 0:26:06.868
925
+ Here you see the performance began with GreenScore.
926
+
927
+ 0:26:06.868 --> 0:26:12.569
928
+ You see one system which only was trained
929
+ on genome data and it only has.
930
+
931
+ 0:26:12.812 --> 0:26:17.742
932
+ That's very, very few for machine translation.
933
+
934
+ 0:26:18.438 --> 0:26:23.927
935
+ And to compare that to a system which was
936
+ generally trained on used translation data.
937
+
938
+ 0:26:24.104 --> 0:26:34.177
939
+ With four point five million sentences so
940
+ roughly one hundred times as much data you
941
+
942
+ 0:26:34.177 --> 0:26:40.458
943
+ still see that this system doesn't really work
944
+ well.
945
+
946
+ 0:26:40.820 --> 0:26:50.575
947
+ So you see it's not only about data, it's
948
+ also that the data has to somewhat fit to the
949
+
950
+ 0:26:50.575 --> 0:26:51.462
951
+ domain.
952
+
953
+ 0:26:51.831 --> 0:26:58.069
954
+ The more general data you get that you have
955
+ covered up all domains.
956
+
957
+ 0:26:58.418 --> 0:27:07.906
958
+ But that's very difficult and especially for
959
+ more specific domains.
960
+
961
+ 0:27:07.906 --> 0:27:16.696
962
+ It can be really important to get data which
963
+ fits your domain.
964
+
965
+ 0:27:16.716 --> 0:27:18.520
966
+ Maybe if you can do some very much broccoli
967
+ or something like that, maybe if you.
968
+
969
+ 0:27:18.598 --> 0:27:22.341
970
+ To say okay, concentrate this as you like
971
+ for being at better.
972
+
973
+ 0:27:24.564 --> 0:27:28.201
974
+ It's not that easy to prompt it.
975
+
976
+ 0:27:28.201 --> 0:27:35.807
977
+ You can do the prompting in the more traditional
978
+ way of fine tuning.
979
+
980
+ 0:27:35.807 --> 0:27:44.514
981
+ Then, of course, if you select UIV later combine
982
+ this one, you can get better.
983
+
984
+ 0:27:44.904 --> 0:27:52.675
985
+ But it will always be that this type of similar
986
+ data is much more important than the general.
987
+
988
+ 0:27:52.912 --> 0:28:00.705
989
+ So of course it can make the lower system
990
+ a lot better if you search for similar data
991
+
992
+ 0:28:00.705 --> 0:28:01.612
993
+ and find.
994
+
995
+ 0:28:02.122 --> 0:28:08.190
996
+ Will have a lecture on domain adaptation where
997
+ it's exactly the idea how you can make systems
998
+
999
+ 0:28:08.190 --> 0:28:13.935
1000
+ in these situations better so you can adapt
1001
+ it to this data but then you still need this
1002
+
1003
+ 0:28:13.935 --> 0:28:14.839
1004
+ type of data.
1005
+
1006
+ 0:28:15.335 --> 0:28:21.590
1007
+ And in prompting it might work if you have
1008
+ seen it in your data so it can make the system
1009
+
1010
+ 0:28:21.590 --> 0:28:25.134
1011
+ aware and tell it focus more in this type of
1012
+ data.
1013
+
1014
+ 0:28:25.465 --> 0:28:30.684
1015
+ But if you haven't had enough of the really
1016
+ specific good matching data, I think it will
1017
+
1018
+ 0:28:30.684 --> 0:28:31.681
1019
+ always not work.
1020
+
1021
+ 0:28:31.681 --> 0:28:37.077
1022
+ So you need to have this type of data and
1023
+ therefore it's important not only to have general
1024
+
1025
+ 0:28:37.077 --> 0:28:42.120
1026
+ data but also data, at least in your overall
1027
+ system, which really fits to the domain.
1028
+
1029
+ 0:28:45.966 --> 0:28:53.298
1030
+ And then the second thing, of course, is you
1031
+ need to have data that has good quality.
1032
+
1033
+ 0:28:53.693 --> 0:29:00.170
1034
+ In the early stages it might be good to have
1035
+ all the data but later it's especially important
1036
+
1037
+ 0:29:00.170 --> 0:29:06.577
1038
+ that you have somehow good quality and so that
1039
+ you're learning what you really want to learn
1040
+
1041
+ 0:29:06.577 --> 0:29:09.057
1042
+ and not learning some great things.
1043
+
1044
+ 0:29:10.370 --> 0:29:21.551
1045
+ We talked about this with the kilometers and
1046
+ miles, so if you just take in some type of
1047
+
1048
+ 0:29:21.551 --> 0:29:26.253
1049
+ data and don't look at the quality,.
1050
+
1051
+ 0:29:26.766 --> 0:29:30.875
1052
+ But of course, the question here is what is
1053
+ good quality data?
1054
+
1055
+ 0:29:31.331 --> 0:29:35.054
1056
+ It is not yet that easy to define what is
1057
+ a good quality data.
1058
+
1059
+ 0:29:36.096 --> 0:29:43.961
1060
+ That doesn't mean it has to what people generally
1061
+ assume as high quality text or so, like written
1062
+
1063
+ 0:29:43.961 --> 0:29:47.814
1064
+ by a Nobel Prize winner or something like that.
1065
+
1066
+ 0:29:47.814 --> 0:29:54.074
1067
+ This is not what we mean by this quality,
1068
+ but again the most important again.
1069
+
1070
+ 0:29:54.354 --> 0:30:09.181
1071
+ So if you have Twitter data, high quality
1072
+ data doesn't mean you have now some novels.
1073
+
1074
+ 0:30:09.309 --> 0:30:12.875
1075
+ Test data, but it should also be represented
1076
+ similarly.
1077
+
1078
+ 0:30:12.875 --> 0:30:18.480
1079
+ Don't have, for example, quality definitely
1080
+ as it should be really translating yourself
1081
+
1082
+ 0:30:18.480 --> 0:30:18.862
1083
+ into.
1084
+
1085
+ 0:30:19.199 --> 0:30:25.556
1086
+ So especially if you corral data you would
1087
+ often have that it's not a direct translation.
1088
+
1089
+ 0:30:25.805 --> 0:30:28.436
1090
+ So then, of course, this is not high quality
1091
+ teaching.
1092
+
1093
+ 0:30:29.449 --> 0:30:39.974
1094
+ But in generally that's a very difficult thing
1095
+ to, and it's very difficult to design what
1096
+
1097
+ 0:30:39.974 --> 0:30:41.378
1098
+ is reading.
1099
+
1100
+ 0:30:41.982 --> 0:30:48.333
1101
+ And of course a biometric is always the quality
1102
+ of your data is good if your machine translation.
1103
+
1104
+ 0:30:48.648 --> 0:30:50.719
1105
+ So that is like the indirect.
1106
+
1107
+ 0:30:50.991 --> 0:30:52.447
1108
+ Well, what can we motive?
1109
+
1110
+ 0:30:52.447 --> 0:30:57.210
1111
+ Of course, it's difficult to always try a
1112
+ lot of things and evaluate either of them,
1113
+
1114
+ 0:30:57.210 --> 0:30:59.396
1115
+ build a full MP system and then check.
1116
+
1117
+ 0:30:59.396 --> 0:31:00.852
1118
+ Oh, was this a good idea?
1119
+
1120
+ 0:31:00.852 --> 0:31:01.357
1121
+ I mean,.
1122
+
1123
+ 0:31:01.581 --> 0:31:19.055
1124
+ You have two tokenizers who like split sentences
1125
+ and the words you really want to apply.
1126
+
1127
+ 0:31:19.179 --> 0:31:21.652
1128
+ Now you could maybe argue or your idea could
1129
+ be.
1130
+
1131
+ 0:31:21.841 --> 0:31:30.186
1132
+ Just take it there very fast and then get
1133
+ the result, but the problem is there is not
1134
+
1135
+ 0:31:30.186 --> 0:31:31.448
1136
+ always this.
1137
+
1138
+ 0:31:31.531 --> 0:31:36.269
1139
+ One thing that works very well for small data.
1140
+
1141
+ 0:31:36.269 --> 0:31:43.123
1142
+ It's not for sure that the same effect will
1143
+ happen in large stages.
1144
+
1145
+ 0:31:43.223 --> 0:31:50.395
1146
+ This idea really improves on very low resource
1147
+ data if only train on hundred words.
1148
+
1149
+ 0:31:51.271 --> 0:31:58.357
1150
+ But if you use it for a large data set, it
1151
+ doesn't really matter and all your ideas not.
1152
+
1153
+ 0:31:58.598 --> 0:32:01.172
1154
+ So that is also a typical thing.
1155
+
1156
+ 0:32:01.172 --> 0:32:05.383
1157
+ This quality issue is more and more important
1158
+ if you.
1159
+
1160
+ 0:32:06.026 --> 0:32:16.459
1161
+ By one motivation which generally you should
1162
+ have, you want to represent your data in having
1163
+
1164
+ 0:32:16.459 --> 0:32:17.469
1165
+ as many.
1166
+
1167
+ 0:32:17.677 --> 0:32:21.805
1168
+ Why is this the case any idea?
1169
+
1170
+ 0:32:21.805 --> 0:32:33.389
1171
+ Why this could be a motivation that we try
1172
+ to represent the data in a way that we have
1173
+
1174
+ 0:32:33.389 --> 0:32:34.587
1175
+ as many.
1176
+
1177
+ 0:32:38.338 --> 0:32:50.501
1178
+ We also want to learn about the fun text because
1179
+ maybe sometimes some grows in the fun text.
1180
+
1181
+ 0:32:52.612 --> 0:32:54.020
1182
+ The context is here.
1183
+
1184
+ 0:32:54.020 --> 0:32:56.432
1185
+ It's more about the learning first.
1186
+
1187
+ 0:32:56.432 --> 0:33:00.990
1188
+ You can generally learn better if you've seen
1189
+ something more often.
1190
+
1191
+ 0:33:00.990 --> 0:33:06.553
1192
+ So if you have seen an event only once, it's
1193
+ really hard to learn about the event.
1194
+
1195
+ 0:33:07.107 --> 0:33:15.057
1196
+ If you have seen an event a hundred times
1197
+ your bearing estimating which and maybe that
1198
+
1199
+ 0:33:15.057 --> 0:33:18.529
1200
+ is the context, then you can use the.
1201
+
1202
+ 0:33:18.778 --> 0:33:21.331
1203
+ So, for example, if you here have the word
1204
+ towels.
1205
+
1206
+ 0:33:21.761 --> 0:33:28.440
1207
+ If you would just take the data normally you
1208
+ would directly process the data.
1209
+
1210
+ 0:33:28.440 --> 0:33:32.893
1211
+ In the upper case you would the house with
1212
+ the dog.
1213
+
1214
+ 0:33:32.893 --> 0:33:40.085
1215
+ That's a different word than the house this
1216
+ way and then the house with the common.
1217
+
1218
+ 0:33:40.520 --> 0:33:48.365
1219
+ So you want to learn how this translates into
1220
+ house, but you translate an upper case.
1221
+
1222
+ 0:33:48.365 --> 0:33:50.281
1223
+ How this translates.
1224
+
1225
+ 0:33:50.610 --> 0:33:59.445
1226
+ You were learning how to translate into house
1227
+ and house, so you have to learn four different
1228
+
1229
+ 0:33:59.445 --> 0:34:00.205
1230
+ things.
1231
+
1232
+ 0:34:00.205 --> 0:34:06.000
1233
+ Instead, we really want to learn that house
1234
+ gets into house.
1235
+
1236
+ 0:34:06.366 --> 0:34:18.796
1237
+ And then imagine if it would be even a beak,
1238
+ it might be like here a house would be into.
1239
+
1240
+ 0:34:18.678 --> 0:34:22.089
1241
+ Good-bye Then.
1242
+
1243
+ 0:34:22.202 --> 0:34:29.512
1244
+ If it's an upper case then I always have to
1245
+ translate it into a boiler while it's a lower
1246
+
1247
+ 0:34:29.512 --> 0:34:34.955
1248
+ case that is translated into house and that's
1249
+ of course not right.
1250
+
1251
+ 0:34:34.955 --> 0:34:39.260
1252
+ We have to use the context to decide what
1253
+ is better.
1254
+
1255
+ 0:34:39.679 --> 0:34:47.086
1256
+ If you have seen an event several times then
1257
+ you are better able to learn your model and
1258
+
1259
+ 0:34:47.086 --> 0:34:51.414
1260
+ that doesn't matter what type of learning you
1261
+ have.
1262
+
1263
+ 0:34:52.392 --> 0:34:58.981
1264
+ I shouldn't say all but for most of these
1265
+ models it's always better to have like seen
1266
+
1267
+ 0:34:58.981 --> 0:35:00.897
1268
+ an event war more often.
1269
+
1270
+ 0:35:00.920 --> 0:35:11.483
1271
+ Therefore, if you preprocessive data, you
1272
+ should ask the question how can represent data
1273
+
1274
+ 0:35:11.483 --> 0:35:14.212
1275
+ in order to have seen.
1276
+
1277
+ 0:35:14.514 --> 0:35:17.885
1278
+ Of course you should not remove that information.
1279
+
1280
+ 0:35:18.078 --> 0:35:25.519
1281
+ So you could now, of course, just lowercase
1282
+ everything.
1283
+
1284
+ 0:35:25.519 --> 0:35:30.303
1285
+ Then you've seen things more often.
1286
+
1287
+ 0:35:30.710 --> 0:35:38.443
1288
+ And that might be an issue because in the
1289
+ final application you want to have real text
1290
+
1291
+ 0:35:38.443 --> 0:35:38.887
1292
+ and.
1293
+
1294
+ 0:35:40.440 --> 0:35:44.003
1295
+ And finally, even it's more important than
1296
+ it's consistent.
1297
+
1298
+ 0:35:44.965 --> 0:35:52.630
1299
+ So this is a problem where, for example, aren't
1300
+ consistent.
1301
+
1302
+ 0:35:52.630 --> 0:35:58.762
1303
+ So I am, I'm together written in training
1304
+ data.
1305
+
1306
+ 0:35:58.762 --> 0:36:04.512
1307
+ And if you're not in test data, have a high.
1308
+
1309
+ 0:36:04.824 --> 0:36:14.612
1310
+ Therefore, most important is to generate preprocessing
1311
+ and represent your data that is most consistent
1312
+
1313
+ 0:36:14.612 --> 0:36:18.413
1314
+ because it's easier to map how similar.
1315
+
1316
+ 0:36:18.758 --> 0:36:26.588
1317
+ If your text is represented very, very differently
1318
+ then your data will be badly be translated.
1319
+
1320
+ 0:36:26.666 --> 0:36:30.664
1321
+ So we once had the case.
1322
+
1323
+ 0:36:30.664 --> 0:36:40.420
1324
+ For example, there is some data who wrote
1325
+ it, but in German.
1326
+
1327
+ 0:36:40.900 --> 0:36:44.187
1328
+ And if you read it as a human you see it.
1329
+
1330
+ 0:36:44.187 --> 0:36:49.507
1331
+ It's even hard to get the difference because
1332
+ it looks very similar.
1333
+
1334
+ 0:36:50.130 --> 0:37:02.997
1335
+ If you use it for a machine translation system,
1336
+ it would not be able to translate anything
1337
+
1338
+ 0:37:02.997 --> 0:37:08.229
1339
+ of it because it's a different word.
1340
+
1341
+ 0:37:09.990 --> 0:37:17.736
1342
+ And especially on the other hand you should
1343
+ of course not rechange significant training
1344
+
1345
+ 0:37:17.736 --> 0:37:18.968
1346
+ data thereby.
1347
+
1348
+ 0:37:18.968 --> 0:37:27.155
1349
+ For example, removing case information because
1350
+ if your task is to generate case information.
1351
+
1352
+ 0:37:31.191 --> 0:37:41.081
1353
+ One thing which is a bit point to look into
1354
+ it in order to see the difficulty of your data
1355
+
1356
+ 0:37:41.081 --> 0:37:42.711
1357
+ is to compare.
1358
+
1359
+ 0:37:43.103 --> 0:37:45.583
1360
+ There are types.
1361
+
1362
+ 0:37:45.583 --> 0:37:57.983
1363
+ We mean the number of unique words in the
1364
+ corpus, so your vocabulary and the tokens.
1365
+
1366
+ 0:37:58.298 --> 0:38:08.628
1367
+ And then you can look at the type token ratio
1368
+ that means a number of types per token.
1369
+
1370
+ 0:38:15.815 --> 0:38:22.381
1371
+ Have less types than tokens because every
1372
+ word appears at least in the corpus, but most
1373
+
1374
+ 0:38:22.381 --> 0:38:27.081
1375
+ of them will occur more often until this number
1376
+ is bigger, so.
1377
+
1378
+ 0:38:27.667 --> 0:38:30.548
1379
+ And of course this changes if you have more
1380
+ date.
1381
+
1382
+ 0:38:31.191 --> 0:38:38.103
1383
+ Here is an example from an English Wikipedia.
1384
+
1385
+ 0:38:38.103 --> 0:38:45.015
1386
+ That means each word in average occurs times.
1387
+
1388
+ 0:38:45.425 --> 0:38:47.058
1389
+ Of course there's a big difference.
1390
+
1391
+ 0:38:47.058 --> 0:38:51.323
1392
+ There will be some words which occur one hundred
1393
+ times, but therefore most of the words occur
1394
+
1395
+ 0:38:51.323 --> 0:38:51.777
1396
+ only one.
1397
+
1398
+ 0:38:52.252 --> 0:38:55.165
1399
+ However, you see this ratio goes down.
1400
+
1401
+ 0:38:55.165 --> 0:39:01.812
1402
+ That's a good thing, so you have seen each
1403
+ word more often and therefore your model gets
1404
+
1405
+ 0:39:01.812 --> 0:39:03.156
1406
+ typically better.
1407
+
1408
+ 0:39:03.156 --> 0:39:08.683
1409
+ However, the problem is we always have a lot
1410
+ of words which we have seen.
1411
+
1412
+ 0:39:09.749 --> 0:39:15.111
1413
+ Even here there will be a bound of words which
1414
+ you have only seen once.
1415
+
1416
+ 0:39:15.111 --> 0:39:20.472
1417
+ However, this can give you an indication about
1418
+ the quality of the data.
1419
+
1420
+ 0:39:20.472 --> 0:39:27.323
1421
+ So you should always, of course, try to achieve
1422
+ data where you have a very low type to talk
1423
+
1424
+ 0:39:27.323 --> 0:39:28.142
1425
+ and ratio.
1426
+
1427
+ 0:39:28.808 --> 0:39:39.108
1428
+ For example, if you compare, simplify and
1429
+ not only Wikipedia, what would be your expectation?
1430
+
1431
+ 0:39:41.861 --> 0:39:49.842
1432
+ Yes, that's exactly, but however it's surprisingly
1433
+ only a little bit lower, but you see that it's
1434
+
1435
+ 0:39:49.842 --> 0:39:57.579
1436
+ lower, so we are using less words to express
1437
+ the same thing, and therefore the task to produce
1438
+
1439
+ 0:39:57.579 --> 0:39:59.941
1440
+ this text is also a gesture.
1441
+
1442
+ 0:40:01.221 --> 0:40:07.702
1443
+ However, as how many words are there, there
1444
+ is no clear definition.
1445
+
1446
+ 0:40:07.787 --> 0:40:19.915
1447
+ So there will be always more words, especially
1448
+ depending on your dataset, how many different
1449
+
1450
+ 0:40:19.915 --> 0:40:22.132
1451
+ words there are.
1452
+
1453
+ 0:40:22.482 --> 0:40:30.027
1454
+ So if you have million tweets where around
1455
+ fifty million tokens and you have six hundred
1456
+
1457
+ 0:40:30.027 --> 0:40:30.875
1458
+ thousand.
1459
+
1460
+ 0:40:31.251 --> 0:40:40.299
1461
+ If you have times this money teen tweeds you
1462
+ also have significantly more tokens but also.
1463
+
1464
+ 0:40:40.660 --> 0:40:58.590
1465
+ So especially in things like the social media,
1466
+ of course, there's always different types of
1467
+
1468
+ 0:40:58.590 --> 0:40:59.954
1469
+ words.
1470
+
1471
+ 0:41:00.040 --> 0:41:04.028
1472
+ Another example from not social media is here.
1473
+
1474
+ 0:41:04.264 --> 0:41:18.360
1475
+ So yeah, there is a small liter sandwich like
1476
+ phone conversations, two million tokens, and
1477
+
1478
+ 0:41:18.360 --> 0:41:22.697
1479
+ only twenty thousand words.
1480
+
1481
+ 0:41:23.883 --> 0:41:37.221
1482
+ If you think about Shakespeare, it has even
1483
+ less token, significantly less than a million,
1484
+
1485
+ 0:41:37.221 --> 0:41:40.006
1486
+ but the number of.
1487
+
1488
+ 0:41:40.060 --> 0:41:48.781
1489
+ On the other hand, there is this Google Engron
1490
+ corpus which has tokens and there is always
1491
+
1492
+ 0:41:48.781 --> 0:41:50.506
1493
+ new words coming.
1494
+
1495
+ 0:41:50.991 --> 0:41:52.841
1496
+ Is English.
1497
+
1498
+ 0:41:52.841 --> 0:42:08.103
1499
+ The nice thing about English is that the vocabulary
1500
+ is relatively small, too small, but relatively
1501
+
1502
+ 0:42:08.103 --> 0:42:09.183
1503
+ small.
1504
+
1505
+ 0:42:09.409 --> 0:42:14.224
1506
+ So here you see the Ted Corpus here.
1507
+
1508
+ 0:42:15.555 --> 0:42:18.144
1509
+ All know Ted's lectures.
1510
+
1511
+ 0:42:18.144 --> 0:42:26.429
1512
+ They are transcribed, translated, not a source
1513
+ for us, especially small crocus.
1514
+
1515
+ 0:42:26.846 --> 0:42:32.702
1516
+ You can do a lot of experiments with that
1517
+ and you see that the corpus site is relatively
1518
+
1519
+ 0:42:32.702 --> 0:42:36.782
1520
+ similar so we have around four million tokens
1521
+ in this corpus.
1522
+
1523
+ 0:42:36.957 --> 0:42:44.464
1524
+ However, if you look at the vocabulary, English
1525
+ has half as many words in their different words
1526
+
1527
+ 0:42:44.464 --> 0:42:47.045
1528
+ as German and Dutch and Italian.
1529
+
1530
+ 0:42:47.527 --> 0:42:56.260
1531
+ So this is one influence from positional works
1532
+ like which are more frequent in German, the
1533
+
1534
+ 0:42:56.260 --> 0:43:02.978
1535
+ more important since we have all these different
1536
+ morphological forms.
1537
+
1538
+ 0:43:03.263 --> 0:43:08.170
1539
+ There all leads to new words and they need
1540
+ to be somewhat expressed in there.
1541
+
1542
+ 0:43:11.531 --> 0:43:20.278
1543
+ So to deal with this, the question is how
1544
+ can we normalize the text in order to make
1545
+
1546
+ 0:43:20.278 --> 0:43:22.028
1547
+ the text easier?
1548
+
1549
+ 0:43:22.028 --> 0:43:25.424
1550
+ Can we simplify the task easier?
1551
+
1552
+ 0:43:25.424 --> 0:43:29.231
1553
+ But we need to keep all information.
1554
+
1555
+ 0:43:29.409 --> 0:43:32.239
1556
+ So an example where not all information skipped.
1557
+
1558
+ 0:43:32.239 --> 0:43:35.012
1559
+ Of course you make the task easier if you
1560
+ just.
1561
+
1562
+ 0:43:35.275 --> 0:43:41.141
1563
+ You don't have to deal with different cases.
1564
+
1565
+ 0:43:41.141 --> 0:43:42.836
1566
+ It's easier.
1567
+
1568
+ 0:43:42.836 --> 0:43:52.482
1569
+ However, information gets lost and you might
1570
+ need to generate the target.
1571
+
1572
+ 0:43:52.832 --> 0:44:00.153
1573
+ So the question is always: How can we on the
1574
+ one hand simplify the task but keep all the
1575
+
1576
+ 0:44:00.153 --> 0:44:01.223
1577
+ information?
1578
+
1579
+ 0:44:01.441 --> 0:44:06.639
1580
+ Say necessary because it depends on the task.
1581
+
1582
+ 0:44:06.639 --> 0:44:11.724
1583
+ For some tasks you might find to remove the.
1584
+
1585
+ 0:44:14.194 --> 0:44:23.463
1586
+ So the steps they were typically doing are
1587
+ that you can the segment and words in a running
1588
+
1589
+ 0:44:23.463 --> 0:44:30.696
1590
+ text, so you can normalize word forms and segmentation
1591
+ into sentences.
1592
+
1593
+ 0:44:30.696 --> 0:44:33.955
1594
+ Also, if you have not a single.
1595
+
1596
+ 0:44:33.933 --> 0:44:38.739
1597
+ If this is not a redundancy point to segments,
1598
+ the text is also into segments.
1599
+
1600
+ 0:44:39.779 --> 0:44:52.609
1601
+ So what are we doing there for European language
1602
+ segmentation into words?
1603
+
1604
+ 0:44:52.609 --> 0:44:57.290
1605
+ It's not that complicated.
1606
+
1607
+ 0:44:57.277 --> 0:45:06.001
1608
+ You have to somehow handle the joint words
1609
+ and by handling joint words the most important.
1610
+
1611
+ 0:45:06.526 --> 0:45:11.331
1612
+ So in most systems it really doesn't matter
1613
+ much.
1614
+
1615
+ 0:45:11.331 --> 0:45:16.712
1616
+ If you write, I'm together as one word or
1617
+ as two words.
1618
+
1619
+ 0:45:17.197 --> 0:45:23.511
1620
+ The nice thing about iron is maybe this is
1621
+ so often that it doesn't matter if you both
1622
+
1623
+ 0:45:23.511 --> 0:45:26.560
1624
+ and if they're both accrued often enough.
1625
+
1626
+ 0:45:26.560 --> 0:45:32.802
1627
+ But you'll have some of these cases where
1628
+ they don't occur there often, so you should
1629
+
1630
+ 0:45:32.802 --> 0:45:35.487
1631
+ have more as consistent as possible.
1632
+
1633
+ 0:45:36.796 --> 0:45:41.662
1634
+ But of course things can get more complicated.
1635
+
1636
+ 0:45:41.662 --> 0:45:48.598
1637
+ If you have Finland capital, do you want to
1638
+ split the ends or not?
1639
+
1640
+ 0:45:48.598 --> 0:45:53.256
1641
+ Isn't you split or do you even write it out?
1642
+
1643
+ 0:45:53.433 --> 0:46:00.468
1644
+ And what about like things with hyphens in
1645
+ the middle and so on?
1646
+
1647
+ 0:46:00.540 --> 0:46:07.729
1648
+ So there is not everything is very easy, but
1649
+ is generally possible to somewhat keep as.
1650
+
1651
+ 0:46:11.791 --> 0:46:25.725
1652
+ Sometimes the most challenging and traditional
1653
+ systems were compounds, or how to deal with
1654
+
1655
+ 0:46:25.725 --> 0:46:28.481
1656
+ things like this.
1657
+
1658
+ 0:46:28.668 --> 0:46:32.154
1659
+ The nice thing is, as said, will come to the
1660
+ later.
1661
+
1662
+ 0:46:32.154 --> 0:46:34.501
1663
+ Nowadays we typically use subword.
1664
+
1665
+ 0:46:35.255 --> 0:46:42.261
1666
+ Unit, so we don't have to deal with this in
1667
+ the preprocessing directly, but in the subword
1668
+
1669
+ 0:46:42.261 --> 0:46:47.804
1670
+ splitting we're doing it, and then we can learn
1671
+ how to best spit these.
1672
+
1673
+ 0:46:52.392 --> 0:46:56.974
1674
+ Things Get More Complicated.
1675
+
1676
+ 0:46:56.977 --> 0:46:59.934
1677
+ About non European languages.
1678
+
1679
+ 0:46:59.934 --> 0:47:08.707
1680
+ Because in non European languages, not all
1681
+ of them, there is no space between the words.
1682
+
1683
+ 0:47:09.029 --> 0:47:18.752
1684
+ Nowadays you can also download word segmentation
1685
+ models where you put in the full sentence and
1686
+
1687
+ 0:47:18.752 --> 0:47:22.744
1688
+ then it's getting splitted into parts.
1689
+
1690
+ 0:47:22.963 --> 0:47:31.814
1691
+ And then, of course, it's even that you have
1692
+ different writing systems, sometimes in Japanese.
1693
+
1694
+ 0:47:31.814 --> 0:47:40.385
1695
+ For example, they have these katakana, hiragana
1696
+ and kanji symbols in there, and you have to
1697
+
1698
+ 0:47:40.385 --> 0:47:42.435
1699
+ some idea with these.
1700
+
1701
+ 0:47:49.669 --> 0:47:54.560
1702
+ To the, the next thing is can reduce some
1703
+ normalization.
1704
+
1705
+ 0:47:54.874 --> 0:48:00.376
1706
+ So the idea is that you map several words
1707
+ onto the same.
1708
+
1709
+ 0:48:00.460 --> 0:48:07.877
1710
+ And that is test dependent, and the idea is
1711
+ to define something like acronym classes so
1712
+
1713
+ 0:48:07.877 --> 0:48:15.546
1714
+ that words, which have the same meaning where
1715
+ it's not in order to have the difference, to
1716
+
1717
+ 0:48:15.546 --> 0:48:19.423
1718
+ map onto the same thing in order to make the.
1719
+
1720
+ 0:48:19.679 --> 0:48:27.023
1721
+ The most important thing is there about tasing,
1722
+ and then there is something like sometimes
1723
+
1724
+ 0:48:27.023 --> 0:48:27.508
1725
+ word.
1726
+
1727
+ 0:48:28.048 --> 0:48:37.063
1728
+ For casing you can do two things and then
1729
+ depend on the task.
1730
+
1731
+ 0:48:37.063 --> 0:48:44.769
1732
+ You can lowercase everything, maybe some exceptions.
1733
+
1734
+ 0:48:45.045 --> 0:48:47.831
1735
+ For the target side, it should normally it's
1736
+ normally not done.
1737
+
1738
+ 0:48:48.188 --> 0:48:51.020
1739
+ Why is it not done?
1740
+
1741
+ 0:48:51.020 --> 0:48:56.542
1742
+ Why should you only do it for suicide?
1743
+
1744
+ 0:48:56.542 --> 0:49:07.729
1745
+ Yes, so you have to generate correct text
1746
+ instead of lower case and uppercase.
1747
+
1748
+ 0:49:08.848 --> 0:49:16.370
1749
+ Nowadays to be always do true casing on both
1750
+ sides, also on the sewer side, that means you
1751
+
1752
+ 0:49:16.370 --> 0:49:17.610
1753
+ keep the case.
1754
+
1755
+ 0:49:17.610 --> 0:49:24.966
1756
+ The only thing where people try to work on
1757
+ or sometimes do that is that at the beginning
1758
+
1759
+ 0:49:24.966 --> 0:49:25.628
1760
+ of the.
1761
+
1762
+ 0:49:25.825 --> 0:49:31.115
1763
+ For words like this, this is not that important
1764
+ because you will have seen otherwise a lot
1765
+
1766
+ 0:49:31.115 --> 0:49:31.696
1767
+ of times.
1768
+
1769
+ 0:49:31.696 --> 0:49:36.928
1770
+ But if you know have rare words, which you
1771
+ only have seen maybe three times, and you have
1772
+
1773
+ 0:49:36.928 --> 0:49:42.334
1774
+ only seen in the middle of the sentence, and
1775
+ now it occurs at the beginning of the sentence,
1776
+
1777
+ 0:49:42.334 --> 0:49:45.763
1778
+ which is upper case, then you don't know how
1779
+ to deal with.
1780
+
1781
+ 0:49:46.146 --> 0:49:50.983
1782
+ So then it might be good to do a true casing.
1783
+
1784
+ 0:49:50.983 --> 0:49:56.241
1785
+ That means you recase each word on the beginning.
1786
+
1787
+ 0:49:56.576 --> 0:49:59.830
1788
+ The only question, of course, is how do you
1789
+ recase it?
1790
+
1791
+ 0:49:59.830 --> 0:50:01.961
1792
+ So what case would you always know?
1793
+
1794
+ 0:50:02.162 --> 0:50:18.918
1795
+ Word of the senders, or do you have a better
1796
+ solution, especially not English, maybe German.
1797
+
1798
+ 0:50:18.918 --> 0:50:20.000
1799
+ It's.
1800
+
1801
+ 0:50:25.966 --> 0:50:36.648
1802
+ The fancy solution would be to count hope
1803
+ and decide based on this, the unfancy running
1804
+
1805
+ 0:50:36.648 --> 0:50:43.147
1806
+ would: Think it's not really good because most
1807
+ of the cane boards are lower paced.
1808
+
1809
+ 0:50:43.683 --> 0:50:53.657
1810
+ That is one idea to count and definitely better
1811
+ because as a word more often occurs upper case.
1812
+
1813
+ 0:50:53.653 --> 0:50:57.934
1814
+ Otherwise you only have a lower case at the
1815
+ beginning where you have again.
1816
+
1817
+ 0:50:58.338 --> 0:51:03.269
1818
+ Haven't gained anything, you can make it even
1819
+ a bit better when counting.
1820
+
1821
+ 0:51:03.269 --> 0:51:09.134
1822
+ You're ignoring the first position so that
1823
+ you don't count the word beginning and yeah,
1824
+
1825
+ 0:51:09.134 --> 0:51:12.999
1826
+ that's typically how it's done to do this type
1827
+ of casing.
1828
+
1829
+ 0:51:13.273 --> 0:51:23.907
1830
+ And that's the easy thing you can't even use
1831
+ like then bygram teachers who work pairs.
1832
+
1833
+ 0:51:23.907 --> 0:51:29.651
1834
+ There's very few words which occur more often.
1835
+
1836
+ 0:51:29.970 --> 0:51:33.163
1837
+ It's OK to have them boast because you can
1838
+ otherwise learn it.
1839
+
1840
+ 0:51:36.376 --> 0:51:52.305
1841
+ Another thing about these classes is to use
1842
+ word classes that were partly done, for example,
1843
+
1844
+ 0:51:52.305 --> 0:51:55.046
1845
+ and more often.
1846
+
1847
+ 0:51:55.375 --> 0:51:57.214
1848
+ Ten Thousand One Hundred Books.
1849
+
1850
+ 0:51:57.597 --> 0:52:07.397
1851
+ And then for an system that might not be important
1852
+ you can do something at number books.
1853
+
1854
+ 0:52:07.847 --> 0:52:16.450
1855
+ However, you see here already that it's not
1856
+ that easy because if you have one book you
1857
+
1858
+ 0:52:16.450 --> 0:52:19.318
1859
+ don't have to do with a pro.
1860
+
1861
+ 0:52:20.020 --> 0:52:21.669
1862
+ Always be careful.
1863
+
1864
+ 0:52:21.669 --> 0:52:28.094
1865
+ It's very fast to ignore some exceptions and
1866
+ make more things worse than.
1867
+
1868
+ 0:52:28.488 --> 0:52:37.879
1869
+ So it's always difficult to decide when to
1870
+ do this and when to better not do it and keep
1871
+
1872
+ 0:52:37.879 --> 0:52:38.724
1873
+ things.
1874
+
1875
+ 0:52:43.483 --> 0:52:56.202
1876
+ Then the next step is sentence segmentation,
1877
+ so we are typically working on sentences.
1878
+
1879
+ 0:52:56.476 --> 0:53:11.633
1880
+ However, dots things are a bit more complicated,
1881
+ so you can do a bit more.
1882
+
1883
+ 0:53:11.731 --> 0:53:20.111
1884
+ You can even have some type of classifier
1885
+ with features by then generally.
1886
+
1887
+ 0:53:20.500 --> 0:53:30.731
1888
+ Is not too complicated, so you can have different
1889
+ types of classifiers to do that, but in generally.
1890
+
1891
+ 0:53:30.650 --> 0:53:32.537
1892
+ I Didn't Know It.
1893
+
1894
+ 0:53:33.393 --> 0:53:35.583
1895
+ It's not a super complicated task.
1896
+
1897
+ 0:53:35.583 --> 0:53:39.461
1898
+ There are nowadays also a lot of libraries
1899
+ which you can use.
1900
+
1901
+ 0:53:39.699 --> 0:53:45.714
1902
+ To do that normally if you're doing the normalization
1903
+ beforehand that can be done there so you only
1904
+
1905
+ 0:53:45.714 --> 0:53:51.126
1906
+ split up the dot if it's like the sentence
1907
+ boundary and otherwise you keep it to the word
1908
+
1909
+ 0:53:51.126 --> 0:53:54.194
1910
+ so you can do that a bit jointly with the segment.
1911
+
1912
+ 0:53:54.634 --> 0:54:06.017
1913
+ It's something to think about to care because
1914
+ it's where arrows happen.
1915
+
1916
+ 0:54:06.017 --> 0:54:14.712
1917
+ However, on the one end you can still do it
1918
+ very well.
1919
+
1920
+ 0:54:14.834 --> 0:54:19.740
1921
+ You will never get data which is perfectly
1922
+ clean and where everything is great.
1923
+
1924
+ 0:54:20.340 --> 0:54:31.020
1925
+ There's just too much data and it will never
1926
+ happen, so therefore it's important to be aware
1927
+
1928
+ 0:54:31.020 --> 0:54:35.269
1929
+ of that during the full development.
1930
+
1931
+ 0:54:37.237 --> 0:54:42.369
1932
+ And one last thing about the preprocessing,
1933
+ we'll get into the representation.
1934
+
1935
+ 0:54:42.369 --> 0:54:47.046
1936
+ If you're working on that, you'll get a friend
1937
+ with regular expression.
1938
+
1939
+ 0:54:47.046 --> 0:54:50.034
1940
+ That's not only how you do all this matching.
1941
+
1942
+ 0:54:50.430 --> 0:55:03.811
1943
+ And if you look into the scripts of how to
1944
+ deal with pancreation marks and stuff like
1945
+
1946
+ 0:55:03.811 --> 0:55:04.900
1947
+ that,.
1948
+
1949
+ 0:55:11.011 --> 0:55:19.025
1950
+ So if we have now the data of our next step
1951
+ to build, the system is to represent our words.
1952
+
1953
+ 0:55:19.639 --> 0:55:27.650
1954
+ Before we start with this, any more questions
1955
+ about preprocessing.
1956
+
1957
+ 0:55:27.650 --> 0:55:32.672
1958
+ While we work on the pure text, I'm sure.
1959
+
1960
+ 0:55:33.453 --> 0:55:40.852
1961
+ The idea is again to make things more simple
1962
+ because if you think about the production mark
1963
+
1964
+ 0:55:40.852 --> 0:55:48.252
1965
+ at the beginning of a sentence, it might be
1966
+ that you haven't seen the word or, for example,
1967
+
1968
+ 0:55:48.252 --> 0:55:49.619
1969
+ think of titles.
1970
+
1971
+ 0:55:49.619 --> 0:55:56.153
1972
+ In newspaper articles there's: So you then
1973
+ have seen the word now in the title before,
1974
+
1975
+ 0:55:56.153 --> 0:55:58.425
1976
+ and the text you have never seen.
1977
+
1978
+ 0:55:58.898 --> 0:56:03.147
1979
+ But there is always the decision.
1980
+
1981
+ 0:56:03.123 --> 0:56:09.097
1982
+ Do I gain more because I've seen things more
1983
+ often or do I lose because now I remove information
1984
+
1985
+ 0:56:09.097 --> 0:56:11.252
1986
+ which helps me to the same degree?
1987
+
1988
+ 0:56:11.571 --> 0:56:21.771
1989
+ Because if we, for example, do that in German
1990
+ and remove the case, this might be an important
1991
+
1992
+ 0:56:21.771 --> 0:56:22.531
1993
+ issue.
1994
+
1995
+ 0:56:22.842 --> 0:56:30.648
1996
+ So there is not the perfect solution, but
1997
+ generally you can get some arrows to make things
1998
+
1999
+ 0:56:30.648 --> 0:56:32.277
2000
+ look more similar.
2001
+
2002
+ 0:56:35.295 --> 0:56:43.275
2003
+ What you can do about products like the state
2004
+ of the area or the trends that are more or
2005
+
2006
+ 0:56:43.275 --> 0:56:43.813
2007
+ less.
2008
+
2009
+ 0:56:44.944 --> 0:56:50.193
2010
+ It starts even less because models get more
2011
+ powerful, so it's not that important, but be
2012
+
2013
+ 0:56:50.193 --> 0:56:51.136
2014
+ careful partly.
2015
+
2016
+ 0:56:51.136 --> 0:56:56.326
2017
+ It's also the evaluation thing because these
2018
+ things which are problematic are happening
2019
+
2020
+ 0:56:56.326 --> 0:56:57.092
2021
+ very rarely.
2022
+
2023
+ 0:56:57.092 --> 0:57:00.159
2024
+ If you take average performance, it doesn't
2025
+ matter.
2026
+
2027
+ 0:57:00.340 --> 0:57:06.715
2028
+ However, in between it's doing the stupid
2029
+ mistakes that don't count on average, but they
2030
+
2031
+ 0:57:06.715 --> 0:57:08.219
2032
+ are not really good.
2033
+
2034
+ 0:57:09.089 --> 0:57:15.118
2035
+ Done you do some type of tokenization?
2036
+
2037
+ 0:57:15.118 --> 0:57:19.911
2038
+ You can do true casing or not.
2039
+
2040
+ 0:57:19.911 --> 0:57:28.723
2041
+ Some people nowadays don't do it, but that's
2042
+ still done.
2043
+
2044
+ 0:57:28.948 --> 0:57:34.441
2045
+ Then it depends on who is a bit on the type
2046
+ of domain.
2047
+
2048
+ 0:57:34.441 --> 0:57:37.437
2049
+ Again we have so translation.
2050
+
2051
+ 0:57:37.717 --> 0:57:46.031
2052
+ So in the text sometimes there is mark in
2053
+ the menu, later the shortcut.
2054
+
2055
+ 0:57:46.031 --> 0:57:49.957
2056
+ This letter is used for shortcut.
2057
+
2058
+ 0:57:49.957 --> 0:57:57.232
2059
+ You cannot mistake the word because it's no
2060
+ longer a file but.
2061
+
2062
+ 0:57:58.018 --> 0:58:09.037
2063
+ Then you cannot deal with it, so then it might
2064
+ make sense to remove this.
2065
+
2066
+ 0:58:12.032 --> 0:58:17.437
2067
+ Now the next step is how to match words into
2068
+ numbers.
2069
+
2070
+ 0:58:17.437 --> 0:58:22.142
2071
+ Machine learning models deal with some digits.
2072
+
2073
+ 0:58:22.342 --> 0:58:27.091
2074
+ The first idea is to use words as our basic
2075
+ components.
2076
+
2077
+ 0:58:27.247 --> 0:58:40.695
2078
+ And then you have a large vocabulary where
2079
+ each word gets referenced to an indigenous.
2080
+
2081
+ 0:58:40.900 --> 0:58:49.059
2082
+ So your sentence go home is now and that is
2083
+ your set.
2084
+
2085
+ 0:58:52.052 --> 0:59:00.811
2086
+ So the nice thing is you have very short sequences
2087
+ so that you can deal with them.
2088
+
2089
+ 0:59:00.811 --> 0:59:01.867
2090
+ However,.
2091
+
2092
+ 0:59:01.982 --> 0:59:11.086
2093
+ So you have not really understood how words
2094
+ are processed.
2095
+
2096
+ 0:59:11.086 --> 0:59:16.951
2097
+ Why is this or can that be a problem?
2098
+
2099
+ 0:59:17.497 --> 0:59:20.741
2100
+ And there is an easy solution to deal with
2101
+ unknown words.
2102
+
2103
+ 0:59:20.741 --> 0:59:22.698
2104
+ You just have one token, which is.
2105
+
2106
+ 0:59:23.123 --> 0:59:25.906
2107
+ Worrying in maybe some railroads in your training
2108
+ day, do you deal?
2109
+
2110
+ 0:59:26.206 --> 0:59:34.938
2111
+ That's working a bit for some province, but
2112
+ in general it's not good because you know nothing
2113
+
2114
+ 0:59:34.938 --> 0:59:35.588
2115
+ about.
2116
+
2117
+ 0:59:35.895 --> 0:59:38.770
2118
+ Can at least deal with this and maybe map
2119
+ it.
2120
+
2121
+ 0:59:38.770 --> 0:59:44.269
2122
+ So an easy solution in machine translation
2123
+ is always if it's an unknown word or we just
2124
+
2125
+ 0:59:44.269 --> 0:59:49.642
2126
+ copy it to the target side because unknown
2127
+ words are often named entities and in many
2128
+
2129
+ 0:59:49.642 --> 0:59:52.454
2130
+ languages the good solution is just to keep.
2131
+
2132
+ 0:59:53.013 --> 1:00:01.203
2133
+ So that is somehow a trick, trick, but yeah,
2134
+ that's of course not a good thing.
2135
+
2136
+ 1:00:01.821 --> 1:00:08.959
2137
+ It's also a problem if you deal with full
2138
+ words is that you have very few examples for
2139
+
2140
+ 1:00:08.959 --> 1:00:09.451
2141
+ some.
2142
+
2143
+ 1:00:09.949 --> 1:00:17.696
2144
+ And of course if you've seen a word once you
2145
+ can, someone may be translated, but we will
2146
+
2147
+ 1:00:17.696 --> 1:00:24.050
2148
+ learn that in your networks you represent words
2149
+ with continuous vectors.
2150
+
2151
+ 1:00:24.264 --> 1:00:26.591
2152
+ You have seen them two, three or four times.
2153
+
2154
+ 1:00:26.591 --> 1:00:31.246
2155
+ It is not really well learned, and you are
2156
+ typically doing most Arabs and words with your
2157
+
2158
+ 1:00:31.246 --> 1:00:31.763
2159
+ crow rap.
2160
+
2161
+ 1:00:33.053 --> 1:00:40.543
2162
+ And yeah, you cannot deal with things which
2163
+ are inside the world.
2164
+
2165
+ 1:00:40.543 --> 1:00:50.303
2166
+ So if you know that houses set one hundred
2167
+ and twelve and you see no houses, you have
2168
+
2169
+ 1:00:50.303 --> 1:00:51.324
2170
+ no idea.
2171
+
2172
+ 1:00:51.931 --> 1:00:55.533
2173
+ Of course, not really convenient, so humans
2174
+ are better.
2175
+
2176
+ 1:00:55.533 --> 1:00:58.042
2177
+ They can use the internal information.
2178
+
2179
+ 1:00:58.498 --> 1:01:04.080
2180
+ So if we have houses you'll know that it's
2181
+ like the bluer form of house.
2182
+
2183
+ 1:01:05.285 --> 1:01:16.829
2184
+ And for the ones who weren't in advance, ay,
2185
+ you have this night worth here and guess.
2186
+
2187
+ 1:01:16.716 --> 1:01:20.454
2188
+ Don't know the meaning of these words.
2189
+
2190
+ 1:01:20.454 --> 1:01:25.821
2191
+ However, all of you will know is the fear
2192
+ of something.
2193
+
2194
+ 1:01:26.686 --> 1:01:39.437
2195
+ From the ending, the phobia phobia is always
2196
+ the fear of something, but you don't know how.
2197
+
2198
+ 1:01:39.879 --> 1:01:46.618
2199
+ So we can split words into some parts that
2200
+ is helpful to deal with.
2201
+
2202
+ 1:01:46.618 --> 1:01:49.888
2203
+ This, for example, is a fear of.
2204
+
2205
+ 1:01:50.450 --> 1:02:04.022
2206
+ It's not very important, it's not how to happen
2207
+ very often, but yeah, it's also not important
2208
+
2209
+ 1:02:04.022 --> 1:02:10.374
2210
+ for understanding that you know everything.
2211
+
2212
+ 1:02:15.115 --> 1:02:18.791
2213
+ So what can we do instead?
2214
+
2215
+ 1:02:18.791 --> 1:02:29.685
2216
+ One thing which we could do instead is to
2217
+ represent words by the other extreme.
2218
+
2219
+ 1:02:29.949 --> 1:02:42.900
2220
+ So you really do like if you have a person's
2221
+ eye and a and age, then you need a space symbol.
2222
+
2223
+ 1:02:43.203 --> 1:02:55.875
2224
+ So you have now a representation for each
2225
+ character that enables you to implicitly learn
2226
+
2227
+ 1:02:55.875 --> 1:03:01.143
2228
+ morphology because words which have.
2229
+
2230
+ 1:03:01.541 --> 1:03:05.517
2231
+ And you can then deal with unknown words.
2232
+
2233
+ 1:03:05.517 --> 1:03:10.344
2234
+ There's still not everything you can process,
2235
+ but.
2236
+
2237
+ 1:03:11.851 --> 1:03:16.953
2238
+ So if you would go on charity level what might
2239
+ still be a problem?
2240
+
2241
+ 1:03:18.598 --> 1:03:24.007
2242
+ So all characters which you haven't seen,
2243
+ but that's nowadays a little bit more often
2244
+
2245
+ 1:03:24.007 --> 1:03:25.140
2246
+ with new emoties.
2247
+
2248
+ 1:03:25.140 --> 1:03:26.020
2249
+ You couldn't.
2250
+
2251
+ 1:03:26.020 --> 1:03:31.366
2252
+ It could also be that you have translated
2253
+ from Germany and German, and then there is
2254
+
2255
+ 1:03:31.366 --> 1:03:35.077
2256
+ a Japanese character or Chinese that you cannot
2257
+ translate.
2258
+
2259
+ 1:03:35.435 --> 1:03:43.938
2260
+ But most of the time all directions occur
2261
+ have been seen so that someone works very good.
2262
+
2263
+ 1:03:44.464 --> 1:03:58.681
2264
+ This is first a nice thing, so you have a
2265
+ very small vocabulary size, so one big part
2266
+
2267
+ 1:03:58.681 --> 1:04:01.987
2268
+ of the calculation.
2269
+
2270
+ 1:04:02.222 --> 1:04:11.960
2271
+ Neural networks is the calculation of the
2272
+ vocabulary size, so if you are efficient there
2273
+
2274
+ 1:04:11.960 --> 1:04:13.382
2275
+ it's better.
2276
+
2277
+ 1:04:14.914 --> 1:04:26.998
2278
+ On the other hand, the problem is you have
2279
+ no very long sequences, so if you think about
2280
+
2281
+ 1:04:26.998 --> 1:04:29.985
2282
+ this before you have.
2283
+
2284
+ 1:04:30.410 --> 1:04:43.535
2285
+ Your computation often depends on your input
2286
+ size and not only linear but quadratic going
2287
+
2288
+ 1:04:43.535 --> 1:04:44.410
2289
+ more.
2290
+
2291
+ 1:04:44.504 --> 1:04:49.832
2292
+ And of course it might also be that you just
2293
+ generally make things more complicated than
2294
+
2295
+ 1:04:49.832 --> 1:04:50.910
2296
+ they were before.
2297
+
2298
+ 1:04:50.951 --> 1:04:58.679
2299
+ We said before make things easy, but now if
2300
+ we really have to analyze each director independently,
2301
+
2302
+ 1:04:58.679 --> 1:05:05.003
2303
+ we cannot directly learn that university is
2304
+ the same, but we have to learn that.
2305
+
2306
+ 1:05:05.185 --> 1:05:12.179
2307
+ Is beginning and then there is an I and then
2308
+ there is an E and then all this together means
2309
+
2310
+ 1:05:12.179 --> 1:05:17.273
2311
+ university but another combination of these
2312
+ letters is a complete.
2313
+
2314
+ 1:05:17.677 --> 1:05:24.135
2315
+ So of course you make everything here a lot
2316
+ more complicated than you have on word basis.
2317
+
2318
+ 1:05:24.744 --> 1:05:32.543
2319
+ Character based models work very well in conditions
2320
+ with few data because you have seen the words
2321
+
2322
+ 1:05:32.543 --> 1:05:33.578
2323
+ very rarely.
2324
+
2325
+ 1:05:33.578 --> 1:05:38.751
2326
+ It's not good to learn but you have seen all
2327
+ letters more often.
2328
+
2329
+ 1:05:38.751 --> 1:05:44.083
2330
+ So if you have scenarios with very few data
2331
+ this is like one good.
2332
+
2333
+ 1:05:46.446 --> 1:05:59.668
2334
+ The other idea is to split now not doing the
2335
+ extreme, so either taking forwards or taking
2336
+
2337
+ 1:05:59.668 --> 1:06:06.573
2338
+ only directives by doing something in between.
2339
+
2340
+ 1:06:07.327 --> 1:06:12.909
2341
+ And one of these ideas has been done for a
2342
+ long time.
2343
+
2344
+ 1:06:12.909 --> 1:06:17.560
2345
+ It's called compound splitting, but we only.
2346
+
2347
+ 1:06:17.477 --> 1:06:18.424
2348
+ Bounce them.
2349
+
2350
+ 1:06:18.424 --> 1:06:24.831
2351
+ You see that Baum and Stumbo accrue very often,
2352
+ then maybe more often than Bounce them.
2353
+
2354
+ 1:06:24.831 --> 1:06:28.180
2355
+ Then you split Baum and Stumb and you use
2356
+ it.
2357
+
2358
+ 1:06:29.509 --> 1:06:44.165
2359
+ But it's even not so easy it will learn wrong
2360
+ splits so we did that in all the systems and
2361
+
2362
+ 1:06:44.165 --> 1:06:47.708
2363
+ there is a word Asia.
2364
+
2365
+ 1:06:48.288 --> 1:06:56.137
2366
+ And the business, of course, is not a really
2367
+ good way of dealing it because it is non-semantic.
2368
+
2369
+ 1:06:56.676 --> 1:07:05.869
2370
+ The good thing is we didn't really care that
2371
+ much about it because the system wasn't learned
2372
+
2373
+ 1:07:05.869 --> 1:07:09.428
2374
+ if you have Asia and Tish together.
2375
+
2376
+ 1:07:09.729 --> 1:07:17.452
2377
+ So you can of course learn all that the compound
2378
+ spirit doesn't really help you to get a deeper
2379
+
2380
+ 1:07:17.452 --> 1:07:18.658
2381
+ understanding.
2382
+
2383
+ 1:07:21.661 --> 1:07:23.364
2384
+ The Thing of Course.
2385
+
2386
+ 1:07:23.943 --> 1:07:30.475
2387
+ Yeah, there was one paper where this doesn't
2388
+ work like they report, but it's called Burning
2389
+
2390
+ 1:07:30.475 --> 1:07:30.972
2391
+ Ducks.
2392
+
2393
+ 1:07:30.972 --> 1:07:37.503
2394
+ I think because it was like if you had German
2395
+ NS Branter, you could split it in NS Branter,
2396
+
2397
+ 1:07:37.503 --> 1:07:43.254
2398
+ and sometimes you have to add an E to make
2399
+ the compounds that was Enter Branter.
2400
+
2401
+ 1:07:43.583 --> 1:07:48.515
2402
+ So he translated Esperanto into burning dark.
2403
+
2404
+ 1:07:48.888 --> 1:07:56.127
2405
+ So of course you can introduce there some
2406
+ type of additional arrows, but in generally
2407
+
2408
+ 1:07:56.127 --> 1:07:57.221
2409
+ it's a good.
2410
+
2411
+ 1:07:57.617 --> 1:08:03.306
2412
+ Of course there is a trade off between vocabulary
2413
+ size so you want to have a lower vocabulary
2414
+
2415
+ 1:08:03.306 --> 1:08:08.812
2416
+ size so you've seen everything more often but
2417
+ the length of the sequence should not be too
2418
+
2419
+ 1:08:08.812 --> 1:08:13.654
2420
+ long because if you split more often you get
2421
+ less different types but you have.
2422
+
2423
+ 1:08:16.896 --> 1:08:25.281
2424
+ The motivation of the advantage of compared
2425
+ to Character based models is that you can directly
2426
+
2427
+ 1:08:25.281 --> 1:08:33.489
2428
+ learn the representation for works that occur
2429
+ very often while still being able to represent
2430
+
2431
+ 1:08:33.489 --> 1:08:35.783
2432
+ works that are rare into.
2433
+
2434
+ 1:08:36.176 --> 1:08:42.973
2435
+ And while first this was only done for compounds,
2436
+ nowadays there's an algorithm which really
2437
+
2438
+ 1:08:42.973 --> 1:08:49.405
2439
+ tries to do it on everything and there are
2440
+ different ways to be honest compound fitting
2441
+
2442
+ 1:08:49.405 --> 1:08:50.209
2443
+ and so on.
2444
+
2445
+ 1:08:50.209 --> 1:08:56.129
2446
+ But the most successful one which is commonly
2447
+ used is based on data compression.
2448
+
2449
+ 1:08:56.476 --> 1:08:59.246
2450
+ And there the idea is okay.
2451
+
2452
+ 1:08:59.246 --> 1:09:06.765
2453
+ Can we find an encoding so that parts are
2454
+ compressed in the most efficient?
2455
+
2456
+ 1:09:07.027 --> 1:09:22.917
2457
+ And the compression algorithm is called the
2458
+ bipear encoding, and this is also then used
2459
+
2460
+ 1:09:22.917 --> 1:09:25.625
2461
+ for splitting.
2462
+
2463
+ 1:09:26.346 --> 1:09:39.164
2464
+ And the idea is we recursively represent the
2465
+ most frequent pair of bites by a new bike.
2466
+
2467
+ 1:09:39.819 --> 1:09:51.926
2468
+ Language is now you splitch, burst all your
2469
+ words into letters, and then you look at what
2470
+
2471
+ 1:09:51.926 --> 1:09:59.593
2472
+ is the most frequent bigrams of which two letters
2473
+ occur.
2474
+
2475
+ 1:10:00.040 --> 1:10:04.896
2476
+ And then you replace your repeat until you
2477
+ have a fixed vocabulary.
2478
+
2479
+ 1:10:04.985 --> 1:10:08.031
2480
+ So that's a nice thing.
2481
+
2482
+ 1:10:08.031 --> 1:10:16.663
2483
+ Now you can predefine your vocabulary as want
2484
+ to represent my text.
2485
+
2486
+ 1:10:16.936 --> 1:10:28.486
2487
+ By hand, and then you can represent any text
2488
+ with these symbols, and of course the shorter
2489
+
2490
+ 1:10:28.486 --> 1:10:30.517
2491
+ your text will.
2492
+
2493
+ 1:10:32.772 --> 1:10:36.543
2494
+ So the original idea was something like that.
2495
+
2496
+ 1:10:36.543 --> 1:10:39.411
2497
+ We have to sequence A, B, A, B, C.
2498
+
2499
+ 1:10:39.411 --> 1:10:45.149
2500
+ For example, a common biogram is A, B, so
2501
+ you can face A, B, B, I, D.
2502
+
2503
+ 1:10:45.149 --> 1:10:46.788
2504
+ Then the text gets.
2505
+
2506
+ 1:10:48.108 --> 1:10:53.615
2507
+ Then you can make to and then you have eating
2508
+ beet and so on, so this is then your text.
2509
+
2510
+ 1:10:54.514 --> 1:11:00.691
2511
+ Similarly, we can do it now for tanking.
2512
+
2513
+ 1:11:01.761 --> 1:11:05.436
2514
+ Let's assume you have these sentences.
2515
+
2516
+ 1:11:05.436 --> 1:11:11.185
2517
+ I go, he goes, she goes, so your vocabulary
2518
+ is go, goes, he.
2519
+
2520
+ 1:11:11.851 --> 1:11:30.849
2521
+ And the first thing you're doing is split
2522
+ your crocus into singles.
2523
+
2524
+ 1:11:30.810 --> 1:11:34.692
2525
+ So thereby you can split words again like
2526
+ split senses into words.
2527
+
2528
+ 1:11:34.692 --> 1:11:38.980
2529
+ Because now you only have chiracters, you
2530
+ don't know the word boundaries.
2531
+
2532
+ 1:11:38.980 --> 1:11:44.194
2533
+ You introduce the word boundaries by having
2534
+ a special symbol at the end of each word, and
2535
+
2536
+ 1:11:44.194 --> 1:11:46.222
2537
+ then you know this symbol happens.
2538
+
2539
+ 1:11:46.222 --> 1:11:48.366
2540
+ I can split it and have it in a new.
2541
+
2542
+ 1:11:48.708 --> 1:11:55.245
2543
+ So you have the corpus I go, he goes, and
2544
+ she goes, and then you have now here the sequences
2545
+
2546
+ 1:11:55.245 --> 1:11:56.229
2547
+ of Character.
2548
+
2549
+ 1:11:56.229 --> 1:12:02.625
2550
+ So then the Character based per presentation,
2551
+ and now you calculate the bigram statistics.
2552
+
2553
+ 1:12:02.625 --> 1:12:08.458
2554
+ So I and the end of word occurs one time G
2555
+ & O across three times, so there there.
2556
+
2557
+ 1:12:09.189 --> 1:12:18.732
2558
+ And these are all the others, and now you
2559
+ look, which is the most common happening.
2560
+
2561
+ 1:12:19.119 --> 1:12:26.046
2562
+ So then you have known the rules.
2563
+
2564
+ 1:12:26.046 --> 1:12:39.235
2565
+ If and have them together you have these new
2566
+ words: Now is no longer two symbols, but it's
2567
+
2568
+ 1:12:39.235 --> 1:12:41.738
2569
+ one single symbol because if you join that.
2570
+
2571
+ 1:12:42.402 --> 1:12:51.175
2572
+ And then you have here now the new number
2573
+ of biceps, steel and wood, and and so on.
2574
+
2575
+ 1:12:52.092 --> 1:13:01.753
2576
+ In small examples now you have a lot of rules
2577
+ which occur the same time.
2578
+
2579
+ 1:13:01.753 --> 1:13:09.561
2580
+ In reality that is happening sometimes but
2581
+ not that often.
2582
+
2583
+ 1:13:10.370 --> 1:13:21.240
2584
+ You add the end of words to him, and so this
2585
+ way you go on until you have your vocabulary.
2586
+
2587
+ 1:13:21.601 --> 1:13:38.242
2588
+ And your vocabulary is in these rules, so
2589
+ people speak about the vocabulary of the rules.
2590
+
2591
+ 1:13:38.658 --> 1:13:43.637
2592
+ And these are the rules, and if you have not
2593
+ a different sentence, something like they tell.
2594
+
2595
+ 1:13:44.184 --> 1:13:53.600
2596
+ Then your final output looks like something
2597
+ like that.
2598
+
2599
+ 1:13:53.600 --> 1:13:59.250
2600
+ These two words represent by by.
2601
+
2602
+ 1:14:00.940 --> 1:14:06.398
2603
+ And that is your algorithm.
2604
+
2605
+ 1:14:06.398 --> 1:14:18.873
2606
+ Now you can represent any type of text with
2607
+ a fixed vocabulary.
2608
+
2609
+ 1:14:20.400 --> 1:14:23.593
2610
+ So think that's defined in the beginning.
2611
+
2612
+ 1:14:23.593 --> 1:14:27.243
2613
+ Fill how many egos have won and that has spent.
2614
+
2615
+ 1:14:28.408 --> 1:14:35.253
2616
+ It's nearly correct that it writes a number
2617
+ of characters.
2618
+
2619
+ 1:14:35.253 --> 1:14:38.734
2620
+ It can be that in additional.
2621
+
2622
+ 1:14:38.878 --> 1:14:49.162
2623
+ So on the one end all three of the right side
2624
+ of the rules can occur, and then additionally
2625
+
2626
+ 1:14:49.162 --> 1:14:49.721
2627
+ all.
2628
+
2629
+ 1:14:49.809 --> 1:14:55.851
2630
+ In reality it can even happen that there is
2631
+ less your vocabulary smaller because it might
2632
+
2633
+ 1:14:55.851 --> 1:15:01.960
2634
+ happen that like for example go never occurs
2635
+ singular at the end but you always like merge
2636
+
2637
+ 1:15:01.960 --> 1:15:06.793
2638
+ all occurrences so there are not all right
2639
+ sides really happen because.
2640
+
2641
+ 1:15:06.746 --> 1:15:11.269
2642
+ This rule is never only applied, but afterwards
2643
+ another rule is also applied.
2644
+
2645
+ 1:15:11.531 --> 1:15:15.621
2646
+ So it's a summary approbounce of your vocabulary
2647
+ than static.
2648
+
2649
+ 1:15:20.480 --> 1:15:29.014
2650
+ Then we come to the last part, which is about
2651
+ parallel data, but we have some questions beforehand.
2652
+
2653
+ 1:15:36.436 --> 1:15:38.824
2654
+ So what is parallel data?
2655
+
2656
+ 1:15:38.824 --> 1:15:47.368
2657
+ So if we set machine translations really,
2658
+ really important that we are dealing with parallel
2659
+
2660
+ 1:15:47.368 --> 1:15:52.054
2661
+ data, that means we have a lined input and
2662
+ output.
2663
+
2664
+ 1:15:52.054 --> 1:15:54.626
2665
+ You have this type of data.
2666
+
2667
+ 1:15:55.015 --> 1:16:01.773
2668
+ However, in machine translation we have one
2669
+ very big advantage that is somewhat naturally
2670
+
2671
+ 1:16:01.773 --> 1:16:07.255
2672
+ occurring, so you have a lot of parallel data
2673
+ which you can summar gaps.
2674
+
2675
+ 1:16:07.255 --> 1:16:13.788
2676
+ In many P tests you need to manually annotate
2677
+ your data and generate the aligned data.
2678
+
2679
+ 1:16:14.414 --> 1:16:22.540
2680
+ We have to manually create translations, and
2681
+ of course that is very expensive, but it's
2682
+
2683
+ 1:16:22.540 --> 1:16:29.281
2684
+ really expensive to pay for like one million
2685
+ sentences to be translated.
2686
+
2687
+ 1:16:29.889 --> 1:16:36.952
2688
+ The nice thing is that in there is data normally
2689
+ available because other people have done machine
2690
+
2691
+ 1:16:36.952 --> 1:16:37.889
2692
+ translation.
2693
+
2694
+ 1:16:40.120 --> 1:16:44.672
2695
+ So there is this data and of course process
2696
+ it.
2697
+
2698
+ 1:16:44.672 --> 1:16:51.406
2699
+ We'll have a full lecture on how to deal with
2700
+ more complex situations.
2701
+
2702
+ 1:16:52.032 --> 1:16:56.645
2703
+ The idea is really you don't do really much
2704
+ human work.
2705
+
2706
+ 1:16:56.645 --> 1:17:02.825
2707
+ You really just start the caller with some
2708
+ initials, start pages and then.
2709
+
2710
+ 1:17:03.203 --> 1:17:07.953
2711
+ But a lot of iquality parallel data is really
2712
+ targeted on some scenarios.
2713
+
2714
+ 1:17:07.953 --> 1:17:13.987
2715
+ So, for example, think of the European Parliament
2716
+ as one website where you can easily extract
2717
+
2718
+ 1:17:13.987 --> 1:17:17.581
2719
+ these information from and there you have a
2720
+ large data.
2721
+
2722
+ 1:17:17.937 --> 1:17:22.500
2723
+ Or like we have the TED data, which is also
2724
+ you can get from the TED website.
2725
+
2726
+ 1:17:23.783 --> 1:17:33.555
2727
+ So in generally parallel corpus is a collection
2728
+ of texts with translations into one of several.
2729
+
2730
+ 1:17:34.134 --> 1:17:42.269
2731
+ And this data is important because there is
2732
+ no general empty normally, but you work secured.
2733
+
2734
+ 1:17:42.222 --> 1:17:46.732
2735
+ It works especially good if your training
2736
+ and test conditions are similar.
2737
+
2738
+ 1:17:46.732 --> 1:17:50.460
2739
+ So if the topic is similar, the style of modality
2740
+ is similar.
2741
+
2742
+ 1:17:50.460 --> 1:17:55.391
2743
+ So if you want to translate speech, it's often
2744
+ better to train all to own speech.
2745
+
2746
+ 1:17:55.391 --> 1:17:58.818
2747
+ If you want to translate text, it's better
2748
+ to translate.
2749
+
2750
+ 1:17:59.379 --> 1:18:08.457
2751
+ And there is a lot of these data available
2752
+ nowadays for common languages.
2753
+
2754
+ 1:18:08.457 --> 1:18:12.014
2755
+ You normally can start with.
2756
+
2757
+ 1:18:12.252 --> 1:18:15.298
2758
+ It's really available.
2759
+
2760
+ 1:18:15.298 --> 1:18:27.350
2761
+ For example, Opus is a big website collecting
2762
+ different types of parallel corpus where you
2763
+
2764
+ 1:18:27.350 --> 1:18:29.601
2765
+ can select them.
2766
+
2767
+ 1:18:29.529 --> 1:18:33.276
2768
+ You have this document alignment will come
2769
+ to that layout.
2770
+
2771
+ 1:18:33.553 --> 1:18:39.248
2772
+ There is things like comparable data where
2773
+ you have not full sentences but only some parts
2774
+
2775
+ 1:18:39.248 --> 1:18:40.062
2776
+ of parallel.
2777
+
2778
+ 1:18:40.220 --> 1:18:48.700
2779
+ But now first let's assume we have easy tasks
2780
+ like European Parliament when we have the speech
2781
+
2782
+ 1:18:48.700 --> 1:18:55.485
2783
+ in German and the speech in English and you
2784
+ need to generate parallel data.
2785
+
2786
+ 1:18:55.485 --> 1:18:59.949
2787
+ That means you have to align the sewer sentences.
2788
+
2789
+ 1:19:00.000 --> 1:19:01.573
2790
+ And doing this right.
2791
+
2792
+ 1:19:05.905 --> 1:19:08.435
2793
+ How can we do that?
2794
+
2795
+ 1:19:08.435 --> 1:19:19.315
2796
+ And that is what people refer to sentence
2797
+ alignment, so we have parallel documents in
2798
+
2799
+ 1:19:19.315 --> 1:19:20.707
2800
+ languages.
2801
+
2802
+ 1:19:22.602 --> 1:19:32.076
2803
+ This is so you cannot normally do that word
2804
+ by word because there is no direct correlation
2805
+
2806
+ 1:19:32.076 --> 1:19:34.158
2807
+ between, but it is.
2808
+
2809
+ 1:19:34.074 --> 1:19:39.837
2810
+ Relatively possible to do it on sentence level,
2811
+ it will not be perfect, so you sometimes have
2812
+
2813
+ 1:19:39.837 --> 1:19:42.535
2814
+ two sentences in English and one in German.
2815
+
2816
+ 1:19:42.535 --> 1:19:47.992
2817
+ German like to have these long sentences with
2818
+ sub clauses and so on, so there you can do
2819
+
2820
+ 1:19:47.992 --> 1:19:51.733
2821
+ it, but with long sentences it might not be
2822
+ really possible.
2823
+
2824
+ 1:19:55.015 --> 1:19:59.454
2825
+ And for some we saw that sentence Marcus Andre
2826
+ there, so it's more complicated.
2827
+
2828
+ 1:19:59.819 --> 1:20:10.090
2829
+ So how can we formalize this sentence alignment
2830
+ problem?
2831
+
2832
+ 1:20:10.090 --> 1:20:16.756
2833
+ So we have a set of sewer sentences.
2834
+
2835
+ 1:20:17.377 --> 1:20:22.167
2836
+ And machine translation relatively often.
2837
+
2838
+ 1:20:22.167 --> 1:20:32.317
2839
+ Sometimes source sentences nowadays are and,
2840
+ but traditionally it was and because people
2841
+
2842
+ 1:20:32.317 --> 1:20:34.027
2843
+ started using.
2844
+
2845
+ 1:20:34.594 --> 1:20:45.625
2846
+ And then the idea is to find this alignment
2847
+ where we have alignment.
2848
+
2849
+ 1:20:46.306 --> 1:20:50.421
2850
+ And of course you want these sequences to
2851
+ be shown as possible.
2852
+
2853
+ 1:20:50.421 --> 1:20:56.400
2854
+ Of course an easy solution is here all my
2855
+ screen sentences and here all my target sentences.
2856
+
2857
+ 1:20:56.756 --> 1:21:07.558
2858
+ So want to have short sequences there, typically
2859
+ one sentence or maximum two or three sentences,
2860
+
2861
+ 1:21:07.558 --> 1:21:09.340
2862
+ so that really.
2863
+
2864
+ 1:21:13.913 --> 1:21:21.479
2865
+ Then there is different ways of restriction
2866
+ to this type of alignment, so first of all
2867
+
2868
+ 1:21:21.479 --> 1:21:29.131
2869
+ it should be a monotone alignment, so that
2870
+ means that each segment on the source should
2871
+
2872
+ 1:21:29.131 --> 1:21:31.218
2873
+ start after each other.
2874
+
2875
+ 1:21:31.431 --> 1:21:36.428
2876
+ So we assume that in document there's really
2877
+ a monotone and it's going the same way in source.
2878
+
2879
+ 1:21:36.957 --> 1:21:41.965
2880
+ Course for a very free translation that might
2881
+ not be valid anymore.
2882
+
2883
+ 1:21:41.965 --> 1:21:49.331
2884
+ But this algorithm, the first one in the church
2885
+ and gay algorithm, is more than really translations
2886
+
2887
+ 1:21:49.331 --> 1:21:51.025
2888
+ which are very direct.
2889
+
2890
+ 1:21:51.025 --> 1:21:54.708
2891
+ So each segment should be like coming after
2892
+ each.
2893
+
2894
+ 1:21:55.115 --> 1:22:04.117
2895
+ Then we want to translate the full sequence,
2896
+ and of course each segment should start before
2897
+
2898
+ 1:22:04.117 --> 1:22:04.802
2899
+ it is.
2900
+
2901
+ 1:22:05.525 --> 1:22:22.654
2902
+ And then you want to have something like that,
2903
+ but you have to alignments or alignments.
2904
+
2905
+ 1:22:25.525 --> 1:22:41.851
2906
+ The alignment types are: You then, of course,
2907
+ sometimes insertions and Venetians where there
2908
+
2909
+ 1:22:41.851 --> 1:22:43.858
2910
+ is some information added.
2911
+
2912
+ 1:22:44.224 --> 1:22:50.412
2913
+ Hand be, for example, explanation, so it can
2914
+ be that some term is known in the one language
2915
+
2916
+ 1:22:50.412 --> 1:22:51.018
2917
+ but not.
2918
+
2919
+ 1:22:51.111 --> 1:22:53.724
2920
+ Think of things like Deutschland ticket.
2921
+
2922
+ 1:22:53.724 --> 1:22:58.187
2923
+ In Germany everybody will by now know what
2924
+ the Deutschland ticket is.
2925
+
2926
+ 1:22:58.187 --> 1:23:03.797
2927
+ But if you translate it to English it might
2928
+ be important to explain it and other things
2929
+
2930
+ 1:23:03.797 --> 1:23:04.116
2931
+ are.
2932
+
2933
+ 1:23:04.116 --> 1:23:09.853
2934
+ So sometimes you have to explain things and
2935
+ then you have more sentences with insertions.
2936
+
2937
+ 1:23:10.410 --> 1:23:15.956
2938
+ Then you have two to one and one to two alignment,
2939
+ and that is, for example, in Germany you have
2940
+
2941
+ 1:23:15.956 --> 1:23:19.616
2942
+ a lot of sub-classes and bipes that are expressed
2943
+ by two cents.
2944
+
2945
+ 1:23:20.580 --> 1:23:37.725
2946
+ Of course, it might be more complex, but typically
2947
+ to make it simple and only allow for this type
2948
+
2949
+ 1:23:37.725 --> 1:23:40.174
2950
+ of alignment.
2951
+
2952
+ 1:23:41.301 --> 1:23:56.588
2953
+ Then it is about finding the alignment and
2954
+ that is, we try to score where we just take
2955
+
2956
+ 1:23:56.588 --> 1:23:59.575
2957
+ a general score.
2958
+
2959
+ 1:24:00.000 --> 1:24:04.011
2960
+ That is true like gala algorithms and the
2961
+ matching of one segment.
2962
+
2963
+ 1:24:04.011 --> 1:24:09.279
2964
+ If you have one segment now so this is one
2965
+ of the global things so the global alignment
2966
+
2967
+ 1:24:09.279 --> 1:24:13.828
2968
+ is as good as the product of all single steps
2969
+ and then you have two scores.
2970
+
2971
+ 1:24:13.828 --> 1:24:18.558
2972
+ First of all you say one to one alignments
2973
+ are much better than all the hours.
2974
+
2975
+ 1:24:19.059 --> 1:24:26.884
2976
+ And then you have a lexical similarity, which
2977
+ is, for example, based on an initial dictionary
2978
+
2979
+ 1:24:26.884 --> 1:24:30.713
2980
+ which counts how many dictionary entries are.
2981
+
2982
+ 1:24:31.091 --> 1:24:35.407
2983
+ So this is a very simple algorithm.
2984
+
2985
+ 1:24:35.407 --> 1:24:41.881
2986
+ Typically violates like your first step and
2987
+ you want.
2988
+
2989
+ 1:24:43.303 --> 1:24:54.454
2990
+ And that is like with this one you can get
2991
+ an initial one you can have better parallel
2992
+
2993
+ 1:24:54.454 --> 1:24:55.223
2994
+ data.
2995
+
2996
+ 1:24:55.675 --> 1:25:02.369
2997
+ No, it is an optimization problem and you
2998
+ are now based on the scores you can calculate
2999
+
3000
+ 1:25:02.369 --> 1:25:07.541
3001
+ for each possible alignment and score and then
3002
+ select the best one.
3003
+
3004
+ 1:25:07.541 --> 1:25:14.386
3005
+ Of course, you won't try all possibilities
3006
+ out but you can do a good search and then find
3007
+
3008
+ 1:25:14.386 --> 1:25:15.451
3009
+ the best one.
3010
+
3011
+ 1:25:15.815 --> 1:25:18.726
3012
+ Can typically be automatically.
3013
+
3014
+ 1:25:18.726 --> 1:25:25.456
3015
+ Of course, you should do some checks like
3016
+ aligning sentences as possible.
3017
+
3018
+ 1:25:26.766 --> 1:25:32.043
3019
+ A bill like typically for training data is
3020
+ done this way.
3021
+
3022
+ 1:25:32.043 --> 1:25:35.045
3023
+ Maybe if you have test data you.
3024
+
3025
+ 1:25:40.000 --> 1:25:47.323
3026
+ Sorry, I'm a bit late because originally wanted
3027
+ to do a quiz at the end.
3028
+
3029
+ 1:25:47.323 --> 1:25:49.129
3030
+ Can we go a quiz?
3031
+
3032
+ 1:25:49.429 --> 1:25:51.833
3033
+ We'll do it somewhere else.
3034
+
3035
+ 1:25:51.833 --> 1:25:56.813
3036
+ We had a bachelor project about making quiz
3037
+ for lectures.
3038
+
3039
+ 1:25:56.813 --> 1:25:59.217
3040
+ And I still want to try it.
3041
+
3042
+ 1:25:59.217 --> 1:26:04.197
3043
+ So let's see I hope in some other lecture
3044
+ we can do that.
3045
+
3046
+ 1:26:04.197 --> 1:26:09.435
3047
+ Then we can at the island of the lecture do
3048
+ some quiz about.
3049
+
3050
+ 1:26:09.609 --> 1:26:13.081
3051
+ All We Can Do Is Is the Practical Thing Let's
3052
+ See.
3053
+
3054
+ 1:26:13.533 --> 1:26:24.719
3055
+ And: Today, so what you should remember is
3056
+ what is parallel data and how we can.
3057
+
3058
+ 1:26:25.045 --> 1:26:29.553
3059
+ Create parallel data like how to generally
3060
+ process data.
3061
+
3062
+ 1:26:29.553 --> 1:26:36.435
3063
+ What you think about data is really important
3064
+ if you build systems and different ways.
3065
+
3066
+ 1:26:36.696 --> 1:26:46.857
3067
+ The three main options like forwards is directly
3068
+ on director level or using subword things.
3069
+
3070
+ 1:26:47.687 --> 1:26:49.634
3071
+ Is there any question?
3072
+
3073
+ 1:26:52.192 --> 1:26:57.768
3074
+ Yes, this is the alignment thing in Cadillac
3075
+ band in Tyne walking with people.
3076
+
3077
+ 1:27:00.000 --> 1:27:05.761
3078
+ It's not directly using than every time walking,
3079
+ but the idea is similar and you can use all
3080
+
3081
+ 1:27:05.761 --> 1:27:11.771
3082
+ this type of similar algorithms, which is the
3083
+ main thing which is the question of the difficulty
3084
+
3085
+ 1:27:11.771 --> 1:27:14.807
3086
+ is to define me at your your loss function
3087
+ here.
3088
+
3089
+ 1:27:14.807 --> 1:27:16.418
3090
+ What is a good alignment?
3091
+
3092
+ 1:27:16.736 --> 1:27:24.115
3093
+ But as you do not have a time walk on, you
3094
+ have a monotone alignment in there, and you
3095
+
3096
+ 1:27:24.115 --> 1:27:26.150
3097
+ cannot have rehonoring.
3098
+
3099
+ 1:27:30.770 --> 1:27:40.121
3100
+ There then thanks a lot and on first day we
3101
+ will then start with or discuss.
3102
+
demo_data/lectures/Lecture-03-25.04.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b241226dacb56a88fcbccaecb2639c3b5765fbea6f60e4758715c6941fbc512
3
+ size 117644511
demo_data/lectures/Lecture-04-27.04.2023/English.vtt ADDED
@@ -0,0 +1,2919 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 0:00:03.663 --> 0:00:07.970
4
+ Okay, then I should switch back to English,
5
+ sorry,.
6
+
7
+ 0:00:08.528 --> 0:00:18.970
8
+ So welcome to today's lecture in the cross
9
+ machine translation and today we're planning
10
+
11
+ 0:00:18.970 --> 0:00:20.038
12
+ to talk.
13
+
14
+ 0:00:20.880 --> 0:00:31.845
15
+ Which will be without our summary of power
16
+ translation was done from around till.
17
+
18
+ 0:00:32.872 --> 0:00:38.471
19
+ Fourteen, so this was an approach which was
20
+ quite long.
21
+
22
+ 0:00:38.471 --> 0:00:47.070
23
+ It was the first approach where at the end
24
+ the quality was really so good that it was
25
+
26
+ 0:00:47.070 --> 0:00:49.969
27
+ used as a commercial system.
28
+
29
+ 0:00:49.990 --> 0:00:56.482
30
+ Or something like that, so the first systems
31
+ there was using the statistical machine translation.
32
+
33
+ 0:00:57.937 --> 0:01:02.706
34
+ So when I came into the field this was the
35
+ main part of the lecture, so there would be
36
+
37
+ 0:01:02.706 --> 0:01:07.912
38
+ not be one lecture, but in more detail than
39
+ half of the full course would be about statistical
40
+
41
+ 0:01:07.912 --> 0:01:09.063
42
+ machine translation.
43
+
44
+ 0:01:09.369 --> 0:01:23.381
45
+ So what we try to do today is like get the
46
+ most important things, which think our part
47
+
48
+ 0:01:23.381 --> 0:01:27.408
49
+ is still very important.
50
+
51
+ 0:01:27.267 --> 0:01:31.196
52
+ Four State of the Art Box.
53
+
54
+ 0:01:31.952 --> 0:01:45.240
55
+ Then we'll have the presentation about how
56
+ to evaluate the other part of the machine translation.
57
+
58
+ 0:01:45.505 --> 0:01:58.396
59
+ The other important thing is the language
60
+ modeling part will explain later how they combine.
61
+
62
+ 0:01:59.539 --> 0:02:04.563
63
+ Shortly mentioned this one already.
64
+
65
+ 0:02:04.824 --> 0:02:06.025
66
+ On Tuesday.
67
+
68
+ 0:02:06.246 --> 0:02:21.849
69
+ So in a lot of these explanations, how we
70
+ model translation process, it might be surprising:
71
+
72
+ 0:02:22.082 --> 0:02:27.905
73
+ Later some people say it's for four eight words
74
+ traditionally came because the first models
75
+
76
+ 0:02:27.905 --> 0:02:32.715
77
+ which you'll discuss here also when they are
78
+ referred to as the IVM models.
79
+
80
+ 0:02:32.832 --> 0:02:40.043
81
+ They were trained on French to English translation
82
+ directions and that's why they started using
83
+
84
+ 0:02:40.043 --> 0:02:44.399
85
+ F and E and then this was done for the next
86
+ twenty years.
87
+
88
+ 0:02:44.664 --> 0:02:52.316
89
+ So while we are trying to wait, the source
90
+ words is: We have a big eye, typically the
91
+
92
+ 0:02:52.316 --> 0:03:02.701
93
+ lengths of the sewer sentence in small eye,
94
+ the position, and similarly in the target and
95
+
96
+ 0:03:02.701 --> 0:03:05.240
97
+ the lengths of small.
98
+
99
+ 0:03:05.485 --> 0:03:13.248
100
+ Things will get a bit complicated in this
101
+ way because it is not always clear what is
102
+
103
+ 0:03:13.248 --> 0:03:13.704
104
+ the.
105
+
106
+ 0:03:14.014 --> 0:03:21.962
107
+ See that there is this noisy channel model
108
+ which switches the direction in your model,
109
+
110
+ 0:03:21.962 --> 0:03:25.616
111
+ but in the application it's the target.
112
+
113
+ 0:03:26.006 --> 0:03:37.077
114
+ So that is why if you especially read these
115
+ papers, it might sometimes be a bit disturbing.
116
+
117
+ 0:03:37.437 --> 0:03:40.209
118
+ Try to keep it here always.
119
+
120
+ 0:03:40.209 --> 0:03:48.427
121
+ The source is, and even if we use a model
122
+ where it's inverse, we'll keep this way.
123
+
124
+ 0:03:48.468 --> 0:03:55.138
125
+ Don't get disturbed by that, and I think it's
126
+ possible to understand all that without this
127
+
128
+ 0:03:55.138 --> 0:03:55.944
129
+ confusion.
130
+
131
+ 0:03:55.944 --> 0:04:01.734
132
+ But in some of the papers you might get confused
133
+ because they switched to the.
134
+
135
+ 0:04:04.944 --> 0:04:17.138
136
+ In general, in statistics and machine translation,
137
+ the goal is how we do translation.
138
+
139
+ 0:04:17.377 --> 0:04:25.562
140
+ But first we are seeing all our possible target
141
+ sentences as possible translations.
142
+
143
+ 0:04:26.726 --> 0:04:37.495
144
+ And we are assigning some probability to the
145
+ combination, so we are modeling.
146
+
147
+ 0:04:39.359 --> 0:04:49.746
148
+ And then we are doing a search over all possible
149
+ things or at least theoretically, and we are
150
+
151
+ 0:04:49.746 --> 0:04:56.486
152
+ trying to find the translation with the highest
153
+ probability.
154
+
155
+ 0:04:56.936 --> 0:05:05.116
156
+ And this general idea is also true for neuromachine
157
+ translation.
158
+
159
+ 0:05:05.116 --> 0:05:07.633
160
+ They differ in how.
161
+
162
+ 0:05:08.088 --> 0:05:10.801
163
+ So these were then of course the two big challenges.
164
+
165
+ 0:05:11.171 --> 0:05:17.414
166
+ On the one hand, how can we estimate this
167
+ probability?
168
+
169
+ 0:05:17.414 --> 0:05:21.615
170
+ How is the translation of the other?
171
+
172
+ 0:05:22.262 --> 0:05:32.412
173
+ The other challenge is the search, so we cannot,
174
+ of course, say we want to find the most probable
175
+
176
+ 0:05:32.412 --> 0:05:33.759
177
+ translation.
178
+
179
+ 0:05:33.759 --> 0:05:42.045
180
+ We cannot go over all possible English sentences
181
+ and calculate the probability.
182
+
183
+ 0:05:43.103 --> 0:05:45.004
184
+ So,.
185
+
186
+ 0:05:45.165 --> 0:05:53.423
187
+ What we have to do there is some are doing
188
+ intelligent search and look for the ones and
189
+
190
+ 0:05:53.423 --> 0:05:54.268
191
+ compare.
192
+
193
+ 0:05:54.734 --> 0:05:57.384
194
+ That will be done.
195
+
196
+ 0:05:57.384 --> 0:06:07.006
197
+ This process of finding them is called the
198
+ decoding process because.
199
+
200
+ 0:06:07.247 --> 0:06:09.015
201
+ They will be covered well later.
202
+
203
+ 0:06:09.015 --> 0:06:11.104
204
+ Today we will concentrate on the mile.
205
+
206
+ 0:06:11.451 --> 0:06:23.566
207
+ The model is trained using data, so in the
208
+ first step we're having data, we're somehow
209
+
210
+ 0:06:23.566 --> 0:06:30.529
211
+ having a definition of what the model looks
212
+ like.
213
+
214
+ 0:06:34.034 --> 0:06:42.913
215
+ And in statistical machine translation the
216
+ common model is behind.
217
+
218
+ 0:06:42.913 --> 0:06:46.358
219
+ That is what is referred.
220
+
221
+ 0:06:46.786 --> 0:06:55.475
222
+ And this is motivated by the initial idea
223
+ from Shannon.
224
+
225
+ 0:06:55.475 --> 0:07:02.457
226
+ We have this that you can think of decoding.
227
+
228
+ 0:07:02.722 --> 0:07:10.472
229
+ So think of it as we have this text in maybe
230
+ German.
231
+
232
+ 0:07:10.472 --> 0:07:21.147
233
+ Originally it was an English text, but somebody
234
+ used some nice decoding.
235
+
236
+ 0:07:21.021 --> 0:07:28.579
237
+ Task is to decipher it again, this crazy cyborg
238
+ expressing things in German, and to decipher
239
+
240
+ 0:07:28.579 --> 0:07:31.993
241
+ the meaning again and doing that between.
242
+
243
+ 0:07:32.452 --> 0:07:35.735
244
+ And that is the idea about this noisy channel
245
+ when it.
246
+
247
+ 0:07:36.236 --> 0:07:47.209
248
+ It goes through some type of channel which
249
+ adds noise to the source and then you receive
250
+
251
+ 0:07:47.209 --> 0:07:48.811
252
+ the message.
253
+
254
+ 0:07:49.429 --> 0:08:00.190
255
+ And then the idea is, can we now construct
256
+ the original message out of these messages
257
+
258
+ 0:08:00.190 --> 0:08:05.070
259
+ by modeling some of the channels here?
260
+
261
+ 0:08:06.726 --> 0:08:15.797
262
+ There you know to see a bit the surface of
263
+ the source message with English.
264
+
265
+ 0:08:15.797 --> 0:08:22.361
266
+ It went through some channel and received
267
+ the message.
268
+
269
+ 0:08:22.682 --> 0:08:31.381
270
+ If you're not looking at machine translation,
271
+ your source language is English.
272
+
273
+ 0:08:31.671 --> 0:08:44.388
274
+ Here you see now a bit of this where the confusion
275
+ starts while English as a target language is
276
+
277
+ 0:08:44.388 --> 0:08:47.700
278
+ also the source message.
279
+
280
+ 0:08:47.927 --> 0:08:48.674
281
+ You can see.
282
+
283
+ 0:08:48.674 --> 0:08:51.488
284
+ There is also a mathematics of how we model
285
+ the.
286
+
287
+ 0:08:52.592 --> 0:08:56.888
288
+ It's a noisy channel model from a mathematic
289
+ point of view.
290
+
291
+ 0:08:56.997 --> 0:09:00.245
292
+ So this is again our general formula.
293
+
294
+ 0:09:00.245 --> 0:09:08.623
295
+ We are looking for the most probable translation
296
+ and that is the translation that has the highest
297
+
298
+ 0:09:08.623 --> 0:09:09.735
299
+ probability.
300
+
301
+ 0:09:09.809 --> 0:09:19.467
302
+ We are not interested in the probability itself,
303
+ but we are interesting in this target sentence
304
+
305
+ 0:09:19.467 --> 0:09:22.082
306
+ E where this probability.
307
+
308
+ 0:09:23.483 --> 0:09:33.479
309
+ And: Therefore, we can use them twice definition
310
+ of conditional probability and using the base
311
+
312
+ 0:09:33.479 --> 0:09:42.712
313
+ rules, so this probability equals the probability
314
+ of f giving any kind of probability of e divided
315
+
316
+ 0:09:42.712 --> 0:09:44.858
317
+ by the probability of.
318
+
319
+ 0:09:45.525 --> 0:09:48.218
320
+ Now see mathematically this confusion.
321
+
322
+ 0:09:48.218 --> 0:09:54.983
323
+ Originally we are interested in the probability
324
+ of the target sentence given the search sentence.
325
+
326
+ 0:09:55.295 --> 0:10:00.742
327
+ And if we are modeling things now, we are
328
+ looking here at the inverse direction, so the
329
+
330
+ 0:10:00.742 --> 0:10:06.499
331
+ probability of F given E to the probability
332
+ of the source sentence given the target sentence
333
+
334
+ 0:10:06.499 --> 0:10:10.832
335
+ is the probability of the target sentence divided
336
+ by the probability.
337
+
338
+ 0:10:13.033 --> 0:10:15.353
339
+ Why are we doing this?
340
+
341
+ 0:10:15.353 --> 0:10:24.333
342
+ Maybe I mean, of course, once it's motivated
343
+ by our model, that we were saying this type
344
+
345
+ 0:10:24.333 --> 0:10:27.058
346
+ of how we are modeling it.
347
+
348
+ 0:10:27.058 --> 0:10:30.791
349
+ The other interesting thing is that.
350
+
351
+ 0:10:31.231 --> 0:10:40.019
352
+ So we are looking at this probability up there,
353
+ which we had before we formulate that we can
354
+
355
+ 0:10:40.019 --> 0:10:40.775
356
+ remove.
357
+
358
+ 0:10:41.181 --> 0:10:46.164
359
+ If we are searching for the highest translation,
360
+ this is fixed.
361
+
362
+ 0:10:46.164 --> 0:10:47.800
363
+ This doesn't change.
364
+
365
+ 0:10:47.800 --> 0:10:52.550
366
+ We have an input, the source sentence, and
367
+ we cannot change.
368
+
369
+ 0:10:52.812 --> 0:11:02.780
370
+ Is always the same, so we can ignore it in
371
+ the ACMAX because the lower one is exactly
372
+
373
+ 0:11:02.780 --> 0:11:03.939
374
+ the same.
375
+
376
+ 0:11:04.344 --> 0:11:06.683
377
+ And then we have p o f.
378
+
379
+ 0:11:06.606 --> 0:11:13.177
380
+ E times P of E and that is so we are modeling
381
+ the translation process on the one hand with
382
+
383
+ 0:11:13.177 --> 0:11:19.748
384
+ the translation model which models how probable
385
+ is the sentence F given E and on the other
386
+
387
+ 0:11:19.748 --> 0:11:25.958
388
+ hand with the language model which models only
389
+ how probable is this English sentence.
390
+
391
+ 0:11:26.586 --> 0:11:39.366
392
+ That somebody wrote this language or translation
393
+ point of view, this is about fluency.
394
+
395
+ 0:11:40.200 --> 0:11:44.416
396
+ You should have in German, for example, agreement.
397
+
398
+ 0:11:44.416 --> 0:11:50.863
399
+ If the agreement is not right, that's properly
400
+ not said by anybody in German.
401
+
402
+ 0:11:50.863 --> 0:11:58.220
403
+ Nobody would say that's Schönest's house because
404
+ it's not according to the German rules.
405
+
406
+ 0:11:58.598 --> 0:12:02.302
407
+ So this can be modeled by the language model.
408
+
409
+ 0:12:02.542 --> 0:12:09.855
410
+ And you have the translation model which models
411
+ housings get translated between the.
412
+
413
+ 0:12:10.910 --> 0:12:18.775
414
+ And here you see again our confusion again,
415
+ and now here put the translation model: Wage
416
+
417
+ 0:12:18.775 --> 0:12:24.360
418
+ is a big income counterintuitive because the
419
+ probability of a sewer sentence giving the
420
+
421
+ 0:12:24.360 --> 0:12:24.868
422
+ target.
423
+
424
+ 0:12:26.306 --> 0:12:35.094
425
+ Have to do that for the bass farmer, but in
426
+ the following slides I'll talk again about.
427
+
428
+ 0:12:35.535 --> 0:12:45.414
429
+ Because yeah, that's more intuitive that you
430
+ model the translation of the target sentence
431
+
432
+ 0:12:45.414 --> 0:12:48.377
433
+ given the source sentence.
434
+
435
+ 0:12:50.930 --> 0:12:55.668
436
+ And this is what we want to talk about today.
437
+
438
+ 0:12:55.668 --> 0:13:01.023
439
+ We later talk about language models how to
440
+ do that.
441
+
442
+ 0:13:00.940 --> 0:13:04.493
443
+ And maybe also how to combine them.
444
+
445
+ 0:13:04.493 --> 0:13:13.080
446
+ But the focus on today would be how can we
447
+ model this probability to how to generate a
448
+
449
+ 0:13:13.080 --> 0:13:16.535
450
+ translation from source to target?
451
+
452
+ 0:13:19.960 --> 0:13:24.263
453
+ How can we do that and the easiest thing?
454
+
455
+ 0:13:24.263 --> 0:13:33.588
456
+ Maybe if you think about statistics, you count
457
+ how many examples you have, how many target
458
+
459
+ 0:13:33.588 --> 0:13:39.121
460
+ sentences go occur, and that gives you an estimation.
461
+
462
+ 0:13:40.160 --> 0:13:51.632
463
+ However, like in another model that is not
464
+ possible because most sentences you will never
465
+
466
+ 0:13:51.632 --> 0:13:52.780
467
+ see, so.
468
+
469
+ 0:13:53.333 --> 0:14:06.924
470
+ So what we have to do is break up the translation
471
+ process into smaller models and model each
472
+
473
+ 0:14:06.924 --> 0:14:09.555
474
+ of the decisions.
475
+
476
+ 0:14:09.970 --> 0:14:26.300
477
+ So this simple solution with how you throw
478
+ a dice is like you have a and that gives you
479
+
480
+ 0:14:26.300 --> 0:14:29.454
481
+ the probability.
482
+
483
+ 0:14:29.449 --> 0:14:40.439
484
+ But here's the principle because each event
485
+ is so rare that most of them never have helped.
486
+
487
+ 0:14:43.063 --> 0:14:48.164
488
+ Although it might be that in all your training
489
+ data you have never seen this title of set.
490
+
491
+ 0:14:49.589 --> 0:14:52.388
492
+ How can we do that?
493
+
494
+ 0:14:52.388 --> 0:15:04.845
495
+ We look in statistical machine translation
496
+ into two different models, a generative model
497
+
498
+ 0:15:04.845 --> 0:15:05.825
499
+ where.
500
+
501
+ 0:15:06.166 --> 0:15:11.736
502
+ So the idea was to really model model like
503
+ each individual translation between words.
504
+
505
+ 0:15:12.052 --> 0:15:22.598
506
+ So you break down the translation of a full
507
+ sentence into the translation of each individual's
508
+
509
+ 0:15:22.598 --> 0:15:23.264
510
+ word.
511
+
512
+ 0:15:23.264 --> 0:15:31.922
513
+ So you say if you have the black cat, if you
514
+ translate it, the full sentence.
515
+
516
+ 0:15:32.932 --> 0:15:38.797
517
+ Of course, this has some challenges, any ideas
518
+ where this type of model could be very challenging.
519
+
520
+ 0:15:40.240 --> 0:15:47.396
521
+ Vocabularies and videos: Yes, we're going
522
+ to be able to play in the very color.
523
+
524
+ 0:15:47.867 --> 0:15:51.592
525
+ Yes, but you could at least use a bit of the
526
+ context around it.
527
+
528
+ 0:15:51.592 --> 0:15:55.491
529
+ It will not only depend on the word, but it's
530
+ already challenging.
531
+
532
+ 0:15:55.491 --> 0:15:59.157
533
+ You make things very hard, so that's definitely
534
+ one challenge.
535
+
536
+ 0:16:00.500 --> 0:16:07.085
537
+ One other, what did you talk about that we
538
+ just don't want to say?
539
+
540
+ 0:16:08.348 --> 0:16:11.483
541
+ Yes, they are challenging.
542
+
543
+ 0:16:11.483 --> 0:16:21.817
544
+ You have to do something like words, but the
545
+ problem is that you might introduce errors.
546
+
547
+ 0:16:21.841 --> 0:16:23.298
548
+ Later and makes things very comfortable.
549
+
550
+ 0:16:25.265 --> 0:16:28.153
551
+ Wrong splitting is the worst things that are
552
+ very complicated.
553
+
554
+ 0:16:32.032 --> 0:16:35.580
555
+ Saints, for example, and also maybe Japanese
556
+ medicine.
557
+
558
+ 0:16:35.735 --> 0:16:41.203
559
+ In German, yes, especially like these are
560
+ all right.
561
+
562
+ 0:16:41.203 --> 0:16:46.981
563
+ The first thing is maybe the one which is
564
+ most obvious.
565
+
566
+ 0:16:46.981 --> 0:16:49.972
567
+ It is raining cats and dogs.
568
+
569
+ 0:16:51.631 --> 0:17:01.837
570
+ To German, the cat doesn't translate this
571
+ whole chunk into something because there is
572
+
573
+ 0:17:01.837 --> 0:17:03.261
574
+ not really.
575
+
576
+ 0:17:03.403 --> 0:17:08.610
577
+ Mean, of course, in generally there is this
578
+ type of alignment, so there is a correspondence
579
+
580
+ 0:17:08.610 --> 0:17:11.439
581
+ between words in English and the words in German.
582
+
583
+ 0:17:11.439 --> 0:17:16.363
584
+ However, that's not true for all sentences,
585
+ so in some sentences you cannot really say
586
+
587
+ 0:17:16.363 --> 0:17:18.174
588
+ this word translates into that.
589
+
590
+ 0:17:18.498 --> 0:17:21.583
591
+ But you can only let more locate this whole
592
+ phrase.
593
+
594
+ 0:17:21.583 --> 0:17:23.482
595
+ This model into something else.
596
+
597
+ 0:17:23.563 --> 0:17:30.970
598
+ If you think about the don't in English, the
599
+ do is not really clearly where should that
600
+
601
+ 0:17:30.970 --> 0:17:31.895
602
+ be allied.
603
+
604
+ 0:17:32.712 --> 0:17:39.079
605
+ Then for a long time the most successful approach
606
+ was this phrase based translation model where
607
+
608
+ 0:17:39.079 --> 0:17:45.511
609
+ the idea is your block is not a single word
610
+ but a longer phrase if you try to build translations
611
+
612
+ 0:17:45.511 --> 0:17:46.572
613
+ based on these.
614
+
615
+ 0:17:48.768 --> 0:17:54.105
616
+ But let's start with a word based and what
617
+ you need.
618
+
619
+ 0:17:54.105 --> 0:18:03.470
620
+ There is two main knowledge sources, so on
621
+ the one hand we have a lexicon where we translate
622
+
623
+ 0:18:03.470 --> 0:18:05.786
624
+ possible translations.
625
+
626
+ 0:18:06.166 --> 0:18:16.084
627
+ The main difference between the lexicon and
628
+ statistical machine translation and lexicon
629
+
630
+ 0:18:16.084 --> 0:18:17.550
631
+ as you know.
632
+
633
+ 0:18:17.837 --> 0:18:23.590
634
+ Traditional lexicon: You know how word is
635
+ translated and mainly it's giving you two or
636
+
637
+ 0:18:23.590 --> 0:18:26.367
638
+ three examples with any example sentence.
639
+
640
+ 0:18:26.367 --> 0:18:30.136
641
+ So in this context it gets translated like
642
+ that henceon.
643
+
644
+ 0:18:30.570 --> 0:18:38.822
645
+ In order to model that and work with probabilities
646
+ what we need in a machine translation is these:
647
+
648
+ 0:18:39.099 --> 0:18:47.962
649
+ So if we have the German word bargain, it sends
650
+ me out with a probability of zero point five.
651
+
652
+ 0:18:47.962 --> 0:18:51.545
653
+ Maybe it's translated into a vehicle.
654
+
655
+ 0:18:52.792 --> 0:18:58.876
656
+ And of course this is not easy to be created
657
+ by a shoveman.
658
+
659
+ 0:18:58.876 --> 0:19:07.960
660
+ If ask you and give probabilities for how
661
+ probable this vehicle is, there might: So how
662
+
663
+ 0:19:07.960 --> 0:19:12.848
664
+ we are doing is again that the lexicon is automatically
665
+ will be created from a corpus.
666
+
667
+ 0:19:13.333 --> 0:19:18.754
668
+ And we're just counting here, so we count
669
+ how often does it work, how often does it co
670
+
671
+ 0:19:18.754 --> 0:19:24.425
672
+ occur with vehicle, and then we're taking the
673
+ ratio and saying in the house of time on the
674
+
675
+ 0:19:24.425 --> 0:19:26.481
676
+ English side there was vehicles.
677
+
678
+ 0:19:26.481 --> 0:19:31.840
679
+ There was a probability of vehicles given
680
+ back, and there's something like zero point
681
+
682
+ 0:19:31.840 --> 0:19:32.214
683
+ five.
684
+
685
+ 0:19:33.793 --> 0:19:46.669
686
+ That we need another concept, and that is
687
+ this concept of alignment, and now you can
688
+
689
+ 0:19:46.669 --> 0:19:47.578
690
+ have.
691
+
692
+ 0:19:47.667 --> 0:19:53.113
693
+ Since this is quite complicated, the alignment
694
+ in general can be complex.
695
+
696
+ 0:19:53.113 --> 0:19:55.689
697
+ It can be that it's not only like.
698
+
699
+ 0:19:55.895 --> 0:20:04.283
700
+ It can be that two words of a surrender target
701
+ sign and it's also imbiguous.
702
+
703
+ 0:20:04.283 --> 0:20:13.761
704
+ It can be that you say all these two words
705
+ only are aligned together and our words are
706
+
707
+ 0:20:13.761 --> 0:20:15.504
708
+ aligned or not.
709
+
710
+ 0:20:15.875 --> 0:20:21.581
711
+ Is should the do be aligned to the knot in
712
+ German?
713
+
714
+ 0:20:21.581 --> 0:20:29.301
715
+ It's only there because in German it's not,
716
+ so it should be aligned.
717
+
718
+ 0:20:30.510 --> 0:20:39.736
719
+ However, typically it's formalized and it's
720
+ formalized by a function from the target language.
721
+
722
+ 0:20:40.180 --> 0:20:44.051
723
+ And that is to make these models get easier
724
+ and clearer.
725
+
726
+ 0:20:44.304 --> 0:20:49.860
727
+ That means what means does it mean that you
728
+ have a fence that means that each.
729
+
730
+ 0:20:49.809 --> 0:20:58.700
731
+ A sewer's word gives target word and the alliance
732
+ to only one source word because the function
733
+
734
+ 0:20:58.700 --> 0:21:00.384
735
+ is also directly.
736
+
737
+ 0:21:00.384 --> 0:21:05.999
738
+ However, a source word can be hit or like
739
+ by signal target.
740
+
741
+ 0:21:06.286 --> 0:21:11.332
742
+ So you are allowing for one to many alignments,
743
+ but not for many to one alignment.
744
+
745
+ 0:21:11.831 --> 0:21:17.848
746
+ That is a bit of a challenge because you assume
747
+ a lightning should be symmetrical.
748
+
749
+ 0:21:17.848 --> 0:21:24.372
750
+ So if you look at a parallel sentence, it
751
+ should not matter if you look at it from German
752
+
753
+ 0:21:24.372 --> 0:21:26.764
754
+ to English or English to German.
755
+
756
+ 0:21:26.764 --> 0:21:34.352
757
+ So however, it makes these models: Yea possible
758
+ and we'll like to see yea for the phrase bass
759
+
760
+ 0:21:34.352 --> 0:21:36.545
761
+ until we need these alignments.
762
+
763
+ 0:21:36.836 --> 0:21:41.423
764
+ So this alignment was the most important of
765
+ the world based models.
766
+
767
+ 0:21:41.423 --> 0:21:47.763
768
+ For the next twenty years you need the world
769
+ based models to generate this type of alignment,
770
+
771
+ 0:21:47.763 --> 0:21:50.798
772
+ which is then the first step for the phrase.
773
+
774
+ 0:21:51.931 --> 0:21:59.642
775
+ Approach, and there you can then combine them
776
+ again like both directions into one we'll see.
777
+
778
+ 0:22:00.280 --> 0:22:06.850
779
+ This alignment is very important and allows
780
+ us to do this type of separation.
781
+
782
+ 0:22:08.308 --> 0:22:15.786
783
+ And yet the most commonly used word based
784
+ models are these models referred to as IBM
785
+
786
+ 0:22:15.786 --> 0:22:25.422
787
+ models, and there is a sequence of them with
788
+ great names: And they were like yeah very commonly
789
+
790
+ 0:22:25.422 --> 0:22:26.050
791
+ used.
792
+
793
+ 0:22:26.246 --> 0:22:31.719
794
+ We'll mainly focus on the simple one here
795
+ and look how this works and then not do all
796
+
797
+ 0:22:31.719 --> 0:22:34.138
798
+ the details about the further models.
799
+
800
+ 0:22:34.138 --> 0:22:38.084
801
+ The interesting thing is also that all of
802
+ them are important.
803
+
804
+ 0:22:38.084 --> 0:22:43.366
805
+ So if you want to train this alignment what
806
+ you normally do is train an IVM model.
807
+
808
+ 0:22:43.743 --> 0:22:50.940
809
+ Then you take that as your initialization
810
+ to then train the IBM model too and so on.
811
+
812
+ 0:22:50.940 --> 0:22:53.734
813
+ The motivation for that is yeah.
814
+
815
+ 0:22:53.734 --> 0:23:00.462
816
+ The first model gives you: Is so simple that
817
+ you can even find a global optimum, so it gives
818
+
819
+ 0:23:00.462 --> 0:23:06.403
820
+ you a good starting point for the next one
821
+ where the optimization in finding the right
822
+
823
+ 0:23:06.403 --> 0:23:12.344
824
+ model is more difficult and therefore like
825
+ the defore technique was to make your model
826
+
827
+ 0:23:12.344 --> 0:23:13.641
828
+ step by step more.
829
+
830
+ 0:23:15.195 --> 0:23:27.333
831
+ In these models we are breaking down the probability
832
+ into smaller steps and then we can define:
833
+
834
+ 0:23:27.367 --> 0:23:38.981
835
+ You see it's not a bit different, so it's not
836
+ the curability and one specific alignment given.
837
+
838
+ 0:23:39.299 --> 0:23:42.729
839
+ We'll let us learn how we can then go from
840
+ one alignment to the full set.
841
+
842
+ 0:23:43.203 --> 0:23:52.889
843
+ The probability of target sentences and one
844
+ alignment between the source and target sentences
845
+
846
+ 0:23:52.889 --> 0:23:56.599
847
+ alignment is this type of function.
848
+
849
+ 0:23:57.057 --> 0:24:14.347
850
+ That every word is aligned in order to ensure
851
+ that every word is aligned.
852
+
853
+ 0:24:15.835 --> 0:24:28.148
854
+ So first of all you do some epsilon, the epsilon
855
+ is just a normalization factor that everything
856
+
857
+ 0:24:28.148 --> 0:24:31.739
858
+ is somehow to inferability.
859
+
860
+ 0:24:31.631 --> 0:24:37.539
861
+ Of source sentences plus one to the power
862
+ of the length of the targets.
863
+
864
+ 0:24:37.937 --> 0:24:50.987
865
+ And this is somehow the probability of this
866
+ alignment.
867
+
868
+ 0:24:51.131 --> 0:24:53.224
869
+ So is this alignment probable or not?
870
+
871
+ 0:24:53.224 --> 0:24:55.373
872
+ Of course you can have some intuition.
873
+
874
+ 0:24:55.373 --> 0:24:58.403
875
+ So if there's a lot of crossing, it may be
876
+ not a good.
877
+
878
+ 0:24:58.403 --> 0:25:03.196
879
+ If all of the words align to the same one
880
+ might be not a good alignment, but generally
881
+
882
+ 0:25:03.196 --> 0:25:06.501
883
+ it's difficult to really describe what is a
884
+ good alignment.
885
+
886
+ 0:25:07.067 --> 0:25:11.482
887
+ Say for the first model that's the most simple
888
+ thing.
889
+
890
+ 0:25:11.482 --> 0:25:18.760
891
+ What can be the most simple thing if you think
892
+ about giving a probability to some event?
893
+
894
+ 0:25:21.401 --> 0:25:25.973
895
+ Yes exactly, so just take the uniform distribution.
896
+
897
+ 0:25:25.973 --> 0:25:33.534
898
+ If we don't really know the best thing of
899
+ modeling is all equally probable, of course
900
+
901
+ 0:25:33.534 --> 0:25:38.105
902
+ that is not true, but it's giving you a good
903
+ study.
904
+
905
+ 0:25:38.618 --> 0:25:44.519
906
+ And so this one is just a number of all possible
907
+ alignments for this sentence.
908
+
909
+ 0:25:44.644 --> 0:25:53.096
910
+ So how many alignments are possible, so the
911
+ first target word can be allied to all sources
912
+
913
+ 0:25:53.096 --> 0:25:53.746
914
+ worth.
915
+
916
+ 0:25:54.234 --> 0:26:09.743
917
+ The second one can also be aligned to all
918
+ source work, and the third one also to source.
919
+
920
+ 0:26:10.850 --> 0:26:13.678
921
+ This is the number of alignments.
922
+
923
+ 0:26:13.678 --> 0:26:19.002
924
+ The second part is to model the probability
925
+ of the translation.
926
+
927
+ 0:26:19.439 --> 0:26:31.596
928
+ And there it's not nice to have this function,
929
+ so now we are making the product over all target.
930
+
931
+ 0:26:31.911 --> 0:26:40.068
932
+ And we are making a very strong independent
933
+ assumption because in these models we normally
934
+
935
+ 0:26:40.068 --> 0:26:45.715
936
+ assume the translation probability of one word
937
+ is independent.
938
+
939
+ 0:26:46.126 --> 0:26:49.800
940
+ So how you translate and visit it is independent
941
+ of all the other parts.
942
+
943
+ 0:26:50.290 --> 0:26:52.907
944
+ That is very strong and very bad.
945
+
946
+ 0:26:52.907 --> 0:26:55.294
947
+ Yeah, you should do it better.
948
+
949
+ 0:26:55.294 --> 0:27:00.452
950
+ We know that it's wrong because how you translate
951
+ this depends on.
952
+
953
+ 0:27:00.452 --> 0:27:05.302
954
+ However, it's a first easy solution and again
955
+ a good starting.
956
+
957
+ 0:27:05.966 --> 0:27:14.237
958
+ So what you do is that you take a product
959
+ of all words and take a translation probability
960
+
961
+ 0:27:14.237 --> 0:27:15.707
962
+ on this target.
963
+
964
+ 0:27:16.076 --> 0:27:23.901
965
+ And because we know that there is always one
966
+ source word allied to that, so it.
967
+
968
+ 0:27:24.344 --> 0:27:37.409
969
+ If the probability of visits in the zoo doesn't
970
+ really work, the good here I'm again.
971
+
972
+ 0:27:38.098 --> 0:27:51.943
973
+ So most only we have it here, so the probability
974
+ is an absolute divided pipe to the power.
975
+
976
+ 0:27:53.913 --> 0:27:58.401
977
+ And then there is somewhere in the last one.
978
+
979
+ 0:27:58.401 --> 0:28:04.484
980
+ There is an arrow and switch, so it is the
981
+ other way around.
982
+
983
+ 0:28:04.985 --> 0:28:07.511
984
+ Then you have your translation model.
985
+
986
+ 0:28:07.511 --> 0:28:12.498
987
+ Hopefully let's assume you have your water
988
+ train so that's only a signing.
989
+
990
+ 0:28:12.953 --> 0:28:25.466
991
+ And then this sentence has the probability
992
+ of generating I visit a friend given that you
993
+
994
+ 0:28:25.466 --> 0:28:31.371
995
+ have the source sentence if Bezukhov I'm.
996
+
997
+ 0:28:32.012 --> 0:28:34.498
998
+ Time stand to the power of minus five.
999
+
1000
+ 0:28:35.155 --> 0:28:36.098
1001
+ So this is your model.
1002
+
1003
+ 0:28:36.098 --> 0:28:37.738
1004
+ This is how you're applying your model.
1005
+
1006
+ 0:28:39.479 --> 0:28:44.220
1007
+ As you said, it's the most simple bottle you
1008
+ assume that all word translations are.
1009
+
1010
+ 0:28:44.204 --> 0:28:46.540
1011
+ Independent of each other.
1012
+
1013
+ 0:28:46.540 --> 0:28:54.069
1014
+ You assume that all alignments are equally
1015
+ important, and then the only thing you need
1016
+
1017
+ 0:28:54.069 --> 0:29:00.126
1018
+ for this type of model is to have this lexicon
1019
+ in order to calculate.
1020
+
1021
+ 0:29:00.940 --> 0:29:04.560
1022
+ And that is, of course, now the training process.
1023
+
1024
+ 0:29:04.560 --> 0:29:08.180
1025
+ The question is how do we get this type of
1026
+ lexic?
1027
+
1028
+ 0:29:09.609 --> 0:29:15.461
1029
+ But before we look into the training, do you
1030
+ have any questions about the model itself?
1031
+
1032
+ 0:29:21.101 --> 0:29:26.816
1033
+ The problem in training is that we have incomplete
1034
+ data.
1035
+
1036
+ 0:29:26.816 --> 0:29:32.432
1037
+ So if you want to count, I mean said you want
1038
+ to count.
1039
+
1040
+ 0:29:33.073 --> 0:29:39.348
1041
+ However, if you don't have the alignment,
1042
+ on the other hand, if you would have a lexicon
1043
+
1044
+ 0:29:39.348 --> 0:29:44.495
1045
+ you could maybe generate the alignment, which
1046
+ is the most probable word.
1047
+
1048
+ 0:29:45.225 --> 0:29:55.667
1049
+ And this is the very common problem that you
1050
+ have this type of incomplete data where you
1051
+
1052
+ 0:29:55.667 --> 0:29:59.656
1053
+ have not one type of information.
1054
+
1055
+ 0:30:00.120 --> 0:30:08.767
1056
+ And you can model this by considering the
1057
+ alignment as your hidden variable and then
1058
+
1059
+ 0:30:08.767 --> 0:30:17.619
1060
+ you can use the expectation maximization algorithm
1061
+ in order to generate the alignment.
1062
+
1063
+ 0:30:17.577 --> 0:30:26.801
1064
+ So the nice thing is that you only need your
1065
+ parallel data, which is aligned on sentence
1066
+
1067
+ 0:30:26.801 --> 0:30:29.392
1068
+ level, but you normally.
1069
+
1070
+ 0:30:29.389 --> 0:30:33.720
1071
+ Is just a lot of work we saw last time.
1072
+
1073
+ 0:30:33.720 --> 0:30:39.567
1074
+ Typically what you have is this type of corpus
1075
+ where.
1076
+
1077
+ 0:30:41.561 --> 0:30:50.364
1078
+ And yeah, the ERM algorithm sounds very fancy.
1079
+
1080
+ 0:30:50.364 --> 0:30:58.605
1081
+ However, again look at a little high level.
1082
+
1083
+ 0:30:58.838 --> 0:31:05.841
1084
+ So you're initializing a model by uniform
1085
+ distribution.
1086
+
1087
+ 0:31:05.841 --> 0:31:14.719
1088
+ You're just saying if have lexicon, if all
1089
+ words are equally possible.
1090
+
1091
+ 0:31:15.215 --> 0:31:23.872
1092
+ And then you apply your model to the data,
1093
+ and that is your expectation step.
1094
+
1095
+ 0:31:23.872 --> 0:31:30.421
1096
+ So given this initial lexicon, we are now
1097
+ calculating the.
1098
+
1099
+ 0:31:30.951 --> 0:31:36.043
1100
+ So we can now take all our parallel sentences,
1101
+ and of course ought to check what is the most
1102
+
1103
+ 0:31:36.043 --> 0:31:36.591
1104
+ probable.
1105
+
1106
+ 0:31:38.338 --> 0:31:49.851
1107
+ And then, of course, at the beginning maybe
1108
+ houses most often in line.
1109
+
1110
+ 0:31:50.350 --> 0:31:58.105
1111
+ Once we have done this expectation step, we
1112
+ can next do the maximization step and based
1113
+
1114
+ 0:31:58.105 --> 0:32:06.036
1115
+ on this guest alignment, which we have, we
1116
+ can now learn better translation probabilities
1117
+
1118
+ 0:32:06.036 --> 0:32:09.297
1119
+ by just counting how often do words.
1120
+
1121
+ 0:32:09.829 --> 0:32:22.289
1122
+ And then it's rated these steps: We can make
1123
+ this whole process even more stable, only taking
1124
+
1125
+ 0:32:22.289 --> 0:32:26.366
1126
+ the most probable alignment.
1127
+
1128
+ 0:32:26.346 --> 0:32:36.839
1129
+ Second step, but in contrast we calculate
1130
+ for all possible alignments the alignment probability
1131
+
1132
+ 0:32:36.839 --> 0:32:40.009
1133
+ and weigh the correcurrence.
1134
+
1135
+ 0:32:40.000 --> 0:32:41.593
1136
+ Then Things Are Most.
1137
+
1138
+ 0:32:42.942 --> 0:32:49.249
1139
+ Why could that be very challenging if we do
1140
+ it in general and really calculate all probabilities
1141
+
1142
+ 0:32:49.249 --> 0:32:49.834
1143
+ for all?
1144
+
1145
+ 0:32:53.673 --> 0:32:55.905
1146
+ How many alignments are there for a Simpson?
1147
+
1148
+ 0:32:58.498 --> 0:33:03.344
1149
+ Yes there, we just saw that in the formula
1150
+ if you remember.
1151
+
1152
+ 0:33:03.984 --> 0:33:12.336
1153
+ This was the formula so it's exponential in
1154
+ the lengths of the target sentence.
1155
+
1156
+ 0:33:12.336 --> 0:33:15.259
1157
+ It would calculate all the.
1158
+
1159
+ 0:33:15.415 --> 0:33:18.500
1160
+ Be very inefficient and really possible.
1161
+
1162
+ 0:33:18.500 --> 0:33:25.424
1163
+ The nice thing is we can again use some type
1164
+ of dynamic programming, so then we can do this
1165
+
1166
+ 0:33:25.424 --> 0:33:27.983
1167
+ without really calculating audit.
1168
+
1169
+ 0:33:28.948 --> 0:33:40.791
1170
+ We have the next pipe slides or so with the
1171
+ most equations in the whole lecture, so don't
1172
+
1173
+ 0:33:40.791 --> 0:33:41.713
1174
+ worry.
1175
+
1176
+ 0:33:42.902 --> 0:34:01.427
1177
+ So we said we have first explanation where
1178
+ it is about calculating the alignment.
1179
+
1180
+ 0:34:02.022 --> 0:34:20.253
1181
+ And we can do this with our initial definition
1182
+ of because this formula.
1183
+
1184
+ 0:34:20.160 --> 0:34:25.392
1185
+ So we can define this as and and divided by
1186
+ and.
1187
+
1188
+ 0:34:25.905 --> 0:34:30.562
1189
+ This is just the normal definition of a conditional
1190
+ probability.
1191
+
1192
+ 0:34:31.231 --> 0:34:37.937
1193
+ And what we then need to assume a meter calculate
1194
+ is P of E given.
1195
+
1196
+ 0:34:37.937 --> 0:34:41.441
1197
+ P of E given is still again quiet.
1198
+
1199
+ 0:34:41.982 --> 0:34:56.554
1200
+ Simple: The probability of the sewer sentence
1201
+ given the target sentence is quite intuitive.
1202
+
1203
+ 0:34:57.637 --> 0:35:15.047
1204
+ So let's just calculate how to calculate the
1205
+ probability of a event.
1206
+
1207
+ 0:35:15.215 --> 0:35:21.258
1208
+ So in here we can then put in our original
1209
+ form in our soils.
1210
+
1211
+ 0:35:21.201 --> 0:35:28.023
1212
+ There are some of the possible alignments
1213
+ of the first word, and so until the sum of
1214
+
1215
+ 0:35:28.023 --> 0:35:30.030
1216
+ all possible alignments.
1217
+
1218
+ 0:35:29.990 --> 0:35:41.590
1219
+ And then we have the probability here of the
1220
+ alignment type, this product of translation.
1221
+
1222
+ 0:35:42.562 --> 0:35:58.857
1223
+ Now this one is independent of the alignment,
1224
+ so we can put it to the front here.
1225
+
1226
+ 0:35:58.959 --> 0:36:03.537
1227
+ And now this is where dynamic programming
1228
+ works in.
1229
+
1230
+ 0:36:03.537 --> 0:36:08.556
1231
+ We can change that and make thereby things
1232
+ a lot easier.
1233
+
1234
+ 0:36:08.668 --> 0:36:21.783
1235
+ Can reform it like this just as a product
1236
+ over all target positions, and then it's the
1237
+
1238
+ 0:36:21.783 --> 0:36:26.456
1239
+ sum over all source positions.
1240
+
1241
+ 0:36:27.127 --> 0:36:36.454
1242
+ Maybe at least the intuition why this is equal
1243
+ is a lot easier if you look into it as graphic.
1244
+
1245
+ 0:36:36.816 --> 0:36:39.041
1246
+ So what we have here is the table.
1247
+
1248
+ 0:36:39.041 --> 0:36:42.345
1249
+ We have the target position and the Swiss
1250
+ position.
1251
+
1252
+ 0:36:42.862 --> 0:37:03.643
1253
+ And we have to sum up all possible passes
1254
+ through that: The nice thing is that each of
1255
+
1256
+ 0:37:03.643 --> 0:37:07.127
1257
+ these passes these probabilities are independent
1258
+ of each.
1259
+
1260
+ 0:37:07.607 --> 0:37:19.678
1261
+ In order to get the sum of all passes through
1262
+ this table you can use dynamic programming
1263
+
1264
+ 0:37:19.678 --> 0:37:27.002
1265
+ and then say oh this probability is exactly
1266
+ the same.
1267
+
1268
+ 0:37:26.886 --> 0:37:34.618
1269
+ Times the sun of this column finds the sum
1270
+ of this column, and times the sun of this colun.
1271
+
1272
+ 0:37:35.255 --> 0:37:41.823
1273
+ That is the same as if you go through all
1274
+ possible passes here and multiply always the
1275
+
1276
+ 0:37:41.823 --> 0:37:42.577
1277
+ elements.
1278
+
1279
+ 0:37:43.923 --> 0:37:54.227
1280
+ And that is a simplification because now we
1281
+ only have quadratic numbers and we don't have
1282
+
1283
+ 0:37:54.227 --> 0:37:55.029
1284
+ to go.
1285
+
1286
+ 0:37:55.355 --> 0:38:12.315
1287
+ Similar to guess you may be seen the same
1288
+ type of algorithm for what is it?
1289
+
1290
+ 0:38:14.314 --> 0:38:19.926
1291
+ Yeah, well yeah, so that is the saying.
1292
+
1293
+ 0:38:19.926 --> 0:38:31.431
1294
+ But yeah, I think graphically this is seeable
1295
+ if you don't know exactly the mass.
1296
+
1297
+ 0:38:32.472 --> 0:38:49.786
1298
+ Now put these both together, so if you really
1299
+ want to take a piece of and put these two formulas
1300
+
1301
+ 0:38:49.786 --> 0:38:51.750
1302
+ together,.
1303
+
1304
+ 0:38:51.611 --> 0:38:56.661
1305
+ Eliminated and Then You Get Your Final Formula.
1306
+
1307
+ 0:38:56.716 --> 0:39:01.148
1308
+ And that somehow really makes now really intuitively
1309
+ again sense.
1310
+
1311
+ 0:39:01.401 --> 0:39:08.301
1312
+ So the probability of an alignment is the
1313
+ product of all target sentences, and then it's
1314
+
1315
+ 0:39:08.301 --> 0:39:15.124
1316
+ the probability of to translate a word into
1317
+ the word that is aligned to divided by some
1318
+
1319
+ 0:39:15.124 --> 0:39:17.915
1320
+ of the other words in the sentence.
1321
+
1322
+ 0:39:18.678 --> 0:39:31.773
1323
+ If you look at this again, it makes real descent.
1324
+
1325
+ 0:39:31.891 --> 0:39:43.872
1326
+ So you're looking at how probable it is to
1327
+ translate compared to all the other words.
1328
+
1329
+ 0:39:43.872 --> 0:39:45.404
1330
+ So you're.
1331
+
1332
+ 0:39:45.865 --> 0:39:48.543
1333
+ So and that gives you the alignment probability.
1334
+
1335
+ 0:39:48.768 --> 0:39:54.949
1336
+ Somehow it's not only that it's mathematically
1337
+ correct if you look at it this way, it's somehow
1338
+
1339
+ 0:39:54.949 --> 0:39:55.785
1340
+ intuitively.
1341
+
1342
+ 0:39:55.785 --> 0:39:58.682
1343
+ So if you would say how good is it to align?
1344
+
1345
+ 0:39:58.638 --> 0:40:04.562
1346
+ We had to zoo him to visit, or yet it should
1347
+ depend on how good this is the translation
1348
+
1349
+ 0:40:04.562 --> 0:40:10.620
1350
+ probability compared to how good are the other
1351
+ words in the sentence, and how probable is
1352
+
1353
+ 0:40:10.620 --> 0:40:12.639
1354
+ it that I align them to them.
1355
+
1356
+ 0:40:15.655 --> 0:40:26.131
1357
+ Then you have the expectations that the next
1358
+ thing is now the maximization step, so we have
1359
+
1360
+ 0:40:26.131 --> 0:40:30.344
1361
+ now the probability of an alignment.
1362
+
1363
+ 0:40:31.451 --> 0:40:37.099
1364
+ Intuitively, that means how often are words
1365
+ aligned to each other giving this alignment
1366
+
1367
+ 0:40:37.099 --> 0:40:39.281
1368
+ or more in a perverse definition?
1369
+
1370
+ 0:40:39.281 --> 0:40:43.581
1371
+ What is the expectation value that they are
1372
+ aligned to each other?
1373
+
1374
+ 0:40:43.581 --> 0:40:49.613
1375
+ So if there's a lot of alignments with hyperability
1376
+ that they're aligned to each other, then.
1377
+
1378
+ 0:40:50.050 --> 0:41:07.501
1379
+ So the count of E and given F given our caravan
1380
+ data is a sum of all possible alignments.
1381
+
1382
+ 0:41:07.968 --> 0:41:14.262
1383
+ That is, this count, and you don't do just
1384
+ count with absolute numbers, but you count
1385
+
1386
+ 0:41:14.262 --> 0:41:14.847
1387
+ always.
1388
+
1389
+ 0:41:15.815 --> 0:41:26.519
1390
+ And to make that translation probability is
1391
+ that you have to normalize it, of course, through:
1392
+
1393
+ 0:41:27.487 --> 0:41:30.584
1394
+ And that's then the whole model.
1395
+
1396
+ 0:41:31.111 --> 0:41:39.512
1397
+ It looks now maybe a bit mathematically complex.
1398
+
1399
+ 0:41:39.512 --> 0:41:47.398
1400
+ The whole training process is described here.
1401
+
1402
+ 0:41:47.627 --> 0:41:53.809
1403
+ So you really, really just have to collect
1404
+ these counts and later normalize that.
1405
+
1406
+ 0:41:54.134 --> 0:42:03.812
1407
+ So repeating that until convergence we have
1408
+ said the ear migration is always done again.
1409
+
1410
+ 0:42:04.204 --> 0:42:15.152
1411
+ Equally, then you go over all sentence pairs
1412
+ and all of words and calculate the translation.
1413
+
1414
+ 0:42:15.355 --> 0:42:17.983
1415
+ And then you go once again over.
1416
+
1417
+ 0:42:17.983 --> 0:42:22.522
1418
+ It counted this count, count given, and totally
1419
+ e-given.
1420
+
1421
+ 0:42:22.702 --> 0:42:35.316
1422
+ Initially how probable is the E translated
1423
+ to something else, and you normalize your translation
1424
+
1425
+ 0:42:35.316 --> 0:42:37.267
1426
+ probabilities.
1427
+
1428
+ 0:42:38.538 --> 0:42:45.761
1429
+ So this is an old training process for this
1430
+ type of.
1431
+
1432
+ 0:42:46.166 --> 0:43:00.575
1433
+ How that then works is shown here a bit, so
1434
+ we have a very simple corpus.
1435
+
1436
+ 0:43:01.221 --> 0:43:12.522
1437
+ And as we said, you initialize your translation
1438
+ with yes or possible translations, so dusk
1439
+
1440
+ 0:43:12.522 --> 0:43:16.620
1441
+ can be aligned to the bookhouse.
1442
+
1443
+ 0:43:16.997 --> 0:43:25.867
1444
+ And the other ones are missing because only
1445
+ a curse with and book, and then the others
1446
+
1447
+ 0:43:25.867 --> 0:43:26.988
1448
+ will soon.
1449
+
1450
+ 0:43:27.127 --> 0:43:34.316
1451
+ In the initial way your vocabulary is for
1452
+ works, so the initial probabilities are all:
1453
+
1454
+ 0:43:34.794 --> 0:43:50.947
1455
+ And then if you iterate you see that the things
1456
+ which occur often and then get alignments get
1457
+
1458
+ 0:43:50.947 --> 0:43:53.525
1459
+ more and more.
1460
+
1461
+ 0:43:55.615 --> 0:44:01.506
1462
+ In reality, of course, you won't get like
1463
+ zero alignments, but you would normally get
1464
+
1465
+ 0:44:01.506 --> 0:44:02.671
1466
+ there sometimes.
1467
+
1468
+ 0:44:03.203 --> 0:44:05.534
1469
+ But as the probability increases.
1470
+
1471
+ 0:44:05.785 --> 0:44:17.181
1472
+ The training process is also guaranteed that
1473
+ the probability of your training data is always
1474
+
1475
+ 0:44:17.181 --> 0:44:20.122
1476
+ increased in iteration.
1477
+
1478
+ 0:44:21.421 --> 0:44:27.958
1479
+ You see that the model tries to model your
1480
+ training data and give you at least good models.
1481
+
1482
+ 0:44:30.130 --> 0:44:37.765
1483
+ Okay, are there any more questions to the
1484
+ training of these type of word-based models?
1485
+
1486
+ 0:44:38.838 --> 0:44:54.790
1487
+ Initially there is like forwards in the source
1488
+ site, so it's just one force to do equal distribution.
1489
+
1490
+ 0:44:55.215 --> 0:45:01.888
1491
+ So each target word, the probability of the
1492
+ target word, is at four target words, so the
1493
+
1494
+ 0:45:01.888 --> 0:45:03.538
1495
+ uniform distribution.
1496
+
1497
+ 0:45:07.807 --> 0:45:14.430
1498
+ However, there is problems with this initial
1499
+ order and we have this already mentioned at
1500
+
1501
+ 0:45:14.430 --> 0:45:15.547
1502
+ the beginning.
1503
+
1504
+ 0:45:15.547 --> 0:45:21.872
1505
+ There is for example things that yeah you
1506
+ want to allow for reordering but there are
1507
+
1508
+ 0:45:21.872 --> 0:45:27.081
1509
+ definitely some alignments which should be
1510
+ more probable than others.
1511
+
1512
+ 0:45:27.347 --> 0:45:42.333
1513
+ So a friend visit should have a lower probability
1514
+ than visit a friend.
1515
+
1516
+ 0:45:42.302 --> 0:45:50.233
1517
+ It's not always monitoring, there is some
1518
+ reordering happening, but if you just mix it
1519
+
1520
+ 0:45:50.233 --> 0:45:51.782
1521
+ crazy, it's not.
1522
+
1523
+ 0:45:52.252 --> 0:46:11.014
1524
+ You have slings like one too many alignments
1525
+ and they are not really models.
1526
+
1527
+ 0:46:11.491 --> 0:46:17.066
1528
+ But it shouldn't be that you align one word
1529
+ to all the others, and that is, you don't want
1530
+
1531
+ 0:46:17.066 --> 0:46:18.659
1532
+ this type of probability.
1533
+
1534
+ 0:46:19.199 --> 0:46:27.879
1535
+ You don't want to align to null, so there's
1536
+ nothing about that and how to deal with other
1537
+
1538
+ 0:46:27.879 --> 0:46:30.386
1539
+ words on the source side.
1540
+
1541
+ 0:46:32.272 --> 0:46:45.074
1542
+ And therefore this was only like the initial
1543
+ model in there.
1544
+
1545
+ 0:46:45.325 --> 0:46:47.639
1546
+ Models, which we saw.
1547
+
1548
+ 0:46:47.639 --> 0:46:57.001
1549
+ They only model the translation probability,
1550
+ so how probable is it to translate one word
1551
+
1552
+ 0:46:57.001 --> 0:46:58.263
1553
+ to another?
1554
+
1555
+ 0:46:58.678 --> 0:47:05.915
1556
+ What you could then add is the absolute position.
1557
+
1558
+ 0:47:05.915 --> 0:47:16.481
1559
+ Yeah, the second word should more probable
1560
+ align to the second position.
1561
+
1562
+ 0:47:17.557 --> 0:47:22.767
1563
+ We add a fertility model that means one word
1564
+ is mostly translated into one word.
1565
+
1566
+ 0:47:23.523 --> 0:47:29.257
1567
+ For example, we saw it there that should be
1568
+ translated into two words, but most words should
1569
+
1570
+ 0:47:29.257 --> 0:47:32.463
1571
+ be one to one, and it's even modeled for each
1572
+ word.
1573
+
1574
+ 0:47:32.463 --> 0:47:37.889
1575
+ So for each source word, how probable is it
1576
+ that it is translated to one, two, three or
1577
+
1578
+ 0:47:37.889 --> 0:47:38.259
1579
+ more?
1580
+
1581
+ 0:47:40.620 --> 0:47:50.291
1582
+ Then either one of four acts relative positions,
1583
+ so it's asks: Maybe instead of modeling, how
1584
+
1585
+ 0:47:50.291 --> 0:47:55.433
1586
+ probable is it that you translate from position
1587
+ five to position twenty five?
1588
+
1589
+ 0:47:55.433 --> 0:48:01.367
1590
+ It's not a very good way, but in a relative
1591
+ position instead of what you try to model it.
1592
+
1593
+ 0:48:01.321 --> 0:48:06.472
1594
+ How probable is that you are jumping Swiss
1595
+ steps forward or Swiss steps back?
1596
+
1597
+ 0:48:07.287 --> 0:48:15.285
1598
+ However, this makes sense more complex because
1599
+ what is a jump forward and a jump backward
1600
+
1601
+ 0:48:15.285 --> 0:48:16.885
1602
+ is not that easy.
1603
+
1604
+ 0:48:18.318 --> 0:48:30.423
1605
+ You want to have a model that describes reality,
1606
+ so every sentence that is not possible should
1607
+
1608
+ 0:48:30.423 --> 0:48:37.304
1609
+ have the probability zero because that cannot
1610
+ happen.
1611
+
1612
+ 0:48:37.837 --> 0:48:48.037
1613
+ However, with this type of IBM model four
1614
+ this has a positive probability, so it makes
1615
+
1616
+ 0:48:48.037 --> 0:48:54.251
1617
+ a sentence more complex and you can easily
1618
+ check it.
1619
+
1620
+ 0:48:57.457 --> 0:49:09.547
1621
+ So these models were the first models which
1622
+ tried to directly model and where they are
1623
+
1624
+ 0:49:09.547 --> 0:49:14.132
1625
+ the first to do the translation.
1626
+
1627
+ 0:49:14.414 --> 0:49:19.605
1628
+ So in all of these models, the probability
1629
+ of a word translating into another word is
1630
+
1631
+ 0:49:19.605 --> 0:49:25.339
1632
+ always independent of all the other translations,
1633
+ and that is a challenge because we know that
1634
+
1635
+ 0:49:25.339 --> 0:49:26.486
1636
+ this is not right.
1637
+
1638
+ 0:49:26.967 --> 0:49:32.342
1639
+ And therefore we will come now to then the
1640
+ phrase-based translation models.
1641
+
1642
+ 0:49:35.215 --> 0:49:42.057
1643
+ However, this word alignment is the very important
1644
+ concept which was used in phrase based.
1645
+
1646
+ 0:49:42.162 --> 0:49:50.559
1647
+ Even when people use phrase based, they first
1648
+ would always train a word based model not to
1649
+
1650
+ 0:49:50.559 --> 0:49:56.188
1651
+ get the really model but only to get this type
1652
+ of alignment.
1653
+
1654
+ 0:49:57.497 --> 0:50:01.343
1655
+ What was the main idea of a phrase based machine
1656
+ translation?
1657
+
1658
+ 0:50:03.223 --> 0:50:08.898
1659
+ It's not only that things got mathematically
1660
+ a lot more simple here because you don't try
1661
+
1662
+ 0:50:08.898 --> 0:50:13.628
1663
+ to express the whole translation process, but
1664
+ it's a discriminative model.
1665
+
1666
+ 0:50:13.628 --> 0:50:19.871
1667
+ So what you only try to model is this translation
1668
+ probability or is this translation more probable
1669
+
1670
+ 0:50:19.871 --> 0:50:20.943
1671
+ than some other.
1672
+
1673
+ 0:50:24.664 --> 0:50:28.542
1674
+ The main idea is that the basic units are
1675
+ are the phrases.
1676
+
1677
+ 0:50:28.542 --> 0:50:31.500
1678
+ That's why it's called phrase phrase phrase.
1679
+
1680
+ 0:50:31.500 --> 0:50:35.444
1681
+ You have to be aware that these are not linguistic
1682
+ phrases.
1683
+
1684
+ 0:50:35.444 --> 0:50:39.124
1685
+ I guess you have some intuition about what
1686
+ is a phrase.
1687
+
1688
+ 0:50:39.399 --> 0:50:45.547
1689
+ You would express as a phrase.
1690
+
1691
+ 0:50:45.547 --> 0:50:58.836
1692
+ However, you wouldn't say that is a very good
1693
+ phrase because it's.
1694
+
1695
+ 0:50:59.339 --> 0:51:06.529
1696
+ However, in this machine learning-based motivated
1697
+ thing, phrases are just indicative.
1698
+
1699
+ 0:51:07.127 --> 0:51:08.832
1700
+ So it can be any split.
1701
+
1702
+ 0:51:08.832 --> 0:51:12.455
1703
+ We don't consider linguistically motivated
1704
+ or not.
1705
+
1706
+ 0:51:12.455 --> 0:51:15.226
1707
+ It can be any sequence of consecutive.
1708
+
1709
+ 0:51:15.335 --> 0:51:16.842
1710
+ That's the Only Important Thing.
1711
+
1712
+ 0:51:16.977 --> 0:51:25.955
1713
+ The phrase is always a thing of consecutive
1714
+ words, and the motivation behind that is getting
1715
+
1716
+ 0:51:25.955 --> 0:51:27.403
1717
+ computational.
1718
+
1719
+ 0:51:27.387 --> 0:51:35.912
1720
+ People have looked into how you can also discontinuous
1721
+ phrases, which might be very helpful if you
1722
+
1723
+ 0:51:35.912 --> 0:51:38.237
1724
+ think about German harbor.
1725
+
1726
+ 0:51:38.237 --> 0:51:40.046
1727
+ Has this one phrase?
1728
+
1729
+ 0:51:40.000 --> 0:51:47.068
1730
+ There's two phrases, although there's many
1731
+ things in between, but in order to make things
1732
+
1733
+ 0:51:47.068 --> 0:51:52.330
1734
+ still possible and runner will, it's always
1735
+ like consecutive work.
1736
+
1737
+ 0:51:53.313 --> 0:52:05.450
1738
+ The nice thing is that on the one hand you
1739
+ don't need this word to word correspondence
1740
+
1741
+ 0:52:05.450 --> 0:52:06.706
1742
+ anymore.
1743
+
1744
+ 0:52:06.906 --> 0:52:17.088
1745
+ You now need to invent some type of alignment
1746
+ that in this case doesn't really make sense.
1747
+
1748
+ 0:52:17.417 --> 0:52:21.710
1749
+ So you can just learn okay, you have this
1750
+ phrase and this phrase and their translation.
1751
+
1752
+ 0:52:22.862 --> 0:52:25.989
1753
+ Secondly, we can add a bit of context into
1754
+ that.
1755
+
1756
+ 0:52:26.946 --> 0:52:43.782
1757
+ You're saying, for example, of Ultimate Customs
1758
+ and of My Shift.
1759
+
1760
+ 0:52:44.404 --> 0:52:51.443
1761
+ And this was difficult to model and work based
1762
+ models because they always model the translation.
1763
+
1764
+ 0:52:52.232 --> 0:52:57.877
1765
+ Here you can have phrases where you have more
1766
+ context and just jointly translate the phrases,
1767
+
1768
+ 0:52:57.877 --> 0:53:03.703
1769
+ and if you then have seen all by the question
1770
+ as a phrase you can directly use that to generate.
1771
+
1772
+ 0:53:08.468 --> 0:53:19.781
1773
+ Okay, before we go into how to do that, then
1774
+ we start, so the start is when we start with
1775
+
1776
+ 0:53:19.781 --> 0:53:21.667
1777
+ the alignment.
1778
+
1779
+ 0:53:22.022 --> 0:53:35.846
1780
+ So that is what we get from the work based
1781
+ model and we are assuming to get the.
1782
+
1783
+ 0:53:36.356 --> 0:53:40.786
1784
+ So that is your starting point.
1785
+
1786
+ 0:53:40.786 --> 0:53:47.846
1787
+ You have a certain sentence and one most probable.
1788
+
1789
+ 0:53:48.989 --> 0:54:11.419
1790
+ The challenge you now have is that these alignments
1791
+ are: On the one hand, a source word like hit
1792
+
1793
+ 0:54:11.419 --> 0:54:19.977
1794
+ several times with one source word can be aligned
1795
+ to several: So in this case you see that for
1796
+
1797
+ 0:54:19.977 --> 0:54:29.594
1798
+ example Bisher is aligned to three words, so
1799
+ this can be the alignment from English to German,
1800
+
1801
+ 0:54:29.594 --> 0:54:32.833
1802
+ but it cannot be the alignment.
1803
+
1804
+ 0:54:33.273 --> 0:54:41.024
1805
+ In order to address for this inconsistency
1806
+ and being able to do that, what you typically
1807
+
1808
+ 0:54:41.024 --> 0:54:49.221
1809
+ then do is: If you have this inconsistency
1810
+ and you get different things in both directions,.
1811
+
1812
+ 0:54:54.774 --> 0:55:01.418
1813
+ In machine translation to do that you just
1814
+ do it in both directions and somehow combine
1815
+
1816
+ 0:55:01.418 --> 0:55:08.363
1817
+ them because both will do arrows and the hope
1818
+ is yeah if you know both things you minimize.
1819
+
1820
+ 0:55:08.648 --> 0:55:20.060
1821
+ So you would also do it in the other direction
1822
+ and get a different type of lineup, for example
1823
+
1824
+ 0:55:20.060 --> 0:55:22.822
1825
+ that you now have saw.
1826
+
1827
+ 0:55:23.323 --> 0:55:37.135
1828
+ So in this way you are having two alignments
1829
+ and the question is now how do get one alignment
1830
+
1831
+ 0:55:37.135 --> 0:55:38.605
1832
+ and what?
1833
+
1834
+ 0:55:38.638 --> 0:55:45.828
1835
+ There were a lot of different types of heuristics.
1836
+
1837
+ 0:55:45.828 --> 0:55:55.556
1838
+ They normally start with intersection because
1839
+ you should trust them.
1840
+
1841
+ 0:55:55.996 --> 0:55:59.661
1842
+ And your maximum will could take this, the
1843
+ union thought,.
1844
+
1845
+ 0:55:59.980 --> 0:56:04.679
1846
+ If one of the systems says they are not aligned
1847
+ then maybe you should not align them.
1848
+
1849
+ 0:56:05.986 --> 0:56:12.240
1850
+ The only question they are different is what
1851
+ should I do about things where they don't agree?
1852
+
1853
+ 0:56:12.240 --> 0:56:18.096
1854
+ So where only one of them enlines and then
1855
+ you have heuristics depending on other words
1856
+
1857
+ 0:56:18.096 --> 0:56:22.288
1858
+ around it, you can decide should I align them
1859
+ or should I not.
1860
+
1861
+ 0:56:24.804 --> 0:56:34.728
1862
+ So that is your first step and then the second
1863
+ step in your model.
1864
+
1865
+ 0:56:34.728 --> 0:56:41.689
1866
+ So now you have one alignment for the process.
1867
+
1868
+ 0:56:42.042 --> 0:56:47.918
1869
+ And the idea is that we will now extract all
1870
+ phrase pairs to combinations of source and
1871
+
1872
+ 0:56:47.918 --> 0:56:51.858
1873
+ target phrases where they are consistent within
1874
+ alignment.
1875
+
1876
+ 0:56:52.152 --> 0:56:57.980
1877
+ The idea is a consistence with an alignment
1878
+ that should be a good example and that we can
1879
+
1880
+ 0:56:57.980 --> 0:56:58.563
1881
+ extract.
1882
+
1883
+ 0:56:59.459 --> 0:57:14.533
1884
+ And there are three conditions where we say
1885
+ an alignment has to be consistent.
1886
+
1887
+ 0:57:14.533 --> 0:57:17.968
1888
+ The first one is.
1889
+
1890
+ 0:57:18.318 --> 0:57:24.774
1891
+ So if you add bisher, then it's in your phrase.
1892
+
1893
+ 0:57:24.774 --> 0:57:32.306
1894
+ All the three words up till and now should
1895
+ be in there.
1896
+
1897
+ 0:57:32.492 --> 0:57:42.328
1898
+ So Bisheret Till would not be a valid phrase
1899
+ pair in this case, but for example Bisheret
1900
+
1901
+ 0:57:42.328 --> 0:57:43.433
1902
+ Till now.
1903
+
1904
+ 0:57:45.525 --> 0:58:04.090
1905
+ Does anybody now have already an idea about
1906
+ the second rule that should be there?
1907
+
1908
+ 0:58:05.325 --> 0:58:10.529
1909
+ Yes, that is exactly the other thing.
1910
+
1911
+ 0:58:10.529 --> 0:58:22.642
1912
+ If a target verse is in the phrase pair, there
1913
+ are also: Then there is one very obvious one.
1914
+
1915
+ 0:58:22.642 --> 0:58:28.401
1916
+ If you strike a phrase pair, at least one
1917
+ word in the phrase.
1918
+
1919
+ 0:58:29.069 --> 0:58:32.686
1920
+ And this is a knife with working.
1921
+
1922
+ 0:58:32.686 --> 0:58:40.026
1923
+ However, in reality a captain will select
1924
+ some part of the sentence.
1925
+
1926
+ 0:58:40.380 --> 0:58:47.416
1927
+ You can take any possible combination of sewers
1928
+ and target words for this part, and that of
1929
+
1930
+ 0:58:47.416 --> 0:58:54.222
1931
+ course is not very helpful because you just
1932
+ have no idea, and therefore it says at least
1933
+
1934
+ 0:58:54.222 --> 0:58:58.735
1935
+ one sewer should be aligned to one target word
1936
+ to prevent.
1937
+
1938
+ 0:58:59.399 --> 0:59:09.615
1939
+ But still, it means that if you have normally
1940
+ analyzed words, the more analyzed words you
1941
+
1942
+ 0:59:09.615 --> 0:59:10.183
1943
+ can.
1944
+
1945
+ 0:59:10.630 --> 0:59:13.088
1946
+ That's not true for the very extreme case.
1947
+
1948
+ 0:59:13.088 --> 0:59:17.603
1949
+ If no word is a line you can extract nothing
1950
+ because you can never fulfill it.
1951
+
1952
+ 0:59:17.603 --> 0:59:23.376
1953
+ However, if only for example one word is aligned
1954
+ then you can align a lot of different possibilities
1955
+
1956
+ 0:59:23.376 --> 0:59:28.977
1957
+ because you can start with this word and then
1958
+ add source words or target words or any combination
1959
+
1960
+ 0:59:28.977 --> 0:59:29.606
1961
+ of source.
1962
+
1963
+ 0:59:30.410 --> 0:59:37.585
1964
+ So there was typically a problem that if you
1965
+ have too few works in light you can really
1966
+
1967
+ 0:59:37.585 --> 0:59:38.319
1968
+ extract.
1969
+
1970
+ 0:59:38.558 --> 0:59:45.787
1971
+ If you think about this already here you can
1972
+ extract very, very many phrase pairs from:
1973
+
1974
+ 0:59:45.845 --> 0:59:55.476
1975
+ So what you can extract is, for example, what
1976
+ we saw up and so on.
1977
+
1978
+ 0:59:55.476 --> 1:00:00.363
1979
+ So all of them will be extracted.
1980
+
1981
+ 1:00:00.400 --> 1:00:08.379
1982
+ In order to limit this you typically have
1983
+ a length limit so you can only extract phrases
1984
+
1985
+ 1:00:08.379 --> 1:00:08.738
1986
+ up.
1987
+
1988
+ 1:00:09.049 --> 1:00:18.328
1989
+ But still there these phrases where you have
1990
+ all these phrases extracted.
1991
+
1992
+ 1:00:18.328 --> 1:00:22.968
1993
+ You have to think about how to deal.
1994
+
1995
+ 1:00:26.366 --> 1:00:34.966
1996
+ Now we have the phrases, so the other question
1997
+ is what is a good phrase pair and not so good.
1998
+
1999
+ 1:00:35.255 --> 1:00:39.933
2000
+ You might be that you sometimes extract one
2001
+ which is explaining this sentence but is not
2002
+
2003
+ 1:00:39.933 --> 1:00:44.769
2004
+ really a good one because there is something
2005
+ ever in there or something special so it might
2006
+
2007
+ 1:00:44.769 --> 1:00:47.239
2008
+ not be a good phase pair in another situation.
2009
+
2010
+ 1:00:49.629 --> 1:00:59.752
2011
+ And therefore the easiest thing is again just
2012
+ count, and if a phrase pair occurs very often
2013
+
2014
+ 1:00:59.752 --> 1:01:03.273
2015
+ seems to be a good phrase pair.
2016
+
2017
+ 1:01:03.743 --> 1:01:05.185
2018
+ So if we have this one.
2019
+
2020
+ 1:01:05.665 --> 1:01:09.179
2021
+ And if you have the exam up till now,.
2022
+
2023
+ 1:01:09.469 --> 1:01:20.759
2024
+ Then you look how often does up till now to
2025
+ this hair occur?
2026
+
2027
+ 1:01:20.759 --> 1:01:28.533
2028
+ How often does up until now to this hair?
2029
+
2030
+ 1:01:30.090 --> 1:01:36.426
2031
+ So this is one way of yeah describing the
2032
+ quality of the phrase book.
2033
+
2034
+ 1:01:37.257 --> 1:01:47.456
2035
+ So one difference is now, and that is the
2036
+ advantage of these primitive models.
2037
+
2038
+ 1:01:47.867 --> 1:01:55.442
2039
+ But instead we are trying to have a lot of
2040
+ features describing how good a phrase parent
2041
+
2042
+ 1:01:55.442 --> 1:01:55.786
2043
+ is.
2044
+
2045
+ 1:01:55.786 --> 1:02:04.211
2046
+ One of these features is this one describing:
2047
+ But in this model we'll later see how to combine
2048
+
2049
+ 1:02:04.211 --> 1:02:04.515
2050
+ it.
2051
+
2052
+ 1:02:04.515 --> 1:02:10.987
2053
+ The nice thing is we can invent any other
2054
+ type of features and add that and normally
2055
+
2056
+ 1:02:10.987 --> 1:02:14.870
2057
+ if you have two or three metrics to describe
2058
+ then.
2059
+
2060
+ 1:02:15.435 --> 1:02:18.393
2061
+ And therefore the spray spray sprays.
2062
+
2063
+ 1:02:18.393 --> 1:02:23.220
2064
+ They were not only like evaluated by one type
2065
+ but by several.
2066
+
2067
+ 1:02:23.763 --> 1:02:36.580
2068
+ So this could, for example, have a problem
2069
+ because your target phrase here occurs only
2070
+
2071
+ 1:02:36.580 --> 1:02:37.464
2072
+ once.
2073
+
2074
+ 1:02:38.398 --> 1:02:46.026
2075
+ It will of course only occur with one other
2076
+ source trait, and that probability will be
2077
+
2078
+ 1:02:46.026 --> 1:02:53.040
2079
+ one which might not be a very good estimation
2080
+ because you've only seen it once.
2081
+
2082
+ 1:02:53.533 --> 1:02:58.856
2083
+ Therefore, we use additional ones to better
2084
+ deal with that, and the first thing is we're
2085
+
2086
+ 1:02:58.856 --> 1:02:59.634
2087
+ doing again.
2088
+
2089
+ 1:02:59.634 --> 1:03:01.129
2090
+ Yeah, we know it by now.
2091
+
2092
+ 1:03:01.129 --> 1:03:06.692
2093
+ If you look at it in the one direction, it's
2094
+ helpful to us to look into the other direction.
2095
+
2096
+ 1:03:06.692 --> 1:03:11.297
2097
+ So you take also the inverse probability,
2098
+ so you not only take in peer of E.
2099
+
2100
+ 1:03:11.297 --> 1:03:11.477
2101
+ G.
2102
+
2103
+ 1:03:11.477 --> 1:03:11.656
2104
+ M.
2105
+
2106
+ 1:03:11.656 --> 1:03:12.972
2107
+ F., but also peer of.
2108
+
2109
+ 1:03:13.693 --> 1:03:19.933
2110
+ And then in addition you say maybe for the
2111
+ especially prolonged phrases they occur rarely,
2112
+
2113
+ 1:03:19.933 --> 1:03:25.898
2114
+ and then you have very high probabilities,
2115
+ and that might not be always the right one.
2116
+
2117
+ 1:03:25.898 --> 1:03:32.138
2118
+ So maybe it's good to also look at the word
2119
+ based probabilities to represent how good they
2120
+
2121
+ 1:03:32.138 --> 1:03:32.480
2122
+ are.
2123
+
2124
+ 1:03:32.692 --> 1:03:44.202
2125
+ So in addition you take the work based probabilities
2126
+ of this phrase pair as an additional model.
2127
+
2128
+ 1:03:44.704 --> 1:03:52.828
2129
+ So then you would have in total four different
2130
+ values describing how good the phrase is.
2131
+
2132
+ 1:03:52.828 --> 1:04:00.952
2133
+ It would be the relatively frequencies in
2134
+ both directions and the lexical probabilities.
2135
+
2136
+ 1:04:01.361 --> 1:04:08.515
2137
+ So four values in describing how probable
2138
+ a phrase translation is.
2139
+
2140
+ 1:04:11.871 --> 1:04:20.419
2141
+ Then the next challenge is how can we combine
2142
+ these different types of probabilities into
2143
+
2144
+ 1:04:20.419 --> 1:04:23.458
2145
+ a global score saying how good?
2146
+
2147
+ 1:04:24.424 --> 1:04:36.259
2148
+ Model, but before we are doing that give any
2149
+ questions to this phrase extraction and phrase
2150
+
2151
+ 1:04:36.259 --> 1:04:37.546
2152
+ creation.
2153
+
2154
+ 1:04:40.260 --> 1:04:44.961
2155
+ And the motivation for that this was our initial
2156
+ moral.
2157
+
2158
+ 1:04:44.961 --> 1:04:52.937
2159
+ If you remember from the beginning of a lecture
2160
+ we had the probability of like PFO three times
2161
+
2162
+ 1:04:52.937 --> 1:04:53.357
2163
+ PFO.
2164
+
2165
+ 1:04:55.155 --> 1:04:57.051
2166
+ Now the problem is here.
2167
+
2168
+ 1:04:57.051 --> 1:04:59.100
2169
+ That is, of course, right.
2170
+
2171
+ 1:04:59.100 --> 1:05:06.231
2172
+ However, we have done a lot of simplification
2173
+ that the translation probability is independent
2174
+
2175
+ 1:05:06.231 --> 1:05:08.204
2176
+ of the other translation.
2177
+
2178
+ 1:05:08.628 --> 1:05:14.609
2179
+ So therefore our estimations of pH give me
2180
+ and pH might not be right, and therefore the
2181
+
2182
+ 1:05:14.609 --> 1:05:16.784
2183
+ combination might not be right.
2184
+
2185
+ 1:05:17.317 --> 1:05:22.499
2186
+ So it can be that, for example, at the edge
2187
+ you have a fluid but not accurate translation.
2188
+
2189
+ 1:05:22.782 --> 1:05:25.909
2190
+ And Then There's Could Be an Easy Way Around
2191
+ It.
2192
+
2193
+ 1:05:26.126 --> 1:05:32.019
2194
+ If our effluent but not accurate, it might
2195
+ be that we put too much effort on the language
2196
+
2197
+ 1:05:32.019 --> 1:05:36.341
2198
+ model and we are putting too few effort on
2199
+ the translation model.
2200
+
2201
+ 1:05:36.936 --> 1:05:43.016
2202
+ There we can wait a minute so we can do this
2203
+ a bit stronger.
2204
+
2205
+ 1:05:43.016 --> 1:05:46.305
2206
+ This one is more important than.
2207
+
2208
+ 1:05:48.528 --> 1:05:53.511
2209
+ And based on that we can extend this idea
2210
+ to the lacteria mole.
2211
+
2212
+ 1:05:53.893 --> 1:06:02.164
2213
+ The log linear model now says all the translation
2214
+ probabilities is just we have.
2215
+
2216
+ 1:06:02.082 --> 1:06:09.230
2217
+ Describing how good this translation process
2218
+ is, these are the speeches H which depend on
2219
+
2220
+ 1:06:09.230 --> 1:06:09.468
2221
+ E.
2222
+
2223
+ 1:06:09.468 --> 1:06:09.706
2224
+ F.
2225
+
2226
+ 1:06:09.706 --> 1:06:13.280
2227
+ Only one of them, but generally depend on
2228
+ E.
2229
+
2230
+ 1:06:13.280 --> 1:06:13.518
2231
+ E.
2232
+
2233
+ 1:06:13.518 --> 1:06:13.757
2234
+ E.
2235
+
2236
+ 1:06:13.757 --> 1:06:13.995
2237
+ N.
2238
+
2239
+ 1:06:13.995 --> 1:06:14.233
2240
+ F.
2241
+
2242
+ 1:06:14.474 --> 1:06:22.393
2243
+ Each of these pictures has a weight saying
2244
+ yeah how good does it model it so that if you're
2245
+
2246
+ 1:06:22.393 --> 1:06:29.968
2247
+ asking a lot of people about some opinion it
2248
+ might also be waiting some opinion more so
2249
+
2250
+ 1:06:29.968 --> 1:06:34.100
2251
+ I put more effort on that and he may not be
2252
+ so.
2253
+
2254
+ 1:06:34.314 --> 1:06:39.239
2255
+ If you're saying that it's maybe a good indication,
2256
+ yeah, would trust that much.
2257
+
2258
+ 1:06:39.559 --> 1:06:41.380
2259
+ And exactly you can do that for you too.
2260
+
2261
+ 1:06:41.380 --> 1:06:42.446
2262
+ You can't add no below.
2263
+
2264
+ 1:06:43.423 --> 1:07:01.965
2265
+ It's like depending on how many you want to
2266
+ have and each of the features gives you value.
2267
+
2268
+ 1:07:02.102 --> 1:07:12.655
2269
+ The nice thing is that we can normally ignore
2270
+ because we are not interested in the probability
2271
+
2272
+ 1:07:12.655 --> 1:07:13.544
2273
+ itself.
2274
+
2275
+ 1:07:13.733 --> 1:07:18.640
2276
+ And again, if that's not normalized, that's
2277
+ fine.
2278
+
2279
+ 1:07:18.640 --> 1:07:23.841
2280
+ So if this value is the highest, that's the
2281
+ highest.
2282
+
2283
+ 1:07:26.987 --> 1:07:29.302
2284
+ Can we do that?
2285
+
2286
+ 1:07:29.302 --> 1:07:34.510
2287
+ Let's start with two simple things.
2288
+
2289
+ 1:07:34.510 --> 1:07:39.864
2290
+ Then you have one translation model.
2291
+
2292
+ 1:07:40.000 --> 1:07:43.102
2293
+ Which gives you the peer of eagerness.
2294
+
2295
+ 1:07:43.383 --> 1:07:49.203
2296
+ It can be typically as a feature it would
2297
+ take the liberalism of this ability, so mine
2298
+
2299
+ 1:07:49.203 --> 1:07:51.478
2300
+ is nine hundred and fourty seven.
2301
+
2302
+ 1:07:51.451 --> 1:07:57.846
2303
+ And the language model which says you how
2304
+ clue in the English side is how you can calculate
2305
+
2306
+ 1:07:57.846 --> 1:07:59.028
2307
+ the probability.
2308
+
2309
+ 1:07:58.979 --> 1:08:03.129
2310
+ In some future lectures we'll give you all
2311
+ superbology.
2312
+
2313
+ 1:08:03.129 --> 1:08:10.465
2314
+ You can feature again the luck of the purbology,
2315
+ then you have minus seven and then give different
2316
+
2317
+ 1:08:10.465 --> 1:08:11.725
2318
+ weights to them.
2319
+
2320
+ 1:08:12.292 --> 1:08:19.243
2321
+ And that means that your probability is one
2322
+ divided by said to the power of this.
2323
+
2324
+ 1:08:20.840 --> 1:08:38.853
2325
+ You're not really interested in the probability,
2326
+ so you just calculate on the score to the exponendum.
2327
+
2328
+ 1:08:40.000 --> 1:08:41.668
2329
+ Maximal Maximal I Think.
2330
+
2331
+ 1:08:42.122 --> 1:08:57.445
2332
+ You can, for example, try different translations,
2333
+ calculate all their scores and take in the
2334
+
2335
+ 1:08:57.445 --> 1:09:00.905
2336
+ end the translation.
2337
+
2338
+ 1:09:03.423 --> 1:09:04.661
2339
+ Why to do that.
2340
+
2341
+ 1:09:05.986 --> 1:09:10.698
2342
+ We've done that now for two, but of course
2343
+ you cannot only do it with two.
2344
+
2345
+ 1:09:10.698 --> 1:09:16.352
2346
+ You can do it now with any fixed number, so
2347
+ of course you have to decide in the beginning
2348
+
2349
+ 1:09:16.352 --> 1:09:21.944
2350
+ I want to have ten features or something like
2351
+ that, but you can take all these features.
2352
+
2353
+ 1:09:22.002 --> 1:09:29.378
2354
+ And yeah, based on them, they calculate your
2355
+ model probability or the model score.
2356
+
2357
+ 1:09:31.031 --> 1:09:40.849
2358
+ A big advantage over the initial.
2359
+
2360
+ 1:09:40.580 --> 1:09:45.506
2361
+ A model because now we can add a lot of features
2362
+ and there was diamond machine translation,
2363
+
2364
+ 1:09:45.506 --> 1:09:47.380
2365
+ a statistical machine translation.
2366
+
2367
+ 1:09:47.647 --> 1:09:57.063
2368
+ So how can develop new features, new ways
2369
+ of evaluating them so that can hopefully better
2370
+
2371
+ 1:09:57.063 --> 1:10:00.725
2372
+ describe what is good translation?
2373
+
2374
+ 1:10:01.001 --> 1:10:16.916
2375
+ If you have a new great feature you can calculate
2376
+ these features and then how much better do
2377
+
2378
+ 1:10:16.916 --> 1:10:18.969
2379
+ they model?
2380
+
2381
+ 1:10:21.741 --> 1:10:27.903
2382
+ There is one challenge which haven't touched
2383
+ upon yet.
2384
+
2385
+ 1:10:27.903 --> 1:10:33.505
2386
+ So could you easily build your model if you
2387
+ have.
2388
+
2389
+ 1:10:38.999 --> 1:10:43.016
2390
+ Assumed here something which just gazed, but
2391
+ which might not be that easy.
2392
+
2393
+ 1:10:49.990 --> 1:10:56.333
2394
+ The weight for the translation model is and
2395
+ the weight for the language model is.
2396
+
2397
+ 1:10:56.716 --> 1:11:08.030
2398
+ That's a bit arbitrary, so why should you
2399
+ use this one and guess normally you won't be
2400
+
2401
+ 1:11:08.030 --> 1:11:11.801
2402
+ able to select that by hand?
2403
+
2404
+ 1:11:11.992 --> 1:11:19.123
2405
+ So typically we didn't have like or features
2406
+ in there, but features is very common.
2407
+
2408
+ 1:11:19.779 --> 1:11:21.711
2409
+ So how do you select them?
2410
+
2411
+ 1:11:21.711 --> 1:11:24.645
2412
+ There was a second part of the training.
2413
+
2414
+ 1:11:24.645 --> 1:11:27.507
2415
+ These models were trained in two steps.
2416
+
2417
+ 1:11:27.507 --> 1:11:32.302
2418
+ On the one hand, we had the training of the
2419
+ individual components.
2420
+
2421
+ 1:11:32.302 --> 1:11:38.169
2422
+ We saw that now how to build the phrase based
2423
+ system, how to extract the phrases.
2424
+
2425
+ 1:11:38.738 --> 1:11:46.223
2426
+ But then if you have these different components
2427
+ you need a second training to learn the optimal.
2428
+
2429
+ 1:11:46.926 --> 1:11:51.158
2430
+ And typically this is referred to as the tuning
2431
+ of the system.
2432
+
2433
+ 1:11:51.431 --> 1:12:07.030
2434
+ So now if you have different types of models
2435
+ describing what a good translation is you need
2436
+
2437
+ 1:12:07.030 --> 1:12:10.760
2438
+ to find good weights.
2439
+
2440
+ 1:12:12.312 --> 1:12:14.315
2441
+ So how can you do it?
2442
+
2443
+ 1:12:14.315 --> 1:12:20.871
2444
+ The easiest thing is, of course, you can just
2445
+ try different things out.
2446
+
2447
+ 1:12:21.121 --> 1:12:27.496
2448
+ You can then always select the best hyper
2449
+ scissors.
2450
+
2451
+ 1:12:27.496 --> 1:12:38.089
2452
+ You can evaluate it with some metrics saying:
2453
+ You can score all your outputs, always select
2454
+
2455
+ 1:12:38.089 --> 1:12:42.543
2456
+ the best one and then get this translation.
2457
+
2458
+ 1:12:42.983 --> 1:12:45.930
2459
+ And you can do that for a lot of different
2460
+ possible combinations.
2461
+
2462
+ 1:12:47.067 --> 1:12:59.179
2463
+ However, the challenge is the complexity,
2464
+ so if you have only parameters and each of
2465
+
2466
+ 1:12:59.179 --> 1:13:04.166
2467
+ them has values you try for, then.
2468
+
2469
+ 1:13:04.804 --> 1:13:16.895
2470
+ We won't be able to try all of these possible
2471
+ combinations, so what we have to do is some
2472
+
2473
+ 1:13:16.895 --> 1:13:19.313
2474
+ more intelligent.
2475
+
2476
+ 1:13:20.540 --> 1:13:34.027
2477
+ And what has been done there in machine translation
2478
+ is referred to as a minimum error rate training.
2479
+
2480
+ 1:13:34.534 --> 1:13:41.743
2481
+ Whole surge is a very intuitive one, so have
2482
+ all these different parameters, so how do.
2483
+
2484
+ 1:13:42.522 --> 1:13:44.358
2485
+ And the idea is okay.
2486
+
2487
+ 1:13:44.358 --> 1:13:52.121
2488
+ I start with an initial guess and then I optimize
2489
+ one single parameter that's always easier.
2490
+
2491
+ 1:13:52.121 --> 1:13:54.041
2492
+ That's some or linear.
2493
+
2494
+ 1:13:54.041 --> 1:13:58.882
2495
+ So you're searching the best value for the
2496
+ one parameter.
2497
+
2498
+ 1:13:59.759 --> 1:14:04.130
2499
+ Often visualized with a San Francisco map.
2500
+
2501
+ 1:14:04.130 --> 1:14:13.786
2502
+ Just imagine if you want to go to the highest
2503
+ spot in San Francisco, you're standing somewhere
2504
+
2505
+ 1:14:13.786 --> 1:14:14.395
2506
+ here.
2507
+
2508
+ 1:14:14.574 --> 1:14:21.220
2509
+ You are switching your dimensions so you are
2510
+ going in this direction again finding.
2511
+
2512
+ 1:14:21.661 --> 1:14:33.804
2513
+ Now you're on a different street and this
2514
+ one is not a different one so you go in here
2515
+
2516
+ 1:14:33.804 --> 1:14:36.736
2517
+ so you can interact.
2518
+
2519
+ 1:14:36.977 --> 1:14:56.368
2520
+ The one thing of course is find a local optimum,
2521
+ especially if you start in two different positions.
2522
+
2523
+ 1:14:56.536 --> 1:15:10.030
2524
+ So yeah, there is a heuristic in there, so
2525
+ typically it's done again if you land in different
2526
+
2527
+ 1:15:10.030 --> 1:15:16.059
2528
+ positions with different starting points.
2529
+
2530
+ 1:15:16.516 --> 1:15:29.585
2531
+ What is different or what is like the addition
2532
+ of arrow rate training compared to the standard?
2533
+
2534
+ 1:15:29.729 --> 1:15:37.806
2535
+ So the question is, like we said, you can
2536
+ now evaluate different values for one parameter.
2537
+
2538
+ 1:15:38.918 --> 1:15:42.857
2539
+ And the question is: Which values should you
2540
+ try out for one parameters?
2541
+
2542
+ 1:15:42.857 --> 1:15:47.281
2543
+ Should you just do zero point one, zero point
2544
+ two, zero point three, or anything?
2545
+
2546
+ 1:15:49.029 --> 1:16:03.880
2547
+ If you change only one parameter then you
2548
+ can define the score of translation as a linear
2549
+
2550
+ 1:16:03.880 --> 1:16:05.530
2551
+ function.
2552
+
2553
+ 1:16:05.945 --> 1:16:17.258
2554
+ That this is the one that possesses, and yet
2555
+ if you change the parameter, the score of this.
2556
+
2557
+ 1:16:17.397 --> 1:16:26.506
2558
+ It may depend so your score is there because
2559
+ the rest you don't change your feature value.
2560
+
2561
+ 1:16:26.826 --> 1:16:30.100
2562
+ And the feature value is there for the steepness
2563
+ of their purse.
2564
+
2565
+ 1:16:30.750 --> 1:16:38.887
2566
+ And now look at different possible translations.
2567
+
2568
+ 1:16:38.887 --> 1:16:46.692
2569
+ Therefore, how they go up here is differently.
2570
+
2571
+ 1:16:47.247 --> 1:16:59.289
2572
+ So in this case if you look at the minimum
2573
+ score so there should be as minimum.
2574
+
2575
+ 1:17:00.300 --> 1:17:10.642
2576
+ So it's enough to check once a year and check
2577
+ once here because if you check here and here.
2578
+
2579
+ 1:17:11.111 --> 1:17:24.941
2580
+ And that is the idea in minimum air rate training
2581
+ when you select different hypotheses.
2582
+
2583
+ 1:17:29.309 --> 1:17:34.378
2584
+ So in yeah, the minimum air raid training
2585
+ is a power search.
2586
+
2587
+ 1:17:34.378 --> 1:17:37.453
2588
+ Then we do an intelligent step size.
2589
+
2590
+ 1:17:37.453 --> 1:17:39.364
2591
+ We do random restarts.
2592
+
2593
+ 1:17:39.364 --> 1:17:46.428
2594
+ Then things are still too slow because it
2595
+ might say we would have to decode a lot of
2596
+
2597
+ 1:17:46.428 --> 1:17:47.009
2598
+ times.
2599
+
2600
+ 1:17:46.987 --> 1:17:54.460
2601
+ So what we can do to make things even faster
2602
+ is we are decoding once with the current parameters,
2603
+
2604
+ 1:17:54.460 --> 1:18:01.248
2605
+ but then we are not generating only the most
2606
+ probable translation, but we are generating
2607
+
2608
+ 1:18:01.248 --> 1:18:05.061
2609
+ the most probable ten hundred translations
2610
+ or so.
2611
+
2612
+ 1:18:06.006 --> 1:18:18.338
2613
+ And then we are optimizing our weights by
2614
+ only looking at this one hundred translation
2615
+
2616
+ 1:18:18.338 --> 1:18:23.725
2617
+ and finding the optimal values there.
2618
+
2619
+ 1:18:24.564 --> 1:18:39.284
2620
+ Of course, it might be a problem that at some
2621
+ point you have now good ways to find good translations
2622
+
2623
+ 1:18:39.284 --> 1:18:42.928
2624
+ inside your ambest list.
2625
+
2626
+ 1:18:43.143 --> 1:18:52.357
2627
+ You have to iterate that sometime, but the
2628
+ important thing is you don't have to decode
2629
+
2630
+ 1:18:52.357 --> 1:18:56.382
2631
+ every time you need weights, but you.
2632
+
2633
+ 1:18:57.397 --> 1:19:11.325
2634
+ There is mainly a speed up process in order
2635
+ to make things more, make things even faster.
2636
+
2637
+ 1:19:15.515 --> 1:19:20.160
2638
+ Good Then We'll Finish With.
2639
+
2640
+ 1:19:20.440 --> 1:19:25.289
2641
+ Looking at how do you really calculate the
2642
+ scores and everything?
2643
+
2644
+ 1:19:25.289 --> 1:19:32.121
2645
+ Because what we did look into was a translation
2646
+ of a full sentence doesn't really consist of
2647
+
2648
+ 1:19:32.121 --> 1:19:37.190
2649
+ only one single phrase, but of course you have
2650
+ to combine different.
2651
+
2652
+ 1:19:37.637 --> 1:19:40.855
2653
+ So how does that now really look and how do
2654
+ we have to do?
2655
+
2656
+ 1:19:41.361 --> 1:19:48.252
2657
+ Just think again of the translation we have
2658
+ done before.
2659
+
2660
+ 1:19:48.252 --> 1:19:59.708
2661
+ The sentence must be: What is the probability
2662
+ of translating this one into what we saw after
2663
+
2664
+ 1:19:59.708 --> 1:20:00.301
2665
+ now?
2666
+
2667
+ 1:20:00.301 --> 1:20:03.501
2668
+ We're doing this by using.
2669
+
2670
+ 1:20:03.883 --> 1:20:07.157
2671
+ So we're having the phrase pair.
2672
+
2673
+ 1:20:07.157 --> 1:20:12.911
2674
+ Vasvia is the phrase pair up to now and gazine
2675
+ harm into.
2676
+
2677
+ 1:20:13.233 --> 1:20:18.970
2678
+ In addition, that is important because translation
2679
+ is not monotone.
2680
+
2681
+ 1:20:18.970 --> 1:20:26.311
2682
+ We are not putting phrase pairs in the same
2683
+ order as we are doing it on the source and
2684
+
2685
+ 1:20:26.311 --> 1:20:31.796
2686
+ on the target, but in order to generate the
2687
+ correct translation.
2688
+
2689
+ 1:20:31.771 --> 1:20:34.030
2690
+ So we have to shuffle the phrase pears.
2691
+
2692
+ 1:20:34.294 --> 1:20:39.747
2693
+ And the blue wand is in front on the search
2694
+ side but not on the back of the tag.
2695
+
2696
+ 1:20:40.200 --> 1:20:49.709
2697
+ This reordering makes a statistic of the machine
2698
+ translation really complicated because if you
2699
+
2700
+ 1:20:49.709 --> 1:20:53.313
2701
+ would just monotonely do this then.
2702
+
2703
+ 1:20:53.593 --> 1:21:05.288
2704
+ The problem is if you would analyze all possible
2705
+ combinations of reshuffling them, then again.
2706
+
2707
+ 1:21:05.565 --> 1:21:11.508
2708
+ So you again have to use some type of heuristics
2709
+ which shuffle you allow and which you don't
2710
+
2711
+ 1:21:11.508 --> 1:21:11.955
2712
+ allow.
2713
+
2714
+ 1:21:12.472 --> 1:21:27.889
2715
+ That was relatively challenging since, for
2716
+ example, if you think of Germany you would
2717
+
2718
+ 1:21:27.889 --> 1:21:32.371
2719
+ have to allow very long.
2720
+
2721
+ 1:21:33.033 --> 1:21:52.218
2722
+ But if we have now this, how do we calculate
2723
+ the translation score so the translation score?
2724
+
2725
+ 1:21:52.432 --> 1:21:55.792
2726
+ That's why we sum up the scores at the end.
2727
+
2728
+ 1:21:56.036 --> 1:22:08.524
2729
+ So you said our first feature is the probability
2730
+ of the full sentence.
2731
+
2732
+ 1:22:08.588 --> 1:22:13.932
2733
+ So we say, the translation of each phrase
2734
+ pair is independent of each other, and then
2735
+
2736
+ 1:22:13.932 --> 1:22:19.959
2737
+ we can hear the probability of the full sentences,
2738
+ fear of what we give, but fear of times, fear
2739
+
2740
+ 1:22:19.959 --> 1:22:24.246
2741
+ of sobbing because they have time to feel up
2742
+ till now is impossible.
2743
+
2744
+ 1:22:24.664 --> 1:22:29.379
2745
+ Now we can use the loss of logarithmal calculation.
2746
+
2747
+ 1:22:29.609 --> 1:22:36.563
2748
+ That's logarithm of the first perability.
2749
+
2750
+ 1:22:36.563 --> 1:22:48.153
2751
+ We'll get our first score, which says the
2752
+ translation model is minus.
2753
+
2754
+ 1:22:49.970 --> 1:22:56.586
2755
+ And that we're not doing only once, but we're
2756
+ exactly doing it with all our translation model.
2757
+
2758
+ 1:22:56.957 --> 1:23:03.705
2759
+ So we said we also have the relative frequency
2760
+ and the inverse directions of the.
2761
+
2762
+ 1:23:03.843 --> 1:23:06.226
2763
+ So in the end you'll have four scores.
2764
+
2765
+ 1:23:06.226 --> 1:23:09.097
2766
+ Here how you combine them is exactly the same.
2767
+
2768
+ 1:23:09.097 --> 1:23:12.824
2769
+ The only thing is how you look them up for
2770
+ each phrase pair.
2771
+
2772
+ 1:23:12.824 --> 1:23:18.139
2773
+ We have said in the beginning we are storing
2774
+ four scores describing how good they are.
2775
+
2776
+ 1:23:19.119 --> 1:23:25.415
2777
+ And these are then of force points describing
2778
+ how probable the sense.
2779
+
2780
+ 1:23:27.427 --> 1:23:31.579
2781
+ Then we can have more sports.
2782
+
2783
+ 1:23:31.579 --> 1:23:37.806
2784
+ For example, we can have a distortion model.
2785
+
2786
+ 1:23:37.806 --> 1:23:41.820
2787
+ How much reordering is done?
2788
+
2789
+ 1:23:41.841 --> 1:23:47.322
2790
+ There were different types of ones who won't
2791
+ go into detail, but just imagine you have no
2792
+
2793
+ 1:23:47.322 --> 1:23:47.748
2794
+ score.
2795
+
2796
+ 1:23:48.548 --> 1:23:56.651
2797
+ Then you have a language model which is the
2798
+ sequence of what we saw until now.
2799
+
2800
+ 1:23:56.651 --> 1:24:06.580
2801
+ How we generate this language model for ability
2802
+ will cover: And there weren't even more probabilities.
2803
+
2804
+ 1:24:06.580 --> 1:24:11.841
2805
+ So one, for example, was a phrase count scarf,
2806
+ which just counts how many.
2807
+
2808
+ 1:24:12.072 --> 1:24:19.555
2809
+ In order to learn is it better to have more
2810
+ short phrases or should bias on having fewer
2811
+
2812
+ 1:24:19.555 --> 1:24:20.564
2813
+ and longer.
2814
+
2815
+ 1:24:20.940 --> 1:24:28.885
2816
+ Easily add this but just counting so the value
2817
+ will be here and like putting in a count like
2818
+
2819
+ 1:24:28.885 --> 1:24:32.217
2820
+ typically how good is it to translate.
2821
+
2822
+ 1:24:32.932 --> 1:24:44.887
2823
+ For language model, the probability normally
2824
+ gets shorter the longer the sequences in order
2825
+
2826
+ 1:24:44.887 --> 1:24:46.836
2827
+ to counteract.
2828
+
2829
+ 1:24:47.827 --> 1:24:59.717
2830
+ And then you get your final score by multi-climbing
2831
+ each of the scores we had before.
2832
+
2833
+ 1:24:59.619 --> 1:25:07.339
2834
+ Optimization and that gives you a final score
2835
+ maybe of twenty three point seven eight five
2836
+
2837
+ 1:25:07.339 --> 1:25:13.278
2838
+ and then you can do that with several possible
2839
+ translation tests and.
2840
+
2841
+ 1:25:14.114 --> 1:25:23.949
2842
+ One may be important point here is so the
2843
+ score not only depends on the target side but
2844
+
2845
+ 1:25:23.949 --> 1:25:32.444
2846
+ it also depends on which phrases you have used
2847
+ so you could have generated.
2848
+
2849
+ 1:25:32.772 --> 1:25:38.076
2850
+ So you would have the same translation, but
2851
+ you would have a different split into phrase.
2852
+
2853
+ 1:25:38.979 --> 1:25:45.636
2854
+ And this was normally ignored so you would
2855
+ just look at all of them and then select the
2856
+
2857
+ 1:25:45.636 --> 1:25:52.672
2858
+ one which has the highest probability and ignore
2859
+ that this translation could be generated by
2860
+
2861
+ 1:25:52.672 --> 1:25:54.790
2862
+ several splits into phrase.
2863
+
2864
+ 1:25:57.497 --> 1:26:06.097
2865
+ So to summarize what we look into today and
2866
+ what you should hopefully remember is: Statistical
2867
+
2868
+ 1:26:06.097 --> 1:26:11.440
2869
+ models in how to generate machine translation
2870
+ output that were the word based statistical
2871
+
2872
+ 1:26:11.440 --> 1:26:11.915
2873
+ models.
2874
+
2875
+ 1:26:11.915 --> 1:26:16.962
2876
+ There was IBM models at the beginning and
2877
+ then we have the phrase based entity where
2878
+
2879
+ 1:26:16.962 --> 1:26:22.601
2880
+ it's about building the translation by putting
2881
+ together these blocks of phrases and combining.
2882
+
2883
+ 1:26:23.283 --> 1:26:34.771
2884
+ If you have a water which has several features
2885
+ you can't do that with millions but with features.
2886
+
2887
+ 1:26:34.834 --> 1:26:42.007
2888
+ Then you can combine them with your local
2889
+ model, which allows you to have your variable
2890
+
2891
+ 1:26:42.007 --> 1:26:45.186
2892
+ number of features and easily combine.
2893
+
2894
+ 1:26:45.365 --> 1:26:47.920
2895
+ Yeah, how much can you trust each of these
2896
+ more?
2897
+
2898
+ 1:26:51.091 --> 1:26:54.584
2899
+ Do you have any further questions for this
2900
+ topic?
2901
+
2902
+ 1:26:58.378 --> 1:27:08.715
2903
+ And there will be on Tuesday a lecture by
2904
+ Tuan about evaluation, and then next Thursday
2905
+
2906
+ 1:27:08.715 --> 1:27:12.710
2907
+ there will be the practical part.
2908
+
2909
+ 1:27:12.993 --> 1:27:21.461
2910
+ So please bring the practical pot here, but
2911
+ you can do something yourself if you are not
2912
+
2913
+ 1:27:21.461 --> 1:27:22.317
2914
+ able to.
2915
+
2916
+ 1:27:23.503 --> 1:27:26.848
2917
+ So then please tell us and we'll have to see
2918
+ how we find the difference in this.
2919
+
demo_data/lectures/Lecture-04-27.04.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8786f0bc34cf397879e95757fe367887c5f5d01d0f388aa98f768203cccc5269
3
+ size 116390723
demo_data/lectures/Lecture-05-02.05.2023/English.vtt ADDED
@@ -0,0 +1,1124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 0:00:56.957 --> 0:01:10.166
4
+ In today you are going to talk about evaluation
5
+ like how you can tell how well your translation.
6
+
7
+ 0:01:11.251 --> 0:01:23.175
8
+ Today we're going to talk about first some
9
+ introduction about the difficulties and also
10
+
11
+ 0:01:23.175 --> 0:01:27.783
12
+ the dimensions of the evaluation.
13
+
14
+ 0:01:28.248 --> 0:01:32.315
15
+ And the second one is on automatic evaluation.
16
+
17
+ 0:01:32.315 --> 0:01:33.960
18
+ The second one is.
19
+
20
+ 0:01:33.893 --> 0:01:40.952
21
+ Would be less human effort costly, but it
22
+ probably is not really as perfect.
23
+
24
+ 0:01:42.702 --> 0:02:01.262
25
+ So on machine translation evaluation, so the
26
+ goal is to measure the quality of translation.
27
+
28
+ 0:02:03.003 --> 0:02:06.949
29
+ We need machine translation evaluation.
30
+
31
+ 0:02:06.949 --> 0:02:14.152
32
+ The first thing is for application scenarios
33
+ and whether it is reliable.
34
+
35
+ 0:02:14.674 --> 0:02:22.911
36
+ Second thing is to guide our research because
37
+ given symmetrics we will be able to find out
38
+
39
+ 0:02:22.911 --> 0:02:30.875
40
+ which improvement direction is valuable for
41
+ our machine translation system and the last
42
+
43
+ 0:02:30.875 --> 0:02:34.224
44
+ thing is for our system development.
45
+
46
+ 0:02:36.116 --> 0:02:42.926
47
+ So now we will come to some difficulties on
48
+ evaluation.
49
+
50
+ 0:02:42.926 --> 0:02:50.952
51
+ The first thing is ambiguity because usually
52
+ for one sentence it.
53
+
54
+ 0:02:51.431 --> 0:03:04.031
55
+ Here you can see that, for example, we have
56
+ the correct reference.
57
+
58
+ 0:03:05.325 --> 0:03:19.124
59
+ The second difficulty is that small changes
60
+ can be very important.
61
+
62
+ 0:03:20.060 --> 0:03:22.531
63
+ The first difficulty is subjective.
64
+
65
+ 0:03:23.123 --> 0:03:39.266
66
+ So it depends on each person's opinion whether
67
+ translation is correct.
68
+
69
+ 0:03:41.041 --> 0:03:49.393
70
+ The last is that evaluation sometimes is application
71
+ dependent.
72
+
73
+ 0:03:49.393 --> 0:03:54.745
74
+ We're not sure how good it's getting up.
75
+
76
+ 0:03:57.437 --> 0:04:04.502
77
+ The first dimension is human versus automatic
78
+ evaluation, which I definitely talked about
79
+
80
+ 0:04:04.502 --> 0:04:06.151
81
+ in the introduction.
82
+
83
+ 0:04:06.151 --> 0:04:13.373
84
+ The second thing is on granulity, so evaluation
85
+ could be on sentence level, document level,
86
+
87
+ 0:04:13.373 --> 0:04:14.472
88
+ or task base.
89
+
90
+ 0:04:15.375 --> 0:04:28.622
91
+ The last thing is whether the translation
92
+ is correct in order to capture the meaning.
93
+
94
+ 0:04:30.630 --> 0:04:33.769
95
+ So on the first dimensions, human verses are
96
+ automatic.
97
+
98
+ 0:04:34.334 --> 0:04:45.069
99
+ So human evaluation education is the goal
100
+ standard because in the end we give our machine
101
+
102
+ 0:04:45.069 --> 0:04:48.647
103
+ translation system to people.
104
+
105
+ 0:04:49.329 --> 0:04:55.040
106
+ And is also expensive and time consuming for
107
+ people to manually evaluate some systems.
108
+
109
+ 0:04:57.057 --> 0:05:05.575
110
+ For automatic evaluation, it is of course
111
+ tupper and faster, and it would use human reference.
112
+
113
+ 0:05:08.168 --> 0:05:16.971
114
+ The next dimension is on granulity.
115
+
116
+ 0:05:16.971 --> 0:05:25.529
117
+ The first level is sentence based.
118
+
119
+ 0:05:25.885 --> 0:05:33.003
120
+ But this is difficult because if you translate
121
+ a single sentence, it will be difficult to
122
+
123
+ 0:05:33.003 --> 0:05:35.454
124
+ tell whether this translation.
125
+
126
+ 0:05:37.537 --> 0:05:40.633
127
+ The second level is document based.
128
+
129
+ 0:05:40.633 --> 0:05:46.051
130
+ This should be the most commonly used in automatic
131
+ evaluation.
132
+
133
+ 0:05:46.286 --> 0:06:00.750
134
+ This should be like the final bowl of our
135
+ machine translation.
136
+
137
+ 0:06:01.061 --> 0:06:02.315
138
+ And slow in general.
139
+
140
+ 0:06:02.315 --> 0:06:07.753
141
+ We are not sure whether the arrows come from
142
+ the machine translation system itself or some
143
+
144
+ 0:06:07.753 --> 0:06:08.828
145
+ other components.
146
+
147
+ 0:06:11.431 --> 0:06:21.300
148
+ The next dimension is on adigocy because it's
149
+ fluency, so adigocy is meaning translated correctly.
150
+
151
+ 0:06:22.642 --> 0:06:25.384
152
+ Can see the example here.
153
+
154
+ 0:06:25.384 --> 0:06:32.237
155
+ In hypothesis different is everything now,
156
+ so basically it just.
157
+
158
+ 0:06:32.852 --> 0:06:36.520
159
+ But then you can see it's not fluent.
160
+
161
+ 0:06:36.520 --> 0:06:38.933
162
+ It sounds kind of weird.
163
+
164
+ 0:06:38.933 --> 0:06:41.442
165
+ Nothing is different now.
166
+
167
+ 0:06:41.442 --> 0:06:43.179
168
+ It sounds fluent.
169
+
170
+ 0:06:46.006 --> 0:06:50.650
171
+ Next we come to error analysis.
172
+
173
+ 0:06:50.650 --> 0:07:02.407
174
+ When we value the system and give a score
175
+ we want to have interpretable results.
176
+
177
+ 0:07:03.083 --> 0:07:07.930
178
+ So usually there would be some tetsus first
179
+ in order to detect these errors.
180
+
181
+ 0:07:08.448 --> 0:07:21.077
182
+ And usually they would be like quite specific
183
+ to some specific type of arrow, for example
184
+
185
+ 0:07:21.077 --> 0:07:23.743
186
+ wrong translation.
187
+
188
+ 0:07:24.344 --> 0:07:32.127
189
+ All morphological agreements in whether the
190
+ world form is correct.
191
+
192
+ 0:07:32.127 --> 0:07:35.031
193
+ If you have the article.
194
+
195
+ 0:07:37.577 --> 0:07:45.904
196
+ So now we come to human evaluation, which
197
+ is the final goal of machine translation.
198
+
199
+ 0:07:47.287 --> 0:07:50.287
200
+ So why do we perform human evaluation?
201
+
202
+ 0:07:51.011 --> 0:08:00.115
203
+ The first thing is that automatic machine
204
+ translation magic is not sufficient.
205
+
206
+ 0:08:00.480 --> 0:08:06.725
207
+ Existing automated metrics and are sometimes
208
+ biased.
209
+
210
+ 0:08:06.725 --> 0:08:16.033
211
+ For example, the blue spar, but the blue scar
212
+ will usually try to look at the.
213
+
214
+ 0:08:16.496 --> 0:08:24.018
215
+ So it doesn't take into account some deeper
216
+ meaning like cares about word-to-word matching
217
+
218
+ 0:08:24.018 --> 0:08:26.829
219
+ instead of rephrasing or synonym.
220
+
221
+ 0:08:27.587 --> 0:08:34.881
222
+ And bias, as in that metrics like that would
223
+ usually depend a lot on the goal standard reference
224
+
225
+ 0:08:34.881 --> 0:08:41.948
226
+ given from some human, and that person could
227
+ have some specific type or language preferences,
228
+
229
+ 0:08:41.948 --> 0:08:43.979
230
+ and then the metric would.
231
+
232
+ 0:08:47.147 --> 0:08:55.422
233
+ The next thing is that automatic metrics don't
234
+ provide sufficient insights for error analysis.
235
+
236
+ 0:08:57.317 --> 0:09:04.096
237
+ Different types of errors would have different
238
+ implications depending on the underlying task.
239
+
240
+ 0:09:04.644 --> 0:09:09.895
241
+ So, for example, if you use machine translation
242
+ for information with you both,.
243
+
244
+ 0:09:10.470 --> 0:09:20.202
245
+ Then if it makes some error omitting some
246
+ words in translation then it would be very
247
+
248
+ 0:09:20.202 --> 0:09:20.775
249
+ bad.
250
+
251
+ 0:09:21.321 --> 0:09:30.305
252
+ Another example is if you use machine translation
253
+ in chat pop then fluency would be very important
254
+
255
+ 0:09:30.305 --> 0:09:50.253
256
+ because: And we also need human measure in
257
+ order to develop and assess automatic translation
258
+
259
+ 0:09:50.253 --> 0:09:52.324
260
+ evaluation.
261
+
262
+ 0:09:55.455 --> 0:10:01.872
263
+ Okay, so now we will come to the quality measures
264
+ of human evaluation.
265
+
266
+ 0:10:02.402 --> 0:10:05.165
267
+ The first thing is inter allotator agreement.
268
+
269
+ 0:10:05.825 --> 0:10:25.985
270
+ This is agreement between different annotators.
271
+
272
+ 0:10:26.126 --> 0:10:31.496
273
+ So as you can see here, this would measure
274
+ the reliability of the other features.
275
+
276
+ 0:10:32.252 --> 0:10:49.440
277
+ And here we have an example of where the pace
278
+ car here is.
279
+
280
+ 0:10:49.849 --> 0:10:57.700
281
+ And this is in contrast to intra-annuator
282
+ agreement, so this is agreement within an annotator.
283
+
284
+ 0:10:58.118 --> 0:11:03.950
285
+ So instead of measuring reliability, here
286
+ it measures consistency of a single animator.
287
+
288
+ 0:11:04.884 --> 0:11:07.027
289
+ And yep.
290
+
291
+ 0:11:07.027 --> 0:11:22.260
292
+ We also have an example here of the which
293
+ is so which is quite.
294
+
295
+ 0:11:23.263 --> 0:11:42.120
296
+ So now we will come to the main types of human
297
+ assessment: The first thing is direct assessment.
298
+
299
+ 0:11:42.842 --> 0:11:53.826
300
+ The second thing is human ranking of the translation
301
+ at sentence level.
302
+
303
+ 0:11:56.176 --> 0:12:11.087
304
+ So direct assessment given the source and
305
+ translation, and possibly the reference translation.
306
+
307
+ 0:12:12.612 --> 0:12:18.023
308
+ The goal here is to give the scores to evaluate
309
+ performance,adequacy and fluency.
310
+
311
+ 0:12:18.598 --> 0:12:23.619
312
+ The problem here is that we need normalization
313
+ across different judges, different human.
314
+
315
+ 0:12:24.604 --> 0:12:27.043
316
+ And here we have an example.
317
+
318
+ 0:12:27.043 --> 0:12:33.517
319
+ She was treated at the site by an emergency
320
+ doctor and taken to hospital by.
321
+
322
+ 0:12:34.334 --> 0:12:48.444
323
+ The hypothesis here is that she was treated
324
+ on site and emergency medical rescue workers
325
+
326
+ 0:12:48.444 --> 0:12:52.090
327
+ brought to a hospital.
328
+
329
+ 0:12:52.472 --> 0:12:56.267
330
+ Lesson five is best in one sport.
331
+
332
+ 0:13:00.060 --> 0:13:04.716
333
+ I don't think it's hard because I think there
334
+ should be broad threat to a hospital right.
335
+
336
+ 0:13:05.905 --> 0:13:09.553
337
+ Yes, that is like a crucial error.
338
+
339
+ 0:13:09.553 --> 0:13:19.558
340
+ Yeah, I think I would agree because this sentence
341
+ somehow gives us the idea of what the meaning
342
+
343
+ 0:13:19.558 --> 0:13:21.642
344
+ of the sentence is.
345
+
346
+ 0:13:21.642 --> 0:13:24.768
347
+ But then it lost towards her.
348
+
349
+ 0:13:27.027 --> 0:13:29.298
350
+ The next time of human evaluation is ranking.
351
+
352
+ 0:13:30.810 --> 0:13:38.893
353
+ Which is a great different system according
354
+ to performance like which one is better.
355
+
356
+ 0:13:40.981 --> 0:13:43.914
357
+ So here now we have a second hypothesis.
358
+
359
+ 0:13:43.914 --> 0:13:49.280
360
+ She was hospitalized on the spot and taken
361
+ to hospital by ambulance crews.
362
+
363
+ 0:13:50.630 --> 0:14:01.608
364
+ As you can see here, the second hypothesis
365
+ seems to be more fluent, more smooth.
366
+
367
+ 0:14:01.608 --> 0:14:09.096
368
+ The meaning capture seems to be: So yeah,
369
+ it's difficult to compare different errors
370
+
371
+ 0:14:09.096 --> 0:14:11.143
372
+ in whether which error is more severe.
373
+
374
+ 0:14:13.373 --> 0:14:16.068
375
+ The next type of human evaluation is post
376
+ editing.
377
+
378
+ 0:14:17.817 --> 0:14:29.483
379
+ So we want to measure how much time and effort
380
+ human needs to spend in order to turn it into
381
+
382
+ 0:14:29.483 --> 0:14:32.117
383
+ correct translation.
384
+
385
+ 0:14:32.993 --> 0:14:47.905
386
+ So this area can be measured by time or key
387
+ shop.
388
+
389
+ 0:14:49.649 --> 0:14:52.889
390
+ And the last one is task based evaluation.
391
+
392
+ 0:14:52.889 --> 0:14:56.806
393
+ Here we would want to evaluate the complete
394
+ system.
395
+
396
+ 0:14:56.806 --> 0:15:03.436
397
+ But if you are using the lecture translator
398
+ and you see my lecture in German, the final
399
+
400
+ 0:15:03.436 --> 0:15:05.772
401
+ evaluation here would be like.
402
+
403
+ 0:15:05.772 --> 0:15:08.183
404
+ In the end, can you understand?
405
+
406
+ 0:15:09.769 --> 0:15:15.301
407
+ Their friendship here that we get the overall
408
+ performance, which is our final goal.
409
+
410
+ 0:15:16.816 --> 0:15:25.850
411
+ But the disadvantage here that it could be
412
+ complex and again if the spur is low it might
413
+
414
+ 0:15:25.850 --> 0:15:31.432
415
+ be other problems than the machine translation
416
+ itself.
417
+
418
+ 0:15:33.613 --> 0:15:42.941
419
+ So guess that was about the human evaluation
420
+ part any question so far.
421
+
422
+ 0:15:42.941 --> 0:15:44.255
423
+ Yes, and.
424
+
425
+ 0:16:00.000 --> 0:16:15.655
426
+ Then we will come to our magic matrix here
427
+ to access the quality of the machine translation
428
+
429
+ 0:16:15.655 --> 0:16:26.179
430
+ system by comparing: So the premise here is
431
+ that the more similar translation is to reference,
432
+
433
+ 0:16:26.179 --> 0:16:31.437
434
+ the better and we want some algorithms that
435
+ can approximate.
436
+
437
+ 0:16:34.114 --> 0:16:47.735
438
+ So the most famous measure could be the blow
439
+ spark and the bilingual evaluation.
440
+
441
+ 0:16:50.930 --> 0:16:56.358
442
+ So if we are given the goal that the more
443
+ similar translation is to the reference, the
444
+
445
+ 0:16:56.358 --> 0:17:01.785
446
+ better I think the most naive way would be
447
+ count the number of people sentenced to the
448
+
449
+ 0:17:01.785 --> 0:17:02.472
450
+ reference.
451
+
452
+ 0:17:02.472 --> 0:17:08.211
453
+ But as you can see, this would be very difficult
454
+ because sentence being exactly the same to
455
+
456
+ 0:17:08.211 --> 0:17:10.332
457
+ the reference would be very rare.
458
+
459
+ 0:17:11.831 --> 0:17:24.222
460
+ You can see the example here in the reference
461
+ and machine translation output.
462
+
463
+ 0:17:24.764 --> 0:17:31.930
464
+ So the idea here is that instead of comparing
465
+ the two whole sentences up, we consider the.
466
+
467
+ 0:17:35.255 --> 0:17:43.333
468
+ Now we can look at an example, so for the
469
+ blow score we consider one to three four grams.
470
+
471
+ 0:17:44.844 --> 0:17:52.611
472
+ The one ramp of a lap we would have back to
473
+ the future, not at premieres thirty years ago,
474
+
475
+ 0:17:52.611 --> 0:17:59.524
476
+ so it should be like one, two, three, four,
477
+ five, six, seven, eight, so like it.
478
+
479
+ 0:17:59.459 --> 0:18:01.476
480
+ One ram is overlap to the reverence.
481
+
482
+ 0:18:01.921 --> 0:18:03.366
483
+ So you should be over.
484
+
485
+ 0:18:06.666 --> 0:18:08.994
486
+ Is kind of the same.
487
+
488
+ 0:18:08.994 --> 0:18:18.529
489
+ Instead of considering only the word back
490
+ for three, one is to be back to the future.
491
+
492
+ 0:18:19.439 --> 0:18:31.360
493
+ So that is basically the idea of the blue
494
+ score, and in the end we calculate the geometric.
495
+
496
+ 0:18:32.812 --> 0:18:39.745
497
+ So as you can see here, when we look at the
498
+ A brand overlap you can only look at the machine
499
+
500
+ 0:18:39.745 --> 0:18:40.715
501
+ translation.
502
+
503
+ 0:18:41.041 --> 0:18:55.181
504
+ We only care about how many words in the machine
505
+ translation output appear.
506
+
507
+ 0:18:55.455 --> 0:19:02.370
508
+ So this metric is kind of like a precision
509
+ based and not really recall based.
510
+
511
+ 0:19:04.224 --> 0:19:08.112
512
+ So this would lead to a problem like the example
513
+ here.
514
+
515
+ 0:19:08.112 --> 0:19:14.828
516
+ The reference is back to the future of Premier
517
+ 30 years ago and the machine translation output
518
+
519
+ 0:19:14.828 --> 0:19:16.807
520
+ is only back to the future.
521
+
522
+ 0:19:17.557 --> 0:19:28.722
523
+ The one grab overlap will be formed because
524
+ you can see back to the future is overlap entirely
525
+
526
+ 0:19:28.722 --> 0:19:30.367
527
+ in reference.
528
+
529
+ 0:19:31.231 --> 0:19:38.314
530
+ Is not right because one is the perfect score,
531
+ but this is obviously not a good translation.
532
+
533
+ 0:19:40.120 --> 0:19:47.160
534
+ So in order to tackle this they use something
535
+ called pre gravity velocity.
536
+
537
+ 0:19:47.988 --> 0:19:59.910
538
+ So it should be a factor that is multiplied
539
+ to the geometric nymph.
540
+
541
+ 0:19:59.910 --> 0:20:04.820
542
+ This form is the length of.
543
+
544
+ 0:20:05.525 --> 0:20:19.901
545
+ So the penalty over or overseas to the power
546
+ of the length of this river over.
547
+
548
+ 0:20:21.321 --> 0:20:32.298
549
+ Which is lower than, and if we apply this
550
+ to the example, the blowscorn is going to be
551
+
552
+ 0:20:32.298 --> 0:20:36.462
553
+ which is not a good translation.
554
+
555
+ 0:20:38.999 --> 0:20:42.152
556
+ Yep so any question of this place.
557
+
558
+ 0:20:44.064 --> 0:21:00.947
559
+ Yes exactly that should be a problem as well,
560
+ and it will be mentioned later on.
561
+
562
+ 0:21:00.947 --> 0:21:01.990
563
+ But.
564
+
565
+ 0:21:03.203 --> 0:21:08.239
566
+ Is very sensitive to zero score like that,
567
+ so that is why we usually don't use the blue
568
+
569
+ 0:21:08.239 --> 0:21:13.103
570
+ score sentence level because sentence can be
571
+ short and then there can be no overlap.
572
+
573
+ 0:21:13.103 --> 0:21:16.709
574
+ That is why we usually use it on documents
575
+ as you can imagine.
576
+
577
+ 0:21:16.709 --> 0:21:20.657
578
+ Documents are very long and very little chance
579
+ to have zero overlap.
580
+
581
+ 0:21:23.363 --> 0:21:28.531
582
+ Yeah okay, so the next thing on the blow's
583
+ floor is slipping.
584
+
585
+ 0:21:29.809 --> 0:21:42.925
586
+ So you can see here we have two references,
587
+ the new movie and the new film, and we have
588
+
589
+ 0:21:42.925 --> 0:21:47.396
590
+ a machine translation output.
591
+
592
+ 0:21:47.807 --> 0:21:54.735
593
+ Because the here is also in the reference,
594
+ so yeah two or two books is one, which is:
595
+
596
+ 0:21:56.236 --> 0:22:02.085
597
+ So but then this is not what we want because
598
+ this is just repeating something that appears.
599
+
600
+ 0:22:02.702 --> 0:22:06.058
601
+ So that's why we use clipping.
602
+
603
+ 0:22:06.058 --> 0:22:15.368
604
+ Clipping here is that we consider the mask
605
+ counts in any reference, so as you can see
606
+
607
+ 0:22:15.368 --> 0:22:17.425
608
+ here in reference.
609
+
610
+ 0:22:18.098 --> 0:22:28.833
611
+ So here when we do clipping we will just use
612
+ the maximum opponents in the references.
613
+
614
+ 0:22:29.809 --> 0:22:38.717
615
+ Yeah, just to avoid avoid overlapping repetitive
616
+ words in the translation.
617
+
618
+ 0:22:41.641 --> 0:23:00.599
619
+ It could happen that there is no overlap between
620
+ the machine translation output and reference.
621
+
622
+ 0:23:00.500 --> 0:23:01.917
623
+ Then Everything Is Going To Go To Zero.
624
+
625
+ 0:23:02.402 --> 0:23:07.876
626
+ So that's why for blow score we usually use
627
+ Japanese level score where we arrogate the
628
+
629
+ 0:23:07.876 --> 0:23:08.631
630
+ statistics.
631
+
632
+ 0:23:12.092 --> 0:23:18.589
633
+ Some summary about the brewer as you can see
634
+ it mash exact words.
635
+
636
+ 0:23:18.589 --> 0:23:31.751
637
+ It can take several references: It measured
638
+ a depotency by the word precision and if measured
639
+
640
+ 0:23:31.751 --> 0:23:36.656
641
+ the fluency by the gram precision.
642
+
643
+ 0:23:37.437 --> 0:23:47.254
644
+ And as mentioned, it doesn't consider how
645
+ much meaning that is captured in the machine
646
+
647
+ 0:23:47.254 --> 0:23:48.721
648
+ translation.
649
+
650
+ 0:23:49.589 --> 0:23:53.538
651
+ So here they use reality penalty to prevent
652
+ short sentences.
653
+
654
+ 0:23:54.654 --> 0:24:04.395
655
+ Will get the spot over the last test set to
656
+ avoid the zero issues.
657
+
658
+ 0:24:04.395 --> 0:24:07.012
659
+ As we mentioned,.
660
+
661
+ 0:24:09.829 --> 0:24:22.387
662
+ Yes, that's mentioned with multiple reference
663
+ translation simultaneously, and it's a precision
664
+
665
+ 0:24:22.387 --> 0:24:24.238
666
+ based matrix.
667
+
668
+ 0:24:24.238 --> 0:24:27.939
669
+ So we are not sure if this.
670
+
671
+ 0:24:29.689 --> 0:24:37.423
672
+ The second thing is that blows calls common
673
+ safe for recall by routine penalty, and we
674
+
675
+ 0:24:37.423 --> 0:24:38.667
676
+ are not sure.
677
+
678
+ 0:24:39.659 --> 0:24:50.902
679
+ Matches, so can still improve the similarity
680
+ measure and improve the correlation score to
681
+
682
+ 0:24:50.902 --> 0:24:51.776
683
+ human.
684
+
685
+ 0:24:52.832 --> 0:25:01.673
686
+ The next is that all work will have the same
687
+ importance.
688
+
689
+ 0:25:01.673 --> 0:25:07.101
690
+ What if a scheme for wedding work?
691
+
692
+ 0:25:11.571 --> 0:25:26.862
693
+ And the last witness is that blows for high
694
+ grade order engrams that can confluency dramatically.
695
+
696
+ 0:25:27.547 --> 0:25:32.101
697
+ So the pressure is that can be accounted for
698
+ fluency, and grammatically there's some other.
699
+
700
+ 0:25:35.956 --> 0:25:47.257
701
+ We have some further issues and not created
702
+ equally so we can use stemming or knowledge
703
+
704
+ 0:25:47.257 --> 0:25:48.156
705
+ space.
706
+
707
+ 0:25:50.730 --> 0:26:00.576
708
+ The next way we incorporate information is
709
+ within the metrics.
710
+
711
+ 0:26:01.101 --> 0:26:07.101
712
+ And can be used like a stop list to like somehow
713
+ ignore the non-important words.
714
+
715
+ 0:26:08.688 --> 0:26:12.687
716
+ Text normalization spelling conjugation lower
717
+ case and mix case.
718
+
719
+ 0:26:12.687 --> 0:26:18.592
720
+ The next thing is that for some language like
721
+ Chinese there can be different world segmentation
722
+
723
+ 0:26:18.592 --> 0:26:23.944
724
+ so exact word matching might no longer be a
725
+ good idea so maybe it's ready to cover the
726
+
727
+ 0:26:23.944 --> 0:26:27.388
728
+ score as the character level instead of the
729
+ word level.
730
+
731
+ 0:26:29.209 --> 0:26:33.794
732
+ And the last thing is speech translation.
733
+
734
+ 0:26:33.794 --> 0:26:38.707
735
+ Usually input from speech translation would.
736
+
737
+ 0:26:38.979 --> 0:26:51.399
738
+ And there should be some way to segment into
739
+ sentences so that we can calculate the score
740
+
741
+ 0:26:51.399 --> 0:26:52.090
742
+ and.
743
+
744
+ 0:26:52.953 --> 0:27:01.326
745
+ And the way to soften is to use some tools
746
+ like enware segmentation to align the output
747
+
748
+ 0:27:01.326 --> 0:27:01.896
749
+ with.
750
+
751
+ 0:27:06.306 --> 0:27:10.274
752
+ Yes, so guess that was all about the blow
753
+ score any question.
754
+
755
+ 0:27:14.274 --> 0:27:28.292
756
+ Again on automatic metrics we'll talk about
757
+ probably good metrics, strange automatic metrics,
758
+
759
+ 0:27:28.292 --> 0:27:32.021
760
+ use cases on evaluation.
761
+
762
+ 0:27:34.374 --> 0:27:44.763
763
+ How to measure the performance of the matrix,
764
+ so a good matrix would be a.
765
+
766
+ 0:27:49.949 --> 0:28:04.905
767
+ We would want the matrix to be interpretable
768
+ if this is the ranking from a human that somehow
769
+
770
+ 0:28:04.905 --> 0:28:08.247
771
+ can rank the system.
772
+
773
+ 0:28:12.132 --> 0:28:15.819
774
+ We would also want the evaluation metric to
775
+ be sensitive.
776
+
777
+ 0:28:15.819 --> 0:28:21.732
778
+ Like small differences in the machine translation
779
+ can be distinguished, we would not need to
780
+
781
+ 0:28:21.732 --> 0:28:22.686
782
+ be consistent.
783
+
784
+ 0:28:22.686 --> 0:28:28.472
785
+ Like if the same machine translation system
786
+ is used on a similar text, it should reproduce
787
+
788
+ 0:28:28.472 --> 0:28:29.553
789
+ a similar score.
790
+
791
+ 0:28:31.972 --> 0:28:40.050
792
+ Next, we would want the machine translation
793
+ system to be reliable.
794
+
795
+ 0:28:40.050 --> 0:28:42.583
796
+ Machine translation.
797
+
798
+ 0:28:43.223 --> 0:28:52.143
799
+ We want the matrix to be easy to run in general
800
+ and can be applied to multiple different machine.
801
+
802
+ 0:28:55.035 --> 0:29:11.148
803
+ The difficulty of evaluating the metric itself
804
+ is kind of similar to when you evaluate the
805
+
806
+ 0:29:11.148 --> 0:29:13.450
807
+ translation.
808
+
809
+ 0:29:18.638 --> 0:29:23.813
810
+ And here is some components of the automatic
811
+ machine translation matrix.
812
+
813
+ 0:29:23.813 --> 0:29:28.420
814
+ So for the matching matrix the component would
815
+ be the precision.
816
+
817
+ 0:29:28.420 --> 0:29:30.689
818
+ Recall our Levinstein distance.
819
+
820
+ 0:29:30.689 --> 0:29:35.225
821
+ So for the blow sparks you have seen it cares
822
+ mostly about the.
823
+
824
+ 0:29:36.396 --> 0:29:45.613
825
+ And on the features it would be about how
826
+ to measure the matches or character based.
827
+
828
+ 0:29:48.588 --> 0:30:01.304
829
+ Now we will talk about more matrix because
830
+ the blue score is the most common.
831
+
832
+ 0:30:02.082 --> 0:30:10.863
833
+ So it compared the reference and hypothesis
834
+ using edit operations.
835
+
836
+ 0:30:10.863 --> 0:30:14.925
837
+ They count how many insertion.
838
+
839
+ 0:30:23.143 --> 0:30:31.968
840
+ We already talked about it beyond what matching
841
+ would care about character based mathematization
842
+
843
+ 0:30:31.968 --> 0:30:34.425
844
+ or linguistic information.
845
+
846
+ 0:30:36.636 --> 0:30:41.502
847
+ The next metric is the meteor metric.
848
+
849
+ 0:30:41.502 --> 0:30:50.978
850
+ This is strong called metric for evaluation
851
+ of translation with explicit.
852
+
853
+ 0:30:51.331 --> 0:31:03.236
854
+ So merely their new idea is that they reintroduce
855
+ repose and combine with precision as small
856
+
857
+ 0:31:03.236 --> 0:31:04.772
858
+ components.
859
+
860
+ 0:31:05.986 --> 0:31:16.700
861
+ The language translation output with each
862
+ reference individually and takes part of the
863
+
864
+ 0:31:16.700 --> 0:31:18.301
865
+ best parent.
866
+
867
+ 0:31:20.940 --> 0:31:27.330
868
+ The next thing is that matching takes into
869
+ counterfection variation by stepping, so it's
870
+
871
+ 0:31:27.330 --> 0:31:28.119
872
+ no longer.
873
+
874
+ 0:31:30.230 --> 0:31:40.165
875
+ When they address fluency, they're a direct
876
+ penalty instead of ink arms so they would care
877
+
878
+ 0:31:40.165 --> 0:31:40.929
879
+ about.
880
+
881
+ 0:31:45.925 --> 0:31:56.287
882
+ The next thing is on two noble metrics, so
883
+ for this metric we want to extract some features.
884
+
885
+ 0:31:56.936 --> 0:32:04.450
886
+ So for example here the nice house is on the
887
+ right and the building is on the right side
888
+
889
+ 0:32:04.450 --> 0:32:12.216
890
+ so we will have to extract some pictures like
891
+ for example here the reference and hypothesis
892
+
893
+ 0:32:12.216 --> 0:32:14.158
894
+ have hypers in common.
895
+
896
+ 0:32:14.714 --> 0:32:19.163
897
+ They have one insertion, two deletions, and
898
+ they have the same verb.
899
+
900
+ 0:32:21.141 --> 0:32:31.530
901
+ So the idea is to use machine translation
902
+ techniques to combine features and this machine
903
+
904
+ 0:32:31.530 --> 0:32:37.532
905
+ translation model will be trained on human
906
+ ranking.
907
+
908
+ 0:32:39.819 --> 0:32:44.788
909
+ Any common framework for this is comet.
910
+
911
+ 0:32:44.684 --> 0:32:48.094
912
+ Which is a narrow model that is used with
913
+ X for.
914
+
915
+ 0:32:48.094 --> 0:32:54.149
916
+ The feature would be created using some prejutant
917
+ model like X, L, M, U, R, A, BO, DA.
918
+
919
+ 0:32:54.149 --> 0:33:00.622
920
+ Here the input would be the source, the reference
921
+ and the hypothesis and then they would try
922
+
923
+ 0:33:00.622 --> 0:33:02.431
924
+ to produce an assessment.
925
+
926
+ 0:33:03.583 --> 0:33:05.428
927
+ Yeah, it's strange to predict human sport.
928
+
929
+ 0:33:06.346 --> 0:33:19.131
930
+ And they also have some additional versions,
931
+ as we train this model in order to tell whether
932
+
933
+ 0:33:19.131 --> 0:33:20.918
934
+ translation.
935
+
936
+ 0:33:21.221 --> 0:33:29.724
937
+ So instead of checking the source and the
938
+ hypothesis as input, they could take only the
939
+
940
+ 0:33:29.724 --> 0:33:38.034
941
+ source and the hypotheses as input and try
942
+ to predict the quality of the translation.
943
+
944
+ 0:33:42.562 --> 0:33:49.836
945
+ So assumptions before machine translation
946
+ systems are often used in larger systems.
947
+
948
+ 0:33:50.430 --> 0:33:57.713
949
+ So the question is how to evaluate the performance
950
+ of the machine translation system in this larger
951
+
952
+ 0:33:57.713 --> 0:34:04.997
953
+ scenario, and an example would be speech translation
954
+ system when you try to translate English audio
955
+
956
+ 0:34:04.997 --> 0:34:05.798
957
+ to German.
958
+
959
+ 0:34:06.506 --> 0:34:13.605
960
+ Then it would usually have two opponents,
961
+ ASR and MT, where ASR is like speech recognition
962
+
963
+ 0:34:13.605 --> 0:34:20.626
964
+ that can describe English audio to English
965
+ text, and then we have the machine translation
966
+
967
+ 0:34:20.626 --> 0:34:24.682
968
+ system that translates English text to German
969
+ text.
970
+
971
+ 0:34:26.967 --> 0:34:33.339
972
+ So in order to have these overall performances
973
+ in this bigger scenario, they are so willing
974
+
975
+ 0:34:33.339 --> 0:34:34.447
976
+ to evaluate it.
977
+
978
+ 0:34:34.447 --> 0:34:41.236
979
+ So the first one is to evaluate the individual
980
+ components like how good is the speech recognizer,
981
+
982
+ 0:34:41.236 --> 0:34:46.916
983
+ how good is the analyzed and generalization
984
+ engines, how good is the synthesizer.
985
+
986
+ 0:34:47.727 --> 0:34:56.905
987
+ The second way is to evaluate translation
988
+ quality from speech input to text output.
989
+
990
+ 0:34:56.905 --> 0:35:00.729
991
+ How good is the final translation?
992
+
993
+ 0:35:02.102 --> 0:35:10.042
994
+ The next thing is to measure the to evaluate
995
+ the architecture effectiveness like: How is
996
+
997
+ 0:35:10.042 --> 0:35:12.325
998
+ the level effects in general?
999
+
1000
+ 0:35:12.325 --> 0:35:19.252
1001
+ The next one is task based evaluation or use
1002
+ a study like we just simply ask the user what
1003
+
1004
+ 0:35:19.252 --> 0:35:24.960
1005
+ is their experience like whether the system
1006
+ works well and how well it is.
1007
+
1008
+ 0:35:27.267 --> 0:35:32.646
1009
+ So here we have an example of the ITF shale
1010
+ test result.
1011
+
1012
+ 0:35:33.153 --> 0:35:38.911
1013
+ So the first block would be the human evaluation
1014
+ like I think they are asked to give a spawl
1015
+
1016
+ 0:35:38.911 --> 0:35:44.917
1017
+ from one to five again where a fight is best
1018
+ and one is worst and the lower one is the blowscore
1019
+
1020
+ 0:35:44.917 --> 0:35:50.490
1021
+ and they find out that the human evaluation
1022
+ is far actually correlated with the blowsfall
1023
+
1024
+ 0:35:50.490 --> 0:35:51.233
1025
+ quite well.
1026
+
1027
+ 0:35:53.193 --> 0:36:02.743
1028
+ Here you can also see that the systems from
1029
+ our university are actually on top many sub-tasts.
1030
+
1031
+ 0:36:05.605 --> 0:36:07.429
1032
+ So Yeah.
1033
+
1034
+ 0:36:08.868 --> 0:36:14.401
1035
+ For this lecture is that machine translation
1036
+ evaluation is difficult.
1037
+
1038
+ 0:36:14.401 --> 0:36:21.671
1039
+ We talk about human versus automatic evaluation
1040
+ that human would be costly, but then is the
1041
+
1042
+ 0:36:21.671 --> 0:36:27.046
1043
+ goal standard automatic evaluation would be
1044
+ a fast and cheaper way.
1045
+
1046
+ 0:36:27.547 --> 0:36:36.441
1047
+ We talk about granulity on sentence level,
1048
+ document level or task level evaluation machine
1049
+
1050
+ 0:36:36.441 --> 0:36:38.395
1051
+ translation system.
1052
+
1053
+ 0:36:39.679 --> 0:36:51.977
1054
+ And we talked about human evaluation versus
1055
+ automatic metrics in details.
1056
+
1057
+ 0:36:54.034 --> 0:36:59.840
1058
+ So we introduced a lot of metric metrics.
1059
+
1060
+ 0:36:59.840 --> 0:37:10.348
1061
+ How do they compare from the quadrating of
1062
+ human assessment so it's better?
1063
+
1064
+ 0:37:12.052 --> 0:37:16.294
1065
+ I don't have the exact score and reference
1066
+ in my head.
1067
+
1068
+ 0:37:16.294 --> 0:37:22.928
1069
+ I would assume that mediators should have
1070
+ a better correlation because here they also
1071
+
1072
+ 0:37:22.928 --> 0:37:30.025
1073
+ consider other aspects like the recall whether
1074
+ the information in the reference is captured
1075
+
1076
+ 0:37:30.025 --> 0:37:31.568
1077
+ in the translation.
1078
+
1079
+ 0:37:32.872 --> 0:37:41.875
1080
+ Like synonyms, so I would assume that mid
1081
+ air is better, but again don't have the reference
1082
+
1083
+ 0:37:41.875 --> 0:37:43.441
1084
+ in my hair, so.
1085
+
1086
+ 0:37:43.903 --> 0:37:49.771
1087
+ But guess the reason people are still using
1088
+ BlueScore is that in most literature, a machine
1089
+
1090
+ 0:37:49.771 --> 0:38:00.823
1091
+ translation system, they report: So now you
1092
+ create a new machine translation system.
1093
+
1094
+ 0:38:00.823 --> 0:38:07.990
1095
+ It might be better to also report the blow.
1096
+
1097
+ 0:38:08.228 --> 0:38:11.472
1098
+ Exactly just slice good, just spread white,
1099
+ and then we're going to go ahead.
1100
+
1101
+ 0:38:12.332 --> 0:38:14.745
1102
+ And don't know what you're doing.
1103
+
1104
+ 0:38:17.457 --> 0:38:18.907
1105
+ I Want to Talk Quickly About.
1106
+
1107
+ 0:38:19.059 --> 0:38:32.902
1108
+ So it is like a language model, so it's kind
1109
+ of the same uses as.
1110
+
1111
+ 0:38:33.053 --> 0:38:39.343
1112
+ So the idea is that we have this layer in
1113
+ order to embed the sauce and the reference
1114
+
1115
+ 0:38:39.343 --> 0:38:39.713
1116
+ and.
1117
+
1118
+ 0:38:40.000 --> 0:38:54.199
1119
+ Into some feature vectors that we can later
1120
+ on use to predict the human sport in the.
1121
+
1122
+ 0:38:58.618 --> 0:39:00.051
1123
+ It If There's Nothing Else.
1124
+
demo_data/lectures/Lecture-05-02.05.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5014f3570b8db38818ab44ed117dc6d67206c5163b6b87b45df4a2aa426b8222
3
+ size 314238982
demo_data/lectures/Lecture-06-09.05.2023/English.vtt ADDED
@@ -0,0 +1,2970 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 0:00:01.721 --> 0:00:08.584
4
+ Hey, then welcome to today's lecture on language
5
+ modeling.
6
+
7
+ 0:00:09.409 --> 0:00:21.608
8
+ We had not a different view on machine translation,
9
+ which was the evaluation path it's important
10
+
11
+ 0:00:21.608 --> 0:00:24.249
12
+ to evaluate and see.
13
+
14
+ 0:00:24.664 --> 0:00:33.186
15
+ We want to continue with building the MT system
16
+ and this will be the last part before we are
17
+
18
+ 0:00:33.186 --> 0:00:36.668
19
+ going into a neural step on Thursday.
20
+
21
+ 0:00:37.017 --> 0:00:45.478
22
+ So we had the the broader view on statistical
23
+ machine translation and the.
24
+
25
+ 0:00:45.385 --> 0:00:52.977
26
+ Thursday: A week ago we talked about the statistical
27
+ machine translation and mainly the translation
28
+
29
+ 0:00:52.977 --> 0:00:59.355
30
+ model, so how we model how probable is it that
31
+ one word is translated into another.
32
+
33
+ 0:01:00.800 --> 0:01:15.583
34
+ However, there is another component when doing
35
+ generation tasks in general and machine translation.
36
+
37
+ 0:01:16.016 --> 0:01:23.797
38
+ There are several characteristics which you
39
+ only need to model on the target side in the
40
+
41
+ 0:01:23.797 --> 0:01:31.754
42
+ traditional approach where we talked about
43
+ the generation from more semantic or synthectic
44
+
45
+ 0:01:31.754 --> 0:01:34.902
46
+ representation into the real world.
47
+
48
+ 0:01:35.555 --> 0:01:51.013
49
+ And the challenge is that there's some constructs
50
+ which are only there in the target language.
51
+
52
+ 0:01:52.132 --> 0:01:57.908
53
+ You cannot really get that translation, but
54
+ it's more something that needs to model on
55
+
56
+ 0:01:57.908 --> 0:01:58.704
57
+ the target.
58
+
59
+ 0:01:59.359 --> 0:02:05.742
60
+ And this is done typically by a language model
61
+ and this concept of language model.
62
+
63
+ 0:02:06.326 --> 0:02:11.057
64
+ Guess you can assume nowadays very important.
65
+
66
+ 0:02:11.057 --> 0:02:20.416
67
+ You've read a lot about large language models
68
+ recently and they are all somehow trained or
69
+
70
+ 0:02:20.416 --> 0:02:22.164
71
+ the idea behind.
72
+
73
+ 0:02:25.986 --> 0:02:41.802
74
+ What we'll look today at if get the next night
75
+ and look what a language model is and today's
76
+
77
+ 0:02:41.802 --> 0:02:42.992
78
+ focus.
79
+
80
+ 0:02:43.363 --> 0:02:49.188
81
+ This was the common approach to the language
82
+ model for twenty or thirty years, so a lot
83
+
84
+ 0:02:49.188 --> 0:02:52.101
85
+ of time it was really the state of the art.
86
+
87
+ 0:02:52.101 --> 0:02:58.124
88
+ And people have used that in many applications
89
+ in machine translation and automatic speech
90
+
91
+ 0:02:58.124 --> 0:02:58.985
92
+ recognition.
93
+
94
+ 0:02:59.879 --> 0:03:11.607
95
+ Again you are measuring the performance, but
96
+ this is purely the performance of the language
97
+
98
+ 0:03:11.607 --> 0:03:12.499
99
+ model.
100
+
101
+ 0:03:13.033 --> 0:03:23.137
102
+ And then we will see that the traditional
103
+ language will have a major drawback in how
104
+
105
+ 0:03:23.137 --> 0:03:24.683
106
+ we can deal.
107
+
108
+ 0:03:24.944 --> 0:03:32.422
109
+ So if you model language you will see that
110
+ in most of the sentences and you have not really
111
+
112
+ 0:03:32.422 --> 0:03:39.981
113
+ seen and you're still able to assess if this
114
+ is good language or if this is native language.
115
+
116
+ 0:03:40.620 --> 0:03:45.092
117
+ And this is challenging if you do just like
118
+ parameter estimation.
119
+
120
+ 0:03:45.605 --> 0:03:59.277
121
+ We are using two different techniques to do:
122
+ interpolation, and these are essentially in
123
+
124
+ 0:03:59.277 --> 0:04:01.735
125
+ order to build.
126
+
127
+ 0:04:01.881 --> 0:04:11.941
128
+ It also motivates why things might be easier
129
+ if we are going into neural morals as we will.
130
+
131
+ 0:04:12.312 --> 0:04:18.203
132
+ And at the end we'll talk a bit about some
133
+ additional type of language models which are
134
+
135
+ 0:04:18.203 --> 0:04:18.605
136
+ also.
137
+
138
+ 0:04:20.440 --> 0:04:29.459
139
+ So where our language was used, or how are
140
+ they used in the machine translations?
141
+
142
+ 0:04:30.010 --> 0:04:38.513
143
+ So the idea of a language model is that we
144
+ are modeling what is the fluency of language.
145
+
146
+ 0:04:38.898 --> 0:04:49.381
147
+ So if you have, for example, sentence will,
148
+ then you can estimate that there are some words:
149
+
150
+ 0:04:49.669 --> 0:05:08.929
151
+ For example, the next word is valid, but will
152
+ card's words not?
153
+
154
+ 0:05:09.069 --> 0:05:13.673
155
+ And we can do that.
156
+
157
+ 0:05:13.673 --> 0:05:22.192
158
+ We have seen that the noise channel.
159
+
160
+ 0:05:22.322 --> 0:05:33.991
161
+ That we have seen someone two weeks ago, and
162
+ today we will look into how can we model P
163
+
164
+ 0:05:33.991 --> 0:05:36.909
165
+ of Y or how possible.
166
+
167
+ 0:05:37.177 --> 0:05:44.192
168
+ Now this is completely independent of the
169
+ translation process.
170
+
171
+ 0:05:44.192 --> 0:05:49.761
172
+ How fluent is a sentence and how you can express?
173
+
174
+ 0:05:51.591 --> 0:06:01.699
175
+ And this language model task has one really
176
+ big advantage and assume that is even the big
177
+
178
+ 0:06:01.699 --> 0:06:02.935
179
+ advantage.
180
+
181
+ 0:06:03.663 --> 0:06:16.345
182
+ The big advantage is the data we need to train
183
+ that so normally we are doing supervised learning.
184
+
185
+ 0:06:16.876 --> 0:06:20.206
186
+ So machine translation will talk about.
187
+
188
+ 0:06:20.206 --> 0:06:24.867
189
+ That means we have the source center and target
190
+ center.
191
+
192
+ 0:06:25.005 --> 0:06:27.620
193
+ They need to be aligned.
194
+
195
+ 0:06:27.620 --> 0:06:31.386
196
+ We look into how we can model them.
197
+
198
+ 0:06:31.386 --> 0:06:39.270
199
+ Generally, the problem with this is that:
200
+ Machine translation: You still have the advantage
201
+
202
+ 0:06:39.270 --> 0:06:45.697
203
+ that there's quite huge amounts of this data
204
+ for many languages, not all but many, but other
205
+
206
+ 0:06:45.697 --> 0:06:47.701
207
+ classes even more difficult.
208
+
209
+ 0:06:47.701 --> 0:06:50.879
210
+ There's very few data where you have summary.
211
+
212
+ 0:06:51.871 --> 0:07:02.185
213
+ So the big advantage of language model is
214
+ we're only modeling the centers, so we only
215
+
216
+ 0:07:02.185 --> 0:07:04.103
217
+ need pure text.
218
+
219
+ 0:07:04.584 --> 0:07:11.286
220
+ And pure text, especially since we have the
221
+ Internet face melting large amounts of text.
222
+
223
+ 0:07:11.331 --> 0:07:17.886
224
+ Of course, it's still, it's still maybe only
225
+ for some domains, some type.
226
+
227
+ 0:07:18.198 --> 0:07:23.466
228
+ Want to have data for speech about machine
229
+ translation.
230
+
231
+ 0:07:23.466 --> 0:07:27.040
232
+ Maybe there's only limited data that.
233
+
234
+ 0:07:27.027 --> 0:07:40.030
235
+ There's always and also you go to some more
236
+ exotic languages and then you will have less
237
+
238
+ 0:07:40.030 --> 0:07:40.906
239
+ data.
240
+
241
+ 0:07:41.181 --> 0:07:46.803
242
+ And in language once we can now look, how
243
+ can we make use of these data?
244
+
245
+ 0:07:47.187 --> 0:07:54.326
246
+ And: Nowadays this is often also framed as
247
+ self supervised learning because on the one
248
+
249
+ 0:07:54.326 --> 0:08:00.900
250
+ hand here we'll see it's a time of classification
251
+ cast or supervised learning but we create some
252
+
253
+ 0:08:00.900 --> 0:08:02.730
254
+ other data science itself.
255
+
256
+ 0:08:02.742 --> 0:08:13.922
257
+ So it's not that we have this pair of data
258
+ text and labels, but we have only the text.
259
+
260
+ 0:08:15.515 --> 0:08:21.367
261
+ So the question is how can we use this modeling
262
+ data and how can we train our language?
263
+
264
+ 0:08:22.302 --> 0:08:35.086
265
+ The main goal is to produce fluent English,
266
+ so we want to somehow model that something
267
+
268
+ 0:08:35.086 --> 0:08:38.024
269
+ is a sentence of a.
270
+
271
+ 0:08:38.298 --> 0:08:44.897
272
+ So there is no clear separation about semantics
273
+ and syntax, but in this case it is not about
274
+
275
+ 0:08:44.897 --> 0:08:46.317
276
+ a clear separation.
277
+
278
+ 0:08:46.746 --> 0:08:50.751
279
+ So we will monitor them somehow in there.
280
+
281
+ 0:08:50.751 --> 0:08:56.091
282
+ There will be some notion of semantics, some
283
+ notion of.
284
+
285
+ 0:08:56.076 --> 0:09:08.748
286
+ Because you say you want to water how fluid
287
+ or probable is that the native speaker is producing
288
+
289
+ 0:09:08.748 --> 0:09:12.444
290
+ that because of the one in.
291
+
292
+ 0:09:12.512 --> 0:09:17.711
293
+ We are rarely talking like things that are
294
+ semantically wrong, and therefore there is
295
+
296
+ 0:09:17.711 --> 0:09:18.679
297
+ also some type.
298
+
299
+ 0:09:19.399 --> 0:09:24.048
300
+ So, for example, the house is small.
301
+
302
+ 0:09:24.048 --> 0:09:30.455
303
+ It should be a higher stability than the house
304
+ is.
305
+
306
+ 0:09:31.251 --> 0:09:38.112
307
+ Because home and house are both meaning German,
308
+ they are used differently.
309
+
310
+ 0:09:38.112 --> 0:09:43.234
311
+ For example, it should be more probable that
312
+ the plane.
313
+
314
+ 0:09:44.444 --> 0:09:51.408
315
+ So this is both synthetically correct, but
316
+ cementically not.
317
+
318
+ 0:09:51.408 --> 0:09:58.372
319
+ But still you will see much more often the
320
+ probability that.
321
+
322
+ 0:10:03.883 --> 0:10:14.315
323
+ So more formally, it's about like the language
324
+ should be some type of function, and it gives
325
+
326
+ 0:10:14.315 --> 0:10:18.690
327
+ us the probability that this sentence.
328
+
329
+ 0:10:19.519 --> 0:10:27.312
330
+ Indicating that this is good English or more
331
+ generally English, of course you can do that.
332
+
333
+ 0:10:28.448 --> 0:10:37.609
334
+ And earlier times people have even done try
335
+ to do that deterministic that was especially
336
+
337
+ 0:10:37.609 --> 0:10:40.903
338
+ used for more dialogue systems.
339
+
340
+ 0:10:40.840 --> 0:10:50.660
341
+ You have a very strict syntax so you can only
342
+ use like turn off the, turn off the radio.
343
+
344
+ 0:10:50.690 --> 0:10:56.928
345
+ Something else, but you have a very strict
346
+ deterministic finance state grammar like which
347
+
348
+ 0:10:56.928 --> 0:10:58.107
349
+ type of phrases.
350
+
351
+ 0:10:58.218 --> 0:11:04.791
352
+ The problem of course if we're dealing with
353
+ language is that language is variable, we're
354
+
355
+ 0:11:04.791 --> 0:11:10.183
356
+ not always talking correct sentences, and so
357
+ this type of deterministic.
358
+
359
+ 0:11:10.650 --> 0:11:22.121
360
+ That's why for already many, many years people
361
+ look into statistical language models and try
362
+
363
+ 0:11:22.121 --> 0:11:24.587
364
+ to model something.
365
+
366
+ 0:11:24.924 --> 0:11:35.096
367
+ So something like what is the probability
368
+ of the sequences of to, and that is what.
369
+
370
+ 0:11:35.495 --> 0:11:43.076
371
+ The advantage of doing it statistically is
372
+ that we can train large text databases so we
373
+
374
+ 0:11:43.076 --> 0:11:44.454
375
+ can train them.
376
+
377
+ 0:11:44.454 --> 0:11:52.380
378
+ We don't have to define it and most of these
379
+ cases we don't want to have the hard decision.
380
+
381
+ 0:11:52.380 --> 0:11:55.481
382
+ This is a sentence of the language.
383
+
384
+ 0:11:55.815 --> 0:11:57.914
385
+ Why we want to have some type of probability?
386
+
387
+ 0:11:57.914 --> 0:11:59.785
388
+ How probable is this part of the center?
389
+
390
+ 0:12:00.560 --> 0:12:04.175
391
+ Because yeah, even for a few minutes, it's
392
+ not always clear.
393
+
394
+ 0:12:04.175 --> 0:12:06.782
395
+ Is this a sentence that you can use or not?
396
+
397
+ 0:12:06.782 --> 0:12:12.174
398
+ I mean, I just in this presentation gave several
399
+ sentences, which are not correct English.
400
+
401
+ 0:12:12.174 --> 0:12:17.744
402
+ So it might still happen that people speak
403
+ sentences or write sentences that I'm not correct,
404
+
405
+ 0:12:17.744 --> 0:12:19.758
406
+ and you want to deal with all of.
407
+
408
+ 0:12:20.020 --> 0:12:25.064
409
+ So that is then, of course, a big advantage
410
+ if you use your more statistical models.
411
+
412
+ 0:12:25.705 --> 0:12:35.810
413
+ The disadvantage is that you need a subtitle
414
+ of large text databases which might exist from
415
+
416
+ 0:12:35.810 --> 0:12:37.567
417
+ many languages.
418
+
419
+ 0:12:37.857 --> 0:12:46.511
420
+ Nowadays you see that there is of course issues
421
+ that you need large computational resources
422
+
423
+ 0:12:46.511 --> 0:12:47.827
424
+ to deal with.
425
+
426
+ 0:12:47.827 --> 0:12:56.198
427
+ You need to collect all these crawlers on
428
+ the internet which can create enormous amounts
429
+
430
+ 0:12:56.198 --> 0:12:57.891
431
+ of training data.
432
+
433
+ 0:12:58.999 --> 0:13:08.224
434
+ So if we want to build this then the question
435
+ is of course how can we estimate the probability?
436
+
437
+ 0:13:08.448 --> 0:13:10.986
438
+ So how probable is the sentence good morning?
439
+
440
+ 0:13:11.871 --> 0:13:15.450
441
+ And you all know basic statistics.
442
+
443
+ 0:13:15.450 --> 0:13:21.483
444
+ So if you see this you have a large database
445
+ of sentences.
446
+
447
+ 0:13:21.901 --> 0:13:28.003
448
+ Made this a real example, so this was from
449
+ the TED talks.
450
+
451
+ 0:13:28.003 --> 0:13:37.050
452
+ I guess most of you have heard about them,
453
+ and if you account for all many sentences,
454
+
455
+ 0:13:37.050 --> 0:13:38.523
456
+ good morning.
457
+
458
+ 0:13:38.718 --> 0:13:49.513
459
+ It happens so the probability of good morning
460
+ is sweet point times to the power minus.
461
+
462
+ 0:13:50.030 --> 0:13:53.755
463
+ Okay, so this is a very easy thing.
464
+
465
+ 0:13:53.755 --> 0:13:58.101
466
+ We can directly model the language model.
467
+
468
+ 0:13:58.959 --> 0:14:03.489
469
+ Does anybody see a problem why this might
470
+ not be the final solution?
471
+
472
+ 0:14:06.326 --> 0:14:14.962
473
+ Think we would need a folder of more sentences
474
+ to make anything useful of this.
475
+
476
+ 0:14:15.315 --> 0:14:29.340
477
+ Because the probability of the talk starting
478
+ with good morning, good morning is much higher
479
+
480
+ 0:14:29.340 --> 0:14:32.084
481
+ than ten minutes.
482
+
483
+ 0:14:33.553 --> 0:14:41.700
484
+ In all the probability presented in this face,
485
+ not how we usually think about it.
486
+
487
+ 0:14:42.942 --> 0:14:55.038
488
+ The probability is even OK, but you're going
489
+ into the right direction about the large data.
490
+
491
+ 0:14:55.038 --> 0:14:59.771
492
+ Yes, you can't form a new sentence.
493
+
494
+ 0:15:00.160 --> 0:15:04.763
495
+ It's about a large data, so you said it's
496
+ hard to get enough data.
497
+
498
+ 0:15:04.763 --> 0:15:05.931
499
+ It's impossible.
500
+
501
+ 0:15:05.931 --> 0:15:11.839
502
+ I would say we are always saying sentences
503
+ which have never been said and we are able
504
+
505
+ 0:15:11.839 --> 0:15:12.801
506
+ to deal with.
507
+
508
+ 0:15:13.133 --> 0:15:25.485
509
+ The problem with the sparsity of the data
510
+ will have a lot of perfect English sentences.
511
+
512
+ 0:15:26.226 --> 0:15:31.338
513
+ And this is, of course, not what we want to
514
+ deal with.
515
+
516
+ 0:15:31.338 --> 0:15:39.332
517
+ If we want to model that, we need to have
518
+ a model which can really estimate how good.
519
+
520
+ 0:15:39.599 --> 0:15:47.970
521
+ And if we are just like counting this way,
522
+ most of it will get a zero probability, which
523
+
524
+ 0:15:47.970 --> 0:15:48.722
525
+ is not.
526
+
527
+ 0:15:49.029 --> 0:15:56.572
528
+ So we need to make things a bit different.
529
+
530
+ 0:15:56.572 --> 0:16:06.221
531
+ For the models we had already some idea of
532
+ doing that.
533
+
534
+ 0:16:06.486 --> 0:16:08.058
535
+ And that we can do here again.
536
+
537
+ 0:16:08.528 --> 0:16:12.866
538
+ So we can especially use the gel gel.
539
+
540
+ 0:16:12.772 --> 0:16:19.651
541
+ The chain rule and the definition of conditional
542
+ probability solve the conditional probability.
543
+
544
+ 0:16:19.599 --> 0:16:26.369
545
+ Of an event B given in an event A is the probability
546
+ of A and B divided to the probability of A.
547
+
548
+ 0:16:26.369 --> 0:16:32.720
549
+ Yes, I recently had a exam on a manic speech
550
+ recognition and Mister Rival said this is not
551
+
552
+ 0:16:32.720 --> 0:16:39.629
553
+ called a chain of wood because I use this terminology
554
+ and he said it's just applying base another.
555
+
556
+ 0:16:40.500 --> 0:16:56.684
557
+ But this is definitely the definition of the
558
+ condition of probability.
559
+
560
+ 0:16:57.137 --> 0:17:08.630
561
+ The probability is defined as P of A and P
562
+ of supposed to be divided by the one.
563
+
564
+ 0:17:08.888 --> 0:17:16.392
565
+ And that can be easily rewritten into and
566
+ times given.
567
+
568
+ 0:17:16.816 --> 0:17:35.279
569
+ And the nice thing is, we can easily extend
570
+ it, of course, into more variables so we can
571
+
572
+ 0:17:35.279 --> 0:17:38.383
573
+ have: And so on.
574
+
575
+ 0:17:38.383 --> 0:17:49.823
576
+ So more generally you can do that for now
577
+ any length of sequence.
578
+
579
+ 0:17:50.650 --> 0:18:04.802
580
+ So if we are now going back to words, we can
581
+ model that as the probability of the sequence
582
+
583
+ 0:18:04.802 --> 0:18:08.223
584
+ is given its history.
585
+
586
+ 0:18:08.908 --> 0:18:23.717
587
+ Maybe it's more clear if we're looking at
588
+ real works, so if we have pee-off, it's water
589
+
590
+ 0:18:23.717 --> 0:18:26.914
591
+ is so transparent.
592
+
593
+ 0:18:26.906 --> 0:18:39.136
594
+ So this way we are able to model the ability
595
+ of the whole sentence given the sequence by
596
+
597
+ 0:18:39.136 --> 0:18:42.159
598
+ looking at each word.
599
+
600
+ 0:18:42.762 --> 0:18:49.206
601
+ And of course the big advantage is that each
602
+ word occurs less often than the full sect.
603
+
604
+ 0:18:49.206 --> 0:18:54.991
605
+ So hopefully we see that still, of course,
606
+ the problem the word doesn't occur.
607
+
608
+ 0:18:54.991 --> 0:19:01.435
609
+ Then this doesn't work, but let's recover
610
+ most of the lectures today about dealing with
611
+
612
+ 0:19:01.435 --> 0:19:01.874
613
+ this.
614
+
615
+ 0:19:02.382 --> 0:19:08.727
616
+ So by first of all, we generally is at least
617
+ easier as the thing we have before.
618
+
619
+ 0:19:13.133 --> 0:19:23.531
620
+ That we really make sense easier, no, because
621
+ those jumps get utterly long and we have central.
622
+
623
+ 0:19:23.943 --> 0:19:29.628
624
+ Yes exactly, so when we look at the last probability
625
+ here, we still have to have seen the full.
626
+
627
+ 0:19:30.170 --> 0:19:38.146
628
+ So if we want a molecule of transparent, if
629
+ water is so we have to see the food sequence.
630
+
631
+ 0:19:38.578 --> 0:19:48.061
632
+ So in first step we didn't really have to
633
+ have seen the full sentence.
634
+
635
+ 0:19:48.969 --> 0:19:52.090
636
+ However, a little bit of a step nearer.
637
+
638
+ 0:19:52.512 --> 0:19:59.673
639
+ So this is still a problem and we will never
640
+ have seen it for all the time.
641
+
642
+ 0:20:00.020 --> 0:20:08.223
643
+ So you can look at this if you have a vocabulary
644
+ of words.
645
+
646
+ 0:20:08.223 --> 0:20:17.956
647
+ Now, for example, if the average sentence
648
+ is, you would leave to the.
649
+
650
+ 0:20:18.298 --> 0:20:22.394
651
+ And we are quite sure we have never seen that
652
+ much date.
653
+
654
+ 0:20:22.902 --> 0:20:26.246
655
+ So this is, we cannot really compute this
656
+ probability.
657
+
658
+ 0:20:26.786 --> 0:20:37.794
659
+ However, there's a trick how we can do that
660
+ and that's the idea between most of the language.
661
+
662
+ 0:20:38.458 --> 0:20:44.446
663
+ So instead of saying how often does this work
664
+ happen to exactly this history, we are trying
665
+
666
+ 0:20:44.446 --> 0:20:50.433
667
+ to do some kind of clustering and cluster a
668
+ lot of different histories into the same class,
669
+
670
+ 0:20:50.433 --> 0:20:55.900
671
+ and then we are modeling the probability of
672
+ the word given this class of histories.
673
+
674
+ 0:20:56.776 --> 0:21:06.245
675
+ And then, of course, the big design decision
676
+ is how to be modeled like how to cluster history.
677
+
678
+ 0:21:06.666 --> 0:21:17.330
679
+ So how do we put all these histories together
680
+ so that we have seen each of one off enough
681
+
682
+ 0:21:17.330 --> 0:21:18.396
683
+ so that.
684
+
685
+ 0:21:20.320 --> 0:21:25.623
686
+ So there is quite different types of things
687
+ people can do.
688
+
689
+ 0:21:25.623 --> 0:21:33.533
690
+ You can add some speech texts, you can do
691
+ semantic words, you can model the similarity,
692
+
693
+ 0:21:33.533 --> 0:21:46.113
694
+ you can model grammatical content, and things
695
+ like: However, like quite often in these statistical
696
+
697
+ 0:21:46.113 --> 0:21:53.091
698
+ models, if you have a very simple solution.
699
+
700
+ 0:21:53.433 --> 0:21:58.455
701
+ And this is what most statistical models do.
702
+
703
+ 0:21:58.455 --> 0:22:09.616
704
+ They are based on the so called mark of assumption,
705
+ and that means we are assuming all this history
706
+
707
+ 0:22:09.616 --> 0:22:12.183
708
+ is not that important.
709
+
710
+ 0:22:12.792 --> 0:22:25.895
711
+ So we are modeling the probability of zirkins
712
+ is so transparent that or we have maybe two
713
+
714
+ 0:22:25.895 --> 0:22:29.534
715
+ words by having a fixed.
716
+
717
+ 0:22:29.729 --> 0:22:38.761
718
+ So the class of all our history from word
719
+ to word minus one is just the last two words.
720
+
721
+ 0:22:39.679 --> 0:22:45.229
722
+ And by doing this classification, which of
723
+ course does need any additional knowledge.
724
+
725
+ 0:22:45.545 --> 0:22:51.176
726
+ It's very easy to calculate we have no limited
727
+ our our histories.
728
+
729
+ 0:22:51.291 --> 0:23:00.906
730
+ So instead of an arbitrary long one here,
731
+ we have here only like.
732
+
733
+ 0:23:00.906 --> 0:23:10.375
734
+ For example, if we have two grams, a lot of
735
+ them will not occur.
736
+
737
+ 0:23:10.930 --> 0:23:20.079
738
+ So it's a very simple trick to make all these
739
+ classes into a few classes and motivated by,
740
+
741
+ 0:23:20.079 --> 0:23:24.905
742
+ of course, the language the nearest things
743
+ are.
744
+
745
+ 0:23:24.944 --> 0:23:33.043
746
+ Like a lot of sequences, they mainly depend
747
+ on the previous one, and things which are far
748
+
749
+ 0:23:33.043 --> 0:23:33.583
750
+ away.
751
+
752
+ 0:23:38.118 --> 0:23:47.361
753
+ In our product here everything is just modeled
754
+ not by the whole history but by the last and
755
+
756
+ 0:23:47.361 --> 0:23:48.969
757
+ minus one word.
758
+
759
+ 0:23:50.470 --> 0:23:54.322
760
+ So and this is typically expressed by people.
761
+
762
+ 0:23:54.322 --> 0:24:01.776
763
+ They're therefore also talking by an N gram
764
+ language model because we are always looking
765
+
766
+ 0:24:01.776 --> 0:24:06.550
767
+ at these chimes of N words and modeling the
768
+ probability.
769
+
770
+ 0:24:07.527 --> 0:24:10.485
771
+ So again start with the most simple case.
772
+
773
+ 0:24:10.485 --> 0:24:15.485
774
+ Even extreme is the unigram case, so we're
775
+ ignoring the whole history.
776
+
777
+ 0:24:15.835 --> 0:24:24.825
778
+ The probability of a sequence of words is
779
+ just the probability of each of the words in
780
+
781
+ 0:24:24.825 --> 0:24:25.548
782
+ there.
783
+
784
+ 0:24:26.046 --> 0:24:32.129
785
+ And therefore we are removing the whole context.
786
+
787
+ 0:24:32.129 --> 0:24:40.944
788
+ The most probable sequence would be something
789
+ like one of them is the.
790
+
791
+ 0:24:42.162 --> 0:24:44.694
792
+ Most probable wordsuit by itself.
793
+
794
+ 0:24:44.694 --> 0:24:49.684
795
+ It might not make sense, but it, of course,
796
+ can give you a bit of.
797
+
798
+ 0:24:49.629 --> 0:24:52.682
799
+ Intuition like which types of words should
800
+ be more frequent.
801
+
802
+ 0:24:53.393 --> 0:25:00.012
803
+ And if you what you can do is train such a
804
+ button and you can just automatically generate.
805
+
806
+ 0:25:00.140 --> 0:25:09.496
807
+ And this sequence is generated by sampling,
808
+ so we will later come in the lecture too.
809
+
810
+ 0:25:09.496 --> 0:25:16.024
811
+ The sampling is that you randomly pick a word
812
+ but based on.
813
+
814
+ 0:25:16.096 --> 0:25:22.711
815
+ So if the probability of one word is zero
816
+ point two then you'll put it on and if another
817
+
818
+ 0:25:22.711 --> 0:25:23.157
819
+ word.
820
+
821
+ 0:25:23.483 --> 0:25:36.996
822
+ And if you see that you'll see here now, for
823
+ example, it seems that these are two occurring
824
+
825
+ 0:25:36.996 --> 0:25:38.024
826
+ posts.
827
+
828
+ 0:25:38.138 --> 0:25:53.467
829
+ But you see there's not really any continuing
830
+ type of structure because each word is modeled
831
+
832
+ 0:25:53.467 --> 0:25:55.940
833
+ independently.
834
+
835
+ 0:25:57.597 --> 0:26:03.037
836
+ This you can do better even though going to
837
+ a biograph, so then we're having a bit of context.
838
+
839
+ 0:26:03.037 --> 0:26:08.650
840
+ Of course, it's still very small, so the probability
841
+ of your word of the actual word only depends
842
+
843
+ 0:26:08.650 --> 0:26:12.429
844
+ on the previous word and all the context before
845
+ there is ignored.
846
+
847
+ 0:26:13.133 --> 0:26:18.951
848
+ This of course will come to that wrong, but
849
+ it models a regular language significantly
850
+
851
+ 0:26:18.951 --> 0:26:19.486
852
+ better.
853
+
854
+ 0:26:19.779 --> 0:26:28.094
855
+ Seeing some things here still doesn't really
856
+ make a lot of sense, but you're seeing some
857
+
858
+ 0:26:28.094 --> 0:26:29.682
859
+ typical phrases.
860
+
861
+ 0:26:29.949 --> 0:26:39.619
862
+ In this hope doesn't make sense, but in this
863
+ issue is also frequent.
864
+
865
+ 0:26:39.619 --> 0:26:51.335
866
+ Issue is also: Very nice is this year new
867
+ car parking lot after, so if you have the word
868
+
869
+ 0:26:51.335 --> 0:26:53.634
870
+ new then the word.
871
+
872
+ 0:26:53.893 --> 0:27:01.428
873
+ Is also quite common, but new car they wouldn't
874
+ put parking.
875
+
876
+ 0:27:01.428 --> 0:27:06.369
877
+ Often the continuation is packing lots.
878
+
879
+ 0:27:06.967 --> 0:27:12.417
880
+ And now it's very interesting because here
881
+ we see the two cementic meanings of lot: You
882
+
883
+ 0:27:12.417 --> 0:27:25.889
884
+ have a parking lot, but in general if you just
885
+ think about the history, the most common use
886
+
887
+ 0:27:25.889 --> 0:27:27.353
888
+ is a lot.
889
+
890
+ 0:27:27.527 --> 0:27:33.392
891
+ So you see that he's really not using the
892
+ context before, but he's only using the current
893
+
894
+ 0:27:33.392 --> 0:27:33.979
895
+ context.
896
+
897
+ 0:27:38.338 --> 0:27:41.371
898
+ So in general we can of course do that longer.
899
+
900
+ 0:27:41.371 --> 0:27:43.888
901
+ We can do unigrams, bigrams, trigrams.
902
+
903
+ 0:27:45.845 --> 0:27:52.061
904
+ People typically went up to four or five grams,
905
+ and then it's getting difficult because.
906
+
907
+ 0:27:52.792 --> 0:27:56.671
908
+ There are so many five grams that it's getting
909
+ complicated.
910
+
911
+ 0:27:56.671 --> 0:28:02.425
912
+ Storing all of them and storing these models
913
+ get so big that it's no longer working, and
914
+
915
+ 0:28:02.425 --> 0:28:08.050
916
+ of course at some point the calculation of
917
+ the probabilities again gets too difficult,
918
+
919
+ 0:28:08.050 --> 0:28:09.213
920
+ and each of them.
921
+
922
+ 0:28:09.429 --> 0:28:14.777
923
+ If you have a small corpus, of course you
924
+ will use a smaller ingram length.
925
+
926
+ 0:28:14.777 --> 0:28:16.466
927
+ You will take a larger.
928
+
929
+ 0:28:18.638 --> 0:28:24.976
930
+ What is important to keep in mind is that,
931
+ of course, this is wrong.
932
+
933
+ 0:28:25.285 --> 0:28:36.608
934
+ So we have long range dependencies, and if
935
+ we really want to model everything in language
936
+
937
+ 0:28:36.608 --> 0:28:37.363
938
+ then.
939
+
940
+ 0:28:37.337 --> 0:28:46.965
941
+ So here is like one of these extreme cases,
942
+ the computer, which has just put into the machine
943
+
944
+ 0:28:46.965 --> 0:28:49.423
945
+ room in the slow crash.
946
+
947
+ 0:28:49.423 --> 0:28:55.978
948
+ Like somehow, there is a dependency between
949
+ computer and crash.
950
+
951
+ 0:28:57.978 --> 0:29:10.646
952
+ However, in most situations these are typically
953
+ rare and normally most important things happen
954
+
955
+ 0:29:10.646 --> 0:29:13.446
956
+ in the near context.
957
+
958
+ 0:29:15.495 --> 0:29:28.408
959
+ But of course it's important to keep that
960
+ in mind that you can't model the thing so you
961
+
962
+ 0:29:28.408 --> 0:29:29.876
963
+ can't do.
964
+
965
+ 0:29:33.433 --> 0:29:50.200
966
+ The next question is again how can we train
967
+ so we have to estimate these probabilities.
968
+
969
+ 0:29:51.071 --> 0:30:00.131
970
+ And the question is how we do that, and again
971
+ the most simple thing.
972
+
973
+ 0:30:00.440 --> 0:30:03.168
974
+ The thing is exactly what's maximum legal
975
+ destination.
976
+
977
+ 0:30:03.168 --> 0:30:12.641
978
+ What gives you the right answer is: So how
979
+ probable is that the word is following minus
980
+
981
+ 0:30:12.641 --> 0:30:13.370
982
+ one?
983
+
984
+ 0:30:13.370 --> 0:30:20.946
985
+ You just count how often does this sequence
986
+ happen?
987
+
988
+ 0:30:21.301 --> 0:30:28.165
989
+ So guess this is what most of you would have
990
+ intuitively done, and this also works best.
991
+
992
+ 0:30:28.568 --> 0:30:39.012
993
+ So it's not a complicated train, so you once
994
+ have to go over your corpus, you have to count
995
+
996
+ 0:30:39.012 --> 0:30:48.662
997
+ our diagrams and unigrams, and then you can
998
+ directly train the basic language model.
999
+
1000
+ 0:30:49.189 --> 0:30:50.651
1001
+ Who is it difficult?
1002
+
1003
+ 0:30:50.651 --> 0:30:58.855
1004
+ There are two difficulties: The basic language
1005
+ well doesn't work that well because of zero
1006
+
1007
+ 0:30:58.855 --> 0:31:03.154
1008
+ counts and how we address that and the second.
1009
+
1010
+ 0:31:03.163 --> 0:31:13.716
1011
+ Because we saw that especially if you go for
1012
+ larger you have to store all these engrams
1013
+
1014
+ 0:31:13.716 --> 0:31:15.275
1015
+ efficiently.
1016
+
1017
+ 0:31:17.697 --> 0:31:21.220
1018
+ So how we can do that?
1019
+
1020
+ 0:31:21.220 --> 0:31:24.590
1021
+ Here's some examples.
1022
+
1023
+ 0:31:24.590 --> 0:31:33.626
1024
+ For example, if you have the sequence your
1025
+ training curve.
1026
+
1027
+ 0:31:33.713 --> 0:31:41.372
1028
+ You see that the word happens, ascends the
1029
+ star and the sequence happens two times.
1030
+
1031
+ 0:31:42.182 --> 0:31:45.651
1032
+ We have three times.
1033
+
1034
+ 0:31:45.651 --> 0:31:58.043
1035
+ The same starts as the probability is to thirds
1036
+ and the other probability.
1037
+
1038
+ 0:31:58.858 --> 0:32:09.204
1039
+ Here we have what is following so you have
1040
+ twice and once do so again two thirds and one.
1041
+
1042
+ 0:32:09.809 --> 0:32:20.627
1043
+ And this is all that you need to know here
1044
+ about it, so you can do this calculation.
1045
+
1046
+ 0:32:23.723 --> 0:32:35.506
1047
+ So the question then, of course, is what do
1048
+ we really learn in these types of models?
1049
+
1050
+ 0:32:35.506 --> 0:32:45.549
1051
+ Here are examples from the Europycopterus:
1052
+ The green, the red, and the blue, and here
1053
+
1054
+ 0:32:45.549 --> 0:32:48.594
1055
+ you have the probabilities which is the next.
1056
+
1057
+ 0:32:48.989 --> 0:33:01.897
1058
+ That there is a lot more than just like the
1059
+ syntax because the initial phrase is all the
1060
+
1061
+ 0:33:01.897 --> 0:33:02.767
1062
+ same.
1063
+
1064
+ 0:33:03.163 --> 0:33:10.132
1065
+ For example, you see the green paper in the
1066
+ green group.
1067
+
1068
+ 0:33:10.132 --> 0:33:16.979
1069
+ It's more European palaman, the red cross,
1070
+ which is by.
1071
+
1072
+ 0:33:17.197 --> 0:33:21.777
1073
+ What you also see that it's like sometimes
1074
+ Indian, sometimes it's more difficult.
1075
+
1076
+ 0:33:22.302 --> 0:33:28.345
1077
+ So, for example, following the rats, in one
1078
+ hundred cases it was a red cross.
1079
+
1080
+ 0:33:28.668 --> 0:33:48.472
1081
+ So it seems to be easier to guess the next
1082
+ word.
1083
+
1084
+ 0:33:48.528 --> 0:33:55.152
1085
+ So there is different types of information
1086
+ coded in that you also know that I guess sometimes
1087
+
1088
+ 0:33:55.152 --> 0:33:58.675
1089
+ you directly know all the speakers will continue.
1090
+
1091
+ 0:33:58.675 --> 0:34:04.946
1092
+ It's not a lot of new information in the next
1093
+ word, but in other cases like blue there's
1094
+
1095
+ 0:34:04.946 --> 0:34:06.496
1096
+ a lot of information.
1097
+
1098
+ 0:34:11.291 --> 0:34:14.849
1099
+ Another example is this Berkeley restaurant
1100
+ sentences.
1101
+
1102
+ 0:34:14.849 --> 0:34:21.059
1103
+ It's collected at Berkeley and you have sentences
1104
+ like can you tell me about any good spaghetti
1105
+
1106
+ 0:34:21.059 --> 0:34:21.835
1107
+ restaurant.
1108
+
1109
+ 0:34:21.835 --> 0:34:27.463
1110
+ Big price title is what I'm looking for so
1111
+ it's more like a dialogue system and people
1112
+
1113
+ 0:34:27.463 --> 0:34:31.215
1114
+ have collected this data and of course you
1115
+ can also look.
1116
+
1117
+ 0:34:31.551 --> 0:34:46.878
1118
+ Into this and get the counts, so you count
1119
+ the vibrants in the top, so the color is the.
1120
+
1121
+ 0:34:49.409 --> 0:34:52.912
1122
+ This is a bigram which is the first word of
1123
+ West.
1124
+
1125
+ 0:34:52.912 --> 0:34:54.524
1126
+ This one fuzzy is one.
1127
+
1128
+ 0:34:56.576 --> 0:35:12.160
1129
+ One because want to hyperability, but want
1130
+ a lot less, and there where you see it, for
1131
+
1132
+ 0:35:12.160 --> 0:35:17.004
1133
+ example: So here you see after I want.
1134
+
1135
+ 0:35:17.004 --> 0:35:23.064
1136
+ It's very often for I eat, but an island which
1137
+ is not just.
1138
+
1139
+ 0:35:27.347 --> 0:35:39.267
1140
+ The absolute counts of how often each road
1141
+ occurs, and then you can see here the probabilities
1142
+
1143
+ 0:35:39.267 --> 0:35:40.145
1144
+ again.
1145
+
1146
+ 0:35:42.422 --> 0:35:54.519
1147
+ Then do that if you want to do iwan Dutch
1148
+ food you get the sequence you have to multiply
1149
+
1150
+ 0:35:54.519 --> 0:35:55.471
1151
+ olive.
1152
+
1153
+ 0:35:55.635 --> 0:36:00.281
1154
+ And then you of course get a bit of interesting
1155
+ experience on that.
1156
+
1157
+ 0:36:00.281 --> 0:36:04.726
1158
+ For example: Information is there.
1159
+
1160
+ 0:36:04.726 --> 0:36:15.876
1161
+ So, for example, if you compare I want Dutch
1162
+ or I want Chinese, it seems that.
1163
+
1164
+ 0:36:16.176 --> 0:36:22.910
1165
+ That the sentence often starts with eye.
1166
+
1167
+ 0:36:22.910 --> 0:36:31.615
1168
+ You have it after two is possible, but after
1169
+ one it.
1170
+
1171
+ 0:36:31.731 --> 0:36:39.724
1172
+ And you cannot say want, but you have to say
1173
+ want to spend, so there's grammical information.
1174
+
1175
+ 0:36:40.000 --> 0:36:51.032
1176
+ To main information and source: Here before
1177
+ we're going into measuring quality, is there
1178
+
1179
+ 0:36:51.032 --> 0:36:58.297
1180
+ any questions about language model and the
1181
+ idea of modeling?
1182
+
1183
+ 0:37:02.702 --> 0:37:13.501
1184
+ Hope that doesn't mean everybody sleeping,
1185
+ and so when we're doing the training these
1186
+
1187
+ 0:37:13.501 --> 0:37:15.761
1188
+ language models,.
1189
+
1190
+ 0:37:16.356 --> 0:37:26.429
1191
+ You need to model what is the engrum length
1192
+ should we use a trigram or a forkrum.
1193
+
1194
+ 0:37:27.007 --> 0:37:34.040
1195
+ So in order to decide how can you now decide
1196
+ which of the two models are better?
1197
+
1198
+ 0:37:34.914 --> 0:37:40.702
1199
+ And if you would have to do that, how would
1200
+ you decide taking language model or taking
1201
+
1202
+ 0:37:40.702 --> 0:37:41.367
1203
+ language?
1204
+
1205
+ 0:37:43.263 --> 0:37:53.484
1206
+ I take some test text and see which model
1207
+ assigns a higher probability to me.
1208
+
1209
+ 0:37:54.354 --> 0:38:03.978
1210
+ It's very good, so that's even the second
1211
+ thing, so the first thing maybe would have
1212
+
1213
+ 0:38:03.978 --> 0:38:04.657
1214
+ been.
1215
+
1216
+ 0:38:05.925 --> 0:38:12.300
1217
+ The problem is the and then you take the language
1218
+ language language and machine translation.
1219
+
1220
+ 0:38:13.193 --> 0:38:18.773
1221
+ Problems: First of all you have to build a
1222
+ whole system which is very time consuming and
1223
+
1224
+ 0:38:18.773 --> 0:38:21.407
1225
+ it might not only depend on the language.
1226
+
1227
+ 0:38:21.407 --> 0:38:24.730
1228
+ On the other hand, that's of course what the
1229
+ end is.
1230
+
1231
+ 0:38:24.730 --> 0:38:30.373
1232
+ The end want and the pressure will model each
1233
+ component individually or do you want to do
1234
+
1235
+ 0:38:30.373 --> 0:38:31.313
1236
+ an end to end.
1237
+
1238
+ 0:38:31.771 --> 0:38:35.463
1239
+ What can also happen is you'll see your metric
1240
+ model.
1241
+
1242
+ 0:38:35.463 --> 0:38:41.412
1243
+ This is a very good language model, but it
1244
+ somewhat doesn't really work well with your
1245
+
1246
+ 0:38:41.412 --> 0:38:42.711
1247
+ translation model.
1248
+
1249
+ 0:38:43.803 --> 0:38:49.523
1250
+ But of course it's very good to also have
1251
+ this type of intrinsic evaluation where the
1252
+
1253
+ 0:38:49.523 --> 0:38:52.116
1254
+ assumption should be as a pointed out.
1255
+
1256
+ 0:38:52.116 --> 0:38:57.503
1257
+ If we have Good English it shouldn't be a
1258
+ high probability and it's bad English.
1259
+
1260
+ 0:38:58.318 --> 0:39:07.594
1261
+ And this is measured by the take a held out
1262
+ data set, so some data which you don't train
1263
+
1264
+ 0:39:07.594 --> 0:39:12.596
1265
+ on then calculate the probability of this data.
1266
+
1267
+ 0:39:12.912 --> 0:39:26.374
1268
+ Then you're just looking at the language model
1269
+ and you take the language model.
1270
+
1271
+ 0:39:27.727 --> 0:39:33.595
1272
+ You're not directly using the probability,
1273
+ but you're taking the perplexity.
1274
+
1275
+ 0:39:33.595 --> 0:39:40.454
1276
+ The perplexity is due to the power of the
1277
+ cross entropy, and you see in the cross entropy
1278
+
1279
+ 0:39:40.454 --> 0:39:46.322
1280
+ you're doing something like an average probability
1281
+ of always coming to this.
1282
+
1283
+ 0:39:46.846 --> 0:39:54.721
1284
+ Not so how exactly is that define perplexity
1285
+ is typically what people refer to all across.
1286
+
1287
+ 0:39:54.894 --> 0:40:02.328
1288
+ The cross edge is negative and average, and
1289
+ then you have the lock of the probability of
1290
+
1291
+ 0:40:02.328 --> 0:40:03.246
1292
+ the whole.
1293
+
1294
+ 0:40:04.584 --> 0:40:10.609
1295
+ We are modeling this probability as the product
1296
+ of each of the words.
1297
+
1298
+ 0:40:10.609 --> 0:40:18.613
1299
+ That's how the end gram was defined and now
1300
+ you hopefully can remember the rules of logarism
1301
+
1302
+ 0:40:18.613 --> 0:40:23.089
1303
+ so you can get the probability within the logarism.
1304
+
1305
+ 0:40:23.063 --> 0:40:31.036
1306
+ The sum here so the cross entry is minus one
1307
+ by two by n, and the sum of all your words
1308
+
1309
+ 0:40:31.036 --> 0:40:35.566
1310
+ and the lowerism of the probability of each
1311
+ word.
1312
+
1313
+ 0:40:36.176 --> 0:40:39.418
1314
+ And then the perplexity is just like two to
1315
+ the power.
1316
+
1317
+ 0:40:41.201 --> 0:40:44.706
1318
+ Why can this be interpreted as a branching
1319
+ factor?
1320
+
1321
+ 0:40:44.706 --> 0:40:50.479
1322
+ So it gives you a bit like the average thing,
1323
+ like how many possibilities you have.
1324
+
1325
+ 0:40:51.071 --> 0:41:02.249
1326
+ You have a digit task and you have no idea,
1327
+ but the probability of the next digit is like
1328
+
1329
+ 0:41:02.249 --> 0:41:03.367
1330
+ one ten.
1331
+
1332
+ 0:41:03.783 --> 0:41:09.354
1333
+ And if you then take a later perplexity, it
1334
+ will be exactly ten.
1335
+
1336
+ 0:41:09.849 --> 0:41:24.191
1337
+ And that is like this perplexity gives you
1338
+ a million interpretations, so how much randomness
1339
+
1340
+ 0:41:24.191 --> 0:41:27.121
1341
+ is still in there?
1342
+
1343
+ 0:41:27.307 --> 0:41:32.433
1344
+ Of course, now it's good to have a lower perplexity.
1345
+
1346
+ 0:41:32.433 --> 0:41:36.012
1347
+ We have less ambiguity in there and.
1348
+
1349
+ 0:41:35.976 --> 0:41:48.127
1350
+ If you have a hundred words and you only have
1351
+ to uniformly compare it to ten different, so
1352
+
1353
+ 0:41:48.127 --> 0:41:49.462
1354
+ you have.
1355
+
1356
+ 0:41:49.609 --> 0:41:53.255
1357
+ Yes, think so it should be.
1358
+
1359
+ 0:41:53.255 --> 0:42:03.673
1360
+ You had here logarism and then to the power
1361
+ and that should then be eliminated.
1362
+
1363
+ 0:42:03.743 --> 0:42:22.155
1364
+ So which logarism you use is not that important
1365
+ because it's a constant factor to reformulate.
1366
+
1367
+ 0:42:23.403 --> 0:42:28.462
1368
+ Yes and Yeah So the Best.
1369
+
1370
+ 0:42:31.931 --> 0:42:50.263
1371
+ The best model is always like you want to
1372
+ have a high probability.
1373
+
1374
+ 0:42:51.811 --> 0:43:04.549
1375
+ Time you see here, so here the probabilities
1376
+ would like to commend the rapporteur on his
1377
+
1378
+ 0:43:04.549 --> 0:43:05.408
1379
+ work.
1380
+
1381
+ 0:43:05.285 --> 0:43:14.116
1382
+ You have then locked two probabilities and
1383
+ then the average, so this is not the perplexity
1384
+
1385
+ 0:43:14.116 --> 0:43:18.095
1386
+ but the cross entropy as mentioned here.
1387
+
1388
+ 0:43:18.318 --> 0:43:26.651
1389
+ And then due to the power of that we'll give
1390
+ you the perplexity of the center.
1391
+
1392
+ 0:43:29.329 --> 0:43:40.967
1393
+ And these metrics of perplexity are essential
1394
+ in modeling that and we'll also see nowadays.
1395
+
1396
+ 0:43:41.121 --> 0:43:47.898
1397
+ You also measure like equality often in perplexity
1398
+ or cross entropy, which gives you how good
1399
+
1400
+ 0:43:47.898 --> 0:43:50.062
1401
+ is it in estimating the same.
1402
+
1403
+ 0:43:50.010 --> 0:43:53.647
1404
+ The better the model is, the more information
1405
+ you have about this.
1406
+
1407
+ 0:43:55.795 --> 0:44:03.106
1408
+ Talked about isomic ability or quit sentences,
1409
+ but don't most have to any much because.
1410
+
1411
+ 0:44:03.463 --> 0:44:12.512
1412
+ You are doing that in this way implicitly
1413
+ because of the correct word.
1414
+
1415
+ 0:44:12.512 --> 0:44:19.266
1416
+ If you are modeling this one, the sun over
1417
+ all next.
1418
+
1419
+ 0:44:20.020 --> 0:44:29.409
1420
+ Therefore, you have that implicitly in there
1421
+ because in each position you're modeling the
1422
+
1423
+ 0:44:29.409 --> 0:44:32.957
1424
+ probability of this witch behind.
1425
+
1426
+ 0:44:35.515 --> 0:44:43.811
1427
+ You have a very large number of negative examples
1428
+ because all the possible extensions which are
1429
+
1430
+ 0:44:43.811 --> 0:44:49.515
1431
+ not there are incorrect, which of course might
1432
+ also be a problem.
1433
+
1434
+ 0:44:52.312 --> 0:45:00.256
1435
+ And the biggest challenge of these types of
1436
+ models is how to model unseen events.
1437
+
1438
+ 0:45:00.840 --> 0:45:04.973
1439
+ So that can be unknown words or it can be
1440
+ unknown vibrants.
1441
+
1442
+ 0:45:05.245 --> 0:45:10.096
1443
+ So that's important also like you've seen
1444
+ all the words.
1445
+
1446
+ 0:45:10.096 --> 0:45:17.756
1447
+ But if you have a bigram language model, if
1448
+ you haven't seen the bigram, you'll still get
1449
+
1450
+ 0:45:17.756 --> 0:45:23.628
1451
+ a zero probability because we know that the
1452
+ bigram's divided by the.
1453
+
1454
+ 0:45:24.644 --> 0:45:35.299
1455
+ If you have unknown words, the problem gets
1456
+ even bigger because one word typically causes
1457
+
1458
+ 0:45:35.299 --> 0:45:37.075
1459
+ a lot of zero.
1460
+
1461
+ 0:45:37.217 --> 0:45:41.038
1462
+ So if you, for example, if your vocabulary
1463
+ is go to and care it,.
1464
+
1465
+ 0:45:41.341 --> 0:45:43.467
1466
+ And you have not a sentence.
1467
+
1468
+ 0:45:43.467 --> 0:45:47.941
1469
+ I want to pay a T, so you have one word, which
1470
+ is here 'an'.
1471
+
1472
+ 0:45:47.887 --> 0:45:54.354
1473
+ It is unknow then you have the proper.
1474
+
1475
+ 0:45:54.354 --> 0:46:02.147
1476
+ It is I get a sentence star and sentence star.
1477
+
1478
+ 0:46:02.582 --> 0:46:09.850
1479
+ To model this probability you always have
1480
+ to take the account from these sequences divided
1481
+
1482
+ 0:46:09.850 --> 0:46:19.145
1483
+ by: Since when does it occur, all of these
1484
+ angrams can also occur because of the word
1485
+
1486
+ 0:46:19.145 --> 0:46:19.961
1487
+ middle.
1488
+
1489
+ 0:46:20.260 --> 0:46:27.800
1490
+ So all of these probabilities are directly
1491
+ zero.
1492
+
1493
+ 0:46:27.800 --> 0:46:33.647
1494
+ You see that just by having a single.
1495
+
1496
+ 0:46:34.254 --> 0:46:47.968
1497
+ Tells you it might not always be better to
1498
+ have larger grams because if you have a gram
1499
+
1500
+ 0:46:47.968 --> 0:46:50.306
1501
+ language more.
1502
+
1503
+ 0:46:50.730 --> 0:46:57.870
1504
+ So sometimes it's better to have a smaller
1505
+ angram counter because the chances that you're
1506
+
1507
+ 0:46:57.870 --> 0:47:00.170
1508
+ seeing the angram is higher.
1509
+
1510
+ 0:47:00.170 --> 0:47:07.310
1511
+ On the other hand, you want to have a larger
1512
+ account because the larger the count is, the
1513
+
1514
+ 0:47:07.310 --> 0:47:09.849
1515
+ longer the context is modeling.
1516
+
1517
+ 0:47:10.670 --> 0:47:17.565
1518
+ So how can we address this type of problem?
1519
+
1520
+ 0:47:17.565 --> 0:47:28.064
1521
+ We address this type of problem by somehow
1522
+ adjusting our accounts.
1523
+
1524
+ 0:47:29.749 --> 0:47:40.482
1525
+ We have often, but most of your entries in
1526
+ the table are zero, and if one of these engrams
1527
+
1528
+ 0:47:40.482 --> 0:47:45.082
1529
+ occurs you'll have a zero probability.
1530
+
1531
+ 0:47:46.806 --> 0:48:06.999
1532
+ So therefore we need to find some of our ways
1533
+ in order to estimate this type of event because:
1534
+
1535
+ 0:48:07.427 --> 0:48:11.619
1536
+ So there are different ways of how to model
1537
+ it and how to adjust it.
1538
+
1539
+ 0:48:11.619 --> 0:48:15.326
1540
+ The one I hear is to do smoocing and that's
1541
+ the first thing.
1542
+
1543
+ 0:48:15.326 --> 0:48:20.734
1544
+ So in smoocing you're saying okay, we take
1545
+ a bit of the probability we have to our scene
1546
+
1547
+ 0:48:20.734 --> 0:48:23.893
1548
+ events and distribute this thing we're taking
1549
+ away.
1550
+
1551
+ 0:48:23.893 --> 0:48:26.567
1552
+ We're distributing to all the other events.
1553
+
1554
+ 0:48:26.946 --> 0:48:33.927
1555
+ The nice thing is in this case oh now each
1556
+ event has a non zero probability and that is
1557
+
1558
+ 0:48:33.927 --> 0:48:39.718
1559
+ of course very helpful because we don't have
1560
+ zero probabilities anymore.
1561
+
1562
+ 0:48:40.180 --> 0:48:48.422
1563
+ It smoothed out, but at least you have some
1564
+ kind of probability everywhere, so you take
1565
+
1566
+ 0:48:48.422 --> 0:48:50.764
1567
+ some of the probability.
1568
+
1569
+ 0:48:53.053 --> 0:49:05.465
1570
+ You can also do that more here when you have
1571
+ the endgram, for example, and this is your
1572
+
1573
+ 0:49:05.465 --> 0:49:08.709
1574
+ original distribution.
1575
+
1576
+ 0:49:08.648 --> 0:49:15.463
1577
+ Then you are taking some mass away from here
1578
+ and distributing this mass to all the other
1579
+
1580
+ 0:49:15.463 --> 0:49:17.453
1581
+ words that you have seen.
1582
+
1583
+ 0:49:18.638 --> 0:49:26.797
1584
+ And thereby you are now making sure that it's
1585
+ yeah, that it's now possible to model that.
1586
+
1587
+ 0:49:28.828 --> 0:49:36.163
1588
+ The other idea we're coming into more detail
1589
+ on how we can do this type of smoking, but
1590
+
1591
+ 0:49:36.163 --> 0:49:41.164
1592
+ one other idea you can do is to do some type
1593
+ of clustering.
1594
+
1595
+ 0:49:41.501 --> 0:49:48.486
1596
+ And that means if we are can't model go Kit's,
1597
+ for example because we haven't seen that.
1598
+
1599
+ 0:49:49.349 --> 0:49:56.128
1600
+ Then we're just looking at the full thing
1601
+ and we're just going to live directly how probable.
1602
+
1603
+ 0:49:56.156 --> 0:49:58.162
1604
+ Go two ways or so.
1605
+
1606
+ 0:49:58.162 --> 0:50:09.040
1607
+ Then we are modeling just only the word interpolation
1608
+ where you're interpolating all the probabilities
1609
+
1610
+ 0:50:09.040 --> 0:50:10.836
1611
+ and thereby can.
1612
+
1613
+ 0:50:11.111 --> 0:50:16.355
1614
+ These are the two things which are helpful
1615
+ in order to better calculate all these types.
1616
+
1617
+ 0:50:19.499 --> 0:50:28.404
1618
+ Let's start with what counts news so the idea
1619
+ is okay.
1620
+
1621
+ 0:50:28.404 --> 0:50:38.119
1622
+ We have not seen an event and then the probability
1623
+ is zero.
1624
+
1625
+ 0:50:38.618 --> 0:50:50.902
1626
+ It's not that high, but you should always
1627
+ be aware that there might be new things happening
1628
+
1629
+ 0:50:50.902 --> 0:50:55.308
1630
+ and somehow be able to estimate.
1631
+
1632
+ 0:50:56.276 --> 0:50:59.914
1633
+ So the idea is okay.
1634
+
1635
+ 0:50:59.914 --> 0:51:09.442
1636
+ We can also assign a positive probability
1637
+ to a higher.
1638
+
1639
+ 0:51:10.590 --> 0:51:23.233
1640
+ We are changing so currently we worked on
1641
+ imperial accounts so how often we have seen
1642
+
1643
+ 0:51:23.233 --> 0:51:25.292
1644
+ the accounts.
1645
+
1646
+ 0:51:25.745 --> 0:51:37.174
1647
+ And now we are going on to expect account
1648
+ how often this would occur in an unseen.
1649
+
1650
+ 0:51:37.517 --> 0:51:39.282
1651
+ So we are directly trying to model that.
1652
+
1653
+ 0:51:39.859 --> 0:51:45.836
1654
+ Of course, the empirical accounts are a good
1655
+ starting point, so if you've seen the world
1656
+
1657
+ 0:51:45.836 --> 0:51:51.880
1658
+ very often in your training data, it's a good
1659
+ estimation of how often you would see it in
1660
+
1661
+ 0:51:51.880 --> 0:51:52.685
1662
+ the future.
1663
+
1664
+ 0:51:52.685 --> 0:51:58.125
1665
+ However, it might make sense to think about
1666
+ it only because you haven't seen it.
1667
+
1668
+ 0:51:58.578 --> 0:52:10.742
1669
+ So does anybody have a very simple idea how
1670
+ you start with smoothing it?
1671
+
1672
+ 0:52:10.742 --> 0:52:15.241
1673
+ What count would you give?
1674
+
1675
+ 0:52:21.281 --> 0:52:32.279
1676
+ Now you have the probability to calculation
1677
+ how often have you seen the biogram with zero
1678
+
1679
+ 0:52:32.279 --> 0:52:33.135
1680
+ count.
1681
+
1682
+ 0:52:33.193 --> 0:52:39.209
1683
+ So what count would you give in order to still
1684
+ do this calculation?
1685
+
1686
+ 0:52:39.209 --> 0:52:41.509
1687
+ We have to smooth, so we.
1688
+
1689
+ 0:52:44.884 --> 0:52:52.151
1690
+ We could clump together all the rare words,
1691
+ for example everywhere we have only seen ones.
1692
+
1693
+ 0:52:52.652 --> 0:52:56.904
1694
+ And then just we can do the massive moment
1695
+ of those and don't.
1696
+
1697
+ 0:52:56.936 --> 0:53:00.085
1698
+ So remove the real ones.
1699
+
1700
+ 0:53:00.085 --> 0:53:06.130
1701
+ Yes, and then every unseen word is one of
1702
+ them.
1703
+
1704
+ 0:53:06.130 --> 0:53:13.939
1705
+ Yeah, but it's not only about unseen words,
1706
+ it's even unseen.
1707
+
1708
+ 0:53:14.874 --> 0:53:20.180
1709
+ You can even start easier and that's what
1710
+ people do at the first thing.
1711
+
1712
+ 0:53:20.180 --> 0:53:22.243
1713
+ That's at one smooth thing.
1714
+
1715
+ 0:53:22.243 --> 0:53:28.580
1716
+ You'll see it's not working good but the variation
1717
+ works fine and we're just as here.
1718
+
1719
+ 0:53:28.580 --> 0:53:30.644
1720
+ We've seen everything once.
1721
+
1722
+ 0:53:31.771 --> 0:53:39.896
1723
+ That's similar to this because you're clustering
1724
+ the one and the zero together and you just
1725
+
1726
+ 0:53:39.896 --> 0:53:45.814
1727
+ say you've seen everything once or have seen
1728
+ them twice and so on.
1729
+
1730
+ 0:53:46.386 --> 0:53:53.249
1731
+ And if you've done that wow, there's no probability
1732
+ because each event has happened once.
1733
+
1734
+ 0:53:55.795 --> 0:54:02.395
1735
+ If you otherwise have seen the bigram five
1736
+ times, you would not now do five times but
1737
+
1738
+ 0:54:02.395 --> 0:54:03.239
1739
+ six times.
1740
+
1741
+ 0:54:03.363 --> 0:54:09.117
1742
+ So the nice thing is to have seen everything.
1743
+
1744
+ 0:54:09.117 --> 0:54:19.124
1745
+ Once the probability of the engrap is now
1746
+ out, you have seen it divided by the.
1747
+
1748
+ 0:54:20.780 --> 0:54:23.763
1749
+ How long ago there's one big big problem with
1750
+ it?
1751
+
1752
+ 0:54:24.064 --> 0:54:38.509
1753
+ Just imagine that you have a vocabulary of
1754
+ words, and you have a corpus of thirty million
1755
+
1756
+ 0:54:38.509 --> 0:54:39.954
1757
+ bigrams.
1758
+
1759
+ 0:54:39.954 --> 0:54:42.843
1760
+ So if you have a.
1761
+
1762
+ 0:54:43.543 --> 0:54:46.580
1763
+ Simple Things So You've Seen Them Thirty Million
1764
+ Times.
1765
+
1766
+ 0:54:47.247 --> 0:54:49.818
1767
+ That is your count, your distributing.
1768
+
1769
+ 0:54:49.818 --> 0:54:55.225
1770
+ According to your gain, the problem is yet
1771
+ how many possible bigrams do you have?
1772
+
1773
+ 0:54:55.225 --> 0:55:00.895
1774
+ You have seven point five billion possible
1775
+ bigrams, and each of them you are counting
1776
+
1777
+ 0:55:00.895 --> 0:55:04.785
1778
+ now as give up your ability, like you give
1779
+ account of one.
1780
+
1781
+ 0:55:04.785 --> 0:55:07.092
1782
+ So each of them is saying a curse.
1783
+
1784
+ 0:55:07.627 --> 0:55:16.697
1785
+ Then this number of possible vigrams is many
1786
+ times larger than the number you really see.
1787
+
1788
+ 0:55:17.537 --> 0:55:21.151
1789
+ You're mainly doing equal distribution.
1790
+
1791
+ 0:55:21.151 --> 0:55:26.753
1792
+ Everything gets the same because this is much
1793
+ more important.
1794
+
1795
+ 0:55:26.753 --> 0:55:31.541
1796
+ Most of your probability mass is used for
1797
+ smoothing.
1798
+
1799
+ 0:55:32.412 --> 0:55:37.493
1800
+ Because most of the probability miles have
1801
+ to be distributed that you at least give every
1802
+
1803
+ 0:55:37.493 --> 0:55:42.687
1804
+ biogram at least a count of one, and the other
1805
+ counts are only the thirty million, so seven
1806
+
1807
+ 0:55:42.687 --> 0:55:48.219
1808
+ point five billion counts go to like a distribute
1809
+ around all the engrons, and only thirty million
1810
+
1811
+ 0:55:48.219 --> 0:55:50.026
1812
+ are according to your frequent.
1813
+
1814
+ 0:55:50.210 --> 0:56:02.406
1815
+ So you put a lot too much mass on your smoothing
1816
+ and you're doing some kind of extreme smoothing.
1817
+
1818
+ 0:56:02.742 --> 0:56:08.986
1819
+ So that of course is a bit bad then and will
1820
+ give you not the best performance.
1821
+
1822
+ 0:56:10.130 --> 0:56:16.160
1823
+ However, there's a nice thing and that means
1824
+ to do probability calculations.
1825
+
1826
+ 0:56:16.160 --> 0:56:21.800
1827
+ We are doing it based on counts, but to do
1828
+ this division we don't need.
1829
+
1830
+ 0:56:22.302 --> 0:56:32.112
1831
+ So we can also do that with floating point
1832
+ values and there is still a valid type of calculation.
1833
+
1834
+ 0:56:32.392 --> 0:56:39.380
1835
+ So we can have less probability mass to unseen
1836
+ events.
1837
+
1838
+ 0:56:39.380 --> 0:56:45.352
1839
+ We don't have to give one because if we count.
1840
+
1841
+ 0:56:45.785 --> 0:56:50.976
1842
+ But to do our calculation we can also give
1843
+ zero point zero to something like that, so
1844
+
1845
+ 0:56:50.976 --> 0:56:56.167
1846
+ very small value, and thereby we have less
1847
+ value on the smooth thing, and we are more
1848
+
1849
+ 0:56:56.167 --> 0:56:58.038
1850
+ focusing on the actual corpus.
1851
+
1852
+ 0:56:58.758 --> 0:57:03.045
1853
+ And that is what people refer to as Alpha
1854
+ Smoozing.
1855
+
1856
+ 0:57:03.223 --> 0:57:12.032
1857
+ You see that we are now adding not one to
1858
+ it but only alpha, and then we are giving less
1859
+
1860
+ 0:57:12.032 --> 0:57:19.258
1861
+ probability to the unseen event and more probability
1862
+ to the really seen.
1863
+
1864
+ 0:57:20.780 --> 0:57:24.713
1865
+ Questions: Of course, how do you find see
1866
+ also?
1867
+
1868
+ 0:57:24.713 --> 0:57:29.711
1869
+ I'm here to either use some help out data
1870
+ and optimize them.
1871
+
1872
+ 0:57:30.951 --> 0:57:35.153
1873
+ So what what does it now really mean?
1874
+
1875
+ 0:57:35.153 --> 0:57:40.130
1876
+ This gives you a bit of an idea behind that.
1877
+
1878
+ 0:57:40.700 --> 0:57:57.751
1879
+ So here you have the grams which occur one
1880
+ time, for example all grams which occur one.
1881
+
1882
+ 0:57:57.978 --> 0:58:10.890
1883
+ So, for example, that means that if you have
1884
+ engrams which occur one time, then.
1885
+
1886
+ 0:58:11.371 --> 0:58:22.896
1887
+ If you look at all the engrams which occur
1888
+ two times, then they occur.
1889
+
1890
+ 0:58:22.896 --> 0:58:31.013
1891
+ If you look at the engrams that occur zero,
1892
+ then.
1893
+
1894
+ 0:58:32.832 --> 0:58:46.511
1895
+ So if you are now doing the smoothing you
1896
+ can look what is the probability estimating
1897
+
1898
+ 0:58:46.511 --> 0:58:47.466
1899
+ them.
1900
+
1901
+ 0:58:47.847 --> 0:59:00.963
1902
+ You see that for all the endbreaks you heavily
1903
+ underestimate how often they occur in the test
1904
+
1905
+ 0:59:00.963 --> 0:59:01.801
1906
+ card.
1907
+
1908
+ 0:59:02.002 --> 0:59:10.067
1909
+ So what you want is very good to estimate
1910
+ this distribution, so for each Enron estimate
1911
+
1912
+ 0:59:10.067 --> 0:59:12.083
1913
+ quite well how often.
1914
+
1915
+ 0:59:12.632 --> 0:59:16.029
1916
+ You're quite bad at that for all of them.
1917
+
1918
+ 0:59:16.029 --> 0:59:22.500
1919
+ You're apparently underestimating only for
1920
+ the top ones which you haven't seen.
1921
+
1922
+ 0:59:22.500 --> 0:59:24.845
1923
+ You'll heavily overestimate.
1924
+
1925
+ 0:59:25.645 --> 0:59:30.887
1926
+ If you're doing alpha smoothing and optimize
1927
+ that to fit on the zero count because that's
1928
+
1929
+ 0:59:30.887 --> 0:59:36.361
1930
+ not completely fair because this alpha is now
1931
+ optimizes the test counter, you see that you're
1932
+
1933
+ 0:59:36.361 --> 0:59:37.526
1934
+ doing a lot better.
1935
+
1936
+ 0:59:37.526 --> 0:59:42.360
1937
+ It's not perfect, but you're a lot better
1938
+ in estimating how often they will occur.
1939
+
1940
+ 0:59:45.545 --> 0:59:49.316
1941
+ So this is one idea of doing it.
1942
+
1943
+ 0:59:49.316 --> 0:59:57.771
1944
+ Of course there's other ways and this is like
1945
+ a large research direction.
1946
+
1947
+ 0:59:58.318 --> 1:00:03.287
1948
+ So there is this needed estimation.
1949
+
1950
+ 1:00:03.287 --> 1:00:11.569
1951
+ What you are doing is filling your trading
1952
+ data into parts.
1953
+
1954
+ 1:00:11.972 --> 1:00:19.547
1955
+ Looking at how many engrams occur exactly
1956
+ are types, which engrams occur are times in
1957
+
1958
+ 1:00:19.547 --> 1:00:20.868
1959
+ your training.
1960
+
1961
+ 1:00:21.281 --> 1:00:27.716
1962
+ And then you look for these ones.
1963
+
1964
+ 1:00:27.716 --> 1:00:36.611
1965
+ How often do they occur in your training data?
1966
+
1967
+ 1:00:36.611 --> 1:00:37.746
1968
+ It's.
1969
+
1970
+ 1:00:38.118 --> 1:00:45.214
1971
+ And then you say oh this engram, the expector
1972
+ counts how often will see.
1973
+
1974
+ 1:00:45.214 --> 1:00:56.020
1975
+ It is divided by: Some type of clustering
1976
+ you're putting all the engrams which occur
1977
+
1978
+ 1:00:56.020 --> 1:01:04.341
1979
+ are at times in your data together and in order
1980
+ to estimate how often.
1981
+
1982
+ 1:01:05.185 --> 1:01:12.489
1983
+ And if you do half your data related to your
1984
+ final estimation by just using those statistics,.
1985
+
1986
+ 1:01:14.014 --> 1:01:25.210
1987
+ So this is called added estimation, and thereby
1988
+ you are not able to estimate better how often
1989
+
1990
+ 1:01:25.210 --> 1:01:25.924
1991
+ does.
1992
+
1993
+ 1:01:28.368 --> 1:01:34.559
1994
+ And again we can do the same look and compare
1995
+ it to the expected counts.
1996
+
1997
+ 1:01:34.559 --> 1:01:37.782
1998
+ Again we have exactly the same table.
1999
+
2000
+ 1:01:38.398 --> 1:01:47.611
2001
+ So then we're having to hear how many engrams
2002
+ that does exist.
2003
+
2004
+ 1:01:47.611 --> 1:01:55.361
2005
+ So, for example, there's like engrams which
2006
+ you can.
2007
+
2008
+ 1:01:55.835 --> 1:02:08.583
2009
+ Then you look into your other half and how
2010
+ often do these N grams occur in your 2nd part
2011
+
2012
+ 1:02:08.583 --> 1:02:11.734
2013
+ of the training data?
2014
+
2015
+ 1:02:12.012 --> 1:02:22.558
2016
+ For example, an unseen N gram I expect to
2017
+ occur, an engram which occurs one time.
2018
+
2019
+ 1:02:22.558 --> 1:02:25.774
2020
+ I expect that it occurs.
2021
+
2022
+ 1:02:27.527 --> 1:02:42.564
2023
+ Yeah, the number of zero counts are if take
2024
+ my one grams and then just calculate how many
2025
+
2026
+ 1:02:42.564 --> 1:02:45.572
2027
+ possible bigrams.
2028
+
2029
+ 1:02:45.525 --> 1:02:50.729
2030
+ Yes, so in this case we are now not assuming
2031
+ about having a more larger cattle because then,
2032
+
2033
+ 1:02:50.729 --> 1:02:52.127
2034
+ of course, it's getting.
2035
+
2036
+ 1:02:52.272 --> 1:02:54.730
2037
+ So you're doing that given the current gram.
2038
+
2039
+ 1:02:54.730 --> 1:03:06.057
2040
+ The cavalry is better to: So yeah, there's
2041
+ another problem in how to deal with them.
2042
+
2043
+ 1:03:06.057 --> 1:03:11.150
2044
+ This is more about how to smuse the engram
2045
+ counts to also deal.
2046
+
2047
+ 1:03:14.394 --> 1:03:18.329
2048
+ Certainly as I Think The.
2049
+
2050
+ 1:03:18.198 --> 1:03:25.197
2051
+ Yes, the last idea of doing is so called good
2052
+ cheering, and and the I hear here is in it
2053
+
2054
+ 1:03:25.197 --> 1:03:32.747
2055
+ similar, so there is a typical mathematic approve,
2056
+ but you can show that a very good estimation
2057
+
2058
+ 1:03:32.747 --> 1:03:34.713
2059
+ for the expected counts.
2060
+
2061
+ 1:03:34.654 --> 1:03:42.339
2062
+ Is that you take the number of engrams which
2063
+ occur one time more divided by the number of
2064
+
2065
+ 1:03:42.339 --> 1:03:46.011
2066
+ engram which occur R times and R plus one.
2067
+
2068
+ 1:03:46.666 --> 1:03:49.263
2069
+ So this is then the estimation of.
2070
+
2071
+ 1:03:49.549 --> 1:04:05.911
2072
+ So if you are looking now at an engram which
2073
+ occurs times then you are looking at how many
2074
+
2075
+ 1:04:05.911 --> 1:04:08.608
2076
+ engrams occur.
2077
+
2078
+ 1:04:09.009 --> 1:04:18.938
2079
+ It's very simple, so in this one you only
2080
+ have to count all the bigrams, how many different
2081
+
2082
+ 1:04:18.938 --> 1:04:23.471
2083
+ bigrams out there, and that is very good.
2084
+
2085
+ 1:04:23.903 --> 1:04:33.137
2086
+ So if you are saying now about end drums which
2087
+ occur or times,.
2088
+
2089
+ 1:04:33.473 --> 1:04:46.626
2090
+ It might be that there are some occurring
2091
+ times, but no times, and then.
2092
+
2093
+ 1:04:46.866 --> 1:04:54.721
2094
+ So what you normally do is you are doing for
2095
+ small R, and for large R you do some curve
2096
+
2097
+ 1:04:54.721 --> 1:04:55.524
2098
+ fitting.
2099
+
2100
+ 1:04:56.016 --> 1:05:07.377
2101
+ In general this type of smoothing is important
2102
+ for engrams which occur rarely.
2103
+
2104
+ 1:05:07.377 --> 1:05:15.719
2105
+ If an engram occurs so this is more important
2106
+ for events.
2107
+
2108
+ 1:05:17.717 --> 1:05:25.652
2109
+ So here again you see you have the counts
2110
+ and then based on that you get the adjusted
2111
+
2112
+ 1:05:25.652 --> 1:05:26.390
2113
+ counts.
2114
+
2115
+ 1:05:26.390 --> 1:05:34.786
2116
+ This is here and if you compare it's a test
2117
+ count you see that it really works quite well.
2118
+
2119
+ 1:05:35.035 --> 1:05:41.093
2120
+ But for the low numbers it's a very good modeling
2121
+ of how much how good this works.
2122
+
2123
+ 1:05:45.005 --> 1:05:50.018
2124
+ Then, of course, the question is how good
2125
+ does it work in language modeling?
2126
+
2127
+ 1:05:50.018 --> 1:05:51.516
2128
+ We also want tomorrow.
2129
+
2130
+ 1:05:52.372 --> 1:05:54.996
2131
+ We can measure that perplexity.
2132
+
2133
+ 1:05:54.996 --> 1:05:59.261
2134
+ We learned that before and then we have everyone's.
2135
+
2136
+ 1:05:59.579 --> 1:06:07.326
2137
+ You saw that a lot of too much probability
2138
+ mass is put to the events which have your probability.
2139
+
2140
+ 1:06:07.667 --> 1:06:11.098
2141
+ Then you have an alpha smoothing.
2142
+
2143
+ 1:06:11.098 --> 1:06:16.042
2144
+ Here's a start because it's not completely
2145
+ fair.
2146
+
2147
+ 1:06:16.042 --> 1:06:20.281
2148
+ The alpha was maximized on the test data.
2149
+
2150
+ 1:06:20.480 --> 1:06:25.904
2151
+ But you see that like the leaded estimation
2152
+ of the touring gives you a similar performance.
2153
+
2154
+ 1:06:26.226 --> 1:06:29.141
2155
+ So they seem to really work quite well.
2156
+
2157
+ 1:06:32.232 --> 1:06:41.552
2158
+ So this is about all assigning probability
2159
+ mass to aimed grams, which we have not seen
2160
+
2161
+ 1:06:41.552 --> 1:06:50.657
2162
+ in order to also estimate their probability
2163
+ before we're going to the interpolation.
2164
+
2165
+ 1:06:55.635 --> 1:07:00.207
2166
+ Good, so now we have.
2167
+
2168
+ 1:07:00.080 --> 1:07:11.818
2169
+ Done this estimation, and the problem is we
2170
+ have this general.
2171
+
2172
+ 1:07:11.651 --> 1:07:19.470
2173
+ We want to have a longer context because we
2174
+ can model longer than language better because
2175
+
2176
+ 1:07:19.470 --> 1:07:21.468
2177
+ long range dependency.
2178
+
2179
+ 1:07:21.701 --> 1:07:26.745
2180
+ On the other hand, we have limited data so
2181
+ we want to have stored angrums because they
2182
+
2183
+ 1:07:26.745 --> 1:07:28.426
2184
+ reach angrums at first more.
2185
+
2186
+ 1:07:29.029 --> 1:07:43.664
2187
+ And about the smooth thing in the discounting
2188
+ we did before, it always treats all angrams.
2189
+
2190
+ 1:07:44.024 --> 1:07:46.006
2191
+ So we didn't really look at the end drums.
2192
+
2193
+ 1:07:46.006 --> 1:07:48.174
2194
+ They were all classed into how often they
2195
+ are.
2196
+
2197
+ 1:07:49.169 --> 1:08:00.006
2198
+ However, sometimes this might not be very
2199
+ helpful, so for example look at the engram
2200
+
2201
+ 1:08:00.006 --> 1:08:06.253
2202
+ Scottish beer drinkers and Scottish beer eaters.
2203
+
2204
+ 1:08:06.686 --> 1:08:12.037
2205
+ Because we have not seen the trigram, so you
2206
+ will estimate the trigram probability by the
2207
+
2208
+ 1:08:12.037 --> 1:08:14.593
2209
+ probability you assign to the zero county.
2210
+
2211
+ 1:08:15.455 --> 1:08:26.700
2212
+ However, if you look at the background probability
2213
+ that you might have seen and might be helpful,.
2214
+
2215
+ 1:08:26.866 --> 1:08:34.538
2216
+ So be a drinker is more probable to see than
2217
+ Scottish be a drinker, and be a drinker should
2218
+
2219
+ 1:08:34.538 --> 1:08:36.039
2220
+ be more probable.
2221
+
2222
+ 1:08:36.896 --> 1:08:39.919
2223
+ So this type of information is somehow ignored.
2224
+
2225
+ 1:08:39.919 --> 1:08:45.271
2226
+ So if we have the Trigram language model,
2227
+ we are only looking at trigrams divided by
2228
+
2229
+ 1:08:45.271 --> 1:08:46.089
2230
+ the Vigrams.
2231
+
2232
+ 1:08:46.089 --> 1:08:49.678
2233
+ But if we have not seen the Vigrams, we are
2234
+ not looking.
2235
+
2236
+ 1:08:49.678 --> 1:08:53.456
2237
+ Oh, maybe we will have seen the Vigram and
2238
+ we can back off.
2239
+
2240
+ 1:08:54.114 --> 1:09:01.978
2241
+ And that is what people do in interpolation
2242
+ and back off.
2243
+
2244
+ 1:09:01.978 --> 1:09:09.164
2245
+ The idea is if we don't have seen the large
2246
+ engrams.
2247
+
2248
+ 1:09:09.429 --> 1:09:16.169
2249
+ So don't have to go to a shorter sequence
2250
+ and try to see if we came on in this probability.
2251
+
2252
+ 1:09:16.776 --> 1:09:20.730
2253
+ And this is the idea of interpolation.
2254
+
2255
+ 1:09:20.730 --> 1:09:25.291
2256
+ There's like two different ways of doing it.
2257
+
2258
+ 1:09:25.291 --> 1:09:26.507
2259
+ One is the.
2260
+
2261
+ 1:09:26.646 --> 1:09:29.465
2262
+ The easiest thing is like okay.
2263
+
2264
+ 1:09:29.465 --> 1:09:32.812
2265
+ If we have bigrams, we have trigrams.
2266
+
2267
+ 1:09:32.812 --> 1:09:35.103
2268
+ If we have programs, why?
2269
+
2270
+ 1:09:35.355 --> 1:09:46.544
2271
+ Mean, of course, we have the larger ones,
2272
+ the larger context, but the short amounts are
2273
+
2274
+ 1:09:46.544 --> 1:09:49.596
2275
+ maybe better estimated.
2276
+
2277
+ 1:09:50.090 --> 1:10:00.487
2278
+ Time just by taking the probability of just
2279
+ the word class of probability of and.
2280
+
2281
+ 1:10:01.261 --> 1:10:07.052
2282
+ And of course we need to know because otherwise
2283
+ we don't have a probability distribution, but
2284
+
2285
+ 1:10:07.052 --> 1:10:09.332
2286
+ we can somehow optimize the weights.
2287
+
2288
+ 1:10:09.332 --> 1:10:15.930
2289
+ For example, the health out data set: And
2290
+ thereby we have now a probability distribution
2291
+
2292
+ 1:10:15.930 --> 1:10:17.777
2293
+ which takes both into account.
2294
+
2295
+ 1:10:18.118 --> 1:10:23.705
2296
+ The thing about the Scottish be a drink business.
2297
+
2298
+ 1:10:23.705 --> 1:10:33.763
2299
+ The dry rum probability will be the same for
2300
+ the post office because they both occur zero
2301
+
2302
+ 1:10:33.763 --> 1:10:34.546
2303
+ times.
2304
+
2305
+ 1:10:36.116 --> 1:10:45.332
2306
+ But the two grand verability will hopefully
2307
+ be different because we might have seen beer
2308
+
2309
+ 1:10:45.332 --> 1:10:47.611
2310
+ eaters and therefore.
2311
+
2312
+ 1:10:48.668 --> 1:10:57.296
2313
+ The idea that sometimes it's better to have
2314
+ different models and combine them instead.
2315
+
2316
+ 1:10:58.678 --> 1:10:59.976
2317
+ Another idea in style.
2318
+
2319
+ 1:11:00.000 --> 1:11:08.506
2320
+ Of this overall interpolation is you can also
2321
+ do this type of recursive interpolation.
2322
+
2323
+ 1:11:08.969 --> 1:11:23.804
2324
+ The probability of the word given its history
2325
+ is in the current language model probability.
2326
+
2327
+ 1:11:24.664 --> 1:11:30.686
2328
+ Thus one minus the weights of this two some
2329
+ after one, and here it's an interpolated probability
2330
+
2331
+ 1:11:30.686 --> 1:11:36.832
2332
+ from the n minus one breath, and then of course
2333
+ it goes recursively on until you are at a junigram
2334
+
2335
+ 1:11:36.832 --> 1:11:37.639
2336
+ probability.
2337
+
2338
+ 1:11:38.558 --> 1:11:49.513
2339
+ What you can also do, you can not only do
2340
+ the same weights for all our words, but you
2341
+
2342
+ 1:11:49.513 --> 1:12:06.020
2343
+ can for example: For example, for engrams,
2344
+ which you have seen very often, you put more
2345
+
2346
+ 1:12:06.020 --> 1:12:10.580
2347
+ weight on the trigrams.
2348
+
2349
+ 1:12:13.673 --> 1:12:29.892
2350
+ The other thing you can do is the back off
2351
+ and the difference in back off is we are not
2352
+
2353
+ 1:12:29.892 --> 1:12:32.656
2354
+ interpolating.
2355
+
2356
+ 1:12:32.892 --> 1:12:41.954
2357
+ If we have seen the trigram probability so
2358
+ if the trigram hound is bigger then we take
2359
+
2360
+ 1:12:41.954 --> 1:12:48.412
2361
+ the trigram probability and if we have seen
2362
+ this one then we.
2363
+
2364
+ 1:12:48.868 --> 1:12:54.092
2365
+ So that is the difference.
2366
+
2367
+ 1:12:54.092 --> 1:13:06.279
2368
+ We are always taking all the angle probabilities
2369
+ and back off.
2370
+
2371
+ 1:13:07.147 --> 1:13:09.941
2372
+ Why do we need to do this just a minute?
2373
+
2374
+ 1:13:09.941 --> 1:13:13.621
2375
+ So why have we here just take the probability
2376
+ of the.
2377
+
2378
+ 1:13:15.595 --> 1:13:18.711
2379
+ Yes, because otherwise the probabilities from
2380
+ some people.
2381
+
2382
+ 1:13:19.059 --> 1:13:28.213
2383
+ In order to make them still sound one, we
2384
+ have to take away a bit of a probability mass
2385
+
2386
+ 1:13:28.213 --> 1:13:29.773
2387
+ for the scene.
2388
+
2389
+ 1:13:29.709 --> 1:13:38.919
2390
+ The difference is we are no longer distributing
2391
+ it equally as before to the unseen, but we
2392
+
2393
+ 1:13:38.919 --> 1:13:40.741
2394
+ are distributing.
2395
+
2396
+ 1:13:44.864 --> 1:13:56.220
2397
+ For example, this can be done with gutturing,
2398
+ so the expected counts in goodturing we saw.
2399
+
2400
+ 1:13:57.697 --> 1:13:59.804
2401
+ The adjusted counts.
2402
+
2403
+ 1:13:59.804 --> 1:14:04.719
2404
+ They are always lower than the ones we see
2405
+ here.
2406
+
2407
+ 1:14:04.719 --> 1:14:14.972
2408
+ These counts are always: See that so you can
2409
+ now take this different and distribute this
2410
+
2411
+ 1:14:14.972 --> 1:14:18.852
2412
+ weights to the lower based input.
2413
+
2414
+ 1:14:23.323 --> 1:14:29.896
2415
+ Is how we can distribute things.
2416
+
2417
+ 1:14:29.896 --> 1:14:43.442
2418
+ Then there is one last thing people are doing,
2419
+ especially how much.
2420
+
2421
+ 1:14:43.563 --> 1:14:55.464
2422
+ And there's one thing which is called well
2423
+ written by Mozilla.
2424
+
2425
+ 1:14:55.315 --> 1:15:01.335
2426
+ In the background, like in the background,
2427
+ it might make sense to look at the words and
2428
+
2429
+ 1:15:01.335 --> 1:15:04.893
2430
+ see how probable it is that you need to background.
2431
+
2432
+ 1:15:05.425 --> 1:15:11.232
2433
+ So look at these words five and one cent.
2434
+
2435
+ 1:15:11.232 --> 1:15:15.934
2436
+ Those occur exactly times in the.
2437
+
2438
+ 1:15:16.316 --> 1:15:27.804
2439
+ They would be treated exactly the same because
2440
+ both occur at the same time, and it would be
2441
+
2442
+ 1:15:27.804 --> 1:15:29.053
2443
+ the same.
2444
+
2445
+ 1:15:29.809 --> 1:15:48.401
2446
+ However, it shouldn't really model the same.
2447
+
2448
+ 1:15:48.568 --> 1:15:57.447
2449
+ If you compare that for constant there are
2450
+ four hundred different continuations of this
2451
+
2452
+ 1:15:57.447 --> 1:16:01.282
2453
+ work, so there is nearly always this.
2454
+
2455
+ 1:16:02.902 --> 1:16:11.203
2456
+ So if you're now seeing a new bigram or a
2457
+ biogram with Isaac Constant or Spite starting
2458
+
2459
+ 1:16:11.203 --> 1:16:13.467
2460
+ and then another word,.
2461
+
2462
+ 1:16:15.215 --> 1:16:25.606
2463
+ In constant, it's very frequent that you see
2464
+ new angrups because there are many different
2465
+
2466
+ 1:16:25.606 --> 1:16:27.222
2467
+ combinations.
2468
+
2469
+ 1:16:27.587 --> 1:16:35.421
2470
+ Therefore, it might look not only to look
2471
+ at the counts, the end grams, but also how
2472
+
2473
+ 1:16:35.421 --> 1:16:37.449
2474
+ many extensions does.
2475
+
2476
+ 1:16:38.218 --> 1:16:43.222
2477
+ And this is done by witt velk smoothing.
2478
+
2479
+ 1:16:43.222 --> 1:16:51.032
2480
+ The idea is we count how many possible extensions
2481
+ in this case.
2482
+
2483
+ 1:16:51.371 --> 1:17:01.966
2484
+ So we had for spive, we had possible extensions,
2485
+ and for constant we had a lot more.
2486
+
2487
+ 1:17:02.382 --> 1:17:09.394
2488
+ And then how much we put into our backup model,
2489
+ how much weight we put into the backup is,
2490
+
2491
+ 1:17:09.394 --> 1:17:13.170
2492
+ depending on this number of possible extensions.
2493
+
2494
+ 1:17:14.374 --> 1:17:15.557
2495
+ Style.
2496
+
2497
+ 1:17:15.557 --> 1:17:29.583
2498
+ We have it here, so this is the weight you
2499
+ put on your lower end gram probability.
2500
+
2501
+ 1:17:29.583 --> 1:17:46.596
2502
+ For example: And if you compare these two
2503
+ numbers, so for Spike you do how many extensions
2504
+
2505
+ 1:17:46.596 --> 1:17:55.333
2506
+ does Spike have divided by: While for constant
2507
+ you have zero point three, you know,.
2508
+
2509
+ 1:17:55.815 --> 1:18:05.780
2510
+ So you're putting a lot more weight to like
2511
+ it's not as bad to fall off to the back of
2512
+
2513
+ 1:18:05.780 --> 1:18:06.581
2514
+ model.
2515
+
2516
+ 1:18:06.581 --> 1:18:10.705
2517
+ So for the spy it's really unusual.
2518
+
2519
+ 1:18:10.730 --> 1:18:13.369
2520
+ For Constant there's a lot of probability
2521
+ medicine.
2522
+
2523
+ 1:18:13.369 --> 1:18:15.906
2524
+ The chances that you're doing that is quite
2525
+ high.
2526
+
2527
+ 1:18:20.000 --> 1:18:26.209
2528
+ Similarly, but just from the other way around,
2529
+ it's now looking at this probability distribution.
2530
+
2531
+ 1:18:26.546 --> 1:18:37.103
2532
+ So now when we back off the probability distribution
2533
+ for the lower angrums, we calculated exactly
2534
+
2535
+ 1:18:37.103 --> 1:18:40.227
2536
+ the same as the probability.
2537
+
2538
+ 1:18:40.320 --> 1:18:48.254
2539
+ However, they are used in a different way,
2540
+ so the lower order end drums are only used
2541
+
2542
+ 1:18:48.254 --> 1:18:49.361
2543
+ if we have.
2544
+
2545
+ 1:18:50.410 --> 1:18:54.264
2546
+ So it's like you're modeling something different.
2547
+
2548
+ 1:18:54.264 --> 1:19:01.278
2549
+ You're not modeling how probable this engram
2550
+ if we haven't seen the larger engram and that
2551
+
2552
+ 1:19:01.278 --> 1:19:04.361
2553
+ is tried by the diversity of histories.
2554
+
2555
+ 1:19:04.944 --> 1:19:14.714
2556
+ For example, if you look at York, that's a
2557
+ quite frequent work.
2558
+
2559
+ 1:19:14.714 --> 1:19:18.530
2560
+ It occurs as many times.
2561
+
2562
+ 1:19:19.559 --> 1:19:27.985
2563
+ However, four hundred seventy three times
2564
+ it was followed the way before it was mute.
2565
+
2566
+ 1:19:29.449 --> 1:19:40.237
2567
+ So if you now think the unigram model is only
2568
+ used, the probability of York as a unigram
2569
+
2570
+ 1:19:40.237 --> 1:19:49.947
2571
+ model should be very, very low because: So
2572
+ you should have a lower probability for your
2573
+
2574
+ 1:19:49.947 --> 1:19:56.292
2575
+ than, for example, for foods, although you
2576
+ have seen both of them at the same time, and
2577
+
2578
+ 1:19:56.292 --> 1:20:02.853
2579
+ this is done by Knesser and Nye Smoothing where
2580
+ you are not counting the words itself, but
2581
+
2582
+ 1:20:02.853 --> 1:20:05.377
2583
+ you count the number of mysteries.
2584
+
2585
+ 1:20:05.845 --> 1:20:15.233
2586
+ So how many other way around was it followed
2587
+ by how many different words were before?
2588
+
2589
+ 1:20:15.233 --> 1:20:28.232
2590
+ Then instead of the normal way you count the
2591
+ words: So you don't need to know all the formulas
2592
+
2593
+ 1:20:28.232 --> 1:20:28.864
2594
+ here.
2595
+
2596
+ 1:20:28.864 --> 1:20:33.498
2597
+ The more important thing is this intuition.
2598
+
2599
+ 1:20:34.874 --> 1:20:44.646
2600
+ More than it means already that I haven't
2601
+ seen the larger end grammar, and therefore
2602
+
2603
+ 1:20:44.646 --> 1:20:49.704
2604
+ it might be better to model it differently.
2605
+
2606
+ 1:20:49.929 --> 1:20:56.976
2607
+ So if there's a new engram with something
2608
+ in New York that's very unprofitable compared
2609
+
2610
+ 1:20:56.976 --> 1:20:57.297
2611
+ to.
2612
+
2613
+ 1:21:00.180 --> 1:21:06.130
2614
+ And yeah, this modified Kneffer Nice music
2615
+ is what people took into use.
2616
+
2617
+ 1:21:06.130 --> 1:21:08.249
2618
+ That's the fall approach.
2619
+
2620
+ 1:21:08.728 --> 1:21:20.481
2621
+ Has an absolute discounting for small and
2622
+ grams, and then bells smoothing, and for it
2623
+
2624
+ 1:21:20.481 --> 1:21:27.724
2625
+ uses the discounting of histories which we
2626
+ just had.
2627
+
2628
+ 1:21:28.028 --> 1:21:32.207
2629
+ And there's even two versions of it, like
2630
+ the backup and the interpolator.
2631
+
2632
+ 1:21:32.472 --> 1:21:34.264
2633
+ So that may be interesting.
2634
+
2635
+ 1:21:34.264 --> 1:21:40.216
2636
+ These are here even works well for interpolation,
2637
+ although your assumption is even no longer
2638
+
2639
+ 1:21:40.216 --> 1:21:45.592
2640
+ true because you're using the lower engrams
2641
+ even if you've seen the higher engrams.
2642
+
2643
+ 1:21:45.592 --> 1:21:49.113
2644
+ But since you're then focusing on the higher
2645
+ engrams,.
2646
+
2647
+ 1:21:49.929 --> 1:21:53.522
2648
+ So if you see that some beats on the perfectities,.
2649
+
2650
+ 1:21:54.754 --> 1:22:00.262
2651
+ So you see normally what interpolated movement
2652
+ class of nineties gives you some of the best
2653
+
2654
+ 1:22:00.262 --> 1:22:00.980
2655
+ performing.
2656
+
2657
+ 1:22:02.022 --> 1:22:08.032
2658
+ You see the larger your end drum than it is
2659
+ with interpolation.
2660
+
2661
+ 1:22:08.032 --> 1:22:15.168
2662
+ You also get significant better so you can
2663
+ not only look at the last words.
2664
+
2665
+ 1:22:18.638 --> 1:22:32.725
2666
+ Good so much for these types of things, and
2667
+ we will finish with some special things about
2668
+
2669
+ 1:22:32.725 --> 1:22:34.290
2670
+ language.
2671
+
2672
+ 1:22:38.678 --> 1:22:44.225
2673
+ One thing we talked about the unknown words,
2674
+ so there is different ways of doing it because
2675
+
2676
+ 1:22:44.225 --> 1:22:49.409
2677
+ in all the estimations we were still assuming
2678
+ mostly that we have a fixed vocabulary.
2679
+
2680
+ 1:22:50.270 --> 1:23:06.372
2681
+ So you can often, for example, create an unknown
2682
+ choken and use that while statistical language.
2683
+
2684
+ 1:23:06.766 --> 1:23:16.292
2685
+ It was mainly useful language processing since
2686
+ newer models are coming, but maybe it's surprising.
2687
+
2688
+ 1:23:18.578 --> 1:23:30.573
2689
+ What is also nice is that if you're going
2690
+ to really hard launch and ramps, it's more
2691
+
2692
+ 1:23:30.573 --> 1:23:33.114
2693
+ about efficiency.
2694
+
2695
+ 1:23:33.093 --> 1:23:37.378
2696
+ And then you have to remember lock it in your
2697
+ model.
2698
+
2699
+ 1:23:37.378 --> 1:23:41.422
2700
+ In a lot of situations it's not really important.
2701
+
2702
+ 1:23:41.661 --> 1:23:46.964
2703
+ It's more about ranking so which one is better
2704
+ and if they don't sum up to one that's not
2705
+
2706
+ 1:23:46.964 --> 1:23:47.907
2707
+ that important.
2708
+
2709
+ 1:23:47.907 --> 1:23:53.563
2710
+ Of course then you cannot calculate any perplexity
2711
+ anymore because if this is not a probability
2712
+
2713
+ 1:23:53.563 --> 1:23:58.807
2714
+ mass then the thing we had about the negative
2715
+ example doesn't fit anymore and that's not
2716
+
2717
+ 1:23:58.807 --> 1:23:59.338
2718
+ working.
2719
+
2720
+ 1:23:59.619 --> 1:24:02.202
2721
+ However, anification is also very helpful.
2722
+
2723
+ 1:24:02.582 --> 1:24:13.750
2724
+ And that is why there is this stupid bag-off
2725
+ presented remove all this complicated things
2726
+
2727
+ 1:24:13.750 --> 1:24:14.618
2728
+ which.
2729
+
2730
+ 1:24:15.055 --> 1:24:28.055
2731
+ And it just does once we directly take the
2732
+ absolute account, and otherwise we're doing.
2733
+
2734
+ 1:24:28.548 --> 1:24:41.867
2735
+ Is no longer any discounting anymore, so it's
2736
+ very, very simple and however they show you
2737
+
2738
+ 1:24:41.867 --> 1:24:47.935
2739
+ have to calculate a lot less statistics.
2740
+
2741
+ 1:24:50.750 --> 1:24:57.525
2742
+ In addition you can have other type of language
2743
+ models.
2744
+
2745
+ 1:24:57.525 --> 1:25:08.412
2746
+ We had word based language models and they
2747
+ normally go up to four or five for six brands.
2748
+
2749
+ 1:25:08.412 --> 1:25:10.831
2750
+ They are too large.
2751
+
2752
+ 1:25:11.531 --> 1:25:20.570
2753
+ So what people have then looked also into
2754
+ is what is referred to as part of speech language
2755
+
2756
+ 1:25:20.570 --> 1:25:21.258
2757
+ model.
2758
+
2759
+ 1:25:21.258 --> 1:25:29.806
2760
+ So instead of looking at the word sequence
2761
+ you're modeling directly the part of speech
2762
+
2763
+ 1:25:29.806 --> 1:25:30.788
2764
+ sequence.
2765
+
2766
+ 1:25:31.171 --> 1:25:34.987
2767
+ Then of course now you're only being modeling
2768
+ syntax.
2769
+
2770
+ 1:25:34.987 --> 1:25:41.134
2771
+ There's no cemented information anymore in
2772
+ the paddle speech test but now you might go
2773
+
2774
+ 1:25:41.134 --> 1:25:47.423
2775
+ to a larger context link so you can do seven
2776
+ H or nine grams and then you can write some
2777
+
2778
+ 1:25:47.423 --> 1:25:50.320
2779
+ of the long range dependencies in order.
2780
+
2781
+ 1:25:52.772 --> 1:25:59.833
2782
+ And there's other things people have done
2783
+ like cash language models, so the idea in cash
2784
+
2785
+ 1:25:59.833 --> 1:26:07.052
2786
+ language model is that yes words that you have
2787
+ recently seen are more frequently to do are
2788
+
2789
+ 1:26:07.052 --> 1:26:11.891
2790
+ more probable to reoccurr if you want to model
2791
+ the dynamics.
2792
+
2793
+ 1:26:12.152 --> 1:26:20.734
2794
+ If you're just talking here, we talked about
2795
+ language models in my presentation.
2796
+
2797
+ 1:26:20.734 --> 1:26:23.489
2798
+ There will be a lot more.
2799
+
2800
+ 1:26:23.883 --> 1:26:37.213
2801
+ Can do that by having a dynamic and a static
2802
+ component, and then you have a dynamic component
2803
+
2804
+ 1:26:37.213 --> 1:26:41.042
2805
+ which looks at the bigram.
2806
+
2807
+ 1:26:41.261 --> 1:26:49.802
2808
+ And thereby, for example, if you once generate
2809
+ language model of probability, it's increased
2810
+
2811
+ 1:26:49.802 --> 1:26:52.924
2812
+ and you're modeling that problem.
2813
+
2814
+ 1:26:56.816 --> 1:27:03.114
2815
+ Said the dynamic component is trained on the
2816
+ text translated so far.
2817
+
2818
+ 1:27:04.564 --> 1:27:12.488
2819
+ To train them what you just have done, there's
2820
+ no human feet there.
2821
+
2822
+ 1:27:12.712 --> 1:27:25.466
2823
+ The speech model all the time and then it
2824
+ will repeat its errors and that is, of course,.
2825
+
2826
+ 1:27:25.966 --> 1:27:31.506
2827
+ A similar idea is people have looked into
2828
+ trigger language model whereas one word occurs
2829
+
2830
+ 1:27:31.506 --> 1:27:34.931
2831
+ then you increase the probability of some other
2832
+ words.
2833
+
2834
+ 1:27:34.931 --> 1:27:40.596
2835
+ So if you're talking about money that will
2836
+ increase the probability of bank saving account
2837
+
2838
+ 1:27:40.596 --> 1:27:41.343
2839
+ dollar and.
2840
+
2841
+ 1:27:41.801 --> 1:27:47.352
2842
+ Because then you have to somehow model this
2843
+ dependency, but it's somehow also an idea of
2844
+
2845
+ 1:27:47.352 --> 1:27:52.840
2846
+ modeling long range dependency, because if
2847
+ one word occurs very often in your document,
2848
+
2849
+ 1:27:52.840 --> 1:27:58.203
2850
+ you like somehow like learning which other
2851
+ words to occur because they are more often
2852
+
2853
+ 1:27:58.203 --> 1:27:59.201
2854
+ than by chance.
2855
+
2856
+ 1:28:02.822 --> 1:28:10.822
2857
+ Yes, then the last thing is, of course, especially
2858
+ for languages which are, which are morphologically
2859
+
2860
+ 1:28:10.822 --> 1:28:11.292
2861
+ rich.
2862
+
2863
+ 1:28:11.292 --> 1:28:18.115
2864
+ You can do something similar to BPE so you
2865
+ can now do more themes or so, and then more
2866
+
2867
+ 1:28:18.115 --> 1:28:22.821
2868
+ the morphine sequence because the morphines
2869
+ are more often.
2870
+
2871
+ 1:28:23.023 --> 1:28:26.877
2872
+ However, the program is opposed that your
2873
+ sequence length also gets longer.
2874
+
2875
+ 1:28:27.127 --> 1:28:33.185
2876
+ And so if they have a four gram language model,
2877
+ it's not counting the last three words but
2878
+
2879
+ 1:28:33.185 --> 1:28:35.782
2880
+ only the last three more films, which.
2881
+
2882
+ 1:28:36.196 --> 1:28:39.833
2883
+ So of course then it's a bit challenging and
2884
+ know how to deal with.
2885
+
2886
+ 1:28:40.680 --> 1:28:51.350
2887
+ What about language is finished by the idea
2888
+ of a position at the end of the world?
2889
+
2890
+ 1:28:51.350 --> 1:28:58.807
2891
+ Yeah, but there you can typically do something
2892
+ like that.
2893
+
2894
+ 1:28:59.159 --> 1:29:02.157
2895
+ It is not the one perfect solution.
2896
+
2897
+ 1:29:02.157 --> 1:29:05.989
2898
+ You have to do a bit of testing what is best.
2899
+
2900
+ 1:29:06.246 --> 1:29:13.417
2901
+ One way of dealing with a large vocabulary
2902
+ that you haven't seen is to split these words
2903
+
2904
+ 1:29:13.417 --> 1:29:20.508
2905
+ into parts and themes that either like more
2906
+ linguistic motivated in more themes or more
2907
+
2908
+ 1:29:20.508 --> 1:29:25.826
2909
+ statistically motivated like we have in the
2910
+ bike pair and coding.
2911
+
2912
+ 1:29:28.188 --> 1:29:33.216
2913
+ The representation of your text is different.
2914
+
2915
+ 1:29:33.216 --> 1:29:41.197
2916
+ How you are later doing all the counting and
2917
+ the statistics is the same.
2918
+
2919
+ 1:29:41.197 --> 1:29:44.914
2920
+ What you assume is your sequence.
2921
+
2922
+ 1:29:45.805 --> 1:29:49.998
2923
+ That's the same thing for the other things
2924
+ we had here.
2925
+
2926
+ 1:29:49.998 --> 1:29:55.390
2927
+ Here you don't have words, but everything
2928
+ you're doing is done exactly.
2929
+
2930
+ 1:29:57.857 --> 1:29:59.457
2931
+ Some practical issues.
2932
+
2933
+ 1:29:59.457 --> 1:30:05.646
2934
+ Typically you're doing things on the lock
2935
+ and you're adding because mild decline in very
2936
+
2937
+ 1:30:05.646 --> 1:30:09.819
2938
+ small values gives you sometimes problems with
2939
+ calculation.
2940
+
2941
+ 1:30:10.230 --> 1:30:16.687
2942
+ Good thing is you don't have to care with
2943
+ this mostly so there is very good two kids
2944
+
2945
+ 1:30:16.687 --> 1:30:23.448
2946
+ like Azarayan or Kendalan which when you can
2947
+ just give your data and they will train the
2948
+
2949
+ 1:30:23.448 --> 1:30:30.286
2950
+ language more then do all the complicated maths
2951
+ behind that and you are able to run them.
2952
+
2953
+ 1:30:31.911 --> 1:30:39.894
2954
+ So what you should keep from today is what
2955
+ is a language model and how we can do maximum
2956
+
2957
+ 1:30:39.894 --> 1:30:44.199
2958
+ training on that and different language models.
2959
+
2960
+ 1:30:44.199 --> 1:30:49.939
2961
+ Similar ideas we use for a lot of different
2962
+ statistical models.
2963
+
2964
+ 1:30:50.350 --> 1:30:52.267
2965
+ Where You Always Have the Problem.
2966
+
2967
+ 1:30:53.233 --> 1:31:01.608
2968
+ Different way of looking at it and doing it
2969
+ will do it on Thursday when we will go to language.
2970
+
demo_data/lectures/Lecture-06-09.05.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59fe56576cf62256b2c62b8fdcf6e502ce1931907278fc420d397cd360774f72
3
+ size 129548573
demo_data/lectures/Lecture-07-11.05.2023/English.vtt ADDED
@@ -0,0 +1,2596 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 0:00:01.301 --> 0:00:05.676
4
+ IntroductionOkay So Welcome to Today's Lecture.
5
+
6
+ 0:00:06.066 --> 0:00:12.592
7
+ I'm sorry for the inconvenience.
8
+
9
+ 0:00:12.394 --> 0:00:19.823
10
+ Sometimes they are project meetings.
11
+
12
+ 0:00:19.622 --> 0:00:25.853
13
+ There will be one other time.
14
+
15
+ 0:00:26.806 --> 0:00:40.863
16
+ So what we want to talk today about is want
17
+ to start with neural approaches to machine
18
+
19
+ 0:00:40.863 --> 0:00:42.964
20
+ translation.
21
+
22
+ 0:00:43.123 --> 0:00:51.285
23
+ I guess you have heard about other types of
24
+ neural models for other types of neural language
25
+
26
+ 0:00:51.285 --> 0:00:52.339
27
+ processing.
28
+
29
+ 0:00:52.251 --> 0:00:59.888
30
+ This was some of the first steps in introducing
31
+ neal networks to machine translation.
32
+
33
+ 0:01:00.600 --> 0:01:06.203
34
+ They are similar to what you know they see
35
+ in as large language models.
36
+
37
+ 0:01:06.666 --> 0:01:11.764
38
+ And today look into what are these neuro-language
39
+ models?
40
+
41
+ 0:01:11.676 --> 0:01:13.831
42
+ What is the difference?
43
+
44
+ 0:01:13.741 --> 0:01:15.989
45
+ What is the motivation?
46
+
47
+ 0:01:16.316 --> 0:01:21.445
48
+ And first will use them in statistics and
49
+ machine translation.
50
+
51
+ 0:01:21.364 --> 0:01:28.918
52
+ So if you remember how fully like two or three
53
+ weeks ago we had this likely model where you
54
+
55
+ 0:01:28.918 --> 0:01:31.053
56
+ can integrate easily any.
57
+
58
+ 0:01:31.351 --> 0:01:40.967
59
+ We just have another model which evaluates
60
+ how good a system is or how good a fluent language
61
+
62
+ 0:01:40.967 --> 0:01:41.376
63
+ is.
64
+
65
+ 0:01:41.274 --> 0:01:55.291
66
+ The main advantage compared to the statistical
67
+ models we saw on Tuesday is: Next week we will
68
+
69
+ 0:01:55.291 --> 0:02:06.475
70
+ then go for a neural machine translation where
71
+ we replace the whole model.
72
+
73
+ 0:02:11.211 --> 0:02:21.078
74
+ Just as a remember from Tuesday, we've seen
75
+ the main challenge in language world was that
76
+
77
+ 0:02:21.078 --> 0:02:25.134
78
+ most of the engrams we haven't seen.
79
+
80
+ 0:02:26.946 --> 0:02:33.967
81
+ So this was therefore difficult to estimate
82
+ any probability because you've seen that normally
83
+
84
+ 0:02:33.967 --> 0:02:39.494
85
+ if you have not seen the endgram you will assign
86
+ the probability of zero.
87
+
88
+ 0:02:39.980 --> 0:02:49.420
89
+ However, this is not really very good because
90
+ we don't want to give zero probabilities to
91
+
92
+ 0:02:49.420 --> 0:02:54.979
93
+ sentences, which still might be a very good
94
+ English.
95
+
96
+ 0:02:55.415 --> 0:03:02.167
97
+ And then we learned a lot of techniques and
98
+ that is the main challenging statistical machine
99
+
100
+ 0:03:02.167 --> 0:03:04.490
101
+ translate statistical language.
102
+
103
+ 0:03:04.417 --> 0:03:10.630
104
+ What's how we can give a good estimate of
105
+ probability to events that we haven't seen
106
+
107
+ 0:03:10.630 --> 0:03:12.238
108
+ smoothing techniques?
109
+
110
+ 0:03:12.165 --> 0:03:15.310
111
+ We've seen this interpolation and begoff.
112
+
113
+ 0:03:15.435 --> 0:03:21.637
114
+ And they invent or develop very specific techniques.
115
+
116
+ 0:03:21.520 --> 0:03:26.906
117
+ To deal with that, however, it might not be.
118
+
119
+ 0:03:28.568 --> 0:03:43.190
120
+ And therefore maybe we can do things different,
121
+ so if we have not seen an gram before in statistical
122
+
123
+ 0:03:43.190 --> 0:03:44.348
124
+ models.
125
+
126
+ 0:03:45.225 --> 0:03:51.361
127
+ Before and we can only get information from
128
+ exactly the same words.
129
+
130
+ 0:03:51.411 --> 0:04:06.782
131
+ We don't have some on like approximate matching
132
+ like that, maybe in a sentence that cures similarly.
133
+
134
+ 0:04:06.629 --> 0:04:10.289
135
+ So if you have seen a.
136
+
137
+ 0:04:11.191 --> 0:04:17.748
138
+ And so you would like to have more something
139
+ like that where endgrams are represented, more
140
+
141
+ 0:04:17.748 --> 0:04:21.953
142
+ in a general space, and we can generalize similar
143
+ numbers.
144
+
145
+ 0:04:22.262 --> 0:04:29.874
146
+ So if you learn something about walk then
147
+ maybe we can use this knowledge and also apply.
148
+
149
+ 0:04:30.290 --> 0:04:42.596
150
+ The same as we have done before, but we can
151
+ really better model how similar they are and
152
+
153
+ 0:04:42.596 --> 0:04:45.223
154
+ transfer to other.
155
+
156
+ 0:04:47.047 --> 0:04:54.236
157
+ And we maybe want to do that in a more hierarchical
158
+ approach that we know okay.
159
+
160
+ 0:04:54.146 --> 0:05:02.743
161
+ Some words are similar but like go and walk
162
+ is somehow similar and I and P and G and therefore
163
+
164
+ 0:05:02.743 --> 0:05:06.997
165
+ like maybe if we then merge them in an engram.
166
+
167
+ 0:05:07.387 --> 0:05:15.861
168
+ If we learn something about our walk, then
169
+ it should tell us also something about Hugo.
170
+
171
+ 0:05:15.765 --> 0:05:17.121
172
+ He walks or.
173
+
174
+ 0:05:17.197 --> 0:05:27.327
175
+ You see that there is some relations which
176
+ we need to integrate for you.
177
+
178
+ 0:05:27.188 --> 0:05:35.516
179
+ We need to add the s, but maybe walks should
180
+ also be here.
181
+
182
+ 0:05:37.137 --> 0:05:45.319
183
+ And luckily there is one really convincing
184
+ method in doing that: And that is by using
185
+
186
+ 0:05:45.319 --> 0:05:47.222
187
+ a neural mechanism.
188
+
189
+ 0:05:47.387 --> 0:05:58.497
190
+ That's what we will introduce today so we
191
+ can use this type of neural networks to try
192
+
193
+ 0:05:58.497 --> 0:06:04.053
194
+ to learn this similarity and to learn how.
195
+
196
+ 0:06:04.324 --> 0:06:14.355
197
+ And that is one of the main advantages that
198
+ we have by switching from the standard statistical
199
+
200
+ 0:06:14.355 --> 0:06:15.200
201
+ models.
202
+
203
+ 0:06:15.115 --> 0:06:22.830
204
+ To learn similarities between words and generalized,
205
+ and learn what is called hidden representations
206
+
207
+ 0:06:22.830 --> 0:06:29.705
208
+ or representations of words, where we can measure
209
+ similarity in some dimensions of words.
210
+
211
+ 0:06:30.290 --> 0:06:42.384
212
+ So we can measure in which way words are similar.
213
+
214
+ 0:06:42.822 --> 0:06:48.902
215
+ We had it before and we've seen that words
216
+ were just easier.
217
+
218
+ 0:06:48.802 --> 0:06:51.994
219
+ The only thing we did is like.
220
+
221
+ 0:06:52.192 --> 0:07:02.272
222
+ But this energies don't have any meaning,
223
+ so it wasn't that word is more similar to words.
224
+
225
+ 0:07:02.582 --> 0:07:12.112
226
+ So we couldn't learn anything about words
227
+ in the statistical model and that's a big challenge.
228
+
229
+ 0:07:12.192 --> 0:07:23.063
230
+ About words even like in morphology, so going
231
+ goes is somehow more similar because the person
232
+
233
+ 0:07:23.063 --> 0:07:24.219
234
+ singular.
235
+
236
+ 0:07:24.264 --> 0:07:34.924
237
+ The basic models we have to now have no idea
238
+ about that and goes as similar to go than it
239
+
240
+ 0:07:34.924 --> 0:07:37.175
241
+ might be to sleep.
242
+
243
+ 0:07:39.919 --> 0:07:44.073
244
+ So what we want to do today.
245
+
246
+ 0:07:43.930 --> 0:07:53.098
247
+ In order to go to this we will have a short
248
+ introduction into.
249
+
250
+ 0:07:53.954 --> 0:08:05.984
251
+ It very short just to see how we use them
252
+ here, but that's a good thing, so most of you
253
+
254
+ 0:08:05.984 --> 0:08:08.445
255
+ think it will be.
256
+
257
+ 0:08:08.928 --> 0:08:14.078
258
+ And then we will first look into a feet forward
259
+ neural network language models.
260
+
261
+ 0:08:14.454 --> 0:08:23.706
262
+ And there we will still have this approximation.
263
+
264
+ 0:08:23.517 --> 0:08:33.906
265
+ We have before we are looking only at a fixed
266
+ window.
267
+
268
+ 0:08:34.154 --> 0:08:35.030
269
+ The case.
270
+
271
+ 0:08:34.942 --> 0:08:38.217
272
+ However, we have the umbellent here.
273
+
274
+ 0:08:38.129 --> 0:08:43.353
275
+ That's why they're already better in order
276
+ to generalize.
277
+
278
+ 0:08:44.024 --> 0:08:53.169
279
+ And then at the end we'll look at language
280
+ models where we then have the additional advantage.
281
+
282
+ 0:08:53.093 --> 0:09:04.317
283
+ Case that we need to have a fixed history,
284
+ but in theory we can model arbitrary long dependencies.
285
+
286
+ 0:09:04.304 --> 0:09:12.687
287
+ And we talked about on Tuesday where it is
288
+ not clear what type of information it is to.
289
+
290
+ 0:09:16.396 --> 0:09:24.981
291
+ So in general molecular networks I normally
292
+ learn to prove that they perform some tasks.
293
+
294
+ 0:09:25.325 --> 0:09:33.472
295
+ We have the structure and we are learning
296
+ them from samples so that is similar to what
297
+
298
+ 0:09:33.472 --> 0:09:34.971
299
+ we have before.
300
+
301
+ 0:09:34.877 --> 0:09:42.277
302
+ So now we have the same task here, a language
303
+ model giving input or forwards.
304
+
305
+ 0:09:42.642 --> 0:09:48.959
306
+ And is somewhat originally motivated by human
307
+ brain.
308
+
309
+ 0:09:48.840 --> 0:10:00.640
310
+ However, when you now need to know about artificial
311
+ neural networks, it's hard to get similarity.
312
+
313
+ 0:10:00.540 --> 0:10:02.889
314
+ There seemed to be not that point.
315
+
316
+ 0:10:03.123 --> 0:10:11.014
317
+ So what they are mainly doing is summoning
318
+ multiplication and then one non-linear activation.
319
+
320
+ 0:10:12.692 --> 0:10:16.085
321
+ So the basic units are these type of.
322
+
323
+ 0:10:17.937 --> 0:10:29.891
324
+ Perceptron basic blocks which we have and
325
+ this does processing so we have a fixed number
326
+
327
+ 0:10:29.891 --> 0:10:36.070
328
+ of input features and that will be important.
329
+
330
+ 0:10:36.096 --> 0:10:39.689
331
+ So we have here numbers to xn as input.
332
+
333
+ 0:10:40.060 --> 0:10:53.221
334
+ And this makes partly of course language processing
335
+ difficult.
336
+
337
+ 0:10:54.114 --> 0:10:57.609
338
+ So we have to model this time on and then
339
+ go stand home and model.
340
+
341
+ 0:10:58.198 --> 0:11:02.099
342
+ Then we are having weights, which are the
343
+ parameters and the number of weights exactly
344
+
345
+ 0:11:02.099 --> 0:11:03.668
346
+ the same as the number of weights.
347
+
348
+ 0:11:04.164 --> 0:11:06.322
349
+ Of input features.
350
+
351
+ 0:11:06.208 --> 0:11:15.070
352
+ Sometimes he has his fires in there, and then
353
+ it's not really an input from.
354
+
355
+ 0:11:15.195 --> 0:11:19.205
356
+ And what you then do is multiply.
357
+
358
+ 0:11:19.087 --> 0:11:26.166
359
+ Each input resists weight and then you sum
360
+ it up and then.
361
+
362
+ 0:11:26.606 --> 0:11:34.357
363
+ What is then additionally later important
364
+ is that we have an activation function and
365
+
366
+ 0:11:34.357 --> 0:11:42.473
367
+ it's important that this activation function
368
+ is non linear, so we come to just a linear.
369
+
370
+ 0:11:43.243 --> 0:11:54.088
371
+ And later it will be important that this is
372
+ differentiable because otherwise all the training.
373
+
374
+ 0:11:54.714 --> 0:12:01.907
375
+ This model by itself is not very powerful.
376
+
377
+ 0:12:01.739 --> 0:12:10.440
378
+ It was originally shown that this is not powerful.
379
+
380
+ 0:12:10.710 --> 0:12:19.463
381
+ However, there is a very easy extension, the
382
+ multi layer perceptual, and then things get
383
+
384
+ 0:12:19.463 --> 0:12:20.939
385
+ very powerful.
386
+
387
+ 0:12:21.081 --> 0:12:27.719
388
+ The thing is you just connect a lot of these
389
+ in this layer of structures and we have our
390
+
391
+ 0:12:27.719 --> 0:12:35.029
392
+ input layer where we have the inputs and our
393
+ hidden layer at least one where there is everywhere.
394
+
395
+ 0:12:35.395 --> 0:12:39.817
396
+ And then we can combine them all to do that.
397
+
398
+ 0:12:40.260 --> 0:12:48.320
399
+ The input layer is of course somewhat given
400
+ by a problem of dimension.
401
+
402
+ 0:12:48.206 --> 0:13:00.014
403
+ The outward layer is also given by your dimension,
404
+ but the hidden layer is of course a hyperparameter.
405
+
406
+ 0:13:01.621 --> 0:13:06.982
407
+ How we represent wordsSo let's start with
408
+ the first question, now more language related,
409
+
410
+ 0:13:06.982 --> 0:13:08.788
411
+ and that is how we represent.
412
+
413
+ 0:13:09.149 --> 0:13:23.460
414
+ So we've seen here we have the but the question
415
+ is now how can we put in a word into this?
416
+
417
+ 0:13:26.866 --> 0:13:34.117
418
+ Noise: The first thing we're able to be better
419
+ is by the fact that like you are said,.
420
+
421
+ 0:13:34.314 --> 0:13:43.028
422
+ That is not that easy because the continuous
423
+ vector will come to that.
424
+
425
+ 0:13:42.905 --> 0:13:50.395
426
+ So from the neo-network we can directly put
427
+ in the bedding.
428
+
429
+ 0:13:50.630 --> 0:13:57.277
430
+ But if we need to input a word into the needle
431
+ network, it has to be something which is easily
432
+
433
+ 0:13:57.277 --> 0:13:57.907
434
+ defined.
435
+
436
+ 0:13:59.079 --> 0:14:12.492
437
+ The one hood encoding, and then we have one
438
+ out of encoding, so one value is one, and all
439
+
440
+ 0:14:12.492 --> 0:14:15.324
441
+ the others is the.
442
+
443
+ 0:14:16.316 --> 0:14:25.936
444
+ That means we are always dealing with fixed
445
+ vocabulary because what said is we cannot.
446
+
447
+ 0:14:26.246 --> 0:14:38.017
448
+ So you cannot easily extend your vocabulary
449
+ because if you mean you would extend your vocabulary.
450
+
451
+ 0:14:39.980 --> 0:14:41.502
452
+ That's also motivating.
453
+
454
+ 0:14:41.438 --> 0:14:43.683
455
+ We're talked about biperriagoding.
456
+
457
+ 0:14:43.619 --> 0:14:45.383
458
+ That's a nice thing there.
459
+
460
+ 0:14:45.318 --> 0:14:47.214
461
+ We have a fixed vocabulary.
462
+
463
+ 0:14:48.048 --> 0:14:55.804
464
+ The big advantage of this one encoding is
465
+ that we don't implicitly sum our implement
466
+
467
+ 0:14:55.804 --> 0:15:04.291
468
+ similarity between words, but really re-learning
469
+ because if you first think about this, this
470
+
471
+ 0:15:04.291 --> 0:15:06.938
472
+ is a very, very inefficient.
473
+
474
+ 0:15:07.227 --> 0:15:15.889
475
+ So you need like to represent end words, you
476
+ need a dimension of an end dimensional vector.
477
+
478
+ 0:15:16.236 --> 0:15:24.846
479
+ Imagine you could do binary encoding so you
480
+ could represent words as binary vectors.
481
+
482
+ 0:15:24.745 --> 0:15:26.474
483
+ Then you would.
484
+
485
+ 0:15:26.806 --> 0:15:31.177
486
+ Will be significantly more efficient.
487
+
488
+ 0:15:31.062 --> 0:15:36.781
489
+ However, then you have some implicit similarity.
490
+
491
+ 0:15:36.664 --> 0:15:39.121
492
+ Some numbers share.
493
+
494
+ 0:15:39.559 --> 0:15:46.958
495
+ Would somehow be bad because you would force
496
+ someone to do this by hand or clear how to
497
+
498
+ 0:15:46.958 --> 0:15:47.631
499
+ define.
500
+
501
+ 0:15:48.108 --> 0:15:55.135
502
+ So therefore currently this is the most successful
503
+ approach to just do this one watch.
504
+
505
+ 0:15:55.095 --> 0:15:59.563
506
+ Representations, so we take a fixed vocabulary.
507
+
508
+ 0:15:59.470 --> 0:16:06.124
509
+ We map each word to the inise, and then we
510
+ represent a word like this.
511
+
512
+ 0:16:06.030 --> 0:16:13.248
513
+ So if home will be one, the representation
514
+ will be one zero zero zero, and.
515
+
516
+ 0:16:14.514 --> 0:16:30.639
517
+ But this dimension here is a vocabulary size
518
+ and that is quite high, so we are always trying
519
+
520
+ 0:16:30.639 --> 0:16:33.586
521
+ to be efficient.
522
+
523
+ 0:16:33.853 --> 0:16:43.792
524
+ We are doing then some type of efficiency
525
+ because typically we are having this next layer.
526
+
527
+ 0:16:44.104 --> 0:16:51.967
528
+ It can be still maybe two hundred or five
529
+ hundred or one thousand neurons, but this is
530
+
531
+ 0:16:51.967 --> 0:16:53.323
532
+ significantly.
533
+
534
+ 0:16:53.713 --> 0:17:03.792
535
+ You can learn that directly and there we then
536
+ have similarity between words.
537
+
538
+ 0:17:03.662 --> 0:17:07.462
539
+ Then it is that some words.
540
+
541
+ 0:17:07.807 --> 0:17:14.772
542
+ But the nice thing is that this is then learned
543
+ that we are not need to hand define that.
544
+
545
+ 0:17:17.117 --> 0:17:32.742
546
+ We'll come later to the explicit architecture
547
+ of the neural language one, and there we can
548
+
549
+ 0:17:32.742 --> 0:17:35.146
550
+ see how it's.
551
+
552
+ 0:17:38.418 --> 0:17:44.857
553
+ So we're seeing that the other one or our
554
+ representation always has the same similarity.
555
+
556
+ 0:17:45.105 --> 0:17:59.142
557
+ Then we're having this continuous factor which
558
+ is a lot smaller dimension and that's important
559
+
560
+ 0:17:59.142 --> 0:18:00.768
561
+ for later.
562
+
563
+ 0:18:01.121 --> 0:18:06.989
564
+ What we are doing then is learning these representations
565
+ so that they are best for language.
566
+
567
+ 0:18:07.487 --> 0:18:14.968
568
+ So the representations are implicitly training
569
+ the language for the cards.
570
+
571
+ 0:18:14.869 --> 0:18:19.061
572
+ This is the best way for doing language.
573
+
574
+ 0:18:19.479 --> 0:18:32.564
575
+ And the nice thing that was found out later
576
+ is these representations are really good.
577
+
578
+ 0:18:33.153 --> 0:18:39.253
579
+ And that is why they are now even called word
580
+ embeddings by themselves and used for other
581
+
582
+ 0:18:39.253 --> 0:18:39.727
583
+ tasks.
584
+
585
+ 0:18:40.360 --> 0:18:49.821
586
+ And they are somewhat describing very different
587
+ things so they can describe and semantic similarities.
588
+
589
+ 0:18:49.789 --> 0:18:58.650
590
+ Are looking at the very example of today mass
591
+ vector space by adding words and doing some
592
+
593
+ 0:18:58.650 --> 0:19:00.618
594
+ interesting things.
595
+
596
+ 0:19:00.940 --> 0:19:11.178
597
+ So they got really like the first big improvement
598
+ when switching to neurostaff.
599
+
600
+ 0:19:11.491 --> 0:19:20.456
601
+ Are like part of the model, but with more
602
+ complex representation, but they are the basic
603
+
604
+ 0:19:20.456 --> 0:19:21.261
605
+ models.
606
+
607
+ 0:19:23.683 --> 0:19:36.979
608
+ In the output layer we are also having one
609
+ output layer structure and a connection function.
610
+
611
+ 0:19:36.997 --> 0:19:46.525
612
+ That is, for language learning we want to
613
+ predict what is the most common word.
614
+
615
+ 0:19:47.247 --> 0:19:56.453
616
+ And that can be done very well with this so
617
+ called soft back layer, where again the dimension.
618
+
619
+ 0:19:56.376 --> 0:20:02.825
620
+ Vocabulary size, so this is a vocabulary size,
621
+ and again the case neural represents the case
622
+
623
+ 0:20:02.825 --> 0:20:03.310
624
+ class.
625
+
626
+ 0:20:03.241 --> 0:20:09.759
627
+ So in our case we have again one round representation,
628
+ someone saying this is a core report.
629
+
630
+ 0:20:10.090 --> 0:20:17.255
631
+ Our probability distribution is a probability
632
+ distribution over all works, so the case entry
633
+
634
+ 0:20:17.255 --> 0:20:21.338
635
+ tells us how probable is that the next word
636
+ is this.
637
+
638
+ 0:20:22.682 --> 0:20:33.885
639
+ So we need to have some probability distribution
640
+ at our output in order to achieve that this
641
+
642
+ 0:20:33.885 --> 0:20:37.017
643
+ activation function goes.
644
+
645
+ 0:20:37.197 --> 0:20:46.944
646
+ And we can achieve that with a soft max activation
647
+ we take the input to the form of the value,
648
+
649
+ 0:20:46.944 --> 0:20:47.970
650
+ and then.
651
+
652
+ 0:20:48.288 --> 0:20:58.021
653
+ So by having this type of activation function
654
+ we are really getting this type of probability.
655
+
656
+ 0:20:59.019 --> 0:21:15.200
657
+ At the beginning was also very challenging
658
+ because again we have this inefficient representation.
659
+
660
+ 0:21:15.235 --> 0:21:29.799
661
+ You can imagine that something over is maybe
662
+ a bit inefficient with cheap users, but definitely.
663
+
664
+ 0:21:36.316 --> 0:21:44.072
665
+ And then for training the models that will
666
+ be fine, so we have to use architecture now.
667
+
668
+ 0:21:44.264 --> 0:21:48.491
669
+ We need to minimize the arrow.
670
+
671
+ 0:21:48.355 --> 0:21:53.196
672
+ Are we doing it taking the output?
673
+
674
+ 0:21:53.058 --> 0:21:58.180
675
+ We are comparing it to our targets.
676
+
677
+ 0:21:58.298 --> 0:22:03.830
678
+ So one important thing is by training them.
679
+
680
+ 0:22:03.705 --> 0:22:07.531
681
+ How can we measure the error?
682
+
683
+ 0:22:07.403 --> 0:22:12.763
684
+ So what is if we are training the ideas?
685
+
686
+ 0:22:13.033 --> 0:22:15.163
687
+ And how well we are measuring.
688
+
689
+ 0:22:15.094 --> 0:22:19.769
690
+ It is in natural language processing, typically
691
+ the cross entropy.
692
+
693
+ 0:22:19.960 --> 0:22:35.575
694
+ And that means we are comparing the target
695
+ with the output.
696
+
697
+ 0:22:35.335 --> 0:22:44.430
698
+ It gets optimized and you're seeing that this,
699
+ of course, makes it again very nice and easy
700
+
701
+ 0:22:44.430 --> 0:22:49.868
702
+ because our target is again a one-hour representation.
703
+
704
+ 0:22:50.110 --> 0:23:00.116
705
+ So all of these are always zero, and what
706
+ we are then doing is we are taking the one.
707
+
708
+ 0:23:00.100 --> 0:23:04.615
709
+ And we only need to multiply the one with
710
+ the logarithm here, and that is all the feedback
711
+
712
+ 0:23:04.615 --> 0:23:05.955
713
+ signal we are taking here.
714
+
715
+ 0:23:06.946 --> 0:23:13.885
716
+ Of course, this is not always influenced by
717
+ all the others.
718
+
719
+ 0:23:13.770 --> 0:23:17.936
720
+ Why is this influenced by all the.
721
+
722
+ 0:23:24.304 --> 0:23:34.382
723
+ Have the activation function, which is the
724
+ current activation divided by some of the others.
725
+
726
+ 0:23:34.354 --> 0:23:45.924
727
+ Otherwise it could easily just increase this
728
+ volume and ignore the others, but if you increase
729
+
730
+ 0:23:45.924 --> 0:23:49.090
731
+ one value all the others.
732
+
733
+ 0:23:51.351 --> 0:23:59.912
734
+ Then we can do with neometrics one very nice
735
+ and easy type of training that is done in all
736
+
737
+ 0:23:59.912 --> 0:24:07.721
738
+ the neometrics where we are now calculating
739
+ our error and especially the gradient.
740
+
741
+ 0:24:07.707 --> 0:24:11.640
742
+ So in which direction does the error show?
743
+
744
+ 0:24:11.548 --> 0:24:18.632
745
+ And then if we want to go to a smaller arrow
746
+ that's what we want to achieve.
747
+
748
+ 0:24:18.540 --> 0:24:26.629
749
+ We are taking the inverse direction of the
750
+ gradient and thereby trying to minimize our
751
+
752
+ 0:24:26.629 --> 0:24:27.280
753
+ error.
754
+
755
+ 0:24:27.287 --> 0:24:31.041
756
+ And we have to do that, of course, for all
757
+ the weights.
758
+
759
+ 0:24:30.974 --> 0:24:36.630
760
+ And to calculate the error of all the weights,
761
+ we won't do the defectvagation here.
762
+
763
+ 0:24:36.563 --> 0:24:41.376
764
+ But but what you can do is you can propagate
765
+ the arrow which measured.
766
+
767
+ 0:24:41.309 --> 0:24:46.394
768
+ At the end you can propagate it back its basic
769
+ mass and basic derivation.
770
+
771
+ 0:24:46.706 --> 0:24:58.854
772
+ For each way in your model measure how much
773
+ you contribute to the error and then change
774
+
775
+ 0:24:58.854 --> 0:25:01.339
776
+ it in a way that.
777
+
778
+ 0:25:04.524 --> 0:25:11.265
779
+ Multilayer Language ModelSo to summarize what
780
+ for at least machine translation on your machine
781
+
782
+ 0:25:11.265 --> 0:25:18.502
783
+ translation should remember, you know, to understand
784
+ on this problem is that this is how a multilayer
785
+
786
+ 0:25:18.502 --> 0:25:20.631
787
+ first the problem looks like.
788
+
789
+ 0:25:20.580 --> 0:25:28.251
790
+ There are fully two layers and no connections.
791
+
792
+ 0:25:28.108 --> 0:25:29.759
793
+ Across layers.
794
+
795
+ 0:25:29.829 --> 0:25:35.153
796
+ And what they're doing is always just a waited
797
+ sum here and then in activation production.
798
+
799
+ 0:25:35.415 --> 0:25:38.792
800
+ And in order to train you have this forward
801
+ and backward pass.
802
+
803
+ 0:25:39.039 --> 0:25:41.384
804
+ So We Put in Here.
805
+
806
+ 0:25:41.281 --> 0:25:41.895
807
+ Inputs.
808
+
809
+ 0:25:41.818 --> 0:25:45.285
810
+ We have some random values at the beginning.
811
+
812
+ 0:25:45.208 --> 0:25:47.300
813
+ Then calculate the output.
814
+
815
+ 0:25:47.222 --> 0:25:54.160
816
+ We are measuring how our error is propagating
817
+ the arrow back and then changing our model
818
+
819
+ 0:25:54.160 --> 0:25:57.902
820
+ in a way that we hopefully get a smaller arrow.
821
+
822
+ 0:25:57.824 --> 0:25:59.621
823
+ And then that is how.
824
+
825
+ 0:26:01.962 --> 0:26:12.893
826
+ So before we're coming into our neural networks
827
+ language models, how can we use this type of
828
+
829
+ 0:26:12.893 --> 0:26:17.595
830
+ neural network to do language modeling?
831
+
832
+ 0:26:23.103 --> 0:26:33.157
833
+ So how can we use them in natural language
834
+ processing, especially machine translation?
835
+
836
+ 0:26:33.042 --> 0:26:41.398
837
+ The first idea of using them was to estimate:
838
+ So we have seen that the output can be monitored
839
+
840
+ 0:26:41.398 --> 0:26:42.630
841
+ here as well.
842
+
843
+ 0:26:43.603 --> 0:26:50.311
844
+ A probability distribution and if we have
845
+ a full vocabulary we could mainly hear estimating
846
+
847
+ 0:26:50.311 --> 0:26:56.727
848
+ how probable each next word is and then use
849
+ that in our language model fashion as we've
850
+
851
+ 0:26:56.727 --> 0:26:58.112
852
+ done it last time.
853
+
854
+ 0:26:58.039 --> 0:27:03.217
855
+ We got the probability of a full sentence
856
+ as a product of individual.
857
+
858
+ 0:27:04.544 --> 0:27:12.555
859
+ And: That was done in the ninety seven years
860
+ and it's very easy to integrate it into this
861
+
862
+ 0:27:12.555 --> 0:27:14.602
863
+ lot of the year model.
864
+
865
+ 0:27:14.513 --> 0:27:19.553
866
+ So we have said that this is how the locker
867
+ here model looks like.
868
+
869
+ 0:27:19.478 --> 0:27:25.121
870
+ So we are searching the best translation which
871
+ minimizes each waste time.
872
+
873
+ 0:27:25.125 --> 0:27:26.362
874
+ The Future About You.
875
+
876
+ 0:27:26.646 --> 0:27:31.647
877
+ We have that with minimum error rate training
878
+ if you can remember where we search for the
879
+
880
+ 0:27:31.647 --> 0:27:32.147
881
+ optimal.
882
+
883
+ 0:27:32.512 --> 0:27:40.422
884
+ The language model and many others, and we
885
+ can just add here a neuromodel, have a knock
886
+
887
+ 0:27:40.422 --> 0:27:41.591
888
+ of features.
889
+
890
+ 0:27:41.861 --> 0:27:45.761
891
+ So that is quite easy as said.
892
+
893
+ 0:27:45.635 --> 0:27:53.140
894
+ That was how statistical machine translation
895
+ was improved.
896
+
897
+ 0:27:53.013 --> 0:27:57.088
898
+ You just add one more feature.
899
+
900
+ 0:27:58.798 --> 0:28:07.631
901
+ So how can we model the language modeling
902
+ with a network?
903
+
904
+ 0:28:07.479 --> 0:28:16.010
905
+ So what we have to do is model the probability
906
+ of the.
907
+
908
+ 0:28:16.656 --> 0:28:25.047
909
+ The problem in general in the head is that
910
+ mostly we haven't seen long sequences.
911
+
912
+ 0:28:25.085 --> 0:28:35.650
913
+ Mostly we have to beg off to very short sequences
914
+ and we are working on this discrete space where
915
+
916
+ 0:28:35.650 --> 0:28:36.944
917
+ similarity.
918
+
919
+ 0:28:37.337 --> 0:28:50.163
920
+ So the idea is if we have now a real network,
921
+ we can make words into continuous representation.
922
+
923
+ 0:28:51.091 --> 0:29:00.480
924
+ And the structure then looks like this, so
925
+ this is a basic still feed forward neural network.
926
+
927
+ 0:29:01.361 --> 0:29:10.645
928
+ We are doing this at perximation again, so
929
+ we are not putting in all previous words, but
930
+
931
+ 0:29:10.645 --> 0:29:11.375
932
+ it is.
933
+
934
+ 0:29:11.691 --> 0:29:25.856
935
+ This is done because we said that in the real
936
+ network we can have only a fixed type of input.
937
+
938
+ 0:29:25.945 --> 0:29:31.886
939
+ You can only do a fixed step and then we'll
940
+ be doing that exactly in minus one.
941
+
942
+ 0:29:33.593 --> 0:29:39.536
943
+ So here you are, for example, three words
944
+ and three different words.
945
+
946
+ 0:29:39.450 --> 0:29:50.934
947
+ One and all the others are: And then we're
948
+ having the first layer of the neural network,
949
+
950
+ 0:29:50.934 --> 0:29:56.225
951
+ which like you learns is word embedding.
952
+
953
+ 0:29:57.437 --> 0:30:04.976
954
+ There is one thing which is maybe special
955
+ compared to the standard neural member.
956
+
957
+ 0:30:05.345 --> 0:30:11.918
958
+ So the representation of this word we want
959
+ to learn first of all position independence.
960
+
961
+ 0:30:11.843 --> 0:30:19.014
962
+ So we just want to learn what is the general
963
+ meaning of the word independent of its neighbors.
964
+
965
+ 0:30:19.299 --> 0:30:26.239
966
+ And therefore the representation you get here
967
+ should be the same as if in the second position.
968
+
969
+ 0:30:27.247 --> 0:30:36.865
970
+ The nice thing you can achieve is that this
971
+ weights which you're using here you're reusing
972
+
973
+ 0:30:36.865 --> 0:30:41.727
974
+ here and reusing here so we are forcing them.
975
+
976
+ 0:30:42.322 --> 0:30:48.360
977
+ You then learn your word embedding, which
978
+ is contextual, independent, so it's the same
979
+
980
+ 0:30:48.360 --> 0:30:49.678
981
+ for each position.
982
+
983
+ 0:30:49.909 --> 0:31:03.482
984
+ So that's the idea that you want to learn
985
+ the representation first of and you don't want
986
+
987
+ 0:31:03.482 --> 0:31:07.599
988
+ to really use the context.
989
+
990
+ 0:31:08.348 --> 0:31:13.797
991
+ That of course might have a different meaning
992
+ depending on where it stands, but we'll learn
993
+
994
+ 0:31:13.797 --> 0:31:14.153
995
+ that.
996
+
997
+ 0:31:14.514 --> 0:31:20.386
998
+ So first we are learning here representational
999
+ words, which is just the representation.
1000
+
1001
+ 0:31:20.760 --> 0:31:32.498
1002
+ Normally we said in neurons all input neurons
1003
+ here are connected to all here, but we're reducing
1004
+
1005
+ 0:31:32.498 --> 0:31:37.338
1006
+ the complexity by saying these neurons.
1007
+
1008
+ 0:31:37.857 --> 0:31:47.912
1009
+ Then we have a lot denser representation that
1010
+ is our three word embedded in here, and now
1011
+
1012
+ 0:31:47.912 --> 0:31:57.408
1013
+ we are learning this interaction between words,
1014
+ a direction between words not based.
1015
+
1016
+ 0:31:57.677 --> 0:32:08.051
1017
+ So we have at least one connected layer here,
1018
+ which takes a three embedding input and then
1019
+
1020
+ 0:32:08.051 --> 0:32:14.208
1021
+ learns a new embedding which now represents
1022
+ the full.
1023
+
1024
+ 0:32:15.535 --> 0:32:16.551
1025
+ Layers.
1026
+
1027
+ 0:32:16.424 --> 0:32:27.856
1028
+ It is the output layer which now and then
1029
+ again the probability distribution of all the.
1030
+
1031
+ 0:32:28.168 --> 0:32:48.612
1032
+ So here is your target prediction.
1033
+
1034
+ 0:32:48.688 --> 0:32:56.361
1035
+ The nice thing is that you learn everything
1036
+ together, so you don't have to teach them what
1037
+
1038
+ 0:32:56.361 --> 0:32:58.722
1039
+ a good word representation.
1040
+
1041
+ 0:32:59.079 --> 0:33:08.306
1042
+ Training the whole number together, so it
1043
+ learns what a good representation for a word
1044
+
1045
+ 0:33:08.306 --> 0:33:13.079
1046
+ you get in order to perform your final task.
1047
+
1048
+ 0:33:15.956 --> 0:33:19.190
1049
+ Yeah, that is the main idea.
1050
+
1051
+ 0:33:20.660 --> 0:33:32.731
1052
+ This is now a days often referred to as one
1053
+ way of self supervise learning.
1054
+
1055
+ 0:33:33.053 --> 0:33:37.120
1056
+ The output is the next word and the input
1057
+ is the previous word.
1058
+
1059
+ 0:33:37.377 --> 0:33:46.783
1060
+ But it's not really that we created labels,
1061
+ but we artificially created a task out of unlabeled.
1062
+
1063
+ 0:33:46.806 --> 0:34:02.452
1064
+ We just had pure text, and then we created
1065
+ the telescopes by predicting the next word,
1066
+
1067
+ 0:34:02.452 --> 0:34:18.818
1068
+ which is: Say we have like two sentences like
1069
+ go home and the second one is go to prepare.
1070
+
1071
+ 0:34:18.858 --> 0:34:30.135
1072
+ And then we have to predict the next series
1073
+ and my questions in the labels for the album.
1074
+
1075
+ 0:34:31.411 --> 0:34:42.752
1076
+ We model this as one vector with like probability
1077
+ for possible weights starting again.
1078
+
1079
+ 0:34:44.044 --> 0:34:57.792
1080
+ Multiple examples, so then you would twice
1081
+ train one to predict KRT, one to predict home,
1082
+
1083
+ 0:34:57.792 --> 0:35:02.374
1084
+ and then of course the easel.
1085
+
1086
+ 0:35:04.564 --> 0:35:13.568
1087
+ Is a very good point, so you are not aggregating
1088
+ examples beforehand, but you are taking each.
1089
+
1090
+ 0:35:19.259 --> 0:35:37.204
1091
+ So when you do it simultaneously learn the
1092
+ projection layer and the endgram for abilities
1093
+
1094
+ 0:35:37.204 --> 0:35:39.198
1095
+ and then.
1096
+
1097
+ 0:35:39.499 --> 0:35:47.684
1098
+ And later analyze it that these representations
1099
+ are very powerful.
1100
+
1101
+ 0:35:47.562 --> 0:35:56.360
1102
+ The task is just a very important task to
1103
+ model what is the next word.
1104
+
1105
+ 0:35:56.816 --> 0:35:59.842
1106
+ Is motivated by nowadays.
1107
+
1108
+ 0:35:59.726 --> 0:36:10.668
1109
+ In order to get the meaning of the word you
1110
+ have to look at its companies where the context.
1111
+
1112
+ 0:36:10.790 --> 0:36:16.048
1113
+ If you read texts in days of word which you
1114
+ have never seen, you often can still estimate
1115
+
1116
+ 0:36:16.048 --> 0:36:21.130
1117
+ the meaning of this word because you do not
1118
+ know how it is used, and this is typically
1119
+
1120
+ 0:36:21.130 --> 0:36:22.240
1121
+ used as a city or.
1122
+
1123
+ 0:36:22.602 --> 0:36:25.865
1124
+ Just imagine you read a text about some city.
1125
+
1126
+ 0:36:25.794 --> 0:36:32.033
1127
+ Even if you've never seen the city before,
1128
+ you often know from the context of how it's
1129
+
1130
+ 0:36:32.033 --> 0:36:32.464
1131
+ used.
1132
+
1133
+ 0:36:34.094 --> 0:36:42.483
1134
+ So what is now the big advantage of using
1135
+ neural neckworks?
1136
+
1137
+ 0:36:42.343 --> 0:36:51.853
1138
+ So just imagine we have to estimate that I
1139
+ bought my first iPhone.
1140
+
1141
+ 0:36:52.052 --> 0:36:56.608
1142
+ So you have to monitor the probability of
1143
+ ad hitting them.
1144
+
1145
+ 0:36:56.530 --> 0:37:00.239
1146
+ Now imagine iPhone, which you have never seen.
1147
+
1148
+ 0:37:00.600 --> 0:37:11.588
1149
+ So all the techniques we had last time at
1150
+ the end, if you haven't seen iPhone you will
1151
+
1152
+ 0:37:11.588 --> 0:37:14.240
1153
+ always fall back to.
1154
+
1155
+ 0:37:15.055 --> 0:37:26.230
1156
+ You have no idea how to deal that you won't
1157
+ have seen the diagram, the trigram, and all
1158
+
1159
+ 0:37:26.230 --> 0:37:27.754
1160
+ the others.
1161
+
1162
+ 0:37:28.588 --> 0:37:43.441
1163
+ If you're having this type of model, what
1164
+ does it do if you have my first and then something?
1165
+
1166
+ 0:37:43.483 --> 0:37:50.270
1167
+ Maybe this representation is really messed
1168
+ up because it's mainly on a cavalry word.
1169
+
1170
+ 0:37:50.730 --> 0:37:57.793
1171
+ However, you have still these two information
1172
+ that two words before was first and therefore.
1173
+
1174
+ 0:37:58.098 --> 0:38:06.954
1175
+ So you have a lot of information in order
1176
+ to estimate how good it is.
1177
+
1178
+ 0:38:06.827 --> 0:38:13.282
1179
+ There could be more information if you know
1180
+ that.
1181
+
1182
+ 0:38:13.593 --> 0:38:25.168
1183
+ So all this type of modeling we can do that
1184
+ we couldn't do beforehand because we always
1185
+
1186
+ 0:38:25.168 --> 0:38:25.957
1187
+ have.
1188
+
1189
+ 0:38:27.027 --> 0:38:37.524
1190
+ Good point, so typically you would have one
1191
+ token for a vocabulary so that you could, for
1192
+
1193
+ 0:38:37.524 --> 0:38:45.922
1194
+ example: All you're doing by parent coding
1195
+ when you have a fixed thing.
1196
+
1197
+ 0:38:46.226 --> 0:38:49.437
1198
+ Oh yeah, you have to do something like that
1199
+ that that that's true.
1200
+
1201
+ 0:38:50.050 --> 0:38:55.420
1202
+ So yeah, auto vocabulary are by thanking where
1203
+ you don't have other words written.
1204
+
1205
+ 0:38:55.735 --> 0:39:06.295
1206
+ But then, of course, you might be getting
1207
+ very long previous things, and your sequence
1208
+
1209
+ 0:39:06.295 --> 0:39:11.272
1210
+ length gets very long for unknown words.
1211
+
1212
+ 0:39:17.357 --> 0:39:20.067
1213
+ Any more questions to the basic stable.
1214
+
1215
+ 0:39:23.783 --> 0:39:36.719
1216
+ For this model, what we then want to continue
1217
+ is looking a bit into how complex or how we
1218
+
1219
+ 0:39:36.719 --> 0:39:39.162
1220
+ can make things.
1221
+
1222
+ 0:39:40.580 --> 0:39:49.477
1223
+ Because at the beginning there was definitely
1224
+ a major challenge, it's still not that easy,
1225
+
1226
+ 0:39:49.477 --> 0:39:58.275
1227
+ and I mean our likeers followed the talk about
1228
+ their environmental fingerprint and so on.
1229
+
1230
+ 0:39:58.478 --> 0:40:05.700
1231
+ So this calculation is not really heavy, and
1232
+ if you build systems yourselves you have to
1233
+
1234
+ 0:40:05.700 --> 0:40:06.187
1235
+ wait.
1236
+
1237
+ 0:40:06.466 --> 0:40:14.683
1238
+ So it's good to know a bit about how complex
1239
+ things are in order to do a good or efficient
1240
+
1241
+ 0:40:14.683 --> 0:40:15.405
1242
+ affair.
1243
+
1244
+ 0:40:15.915 --> 0:40:24.211
1245
+ So one thing where most of the calculation
1246
+ really happens is if you're doing it in a bad
1247
+
1248
+ 0:40:24.211 --> 0:40:24.677
1249
+ way.
1250
+
1251
+ 0:40:25.185 --> 0:40:33.523
1252
+ So in generally all these layers we are talking
1253
+ about networks and zones fancy.
1254
+
1255
+ 0:40:33.419 --> 0:40:46.713
1256
+ In the end it is: So what you have to do in
1257
+ order to calculate here, for example, these
1258
+
1259
+ 0:40:46.713 --> 0:40:52.454
1260
+ activations: So make it simple a bit.
1261
+
1262
+ 0:40:52.303 --> 0:41:06.633
1263
+ Let's see where outputs and you just do metric
1264
+ multiplication between your weight matrix and
1265
+
1266
+ 0:41:06.633 --> 0:41:08.482
1267
+ your input.
1268
+
1269
+ 0:41:08.969 --> 0:41:20.992
1270
+ So that is why computers are so powerful for
1271
+ neural networks because they are very good
1272
+
1273
+ 0:41:20.992 --> 0:41:22.358
1274
+ in doing.
1275
+
1276
+ 0:41:22.782 --> 0:41:28.013
1277
+ However, for some type for the embedding layer
1278
+ this is really very inefficient.
1279
+
1280
+ 0:41:28.208 --> 0:41:39.652
1281
+ So because remember we're having this one
1282
+ art encoding in this input, it's always like
1283
+
1284
+ 0:41:39.652 --> 0:41:42.940
1285
+ one and everything else.
1286
+
1287
+ 0:41:42.809 --> 0:41:47.022
1288
+ It's zero if we're doing this.
1289
+
1290
+ 0:41:47.387 --> 0:41:55.552
1291
+ So therefore you can do at least the forward
1292
+ pass a lot more efficient if you don't really
1293
+
1294
+ 0:41:55.552 --> 0:42:01.833
1295
+ do this calculation, but you can select the
1296
+ one color where there is.
1297
+
1298
+ 0:42:01.743 --> 0:42:07.218
1299
+ Therefore, you also see this is called your
1300
+ word embedding.
1301
+
1302
+ 0:42:08.348 --> 0:42:19.542
1303
+ So the weight matrix of the embedding layer
1304
+ is just that in each color you have the embedding
1305
+
1306
+ 0:42:19.542 --> 0:42:20.018
1307
+ of.
1308
+
1309
+ 0:42:20.580 --> 0:42:30.983
1310
+ So this is like how your initial weights look
1311
+ like and how you can interpret or understand.
1312
+
1313
+ 0:42:32.692 --> 0:42:39.509
1314
+ And this is already relatively important because
1315
+ remember this is a huge dimensional thing.
1316
+
1317
+ 0:42:39.435 --> 0:42:46.071
1318
+ So typically here we have the number of words
1319
+ is ten thousand or so, so this is the word
1320
+
1321
+ 0:42:46.071 --> 0:42:51.365
1322
+ embeddings metrics, typically the most expensive
1323
+ to calculate metrics.
1324
+
1325
+ 0:42:51.451 --> 0:42:59.741
1326
+ Because it's the largest one there, we have
1327
+ ten thousand entries, while for the hours we
1328
+
1329
+ 0:42:59.741 --> 0:43:00.393
1330
+ maybe.
1331
+
1332
+ 0:43:00.660 --> 0:43:03.408
1333
+ So therefore the addition to a little bit
1334
+ more to make this.
1335
+
1336
+ 0:43:06.206 --> 0:43:10.538
1337
+ Then you can go where else the calculations
1338
+ are very difficult.
1339
+
1340
+ 0:43:10.830 --> 0:43:20.389
1341
+ So here we then have our network, so we have
1342
+ the word embeddings.
1343
+
1344
+ 0:43:20.244 --> 0:43:29.516
1345
+ We have one hidden there, and then you can
1346
+ look how difficult.
1347
+
1348
+ 0:43:30.270 --> 0:43:38.746
1349
+ Could save a lot of calculation by not really
1350
+ calculating the selection because that is always.
1351
+
1352
+ 0:43:40.600 --> 0:43:46.096
1353
+ The number of calculations you have to do
1354
+ here is so.
1355
+
1356
+ 0:43:45.994 --> 0:43:51.695
1357
+ The length of this layer is minus one type
1358
+ projection.
1359
+
1360
+ 0:43:52.993 --> 0:43:56.321
1361
+ That is a hint size.
1362
+
1363
+ 0:43:56.162 --> 0:44:10.270
1364
+ So the first step of calculation for this
1365
+ metrics modification is how much calculation.
1366
+
1367
+ 0:44:10.730 --> 0:44:18.806
1368
+ Then you have to do some activation function
1369
+ and then you have to do again the calculation.
1370
+
1371
+ 0:44:19.339 --> 0:44:27.994
1372
+ Here we need the vocabulary size because we
1373
+ need to calculate the probability for each
1374
+
1375
+ 0:44:27.994 --> 0:44:29.088
1376
+ next word.
1377
+
1378
+ 0:44:29.889 --> 0:44:42.016
1379
+ And if you look at these numbers, so if you
1380
+ have a projector size of and a vocabulary size
1381
+
1382
+ 0:44:42.016 --> 0:44:53.609
1383
+ of, you see: And that is why there has been
1384
+ especially at the beginning some ideas how
1385
+
1386
+ 0:44:53.609 --> 0:44:55.608
1387
+ we can reduce.
1388
+
1389
+ 0:44:55.956 --> 0:45:01.942
1390
+ And if we really need to calculate all of
1391
+ our capabilities, or if we can calculate only
1392
+
1393
+ 0:45:01.942 --> 0:45:02.350
1394
+ some.
1395
+
1396
+ 0:45:02.582 --> 0:45:10.871
1397
+ And there again the one important thing to
1398
+ think about is for what will use my language
1399
+
1400
+ 0:45:10.871 --> 0:45:11.342
1401
+ mom.
1402
+
1403
+ 0:45:11.248 --> 0:45:19.607
1404
+ I can use it for generations and that's what
1405
+ we will see next week in an achiever which
1406
+
1407
+ 0:45:19.607 --> 0:45:22.457
1408
+ really is guiding the search.
1409
+
1410
+ 0:45:23.123 --> 0:45:30.899
1411
+ If it just uses a feature, we do not want
1412
+ to use it for generations, but we want to only
1413
+
1414
+ 0:45:30.899 --> 0:45:32.559
1415
+ know how probable.
1416
+
1417
+ 0:45:32.953 --> 0:45:39.325
1418
+ There we might not be really interested in
1419
+ all the probabilities, but we already know
1420
+
1421
+ 0:45:39.325 --> 0:45:46.217
1422
+ we just want to know the probability of this
1423
+ one word, and then it might be very inefficient
1424
+
1425
+ 0:45:46.217 --> 0:45:49.403
1426
+ to really calculate all the probabilities.
1427
+
1428
+ 0:45:51.231 --> 0:45:52.919
1429
+ And how can you do that so?
1430
+
1431
+ 0:45:52.859 --> 0:45:56.297
1432
+ Initially, for example, the people look into
1433
+ shortness.
1434
+
1435
+ 0:45:56.756 --> 0:46:02.276
1436
+ So this calculation at the end is really very
1437
+ expensive.
1438
+
1439
+ 0:46:02.179 --> 0:46:05.765
1440
+ So can we make that more efficient.
1441
+
1442
+ 0:46:05.945 --> 0:46:17.375
1443
+ And most words occur very rarely, and maybe
1444
+ we don't need anger, and so there we may want
1445
+
1446
+ 0:46:17.375 --> 0:46:18.645
1447
+ to focus.
1448
+
1449
+ 0:46:19.019 --> 0:46:29.437
1450
+ And so they use the smaller vocabulary, which
1451
+ is maybe.
1452
+
1453
+ 0:46:29.251 --> 0:46:34.581
1454
+ This layer is used from to.
1455
+
1456
+ 0:46:34.391 --> 0:46:37.640
1457
+ Then you merge.
1458
+
1459
+ 0:46:37.937 --> 0:46:45.162
1460
+ So you're taking if the word is in the shortest,
1461
+ so in the two thousand most frequent words.
1462
+
1463
+ 0:46:45.825 --> 0:46:58.299
1464
+ Of this short word by some normalization here,
1465
+ and otherwise you take a back of probability
1466
+
1467
+ 0:46:58.299 --> 0:46:59.655
1468
+ from the.
1469
+
1470
+ 0:47:00.020 --> 0:47:04.933
1471
+ It will not be as good, but the idea is okay.
1472
+
1473
+ 0:47:04.826 --> 0:47:13.994
1474
+ Then we don't have to calculate all these
1475
+ probabilities here at the end, but we only
1476
+
1477
+ 0:47:13.994 --> 0:47:16.043
1478
+ have to calculate.
1479
+
1480
+ 0:47:19.599 --> 0:47:32.097
1481
+ With some type of cost because it means we
1482
+ don't model the probability of the infrequent
1483
+
1484
+ 0:47:32.097 --> 0:47:39.399
1485
+ words, and maybe it's even very important to
1486
+ model.
1487
+
1488
+ 0:47:39.299 --> 0:47:46.671
1489
+ And one idea is to do what is reported as
1490
+ so so structured out there.
1491
+
1492
+ 0:47:46.606 --> 0:47:49.571
1493
+ Network language models you see some years
1494
+ ago.
1495
+
1496
+ 0:47:49.510 --> 0:47:53.155
1497
+ People were very creative and giving names
1498
+ to new models.
1499
+
1500
+ 0:47:53.813 --> 0:48:00.341
1501
+ And there the idea is that we model the output
1502
+ vocabulary as a clustered treat.
1503
+
1504
+ 0:48:00.680 --> 0:48:06.919
1505
+ So you don't need to model all of our bodies
1506
+ directly, but you are putting words into a
1507
+
1508
+ 0:48:06.919 --> 0:48:08.479
1509
+ sequence of clusters.
1510
+
1511
+ 0:48:08.969 --> 0:48:15.019
1512
+ So maybe a very intriguant world is first
1513
+ in cluster three and then in cluster three.
1514
+
1515
+ 0:48:14.949 --> 0:48:21.212
1516
+ You have subclusters again and there is subclusters
1517
+ seven and subclusters and there is.
1518
+
1519
+ 0:48:21.541 --> 0:48:40.134
1520
+ And this is the path, so that is what was
1521
+ the man in the past.
1522
+
1523
+ 0:48:40.340 --> 0:48:52.080
1524
+ And then you can calculate the probability
1525
+ of the word again just by the product of the
1526
+
1527
+ 0:48:52.080 --> 0:48:55.548
1528
+ first class of the world.
1529
+
1530
+ 0:48:57.617 --> 0:49:07.789
1531
+ That it may be more clear where you have this
1532
+ architecture, so this is all the same.
1533
+
1534
+ 0:49:07.670 --> 0:49:13.775
1535
+ But then you first predict here which main
1536
+ class.
1537
+
1538
+ 0:49:14.154 --> 0:49:24.226
1539
+ Then you go to the appropriate subclass, then
1540
+ you calculate the probability of the subclass
1541
+
1542
+ 0:49:24.226 --> 0:49:26.415
1543
+ and maybe the cell.
1544
+
1545
+ 0:49:27.687 --> 0:49:35.419
1546
+ Anybody have an idea why this is more efficient
1547
+ or if you do it first, it looks a lot more.
1548
+
1549
+ 0:49:42.242 --> 0:49:51.788
1550
+ You have to do less calculations, so maybe
1551
+ if you do it here you have to calculate the
1552
+
1553
+ 0:49:51.788 --> 0:49:59.468
1554
+ element there, but you don't have to do all
1555
+ the one hundred thousand.
1556
+
1557
+ 0:49:59.980 --> 0:50:06.115
1558
+ The probabilities in the set classes that
1559
+ you're going through and not for all of them.
1560
+
1561
+ 0:50:06.386 --> 0:50:18.067
1562
+ Therefore, it's more efficient if you don't
1563
+ need all output proficient because you have
1564
+
1565
+ 0:50:18.067 --> 0:50:21.253
1566
+ to calculate the class.
1567
+
1568
+ 0:50:21.501 --> 0:50:28.936
1569
+ So it's only more efficient and scenarios
1570
+ where you really need to use a language model
1571
+
1572
+ 0:50:28.936 --> 0:50:30.034
1573
+ to evaluate.
1574
+
1575
+ 0:50:35.275 --> 0:50:52.456
1576
+ How this works was that you can train first
1577
+ in your language one on the short list.
1578
+
1579
+ 0:50:52.872 --> 0:51:03.547
1580
+ But on the input layer you have your full
1581
+ vocabulary because at the input we saw that
1582
+
1583
+ 0:51:03.547 --> 0:51:06.650
1584
+ this is not complicated.
1585
+
1586
+ 0:51:06.906 --> 0:51:26.638
1587
+ And then you can cluster down all your words
1588
+ here into classes and use that as your glasses.
1589
+
1590
+ 0:51:29.249 --> 0:51:34.148
1591
+ That is one idea of doing it.
1592
+
1593
+ 0:51:33.985 --> 0:51:44.930
1594
+ There is also a second idea of doing it, and
1595
+ again we don't need.
1596
+
1597
+ 0:51:45.025 --> 0:51:53.401
1598
+ So sometimes it doesn't really need to be
1599
+ a probability to evaluate.
1600
+
1601
+ 0:51:53.280 --> 0:51:56.562
1602
+ It's only important that.
1603
+
1604
+ 0:51:58.298 --> 0:52:04.989
1605
+ And: Here it's called self normalization what
1606
+ people have done so.
1607
+
1608
+ 0:52:04.889 --> 0:52:11.552
1609
+ We have seen that the probability is in this
1610
+ soft mechanism always to the input divided
1611
+
1612
+ 0:52:11.552 --> 0:52:18.214
1613
+ by our normalization, and the normalization
1614
+ is a summary of the vocabulary to the power
1615
+
1616
+ 0:52:18.214 --> 0:52:19.274
1617
+ of the spell.
1618
+
1619
+ 0:52:19.759 --> 0:52:25.194
1620
+ So this is how we calculate the software.
1621
+
1622
+ 0:52:25.825 --> 0:52:41.179
1623
+ In self normalization of the idea, if this
1624
+ would be zero then we don't need to calculate
1625
+
1626
+ 0:52:41.179 --> 0:52:42.214
1627
+ that.
1628
+
1629
+ 0:52:42.102 --> 0:52:54.272
1630
+ Will be zero, and then you don't even have
1631
+ to calculate the normalization because it's.
1632
+
1633
+ 0:52:54.514 --> 0:53:08.653
1634
+ So how can we achieve that and then the nice
1635
+ thing in your networks?
1636
+
1637
+ 0:53:09.009 --> 0:53:23.928
1638
+ And now we're just adding a second note with
1639
+ some either permitted here.
1640
+
1641
+ 0:53:24.084 --> 0:53:29.551
1642
+ And the second lost just tells us he'll be
1643
+ strained away.
1644
+
1645
+ 0:53:29.457 --> 0:53:31.630
1646
+ The locks at is zero.
1647
+
1648
+ 0:53:32.352 --> 0:53:38.614
1649
+ So then if it's nearly zero at the end we
1650
+ don't need to calculate this and it's also
1651
+
1652
+ 0:53:38.614 --> 0:53:39.793
1653
+ very efficient.
1654
+
1655
+ 0:53:40.540 --> 0:53:49.498
1656
+ One important thing is this, of course, is
1657
+ only in inference.
1658
+
1659
+ 0:53:49.354 --> 0:54:02.039
1660
+ During tests we don't need to calculate that
1661
+ because: You can do a bit of a hyperparameter
1662
+
1663
+ 0:54:02.039 --> 0:54:14.446
1664
+ here where you do the waiting, so how good
1665
+ should it be estimating the probabilities and
1666
+
1667
+ 0:54:14.446 --> 0:54:16.816
1668
+ how much effort?
1669
+
1670
+ 0:54:18.318 --> 0:54:28.577
1671
+ The only disadvantage is no speed up during
1672
+ training.
1673
+
1674
+ 0:54:28.387 --> 0:54:43.971
1675
+ There are other ways of doing that, for example:
1676
+ Englishman is in case you get it.
1677
+
1678
+ 0:54:44.344 --> 0:54:48.540
1679
+ Then we are coming very, very briefly like
1680
+ just one idea.
1681
+
1682
+ 0:54:48.828 --> 0:54:53.058
1683
+ That there is more things on different types
1684
+ of language models.
1685
+
1686
+ 0:54:52.992 --> 0:54:58.003
1687
+ We are having a very short view on restricted
1688
+ person-based language models.
1689
+
1690
+ 0:54:58.298 --> 0:55:08.931
1691
+ Talk about recurrent neural networks for language
1692
+ mines because they have the advantage that
1693
+
1694
+ 0:55:08.931 --> 0:55:17.391
1695
+ we can even further improve by not having a
1696
+ continuous representation on.
1697
+
1698
+ 0:55:18.238 --> 0:55:23.845
1699
+ So there's different types of neural networks.
1700
+
1701
+ 0:55:23.726 --> 0:55:30.171
1702
+ These are these boxing machines and the interesting.
1703
+
1704
+ 0:55:30.330 --> 0:55:38.519
1705
+ They have these: And they define like an energy
1706
+ function on the network, which can be in restricted
1707
+
1708
+ 0:55:38.519 --> 0:55:44.415
1709
+ balsam machines efficiently calculated in general
1710
+ and restricted needs.
1711
+
1712
+ 0:55:44.333 --> 0:55:51.138
1713
+ You only have connection between the input
1714
+ and the hidden layer, but you don't have connections
1715
+
1716
+ 0:55:51.138 --> 0:55:53.123
1717
+ in the input or within the.
1718
+
1719
+ 0:55:53.393 --> 0:56:00.194
1720
+ So you see here you don't have an input output,
1721
+ you just have an input, and you calculate.
1722
+
1723
+ 0:56:00.460 --> 0:56:15.612
1724
+ Which of course nicely fits with the idea
1725
+ we're having, so you can then use this for
1726
+
1727
+ 0:56:15.612 --> 0:56:19.177
1728
+ an N Gram language.
1729
+
1730
+ 0:56:19.259 --> 0:56:25.189
1731
+ Retaining the flexibility of the input by
1732
+ this type of neon networks.
1733
+
1734
+ 0:56:26.406 --> 0:56:30.589
1735
+ And the advantage of this type of model was
1736
+ there's.
1737
+
1738
+ 0:56:30.550 --> 0:56:37.520
1739
+ Very, very fast to integrate it, so that one
1740
+ was the first one which was used during the
1741
+
1742
+ 0:56:37.520 --> 0:56:38.616
1743
+ coding model.
1744
+
1745
+ 0:56:38.938 --> 0:56:45.454
1746
+ The engram language models were that they
1747
+ were very good and gave performance.
1748
+
1749
+ 0:56:45.371 --> 0:56:50.073
1750
+ However, calculation still with all these
1751
+ tricks takes.
1752
+
1753
+ 0:56:50.230 --> 0:56:58.214
1754
+ We have talked about embest lists so they
1755
+ generated an embest list of the most probable
1756
+
1757
+ 0:56:58.214 --> 0:57:05.836
1758
+ outputs and then they took this and best list
1759
+ scored each entry with a new network.
1760
+
1761
+ 0:57:06.146 --> 0:57:09.306
1762
+ A language model, and then only change the
1763
+ order again.
1764
+
1765
+ 0:57:09.250 --> 0:57:10.889
1766
+ Select based on that which.
1767
+
1768
+ 0:57:11.231 --> 0:57:17.187
1769
+ The neighboring list is maybe only like hundred
1770
+ entries.
1771
+
1772
+ 0:57:17.083 --> 0:57:21.788
1773
+ When decoding you look at several thousand.
1774
+
1775
+ 0:57:26.186 --> 0:57:35.196
1776
+ Let's look at the context so we have now seen
1777
+ your language models.
1778
+
1779
+ 0:57:35.063 --> 0:57:43.678
1780
+ There is the big advantage we can use this
1781
+ word similarity and.
1782
+
1783
+ 0:57:44.084 --> 0:57:52.266
1784
+ Remember for engram language ones is not always
1785
+ minus one words because sometimes you have
1786
+
1787
+ 0:57:52.266 --> 0:57:59.909
1788
+ to back off or interpolation to lower engrams
1789
+ and you don't know the previous words.
1790
+
1791
+ 0:58:00.760 --> 0:58:04.742
1792
+ And however in neural models we always have
1793
+ all of this importance.
1794
+
1795
+ 0:58:04.684 --> 0:58:05.508
1796
+ Can some of.
1797
+
1798
+ 0:58:07.147 --> 0:58:20.288
1799
+ The disadvantage is that you are still limited
1800
+ in your context, and if you remember the sentence
1801
+
1802
+ 0:58:20.288 --> 0:58:22.998
1803
+ from last lecture,.
1804
+
1805
+ 0:58:22.882 --> 0:58:28.328
1806
+ Sometimes you need more context and there
1807
+ is unlimited context that you might need and
1808
+
1809
+ 0:58:28.328 --> 0:58:34.086
1810
+ you can always create sentences where you may
1811
+ need this five context in order to put a good
1812
+
1813
+ 0:58:34.086 --> 0:58:34.837
1814
+ estimation.
1815
+
1816
+ 0:58:35.315 --> 0:58:44.956
1817
+ Can also do it different in order to understand
1818
+ that it makes sense to view language.
1819
+
1820
+ 0:58:45.445 --> 0:58:58.559
1821
+ Secret labelingSo secret labeling tasks are
1822
+ a very common type of task in language processing
1823
+
1824
+ 0:58:58.559 --> 0:59:03.442
1825
+ where you have the input sequence.
1826
+
1827
+ 0:59:03.323 --> 0:59:05.976
1828
+ So you have one output for each input.
1829
+
1830
+ 0:59:05.908 --> 0:59:12.337
1831
+ Machine translation is not a secret labeling
1832
+ cast because the number of inputs and the number
1833
+
1834
+ 0:59:12.337 --> 0:59:14.046
1835
+ of outputs is different.
1836
+
1837
+ 0:59:13.978 --> 0:59:19.940
1838
+ So you put in a string German which has five
1839
+ words and the output can be: See, for example,
1840
+
1841
+ 0:59:19.940 --> 0:59:24.088
1842
+ you always have the same number and the same
1843
+ number of offices.
1844
+
1845
+ 0:59:24.944 --> 0:59:39.779
1846
+ And you can more language waddling as that,
1847
+ and you just say the label for each word is
1848
+
1849
+ 0:59:39.779 --> 0:59:43.151
1850
+ always a next word.
1851
+
1852
+ 0:59:45.705 --> 0:59:50.312
1853
+ This is the more generous you can think of
1854
+ it.
1855
+
1856
+ 0:59:50.214 --> 0:59:56.195
1857
+ For example, Paddle Speech Taking named Entity
1858
+ Recognition.
1859
+
1860
+ 0:59:58.938 --> 1:00:12.703
1861
+ And if you look at now, this output token
1862
+ and generally sequenced labeling can depend
1863
+
1864
+ 1:00:12.703 --> 1:00:26.788
1865
+ on: The input tokens are the same so we can
1866
+ easily model it and they only depend on the
1867
+
1868
+ 1:00:26.788 --> 1:00:29.028
1869
+ input tokens.
1870
+
1871
+ 1:00:31.011 --> 1:00:42.306
1872
+ But we can always look at one specific type
1873
+ of sequence labeling, unidirectional sequence
1874
+
1875
+ 1:00:42.306 --> 1:00:44.189
1876
+ labeling type.
1877
+
1878
+ 1:00:44.584 --> 1:01:00.855
1879
+ The probability of the next word only depends
1880
+ on the previous words that we are having here.
1881
+
1882
+ 1:01:01.321 --> 1:01:05.998
1883
+ That's also not completely true in language.
1884
+
1885
+ 1:01:05.894 --> 1:01:14.419
1886
+ Well, the back context might also be helpful
1887
+ by direction of the model's Google.
1888
+
1889
+ 1:01:14.654 --> 1:01:23.039
1890
+ We will always admire the probability of the
1891
+ word given on its history.
1892
+
1893
+ 1:01:23.623 --> 1:01:30.562
1894
+ And currently there is approximation and sequence
1895
+ labeling that we have this windowing approach.
1896
+
1897
+ 1:01:30.951 --> 1:01:43.016
1898
+ So in order to predict this type of word we
1899
+ always look at the previous three words.
1900
+
1901
+ 1:01:42.874 --> 1:01:48.414
1902
+ This is this type of windowing model.
1903
+
1904
+ 1:01:49.389 --> 1:01:54.780
1905
+ If you're into neural networks you recognize
1906
+ this type of structure.
1907
+
1908
+ 1:01:54.702 --> 1:01:57.517
1909
+ Also, the typical neural networks.
1910
+
1911
+ 1:01:58.938 --> 1:02:11.050
1912
+ Yes, yes, so like engram models you can, at
1913
+ least in some way, prepare for that type of
1914
+
1915
+ 1:02:11.050 --> 1:02:12.289
1916
+ context.
1917
+
1918
+ 1:02:14.334 --> 1:02:23.321
1919
+ Are also other types of neonamic structures
1920
+ which we can use for sequins lately and which
1921
+
1922
+ 1:02:23.321 --> 1:02:30.710
1923
+ might help us where we don't have this type
1924
+ of fixed size representation.
1925
+
1926
+ 1:02:32.812 --> 1:02:34.678
1927
+ That we can do so.
1928
+
1929
+ 1:02:34.580 --> 1:02:39.348
1930
+ The idea is in recurrent new networks traction.
1931
+
1932
+ 1:02:39.249 --> 1:02:43.225
1933
+ We are saving complete history in one.
1934
+
1935
+ 1:02:43.623 --> 1:02:56.946
1936
+ So again we have to do this fixed size representation
1937
+ because the neural networks always need a habit.
1938
+
1939
+ 1:02:57.157 --> 1:03:09.028
1940
+ And then the network should look like that,
1941
+ so we start with an initial value for our storage.
1942
+
1943
+ 1:03:08.903 --> 1:03:15.903
1944
+ We are giving our first input and calculating
1945
+ the new.
1946
+
1947
+ 1:03:16.196 --> 1:03:33.972
1948
+ So again in your network with two types of
1949
+ inputs: Then you can apply it to the next type
1950
+
1951
+ 1:03:33.972 --> 1:03:41.676
1952
+ of input and you're again having this.
1953
+
1954
+ 1:03:41.478 --> 1:03:46.395
1955
+ You're taking this hidden state.
1956
+
1957
+ 1:03:47.367 --> 1:03:53.306
1958
+ Nice thing is now that you can do now step
1959
+ by step by step, so all the way over.
1960
+
1961
+ 1:03:55.495 --> 1:04:06.131
1962
+ The nice thing we are having here now is that
1963
+ now we are having context information from
1964
+
1965
+ 1:04:06.131 --> 1:04:07.206
1966
+ all the.
1967
+
1968
+ 1:04:07.607 --> 1:04:14.181
1969
+ So if you're looking like based on which words
1970
+ do you, you calculate the probability of varying.
1971
+
1972
+ 1:04:14.554 --> 1:04:20.090
1973
+ It depends on this part.
1974
+
1975
+ 1:04:19.868 --> 1:04:33.158
1976
+ It depends on and this hidden state was influenced
1977
+ by two.
1978
+
1979
+ 1:04:33.473 --> 1:04:38.259
1980
+ So now we're having something new.
1981
+
1982
+ 1:04:38.122 --> 1:04:46.465
1983
+ We can model like the word probability not
1984
+ only on a fixed.
1985
+
1986
+ 1:04:46.906 --> 1:04:53.565
1987
+ Because the hidden states we are having here
1988
+ in our Oregon are influenced by all the trivia.
1989
+
1990
+ 1:04:56.296 --> 1:05:02.578
1991
+ So how is there to be Singapore?
1992
+
1993
+ 1:05:02.388 --> 1:05:16.289
1994
+ But then we have the initial idea about this
1995
+ P of given on the history.
1996
+
1997
+ 1:05:16.736 --> 1:05:25.300
1998
+ So do not need to do any clustering here,
1999
+ and you also see how things are put together
2000
+
2001
+ 1:05:25.300 --> 1:05:26.284
2002
+ in order.
2003
+
2004
+ 1:05:29.489 --> 1:05:43.449
2005
+ The green box this night since we are starting
2006
+ from the left to the right.
2007
+
2008
+ 1:05:44.524 --> 1:05:51.483
2009
+ Voices: Yes, that's right, so there are clusters,
2010
+ and here is also sometimes clustering happens.
2011
+
2012
+ 1:05:51.871 --> 1:05:58.687
2013
+ The small difference does matter again, so
2014
+ if you have now a lot of different histories,
2015
+
2016
+ 1:05:58.687 --> 1:06:01.674
2017
+ the similarity which you have in here.
2018
+
2019
+ 1:06:01.597 --> 1:06:08.239
2020
+ If two of the histories are very similar,
2021
+ these representations will be the same, and
2022
+
2023
+ 1:06:08.239 --> 1:06:10.787
2024
+ then you're treating them again.
2025
+
2026
+ 1:06:11.071 --> 1:06:15.789
2027
+ Because in order to do the final restriction
2028
+ you only do a good base on the green box.
2029
+
2030
+ 1:06:16.156 --> 1:06:28.541
2031
+ So you are now still learning some type of
2032
+ clustering in there, but you are learning it
2033
+
2034
+ 1:06:28.541 --> 1:06:30.230
2035
+ implicitly.
2036
+
2037
+ 1:06:30.570 --> 1:06:38.200
2038
+ The only restriction you're giving is you
2039
+ have to stall everything that is important
2040
+
2041
+ 1:06:38.200 --> 1:06:39.008
2042
+ in this.
2043
+
2044
+ 1:06:39.359 --> 1:06:54.961
2045
+ So it's a different type of limitation, so
2046
+ you calculate the probability based on the
2047
+
2048
+ 1:06:54.961 --> 1:06:57.138
2049
+ last words.
2050
+
2051
+ 1:06:57.437 --> 1:07:04.430
2052
+ And that is how you still need to somehow
2053
+ cluster things together in order to do efficiently.
2054
+
2055
+ 1:07:04.356 --> 1:07:09.564
2056
+ Of course, you need to do some type of clustering
2057
+ because otherwise.
2058
+
2059
+ 1:07:09.970 --> 1:07:18.865
2060
+ But this is where things get merged together
2061
+ in this type of hidden representation.
2062
+
2063
+ 1:07:18.760 --> 1:07:27.975
2064
+ So here the probability of the word first
2065
+ only depends on this hidden representation.
2066
+
2067
+ 1:07:28.288 --> 1:07:33.104
2068
+ On the previous words, but they are some other
2069
+ bottleneck in order to make a good estimation.
2070
+
2071
+ 1:07:34.474 --> 1:07:41.231
2072
+ So the idea is that we can store all our history
2073
+ into or into one lecture.
2074
+
2075
+ 1:07:41.581 --> 1:07:44.812
2076
+ Which is the one that makes it more strong.
2077
+
2078
+ 1:07:44.739 --> 1:07:51.238
2079
+ Next we come to problems that of course at
2080
+ some point it might be difficult if you have
2081
+
2082
+ 1:07:51.238 --> 1:07:57.811
2083
+ very long sequences and you always write all
2084
+ the information you have on this one block.
2085
+
2086
+ 1:07:58.398 --> 1:08:02.233
2087
+ Then maybe things get overwritten or you cannot
2088
+ store everything in there.
2089
+
2090
+ 1:08:02.662 --> 1:08:04.514
2091
+ So,.
2092
+
2093
+ 1:08:04.184 --> 1:08:09.569
2094
+ Therefore, yet for short things like single
2095
+ sentences that works well, but especially if
2096
+
2097
+ 1:08:09.569 --> 1:08:15.197
2098
+ you think of other tasks and like symbolizations
2099
+ with our document based on T where you need
2100
+
2101
+ 1:08:15.197 --> 1:08:20.582
2102
+ to consider the full document, these things
2103
+ got got a bit more more more complicated and
2104
+
2105
+ 1:08:20.582 --> 1:08:23.063
2106
+ will learn another type of architecture.
2107
+
2108
+ 1:08:24.464 --> 1:08:30.462
2109
+ In order to understand these neighbors, it
2110
+ is good to have all the bus use always.
2111
+
2112
+ 1:08:30.710 --> 1:08:33.998
2113
+ So this is the unrolled view.
2114
+
2115
+ 1:08:33.889 --> 1:08:43.754
2116
+ Somewhere you're over the type or in language
2117
+ over the words you're unrolling a network.
2118
+
2119
+ 1:08:44.024 --> 1:08:52.096
2120
+ Here is the article and here is the network
2121
+ which is connected by itself and that is recurrent.
2122
+
2123
+ 1:08:56.176 --> 1:09:04.982
2124
+ There is one challenge in this networks and
2125
+ training.
2126
+
2127
+ 1:09:04.819 --> 1:09:11.998
2128
+ We can train them first of all as forward.
2129
+
2130
+ 1:09:12.272 --> 1:09:19.397
2131
+ So we don't really know how to train them,
2132
+ but if you unroll them like this is a feet
2133
+
2134
+ 1:09:19.397 --> 1:09:20.142
2135
+ forward.
2136
+
2137
+ 1:09:20.540 --> 1:09:38.063
2138
+ Is exactly the same, so you can measure your
2139
+ arrows here and be back to your arrows.
2140
+
2141
+ 1:09:38.378 --> 1:09:45.646
2142
+ If you unroll something, it's a feature in
2143
+ your laptop and you can train it the same way.
2144
+
2145
+ 1:09:46.106 --> 1:09:57.606
2146
+ The only important thing is again, of course,
2147
+ for different inputs.
2148
+
2149
+ 1:09:57.837 --> 1:10:05.145
2150
+ But since parameters are shared, it's somehow
2151
+ a similar point you can train it.
2152
+
2153
+ 1:10:05.054 --> 1:10:08.802
2154
+ The training algorithm is very similar.
2155
+
2156
+ 1:10:10.310 --> 1:10:29.568
2157
+ One thing which makes things difficult is
2158
+ what is referred to as the vanish ingredient.
2159
+
2160
+ 1:10:29.809 --> 1:10:32.799
2161
+ That's a very strong thing in the motivation
2162
+ of using hardness.
2163
+
2164
+ 1:10:33.593 --> 1:10:44.604
2165
+ The influence here gets smaller and smaller,
2166
+ and the modems are not really able to monitor.
2167
+
2168
+ 1:10:44.804 --> 1:10:51.939
2169
+ Because the gradient gets smaller and smaller,
2170
+ and so the arrow here propagated to this one
2171
+
2172
+ 1:10:51.939 --> 1:10:58.919
2173
+ that contributes to the arrow is very small,
2174
+ and therefore you don't do any changes there
2175
+
2176
+ 1:10:58.919 --> 1:10:59.617
2177
+ anymore.
2178
+
2179
+ 1:11:00.020 --> 1:11:06.703
2180
+ And yeah, that's why standard art men are
2181
+ undifficult or have to pick them at custard.
2182
+
2183
+ 1:11:07.247 --> 1:11:11.442
2184
+ Long-Short Memory (LDN)So everywhere talking
2185
+ to me about fire and ants nowadays,.
2186
+
2187
+ 1:11:11.791 --> 1:11:23.333
2188
+ What we are typically meaning are LSDN's or
2189
+ long short memories.
2190
+
2191
+ 1:11:23.155 --> 1:11:30.972
2192
+ You see they are by now quite old already.
2193
+
2194
+ 1:11:31.171 --> 1:11:39.019
2195
+ So there was a model in the language model
2196
+ task.
2197
+
2198
+ 1:11:38.858 --> 1:11:44.789
2199
+ It's some more storing information.
2200
+
2201
+ 1:11:44.684 --> 1:11:51.556
2202
+ Because if you only look at the last words,
2203
+ it's often no longer clear this is a question
2204
+
2205
+ 1:11:51.556 --> 1:11:52.548
2206
+ or a normal.
2207
+
2208
+ 1:11:53.013 --> 1:12:05.318
2209
+ So there you have these mechanisms with ripgate
2210
+ in order to store things for a longer time
2211
+
2212
+ 1:12:05.318 --> 1:12:08.563
2213
+ into your hidden state.
2214
+
2215
+ 1:12:10.730 --> 1:12:20.162
2216
+ Here they are used in in in selling quite
2217
+ a lot of works.
2218
+
2219
+ 1:12:21.541 --> 1:12:29.349
2220
+ For especially machine translation now, the
2221
+ standard is to do transform base models which
2222
+
2223
+ 1:12:29.349 --> 1:12:30.477
2224
+ we'll learn.
2225
+
2226
+ 1:12:30.690 --> 1:12:38.962
2227
+ But for example, in architecture we have later
2228
+ one lecture about efficiency.
2229
+
2230
+ 1:12:38.855 --> 1:12:42.833
2231
+ So how can we build very efficient?
2232
+
2233
+ 1:12:42.882 --> 1:12:53.074
2234
+ And there in the decoder in parts of the networks
2235
+ they are still using.
2236
+
2237
+ 1:12:53.473 --> 1:12:57.518
2238
+ So it's not that yeah our hands are of no
2239
+ importance in the body.
2240
+
2241
+ 1:12:59.239 --> 1:13:09.810
2242
+ In order to make them strong, there are some
2243
+ more things which are helpful and should be:
2244
+
2245
+ 1:13:09.810 --> 1:13:19.677
2246
+ So one thing is there is a nice trick to make
2247
+ this new network stronger and better.
2248
+
2249
+ 1:13:19.739 --> 1:13:21.523
2250
+ So of course it doesn't work always.
2251
+
2252
+ 1:13:21.475 --> 1:13:23.452
2253
+ They have to have enough training data.
2254
+
2255
+ 1:13:23.763 --> 1:13:28.959
2256
+ But in general there's the easiest way of
2257
+ making your models bigger and stronger just
2258
+
2259
+ 1:13:28.959 --> 1:13:30.590
2260
+ to increase your pyramids.
2261
+
2262
+ 1:13:30.630 --> 1:13:43.236
2263
+ And you've seen that with a large language
2264
+ models they are always bragging about.
2265
+
2266
+ 1:13:43.903 --> 1:13:56.463
2267
+ This is one way, so the question is how do
2268
+ you get more parameters?
2269
+
2270
+ 1:13:56.278 --> 1:14:01.273
2271
+ There's ways of doing it.
2272
+
2273
+ 1:14:01.521 --> 1:14:10.029
2274
+ And the other thing is to make your networks
2275
+ deeper so to have more legs in between.
2276
+
2277
+ 1:14:11.471 --> 1:14:13.827
2278
+ And then you can also get to get more calm.
2279
+
2280
+ 1:14:14.614 --> 1:14:23.340
2281
+ There's more traveling with this and it's
2282
+ very similar to what we just saw with our hand.
2283
+
2284
+ 1:14:23.603 --> 1:14:34.253
2285
+ We have this problem of radiant flow that
2286
+ if it flows so fast like a radiant gets very
2287
+
2288
+ 1:14:34.253 --> 1:14:35.477
2289
+ swollen,.
2290
+
2291
+ 1:14:35.795 --> 1:14:42.704
2292
+ Exactly the same thing happens in deep LSD
2293
+ ends.
2294
+
2295
+ 1:14:42.563 --> 1:14:52.295
2296
+ If you take here the gradient, tell you what
2297
+ is the right or wrong.
2298
+
2299
+ 1:14:52.612 --> 1:14:56.439
2300
+ With three layers it's no problem, but if
2301
+ you're going to ten, twenty or hundred layers.
2302
+
2303
+ 1:14:57.797 --> 1:14:59.698
2304
+ That's Getting Typically Young.
2305
+
2306
+ 1:15:00.060 --> 1:15:07.000
2307
+ Are doing is using what is called decisional
2308
+ connections.
2309
+
2310
+ 1:15:06.880 --> 1:15:15.857
2311
+ That's a very helpful idea, which is maybe
2312
+ very surprising that it works.
2313
+
2314
+ 1:15:15.956 --> 1:15:20.309
2315
+ And so the idea is that these networks.
2316
+
2317
+ 1:15:20.320 --> 1:15:29.982
2318
+ In between should no longer calculate what
2319
+ is a new good representation, but they're more
2320
+
2321
+ 1:15:29.982 --> 1:15:31.378
2322
+ calculating.
2323
+
2324
+ 1:15:31.731 --> 1:15:37.588
2325
+ Therefore, in the end you're always the output
2326
+ of a layer is added with the input.
2327
+
2328
+ 1:15:38.318 --> 1:15:48.824
2329
+ The knife is later if you are doing back propagation
2330
+ with this very fast back propagation.
2331
+
2332
+ 1:15:49.209 --> 1:16:02.540
2333
+ Nowadays in very deep architectures, not only
2334
+ on other but always has this residual or highway
2335
+
2336
+ 1:16:02.540 --> 1:16:04.224
2337
+ connection.
2338
+
2339
+ 1:16:04.704 --> 1:16:06.616
2340
+ Has two advantages.
2341
+
2342
+ 1:16:06.520 --> 1:16:15.383
2343
+ On the one hand, these layers don't need to
2344
+ learn a representation, they only need to learn
2345
+
2346
+ 1:16:15.383 --> 1:16:18.755
2347
+ what to change the representation.
2348
+
2349
+ 1:16:22.082 --> 1:16:24.172
2350
+ Good.
2351
+
2352
+ 1:16:23.843 --> 1:16:31.768
2353
+ That much for the new map before, so the last
2354
+ thing now means this.
2355
+
2356
+ 1:16:31.671 --> 1:16:33.750
2357
+ Language was are yeah.
2358
+
2359
+ 1:16:33.660 --> 1:16:44.081
2360
+ I were used in the molds itself and now were
2361
+ seeing them again, but one thing which at the
2362
+
2363
+ 1:16:44.081 --> 1:16:55.076
2364
+ beginning they were reading was very essential
2365
+ was: So people really train part of the language
2366
+
2367
+ 1:16:55.076 --> 1:17:00.000
2368
+ models only to get this type of embedding.
2369
+
2370
+ 1:16:59.886 --> 1:17:04.198
2371
+ Therefore, we want to look.
2372
+
2373
+ 1:17:09.229 --> 1:17:15.678
2374
+ So now some last words to the word embeddings.
2375
+
2376
+ 1:17:15.541 --> 1:17:27.205
2377
+ The interesting thing is that word embeddings
2378
+ can be used for very different tasks.
2379
+
2380
+ 1:17:27.347 --> 1:17:31.329
2381
+ The knife wing is you can train that on just
2382
+ large amounts of data.
2383
+
2384
+ 1:17:31.931 --> 1:17:41.569
2385
+ And then if you have these wooden beddings
2386
+ we have seen that they reduce the parameters.
2387
+
2388
+ 1:17:41.982 --> 1:17:52.217
2389
+ So then you can train your small mark to do
2390
+ any other task and therefore you are more efficient.
2391
+
2392
+ 1:17:52.532 --> 1:17:55.218
2393
+ These initial word embeddings is important.
2394
+
2395
+ 1:17:55.157 --> 1:18:00.492
2396
+ They really depend only on the word itself,
2397
+ so if you look at the two meanings of can,
2398
+
2399
+ 1:18:00.492 --> 1:18:06.318
2400
+ the can of beans or I can do that, they will
2401
+ have the same embedding, so some of the embedding
2402
+
2403
+ 1:18:06.318 --> 1:18:08.709
2404
+ has to save the ambiguity inside that.
2405
+
2406
+ 1:18:09.189 --> 1:18:12.486
2407
+ That cannot be resolved.
2408
+
2409
+ 1:18:12.354 --> 1:18:24.727
2410
+ Therefore, if you look at the higher levels
2411
+ in the context, but in the word embedding layers
2412
+
2413
+ 1:18:24.727 --> 1:18:27.920
2414
+ that really depends on.
2415
+
2416
+ 1:18:29.489 --> 1:18:33.757
2417
+ However, even this one has quite very interesting.
2418
+
2419
+ 1:18:34.034 --> 1:18:39.558
2420
+ So that people like to visualize them.
2421
+
2422
+ 1:18:39.417 --> 1:18:47.211
2423
+ They're always difficult because if you look
2424
+ at this.
2425
+
2426
+ 1:18:47.767 --> 1:18:52.879
2427
+ And drawing your five hundred damage, the
2428
+ vector is still a bit challenging.
2429
+
2430
+ 1:18:53.113 --> 1:19:12.472
2431
+ So you cannot directly do that, so people
2432
+ have to do it like they look at some type of.
2433
+
2434
+ 1:19:13.073 --> 1:19:17.209
2435
+ And of course then yes some information is
2436
+ getting lost by a bunch of control.
2437
+
2438
+ 1:19:18.238 --> 1:19:24.802
2439
+ And you see, for example, this is the most
2440
+ famous and common example, so what you can
2441
+
2442
+ 1:19:24.802 --> 1:19:31.289
2443
+ look is you can look at the difference between
2444
+ the main and the female word English.
2445
+
2446
+ 1:19:31.213 --> 1:19:37.854
2447
+ This is here in your embedding of king, and
2448
+ this is the embedding of queen, and this.
2449
+
2450
+ 1:19:38.058 --> 1:19:40.394
2451
+ You can do that for a very different work.
2452
+
2453
+ 1:19:40.780 --> 1:19:45.407
2454
+ And that is where the masks come into, that
2455
+ is what people then look into.
2456
+
2457
+ 1:19:45.725 --> 1:19:50.995
2458
+ So what you can now, for example, do is you
2459
+ can calculate the difference between man and
2460
+
2461
+ 1:19:50.995 --> 1:19:51.410
2462
+ woman?
2463
+
2464
+ 1:19:52.232 --> 1:19:55.511
2465
+ Then you can take the embedding of tea.
2466
+
2467
+ 1:19:55.429 --> 1:20:02.764
2468
+ You can add on it the difference between man
2469
+ and woman, and then you can notice what are
2470
+
2471
+ 1:20:02.764 --> 1:20:04.330
2472
+ the similar words.
2473
+
2474
+ 1:20:04.248 --> 1:20:08.926
2475
+ So you won't, of course, directly hit the
2476
+ correct word.
2477
+
2478
+ 1:20:08.843 --> 1:20:10.518
2479
+ It's a continuous.
2480
+
2481
+ 1:20:10.790 --> 1:20:23.127
2482
+ But you can look what are the nearest neighbors
2483
+ to this same, and often these words are near
2484
+
2485
+ 1:20:23.127 --> 1:20:24.056
2486
+ there.
2487
+
2488
+ 1:20:24.224 --> 1:20:33.913
2489
+ So it somehow learns that the difference between
2490
+ these words is always the same.
2491
+
2492
+ 1:20:34.374 --> 1:20:37.746
2493
+ You can do that for different things.
2494
+
2495
+ 1:20:37.658 --> 1:20:41.236
2496
+ He also imagines that it's not perfect.
2497
+
2498
+ 1:20:41.146 --> 1:20:49.019
2499
+ He says the world tends to be swimming and
2500
+ swimming, and with walking and walking you.
2501
+
2502
+ 1:20:49.469 --> 1:20:51.639
2503
+ So you can try to use them.
2504
+
2505
+ 1:20:51.561 --> 1:20:58.970
2506
+ It's no longer like saying yeah, but the interesting
2507
+ thing is this is completely unsupervised.
2508
+
2509
+ 1:20:58.892 --> 1:21:03.963
2510
+ So nobody taught him the principle of their
2511
+ gender in language.
2512
+
2513
+ 1:21:04.284 --> 1:21:09.910
2514
+ So it's purely trained on the task of doing
2515
+ the next work prediction.
2516
+
2517
+ 1:21:10.230 --> 1:21:20.658
2518
+ And even for really cementing information
2519
+ like the capital, this is the difference between
2520
+
2521
+ 1:21:20.658 --> 1:21:23.638
2522
+ the city and the capital.
2523
+
2524
+ 1:21:23.823 --> 1:21:25.518
2525
+ Visualization.
2526
+
2527
+ 1:21:25.405 --> 1:21:33.768
2528
+ Here we have done the same things of the difference
2529
+ between country and.
2530
+
2531
+ 1:21:33.853 --> 1:21:41.991
2532
+ You see it's not perfect, but it's building
2533
+ some kinds of a right direction, so you can't
2534
+
2535
+ 1:21:41.991 --> 1:21:43.347
2536
+ even use them.
2537
+
2538
+ 1:21:43.257 --> 1:21:51.286
2539
+ For example, for question answering, if you
2540
+ have the difference between them, you apply
2541
+
2542
+ 1:21:51.286 --> 1:21:53.384
2543
+ that to a new country.
2544
+
2545
+ 1:21:54.834 --> 1:22:02.741
2546
+ So it seems these ones are able to really
2547
+ learn a lot of information and collapse all
2548
+
2549
+ 1:22:02.741 --> 1:22:04.396
2550
+ this information.
2551
+
2552
+ 1:22:05.325 --> 1:22:12.301
2553
+ At just to do the next word prediction: And
2554
+ that also explains a bit maybe or not explains
2555
+
2556
+ 1:22:12.301 --> 1:22:19.276
2557
+ wrong life by motivating why what is the main
2558
+ advantage of this type of neural models that
2559
+
2560
+ 1:22:19.276 --> 1:22:26.022
2561
+ we can use this type of hidden representation,
2562
+ transfer them and use them in different.
2563
+
2564
+ 1:22:28.568 --> 1:22:41.948
2565
+ SummarySo summarize what we did today, so
2566
+ what you should hopefully have with you is
2567
+
2568
+ 1:22:41.948 --> 1:22:45.883
2569
+ for machine translation.
2570
+
2571
+ 1:22:45.805 --> 1:22:49.149
2572
+ Then how we can do language modern Chinese
2573
+ literature?
2574
+
2575
+ 1:22:49.449 --> 1:22:56.046
2576
+ We looked at three different architectures:
2577
+ We looked into the feet forward language mode
2578
+
2579
+ 1:22:56.046 --> 1:22:59.052
2580
+ and the one based on Bluetooth machines.
2581
+
2582
+ 1:22:59.039 --> 1:23:05.366
2583
+ And finally there are different architectures
2584
+ to do in your networks.
2585
+
2586
+ 1:23:05.275 --> 1:23:14.405
2587
+ We have seen feet for your networks and we'll
2588
+ see the next lectures, the last type of architecture.
2589
+
2590
+ 1:23:15.915 --> 1:23:17.412
2591
+ Have Any Questions.
2592
+
2593
+ 1:23:20.680 --> 1:23:27.341
2594
+ Then thanks a lot, and next on Tuesday we
2595
+ will be again in our order to know how to play.
2596
+
demo_data/lectures/Lecture-07-11.05.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee1fc2af8bf4d95a18dacaa3d5d9aad8c6c207e0f5f63090a9adefcfcf29f418
3
+ size 150440033
demo_data/lectures/Lecture-07-16.05.2023/English.vtt ADDED
@@ -0,0 +1,2523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 0:00:01.301 --> 0:00:05.664
4
+ IntroductionOkay, so we're welcome to today's
5
+ lecture.
6
+
7
+ 0:00:06.066 --> 0:00:18.128
8
+ A bit desperate in a small room and I'm sorry
9
+ for the inconvenience.
10
+
11
+ 0:00:17.953 --> 0:00:25.824
12
+ Sometimes there are project meetings where.
13
+
14
+ 0:00:26.806 --> 0:00:40.863
15
+ So what we want to talk today about is want
16
+ to start with neural approaches to machine
17
+
18
+ 0:00:40.863 --> 0:00:42.964
19
+ translation.
20
+
21
+ 0:00:43.123 --> 0:00:55.779
22
+ Guess I've heard about other types of neural
23
+ models for natural language processing.
24
+
25
+ 0:00:55.630 --> 0:00:59.954
26
+ This was some of the first.
27
+
28
+ 0:01:00.600 --> 0:01:06.203
29
+ They are similar to what you know they see
30
+ in as large language models.
31
+
32
+ 0:01:06.666 --> 0:01:14.810
33
+ And we want today look into what are these
34
+ neural language models, how we can build them,
35
+
36
+ 0:01:14.810 --> 0:01:15.986
37
+ what is the.
38
+
39
+ 0:01:16.316 --> 0:01:23.002
40
+ And first we'll show how to use them in statistical
41
+ machine translation.
42
+
43
+ 0:01:22.910 --> 0:01:31.058
44
+ RecapIf you remember weeks ago, we had this
45
+ log-linear model where you can integrate easily.
46
+
47
+ 0:01:31.351 --> 0:01:42.756
48
+ And that was how they first were used, so
49
+ we just had another model that evaluates how
50
+
51
+ 0:01:42.756 --> 0:01:49.180
52
+ good a system is or how good a lot of languages.
53
+
54
+ 0:01:50.690 --> 0:02:04.468
55
+ And next week we will go for a neuromachine
56
+ translation where we replace the whole model
57
+
58
+ 0:02:04.468 --> 0:02:06.481
59
+ by one huge.
60
+
61
+ 0:02:11.211 --> 0:02:18.079
62
+ The main challenge in statistical language
63
+ modelingSo just as a member from Tuesday we've
64
+
65
+ 0:02:18.079 --> 0:02:25.101
66
+ seen, the main challenge in language modeling
67
+ was that most of the anthrax we haven't seen.
68
+
69
+ 0:02:26.946 --> 0:02:34.167
70
+ So this was therefore difficult to estimate
71
+ any probability because we've seen that yet
72
+
73
+ 0:02:34.167 --> 0:02:39.501
74
+ normally if you've seen had not seen the N
75
+ gram you will assign.
76
+
77
+ 0:02:39.980 --> 0:02:53.385
78
+ However, this is not really very good because
79
+ we don't want to give zero probabilities to
80
+
81
+ 0:02:53.385 --> 0:02:55.023
82
+ sentences.
83
+
84
+ 0:02:55.415 --> 0:03:10.397
85
+ And then we learned a lot of techniques and
86
+ that is the main challenge in statistical language.
87
+
88
+ 0:03:10.241 --> 0:03:15.396
89
+ How we can give somehow a good.
90
+
91
+ 0:03:15.435 --> 0:03:23.835
92
+ And they developed very specific, very good
93
+ techniques to deal with that.
94
+
95
+ 0:03:23.721 --> 0:03:26.904
96
+ However, this is the best.
97
+
98
+ 0:03:28.568 --> 0:03:33.907
99
+ And therefore we can do things different.
100
+
101
+ 0:03:33.780 --> 0:03:44.332
102
+ If we have not seen an N gram before in statistical
103
+ models, we have to have seen.
104
+
105
+ 0:03:45.225 --> 0:03:51.361
106
+ Before, and we can only get information from
107
+ exactly the same word.
108
+
109
+ 0:03:51.411 --> 0:03:57.567
110
+ We don't have an approximate matching like
111
+ that.
112
+
113
+ 0:03:57.441 --> 0:04:10.256
114
+ Maybe it stood together in some way or similar,
115
+ and in a sentence we might generalize the knowledge.
116
+
117
+ 0:04:11.191 --> 0:04:21.227
118
+ Would like to have more something like that
119
+ where engrams are represented more in a general
120
+
121
+ 0:04:21.227 --> 0:04:21.990
122
+ space.
123
+
124
+ 0:04:22.262 --> 0:04:29.877
125
+ So if you learn something about eyewalk then
126
+ maybe we can use this knowledge and also.
127
+
128
+ 0:04:30.290 --> 0:04:43.034
129
+ And thereby no longer treat all or at least
130
+ a lot of the ingrams as we've done before.
131
+
132
+ 0:04:42.887 --> 0:04:45.242
133
+ We can really.
134
+
135
+ 0:04:47.047 --> 0:04:56.157
136
+ And we maybe want to even do that in a more
137
+ hierarchical approach, but we know okay some
138
+
139
+ 0:04:56.157 --> 0:05:05.268
140
+ words are similar like go and walk is somehow
141
+ similar and and therefore like maybe if we
142
+
143
+ 0:05:05.268 --> 0:05:07.009
144
+ then merge them.
145
+
146
+ 0:05:07.387 --> 0:05:16.104
147
+ If we learn something about work, then it
148
+ should tell us also something about Hugo or
149
+
150
+ 0:05:16.104 --> 0:05:17.118
151
+ he walks.
152
+
153
+ 0:05:17.197 --> 0:05:18.970
154
+ We see already.
155
+
156
+ 0:05:18.859 --> 0:05:22.207
157
+ It's, of course, not so easy.
158
+
159
+ 0:05:22.095 --> 0:05:31.774
160
+ We see that there is some relations which
161
+ we need to integrate, for example, for you.
162
+
163
+ 0:05:31.661 --> 0:05:35.491
164
+ We need to add the S, but maybe.
165
+
166
+ 0:05:37.137 --> 0:05:42.984
167
+ And luckily there is one really yeah, convincing
168
+ methods in doing that.
169
+
170
+ 0:05:42.963 --> 0:05:47.239
171
+ And that is by using an evil neck or.
172
+
173
+ 0:05:47.387 --> 0:05:57.618
174
+ That's what we will introduce today so we
175
+ can use this type of neural networks to try
176
+
177
+ 0:05:57.618 --> 0:06:04.042
178
+ to learn this similarity and to learn how some
179
+ words.
180
+
181
+ 0:06:04.324 --> 0:06:13.711
182
+ And that is one of the main advantages that
183
+ we have by switching from the standard statistical
184
+
185
+ 0:06:13.711 --> 0:06:15.193
186
+ models to the.
187
+
188
+ 0:06:15.115 --> 0:06:22.840
189
+ To learn similarities between words and generalized
190
+ and learn what we call hidden representations.
191
+
192
+ 0:06:22.762 --> 0:06:29.708
193
+ So somehow representations of words where
194
+ we can measure similarity in some dimensions.
195
+
196
+ 0:06:30.290 --> 0:06:42.275
197
+ So in representations where as a tubically
198
+ continuous vector or a vector of a fixed size.
199
+
200
+ 0:06:42.822 --> 0:06:52.002
201
+ We had it before and we've seen that the only
202
+ thing we did is we don't want to do.
203
+
204
+ 0:06:52.192 --> 0:06:59.648
205
+ But these indices don't have any meaning,
206
+ so it wasn't that word five is more similar
207
+
208
+ 0:06:59.648 --> 0:07:02.248
209
+ to words twenty than to word.
210
+
211
+ 0:07:02.582 --> 0:07:09.059
212
+ So we couldn't learn anything about words
213
+ in the statistical model.
214
+
215
+ 0:07:08.964 --> 0:07:12.110
216
+ That's a big challenge because.
217
+
218
+ 0:07:12.192 --> 0:07:24.232
219
+ If you think about words even in morphology,
220
+ so go and go is more similar because the person.
221
+
222
+ 0:07:24.264 --> 0:07:36.265
223
+ While the basic models we have up to now,
224
+ they have no idea about that and goes as similar
225
+
226
+ 0:07:36.265 --> 0:07:37.188
227
+ to go.
228
+
229
+ 0:07:39.919 --> 0:07:49.062
230
+ A short introduction to network language modelsSo
231
+ what we want to do today, in order to go to
232
+
233
+ 0:07:49.062 --> 0:07:53.050
234
+ this, we will have a short introduction.
235
+
236
+ 0:07:53.954 --> 0:08:06.667
237
+ It very short just to see how we use them
238
+ here, but that's the good thing that are important
239
+
240
+ 0:08:06.667 --> 0:08:08.445
241
+ for dealing.
242
+
243
+ 0:08:08.928 --> 0:08:14.083
244
+ And then we'll first look into feet forward,
245
+ new network language models.
246
+
247
+ 0:08:14.454 --> 0:08:21.221
248
+ And there we will still have this approximation
249
+ we had before, then we are looking only at
250
+
251
+ 0:08:21.221 --> 0:08:22.336
252
+ fixed windows.
253
+
254
+ 0:08:22.262 --> 0:08:28.773
255
+ So if you remember we have this classroom
256
+ of language models, and to determine what is
257
+
258
+ 0:08:28.773 --> 0:08:33.788
259
+ the probability of a word, we only look at
260
+ the past and minus one.
261
+
262
+ 0:08:34.154 --> 0:08:36.878
263
+ This is the theory of the case.
264
+
265
+ 0:08:36.793 --> 0:08:43.349
266
+ However, we have the ability and that's why
267
+ they're really better in order.
268
+
269
+ 0:08:44.024 --> 0:08:51.953
270
+ And then at the end we'll look at current
271
+ network language models where we then have
272
+
273
+ 0:08:51.953 --> 0:08:53.166
274
+ a different.
275
+
276
+ 0:08:53.093 --> 0:09:01.922
277
+ And thereby it is no longer the case that
278
+ we need to have a fixed history, but in theory
279
+
280
+ 0:09:01.922 --> 0:09:04.303
281
+ we can model arbitrary.
282
+
283
+ 0:09:04.304 --> 0:09:06.854
284
+ And we can log this phenomenon.
285
+
286
+ 0:09:06.774 --> 0:09:12.673
287
+ We talked about a Tuesday where it's not clear
288
+ what type of information.
289
+
290
+ 0:09:16.396 --> 0:09:24.982
291
+ So yeah, generally new networks are normally
292
+ learned to improve and perform some tasks.
293
+
294
+ 0:09:25.325 --> 0:09:38.934
295
+ We have this structure and we are learning
296
+ them from samples so that is similar to what
297
+
298
+ 0:09:38.934 --> 0:09:42.336
299
+ we had before so now.
300
+
301
+ 0:09:42.642 --> 0:09:49.361
302
+ And is somehow originally motivated by the
303
+ human brain.
304
+
305
+ 0:09:49.241 --> 0:10:00.641
306
+ However, when you now need to know artificial
307
+ neural networks, it's hard to get a similarity.
308
+
309
+ 0:10:00.540 --> 0:10:02.884
310
+ There seems to be not that important.
311
+
312
+ 0:10:03.123 --> 0:10:11.013
313
+ So what they are mainly doing is doing summoning
314
+ multiplication and then one linear activation.
315
+
316
+ 0:10:12.692 --> 0:10:16.078
317
+ So so the basic units are these type of.
318
+
319
+ 0:10:17.937 --> 0:10:29.837
320
+ Perceptron is a basic block which we have
321
+ and this does exactly the processing.
322
+
323
+ 0:10:29.688 --> 0:10:36.088
324
+ We have a fixed number of input features.
325
+
326
+ 0:10:36.096 --> 0:10:39.668
327
+ So we have here numbers six zero to x and
328
+ as input.
329
+
330
+ 0:10:40.060 --> 0:10:48.096
331
+ And this makes language processing difficult
332
+ because we know that it's not the case.
333
+
334
+ 0:10:48.002 --> 0:10:53.109
335
+ If we're dealing with language, it doesn't
336
+ have any.
337
+
338
+ 0:10:54.114 --> 0:10:57.609
339
+ So we have to model this somehow and understand
340
+ how we model this.
341
+
342
+ 0:10:58.198 --> 0:11:03.681
343
+ Then we have the weights, which are the parameters
344
+ and the number of weights exactly the same.
345
+
346
+ 0:11:04.164 --> 0:11:15.069
347
+ Of input features sometimes you have the spires
348
+ in there that always and then it's not really.
349
+
350
+ 0:11:15.195 --> 0:11:19.656
351
+ And what you then do is very simple.
352
+
353
+ 0:11:19.535 --> 0:11:26.168
354
+ It's just like the weight it sounds, so you
355
+ multiply.
356
+
357
+ 0:11:26.606 --> 0:11:38.405
358
+ What is then additionally important is we
359
+ have an activation function and it's important
360
+
361
+ 0:11:38.405 --> 0:11:42.514
362
+ that this activation function.
363
+
364
+ 0:11:43.243 --> 0:11:54.088
365
+ And later it will be important that this is
366
+ differentiable because otherwise all the training.
367
+
368
+ 0:11:54.714 --> 0:12:01.471
369
+ This model by itself is not very powerful.
370
+
371
+ 0:12:01.314 --> 0:12:10.430
372
+ We have the X Or problem and with this simple
373
+ you can't.
374
+
375
+ 0:12:10.710 --> 0:12:15.489
376
+ However, there is a very easy and nice extension.
377
+
378
+ 0:12:15.393 --> 0:12:20.938
379
+ The multi layer perception and things get
380
+ very powerful.
381
+
382
+ 0:12:21.081 --> 0:12:32.953
383
+ The thing is you just connect a lot of these
384
+ in these layers of structures where we have
385
+
386
+ 0:12:32.953 --> 0:12:35.088
387
+ the inputs and.
388
+
389
+ 0:12:35.395 --> 0:12:47.936
390
+ And then we can combine them, or to do them:
391
+ The input layer is of course given by your
392
+
393
+ 0:12:47.936 --> 0:12:51.926
394
+ problem with the dimension.
395
+
396
+ 0:12:51.784 --> 0:13:00.065
397
+ The output layer is also given by your dimension.
398
+
399
+ 0:13:01.621 --> 0:13:08.802
400
+ So let's start with the first question, now
401
+ more language related, and that is how we represent.
402
+
403
+ 0:13:09.149 --> 0:13:19.282
404
+ So we have seen here input to x, but the question
405
+ is now okay.
406
+
407
+ 0:13:19.121 --> 0:13:23.470
408
+ How can we put into this?
409
+
410
+ 0:13:26.866 --> 0:13:34.123
411
+ The first thing that we're able to do is we're
412
+ going to set it in the inspector.
413
+
414
+ 0:13:34.314 --> 0:13:45.651
415
+ Yeah, and that is not that easy because the
416
+ continuous vector will come to that.
417
+
418
+ 0:13:45.511 --> 0:13:46.953
419
+ We can't.
420
+
421
+ 0:13:46.809 --> 0:13:50.420
422
+ We don't want to do it.
423
+
424
+ 0:13:50.630 --> 0:13:57.237
425
+ But if we need to input the word into the
426
+ needle network, it has to be something easily
427
+
428
+ 0:13:57.237 --> 0:13:57.912
429
+ defined.
430
+
431
+ 0:13:59.079 --> 0:14:11.511
432
+ One is the typical thing, the one-hour encoded
433
+ vector, so we have a vector where the dimension
434
+
435
+ 0:14:11.511 --> 0:14:15.306
436
+ is the vocabulary, and then.
437
+
438
+ 0:14:16.316 --> 0:14:25.938
439
+ So the first thing you are ready to see that
440
+ means we are always dealing with fixed.
441
+
442
+ 0:14:26.246 --> 0:14:34.961
443
+ So you cannot easily extend your vocabulary,
444
+ but if you mean your vocabulary would increase
445
+
446
+ 0:14:34.961 --> 0:14:37.992
447
+ the size of this input vector,.
448
+
449
+ 0:14:39.980 --> 0:14:42.423
450
+ That's maybe also motivating.
451
+
452
+ 0:14:42.341 --> 0:14:45.324
453
+ We'll talk about bike parade going.
454
+
455
+ 0:14:45.241 --> 0:14:47.233
456
+ That's the nice thing.
457
+
458
+ 0:14:48.048 --> 0:15:01.803
459
+ The big advantage of this one putt encoding
460
+ is that we don't implement similarity between
461
+
462
+ 0:15:01.803 --> 0:15:06.999
463
+ words, but we're really learning.
464
+
465
+ 0:15:07.227 --> 0:15:11.219
466
+ So you need like to represent any words.
467
+
468
+ 0:15:11.121 --> 0:15:15.895
469
+ You need a dimension of and dimensional vector.
470
+
471
+ 0:15:16.236 --> 0:15:26.480
472
+ Imagine you could eat no binary encoding,
473
+ so you could represent words as binary vectors.
474
+
475
+ 0:15:26.806 --> 0:15:32.348
476
+ So you will be significantly more efficient.
477
+
478
+ 0:15:32.225 --> 0:15:39.124
479
+ However, you have some more digits than other
480
+ numbers.
481
+
482
+ 0:15:39.559 --> 0:15:46.482
483
+ Would somehow be bad because you would force
484
+ the one to do this and it's by hand not clear
485
+
486
+ 0:15:46.482 --> 0:15:47.623
487
+ how to define.
488
+
489
+ 0:15:48.108 --> 0:15:55.135
490
+ So therefore currently this is the most successful
491
+ approach to just do this one patch.
492
+
493
+ 0:15:55.095 --> 0:15:59.344
494
+ We take a fixed vocabulary.
495
+
496
+ 0:15:59.192 --> 0:16:10.238
497
+ We map each word to the initial and then we
498
+ represent a word like this.
499
+
500
+ 0:16:10.084 --> 0:16:13.314
501
+ The representation.
502
+
503
+ 0:16:14.514 --> 0:16:27.019
504
+ But this dimension here is a secondary size,
505
+ and if you think ten thousand that's quite
506
+
507
+ 0:16:27.019 --> 0:16:33.555
508
+ high, so we're always trying to be efficient.
509
+
510
+ 0:16:33.853 --> 0:16:42.515
511
+ And we are doing the same type of efficiency
512
+ because then we are having a very small one
513
+
514
+ 0:16:42.515 --> 0:16:43.781
515
+ compared to.
516
+
517
+ 0:16:44.104 --> 0:16:53.332
518
+ It can be still a maybe or neurons, but this
519
+ is significantly smaller, of course, as before.
520
+
521
+ 0:16:53.713 --> 0:17:04.751
522
+ So you are learning there this word as you
523
+ said, but you can learn it directly, and there
524
+
525
+ 0:17:04.751 --> 0:17:07.449
526
+ we have similarities.
527
+
528
+ 0:17:07.807 --> 0:17:14.772
529
+ But the nice thing is that this is then learned,
530
+ and we do not need to like hand define.
531
+
532
+ 0:17:17.117 --> 0:17:32.377
533
+ So yes, so that is how we're typically adding
534
+ at least a single word into the language world.
535
+
536
+ 0:17:32.215 --> 0:17:42.390
537
+ Then we can see: So we're seeing that you
538
+ have the one hard representation always of
539
+
540
+ 0:17:42.390 --> 0:17:44.904
541
+ the same similarity.
542
+
543
+ 0:17:45.105 --> 0:18:00.803
544
+ Then we're having this continuous vector which
545
+ is a lot smaller dimension and that's.
546
+
547
+ 0:18:01.121 --> 0:18:06.984
548
+ What we are doing then is learning these representations
549
+ so that they are best for language modeling.
550
+
551
+ 0:18:07.487 --> 0:18:19.107
552
+ So the representations are implicitly because
553
+ we're training on the language.
554
+
555
+ 0:18:19.479 --> 0:18:30.115
556
+ And the nice thing was found out later is
557
+ these representations are really, really good
558
+
559
+ 0:18:30.115 --> 0:18:32.533
560
+ for a lot of other.
561
+
562
+ 0:18:33.153 --> 0:18:39.729
563
+ And that is why they are now called word embedded
564
+ space themselves, and used for other tasks.
565
+
566
+ 0:18:40.360 --> 0:18:49.827
567
+ And they are somehow describing different
568
+ things so they can describe and semantic similarities.
569
+
570
+ 0:18:49.789 --> 0:18:58.281
571
+ We are looking at the very example of today
572
+ that you can do in this vector space by adding
573
+
574
+ 0:18:58.281 --> 0:19:00.613
575
+ some interesting things.
576
+
577
+ 0:19:00.940 --> 0:19:11.174
578
+ And so they got really was a first big improvement
579
+ when switching to neural staff.
580
+
581
+ 0:19:11.491 --> 0:19:20.736
582
+ They are like part of the model still with
583
+ more complex representation alert, but they
584
+
585
+ 0:19:20.736 --> 0:19:21.267
586
+ are.
587
+
588
+ 0:19:23.683 --> 0:19:34.975
589
+ Then we are having the output layer, and in
590
+ the output layer we also have output structure
591
+
592
+ 0:19:34.975 --> 0:19:36.960
593
+ and activation.
594
+
595
+ 0:19:36.997 --> 0:19:44.784
596
+ That is the language we want to predict, which
597
+ word should be the next.
598
+
599
+ 0:19:44.675 --> 0:19:46.521
600
+ We always have.
601
+
602
+ 0:19:47.247 --> 0:19:56.454
603
+ And that can be done very well with the softball
604
+ softbacked layer, where again the dimension.
605
+
606
+ 0:19:56.376 --> 0:20:03.971
607
+ Is the vocabulary, so this is a vocabulary
608
+ size, and again the case neuro represents the
609
+
610
+ 0:20:03.971 --> 0:20:09.775
611
+ case class, so in our case we have again a
612
+ one-hour representation.
613
+
614
+ 0:20:10.090 --> 0:20:18.929
615
+ Ours is a probability distribution and the
616
+ end is a probability distribution of all works.
617
+
618
+ 0:20:18.832 --> 0:20:27.112
619
+ The case entry tells us: So we need to have
620
+ some of our probability distribution at our
621
+
622
+ 0:20:27.112 --> 0:20:36.144
623
+ output, and in order to achieve that this activation
624
+ function goes, it needs to be that all the
625
+
626
+ 0:20:36.144 --> 0:20:36.990
627
+ outputs.
628
+
629
+ 0:20:37.197 --> 0:20:47.993
630
+ And we can achieve that with a softmax activation
631
+ we take each of the value and then.
632
+
633
+ 0:20:48.288 --> 0:20:58.020
634
+ So by having this type of activation function
635
+ we are really getting that at the end we always.
636
+
637
+ 0:20:59.019 --> 0:21:12.340
638
+ The beginning was very challenging because
639
+ again we have this inefficient representation
640
+
641
+ 0:21:12.340 --> 0:21:15.184
642
+ of our vocabulary.
643
+
644
+ 0:21:15.235 --> 0:21:27.500
645
+ And then you can imagine escalating over to
646
+ something over a thousand is maybe a bit inefficient
647
+
648
+ 0:21:27.500 --> 0:21:29.776
649
+ with cheap users.
650
+
651
+ 0:21:36.316 --> 0:21:43.664
652
+ And then yeah, for training the models, that
653
+ is how we refine, so we have this architecture
654
+
655
+ 0:21:43.664 --> 0:21:44.063
656
+ now.
657
+
658
+ 0:21:44.264 --> 0:21:52.496
659
+ We need to minimize the arrow by taking the
660
+ output.
661
+
662
+ 0:21:52.338 --> 0:21:58.200
663
+ We are comparing it to our targets.
664
+
665
+ 0:21:58.298 --> 0:22:07.670
666
+ So one important thing is, of course, how
667
+ can we measure the error?
668
+
669
+ 0:22:07.532 --> 0:22:12.774
670
+ So what if we're training the ideas?
671
+
672
+ 0:22:13.033 --> 0:22:19.770
673
+ And how well when measuring it is in natural
674
+ language processing, typically the cross entropy.
675
+
676
+ 0:22:19.960 --> 0:22:32.847
677
+ That means we are comparing the target with
678
+ the output, so we're taking the value multiplying
679
+
680
+ 0:22:32.847 --> 0:22:35.452
681
+ with the horizons.
682
+
683
+ 0:22:35.335 --> 0:22:43.454
684
+ Which gets optimized and you're seeing that
685
+ this, of course, makes it again very nice and
686
+
687
+ 0:22:43.454 --> 0:22:49.859
688
+ easy because our target, we said, is again
689
+ a one-hound representation.
690
+
691
+ 0:22:50.110 --> 0:23:00.111
692
+ So except for one, all of these are always
693
+ zero, and what we are doing is taking the one.
694
+
695
+ 0:23:00.100 --> 0:23:05.970
696
+ And we only need to multiply the one with
697
+ the logarism here, and that is all the feedback.
698
+
699
+ 0:23:06.946 --> 0:23:14.194
700
+ Of course, this is not always influenced by
701
+ all the others.
702
+
703
+ 0:23:14.073 --> 0:23:17.942
704
+ Why is this influenced by all?
705
+
706
+ 0:23:24.304 --> 0:23:33.554
707
+ Think Mac the activation function, which is
708
+ the current activation divided by some of the
709
+
710
+ 0:23:33.554 --> 0:23:34.377
711
+ others.
712
+
713
+ 0:23:34.354 --> 0:23:44.027
714
+ Because otherwise it could of course easily
715
+ just increase this value and ignore the others,
716
+
717
+ 0:23:44.027 --> 0:23:49.074
718
+ but if you increase one value or the other,
719
+ so.
720
+
721
+ 0:23:51.351 --> 0:24:04.433
722
+ And then we can do with neon networks one
723
+ very nice and easy type of training that is
724
+
725
+ 0:24:04.433 --> 0:24:07.779
726
+ done in all the neon.
727
+
728
+ 0:24:07.707 --> 0:24:12.664
729
+ So in which direction does the arrow show?
730
+
731
+ 0:24:12.548 --> 0:24:23.120
732
+ And then if we want to go to a smaller like
733
+ smaller arrow, that's what we want to achieve.
734
+
735
+ 0:24:23.004 --> 0:24:27.306
736
+ We're trying to minimize our arrow.
737
+
738
+ 0:24:27.287 --> 0:24:32.875
739
+ And we have to do that, of course, for all
740
+ the weights, and to calculate the error of
741
+
742
+ 0:24:32.875 --> 0:24:36.709
743
+ all the weights we want in the back of the
744
+ baggation here.
745
+
746
+ 0:24:36.644 --> 0:24:41.289
747
+ But what you can do is you can propagate the
748
+ arrow which you measured.
749
+
750
+ 0:24:41.223 --> 0:24:43.741
751
+ At the end you can propagate it back.
752
+
753
+ 0:24:43.675 --> 0:24:46.394
754
+ That's basic mass and basic derivation.
755
+
756
+ 0:24:46.706 --> 0:24:59.557
757
+ Then you can do each weight in your model
758
+ and measure how much it contributes to this
759
+
760
+ 0:24:59.557 --> 0:25:01.350
761
+ individual.
762
+
763
+ 0:25:04.524 --> 0:25:17.712
764
+ To summarize what your machine translation
765
+ should be, to understand all this problem is
766
+
767
+ 0:25:17.712 --> 0:25:20.710
768
+ that this is how a.
769
+
770
+ 0:25:20.580 --> 0:25:23.056
771
+ The notes are perfect thrones.
772
+
773
+ 0:25:22.976 --> 0:25:28.169
774
+ They are fully connected between two layers
775
+ and no connections.
776
+
777
+ 0:25:28.108 --> 0:25:29.759
778
+ Across layers.
779
+
780
+ 0:25:29.829 --> 0:25:35.152
781
+ And what they're doing is always just to wait
782
+ for some here and then an activation function.
783
+
784
+ 0:25:35.415 --> 0:25:38.794
785
+ And in order to train you have this sword
786
+ in backwards past.
787
+
788
+ 0:25:39.039 --> 0:25:41.384
789
+ So we put in here.
790
+
791
+ 0:25:41.281 --> 0:25:46.540
792
+ Our inputs have some random values at the
793
+ beginning.
794
+
795
+ 0:25:46.441 --> 0:25:49.140
796
+ They calculate the output.
797
+
798
+ 0:25:49.040 --> 0:25:58.631
799
+ We are measuring how big our error is, propagating
800
+ the arrow back, and then changing our model
801
+
802
+ 0:25:58.631 --> 0:25:59.640
803
+ in a way.
804
+
805
+ 0:26:01.962 --> 0:26:10.408
806
+ How can we use neural networks for language
807
+ modeling?So before we're coming into the neural
808
+
809
+ 0:26:10.408 --> 0:26:17.569
810
+ networks, how can we use this type of neural
811
+ network to do language modeling?
812
+
813
+ 0:26:23.103 --> 0:26:25.520
814
+ So the question is now okay.
815
+
816
+ 0:26:25.437 --> 0:26:32.988
817
+ How can we use them in natural language processing
818
+ and especially in machine translation?
819
+
820
+ 0:26:32.904 --> 0:26:38.443
821
+ The first idea of using them was to estimate
822
+ the language model.
823
+
824
+ 0:26:38.999 --> 0:26:42.599
825
+ So we have seen that the output can be monitored
826
+ here as well.
827
+
828
+ 0:26:43.603 --> 0:26:49.308
829
+ Has a probability distribution, and if we
830
+ have a full vocabulary, we could mainly hear
831
+
832
+ 0:26:49.308 --> 0:26:55.209
833
+ estimate how probable each next word is, and
834
+ then use that in our language model fashion,
835
+
836
+ 0:26:55.209 --> 0:27:02.225
837
+ as we've done it last time, we've got the probability
838
+ of a full sentence as a product of all probabilities
839
+
840
+ 0:27:02.225 --> 0:27:03.208
841
+ of individual.
842
+
843
+ 0:27:04.544 --> 0:27:06.695
844
+ And UM.
845
+
846
+ 0:27:06.446 --> 0:27:09.776
847
+ That was done and in ninety seven years.
848
+
849
+ 0:27:09.695 --> 0:27:17.370
850
+ It's very easy to integrate it into this Locklear
851
+ model, so we have said that this is how the
852
+
853
+ 0:27:17.370 --> 0:27:24.636
854
+ Locklear model looks like, so we're searching
855
+ the best translation, which minimizes each
856
+
857
+ 0:27:24.636 --> 0:27:25.126
858
+ wage.
859
+
860
+ 0:27:25.125 --> 0:27:26.371
861
+ The feature value.
862
+
863
+ 0:27:26.646 --> 0:27:31.642
864
+ We have that with the minimum error training,
865
+ if you can remember when we search for the
866
+
867
+ 0:27:31.642 --> 0:27:32.148
868
+ optimal.
869
+
870
+ 0:27:32.512 --> 0:27:40.927
871
+ We have the phrasetable probabilities, the
872
+ language model, and we can just add here and
873
+
874
+ 0:27:40.927 --> 0:27:41.597
875
+ there.
876
+
877
+ 0:27:41.861 --> 0:27:46.077
878
+ So that is quite easy as said.
879
+
880
+ 0:27:45.941 --> 0:27:54.065
881
+ That was how statistical machine translation
882
+ was improved.
883
+
884
+ 0:27:53.927 --> 0:27:57.101
885
+ Add one more feature.
886
+
887
+ 0:27:58.798 --> 0:28:11.220
888
+ So how can we model the language mark for
889
+ Belty with your network?
890
+
891
+ 0:28:11.035 --> 0:28:22.438
892
+ So what we have to do is: And the problem
893
+ in generally in the head is that most we haven't
894
+
895
+ 0:28:22.438 --> 0:28:25.070
896
+ seen long sequences.
897
+
898
+ 0:28:25.085 --> 0:28:36.956
899
+ Mostly we have to beg off to very short sequences
900
+ and we are working on this discrete space where.
901
+
902
+ 0:28:37.337 --> 0:28:48.199
903
+ So the idea is if we have a meal network we
904
+ can map words into continuous representation
905
+
906
+ 0:28:48.199 --> 0:28:50.152
907
+ and that helps.
908
+
909
+ 0:28:51.091 --> 0:28:59.598
910
+ And the structure then looks like this, so
911
+ this is the basic still feed forward neural
912
+
913
+ 0:28:59.598 --> 0:29:00.478
914
+ network.
915
+
916
+ 0:29:01.361 --> 0:29:10.744
917
+ We are doing this at Proximation again, so
918
+ we are not putting in all previous words, but
919
+
920
+ 0:29:10.744 --> 0:29:11.376
921
+ it's.
922
+
923
+ 0:29:11.691 --> 0:29:21.525
924
+ And this is done because in your network we
925
+ can have only a fixed type of input, so we
926
+
927
+ 0:29:21.525 --> 0:29:31.359
928
+ can: Can only do a fixed set, and they are
929
+ going to be doing exactly the same in minus
930
+
931
+ 0:29:31.359 --> 0:29:31.924
932
+ one.
933
+
934
+ 0:29:33.593 --> 0:29:44.134
935
+ And then we have, for example, three words
936
+ and three different words, which are in these
937
+
938
+ 0:29:44.134 --> 0:29:54.911
939
+ positions: And then we're having the first
940
+ layer of the neural network, which learns words
941
+
942
+ 0:29:54.911 --> 0:29:56.214
943
+ and words.
944
+
945
+ 0:29:57.437 --> 0:30:04.976
946
+ There is one thing which is maybe special
947
+ compared to the standard neural memory.
948
+
949
+ 0:30:05.345 --> 0:30:13.163
950
+ So the representation of this word we want
951
+ to learn first of all position independence,
952
+
953
+ 0:30:13.163 --> 0:30:19.027
954
+ so we just want to learn what is the general
955
+ meaning of the word.
956
+
957
+ 0:30:19.299 --> 0:30:26.244
958
+ Therefore, the representation you get here
959
+ should be the same as if you put it in there.
960
+
961
+ 0:30:27.247 --> 0:30:35.069
962
+ The nice thing is you can achieve that in
963
+ networks the same way you achieve it.
964
+
965
+ 0:30:34.972 --> 0:30:41.720
966
+ This way you're reusing ears so we are forcing
967
+ them to always stay.
968
+
969
+ 0:30:42.322 --> 0:30:49.689
970
+ And that's why you then learn your word embedding,
971
+ which is contextual and independent, so.
972
+
973
+ 0:30:49.909 --> 0:31:05.561
974
+ So the idea is you have the diagram go home
975
+ and you don't want to use the context.
976
+
977
+ 0:31:05.373 --> 0:31:07.654
978
+ First you.
979
+
980
+ 0:31:08.348 --> 0:31:14.155
981
+ That of course it might have a different meaning
982
+ depending on where it stands, but learn that.
983
+
984
+ 0:31:14.514 --> 0:31:19.623
985
+ First, we're learning key representation of
986
+ the words, which is just the representation
987
+
988
+ 0:31:19.623 --> 0:31:20.378
989
+ of the word.
990
+
991
+ 0:31:20.760 --> 0:31:37.428
992
+ So it's also not like normally all input neurons
993
+ are connected to all neurons.
994
+
995
+ 0:31:37.857 --> 0:31:47.209
996
+ This is the first layer of representation,
997
+ and then we have a lot denser representation,
998
+
999
+ 0:31:47.209 --> 0:31:56.666
1000
+ that is, our three word embeddings here, and
1001
+ now we are learning this interaction between
1002
+
1003
+ 0:31:56.666 --> 0:31:57.402
1004
+ words.
1005
+
1006
+ 0:31:57.677 --> 0:32:08.265
1007
+ So now we have at least one connected, fully
1008
+ connected layer here, which takes the three
1009
+
1010
+ 0:32:08.265 --> 0:32:14.213
1011
+ imbedded input and then learns the new embedding.
1012
+
1013
+ 0:32:15.535 --> 0:32:27.871
1014
+ And then if you had one of several layers
1015
+ of lining which is your output layer, then.
1016
+
1017
+ 0:32:28.168 --> 0:32:46.222
1018
+ So here the size is a vocabulary size, and
1019
+ then you put as target what is the probability
1020
+
1021
+ 0:32:46.222 --> 0:32:48.228
1022
+ for each.
1023
+
1024
+ 0:32:48.688 --> 0:32:56.778
1025
+ The nice thing is that you learn everything
1026
+ together, so you're not learning what is a
1027
+
1028
+ 0:32:56.778 --> 0:32:58.731
1029
+ good representation.
1030
+
1031
+ 0:32:59.079 --> 0:33:12.019
1032
+ When you are training the whole network together,
1033
+ it learns what representation for a word you
1034
+
1035
+ 0:33:12.019 --> 0:33:13.109
1036
+ get in.
1037
+
1038
+ 0:33:15.956 --> 0:33:19.176
1039
+ It's Yeah That Is the Main Idea.
1040
+
1041
+ 0:33:20.660 --> 0:33:32.695
1042
+ Nowadays often referred to as one way of self-supervised
1043
+ learning, why self-supervisory learning?
1044
+
1045
+ 0:33:33.053 --> 0:33:37.120
1046
+ The output is the next word and the input
1047
+ is the previous word.
1048
+
1049
+ 0:33:37.377 --> 0:33:46.778
1050
+ But somehow it's self-supervised because it's
1051
+ not really that we created labels, but we artificially.
1052
+
1053
+ 0:33:46.806 --> 0:34:01.003
1054
+ We just have pure text, and then we created
1055
+ the task.
1056
+
1057
+ 0:34:05.905 --> 0:34:12.413
1058
+ Say we have two sentences like go home again.
1059
+
1060
+ 0:34:12.272 --> 0:34:18.783
1061
+ Second one is go to creative again, so both.
1062
+
1063
+ 0:34:18.858 --> 0:34:30.737
1064
+ The starboard bygo and then we have to predict
1065
+ the next four years and my question is: Be
1066
+
1067
+ 0:34:30.737 --> 0:34:40.769
1068
+ modeled this ability as one vector with like
1069
+ probability or possible works.
1070
+
1071
+ 0:34:40.637 --> 0:34:42.746
1072
+ We have musical.
1073
+
1074
+ 0:34:44.044 --> 0:34:56.438
1075
+ You have multiple examples, so you would twice
1076
+ train, once you predict, once you predict,
1077
+
1078
+ 0:34:56.438 --> 0:35:02.359
1079
+ and then, of course, the best performance.
1080
+
1081
+ 0:35:04.564 --> 0:35:11.772
1082
+ A very good point, so you're not aggregating
1083
+ examples beforehand, but you're taking each
1084
+
1085
+ 0:35:11.772 --> 0:35:13.554
1086
+ example individually.
1087
+
1088
+ 0:35:19.259 --> 0:35:33.406
1089
+ So what you do is you simultaneously learn
1090
+ the projection layer which represents this
1091
+
1092
+ 0:35:33.406 --> 0:35:39.163
1093
+ word and the N gram probabilities.
1094
+
1095
+ 0:35:39.499 --> 0:35:48.390
1096
+ And what people then later analyzed is that
1097
+ these representations are very powerful.
1098
+
1099
+ 0:35:48.286 --> 0:35:56.342
1100
+ The task is just a very important task to
1101
+ model like what is the next word.
1102
+
1103
+ 0:35:56.816 --> 0:36:09.429
1104
+ It's a bit motivated by people saying in order
1105
+ to get the meaning of the word you have to
1106
+
1107
+ 0:36:09.429 --> 0:36:10.690
1108
+ look at.
1109
+
1110
+ 0:36:10.790 --> 0:36:18.467
1111
+ If you read the text in there, which you have
1112
+ never seen, you can still estimate the meaning
1113
+
1114
+ 0:36:18.467 --> 0:36:22.264
1115
+ of this word because you know how it is used.
1116
+
1117
+ 0:36:22.602 --> 0:36:26.667
1118
+ Just imagine you read this text about some
1119
+ city.
1120
+
1121
+ 0:36:26.584 --> 0:36:32.476
1122
+ Even if you've never seen the city before
1123
+ heard, you often know from.
1124
+
1125
+ 0:36:34.094 --> 0:36:44.809
1126
+ So what is now the big advantage of using
1127
+ neural networks?
1128
+
1129
+ 0:36:44.628 --> 0:36:56.941
1130
+ Just imagine we have to estimate this: So
1131
+ you have to monitor the probability of ad hip
1132
+
1133
+ 0:36:56.941 --> 0:37:00.300
1134
+ and now imagine iPhone.
1135
+
1136
+ 0:37:00.600 --> 0:37:06.837
1137
+ So all the techniques we have at the last
1138
+ time.
1139
+
1140
+ 0:37:06.707 --> 0:37:14.246
1141
+ At the end, if you haven't seen iPhone, you
1142
+ will always.
1143
+
1144
+ 0:37:15.055 --> 0:37:19.502
1145
+ Because you haven't seen the previous words,
1146
+ so you have no idea how to do that.
1147
+
1148
+ 0:37:19.447 --> 0:37:24.366
1149
+ You won't have seen the diagram, the trigram
1150
+ and all the others, so the probability here
1151
+
1152
+ 0:37:24.366 --> 0:37:27.682
1153
+ will just be based on the probability of ad,
1154
+ so it uses no.
1155
+
1156
+ 0:37:28.588 --> 0:37:38.328
1157
+ If you're having this type of model, what
1158
+ does it do so?
1159
+
1160
+ 0:37:38.157 --> 0:37:43.460
1161
+ This is the last three words.
1162
+
1163
+ 0:37:43.483 --> 0:37:49.837
1164
+ Maybe this representation is messed up because
1165
+ it's mainly on a particular word or source
1166
+
1167
+ 0:37:49.837 --> 0:37:50.260
1168
+ that.
1169
+
1170
+ 0:37:50.730 --> 0:38:00.426
1171
+ Now anyway you have these two information
1172
+ that were two words before was first and therefore:
1173
+
1174
+ 0:38:00.426 --> 0:38:07.234
1175
+ So you have a lot of information here to estimate
1176
+ how good it is.
1177
+
1178
+ 0:38:07.131 --> 0:38:13.293
1179
+ Of course, there could be more information.
1180
+
1181
+ 0:38:13.593 --> 0:38:25.958
1182
+ So all this type of modeling we can do and
1183
+ that we couldn't do beforehand because we always.
1184
+
1185
+ 0:38:27.027 --> 0:38:31.905
1186
+ Don't guess how we do it now.
1187
+
1188
+ 0:38:31.742 --> 0:38:41.826
1189
+ Typically you would have one talking for awkward
1190
+ vocabulary.
1191
+
1192
+ 0:38:42.602 --> 0:38:45.855
1193
+ All you're doing by carrying coding when it
1194
+ has a fixed dancing.
1195
+
1196
+ 0:38:46.226 --> 0:38:49.439
1197
+ Yeah, you have to do something like that that
1198
+ the opposite way.
1199
+
1200
+ 0:38:50.050 --> 0:38:55.413
1201
+ So yeah, all the vocabulary are by thankcoding
1202
+ where you don't have have all the vocabulary.
1203
+
1204
+ 0:38:55.735 --> 0:39:07.665
1205
+ But then, of course, the back pairing coating
1206
+ is better with arbitrary context because a
1207
+
1208
+ 0:39:07.665 --> 0:39:11.285
1209
+ problem with back pairing.
1210
+
1211
+ 0:39:17.357 --> 0:39:20.052
1212
+ Anymore questions to the basic same little
1213
+ things.
1214
+
1215
+ 0:39:23.783 --> 0:39:36.162
1216
+ This model we then want to continue is to
1217
+ look into how complex that is or can make things
1218
+
1219
+ 0:39:36.162 --> 0:39:39.155
1220
+ maybe more efficient.
1221
+
1222
+ 0:39:40.580 --> 0:39:47.404
1223
+ At the beginning there was definitely a major
1224
+ challenge.
1225
+
1226
+ 0:39:47.284 --> 0:39:50.431
1227
+ It's still not that easy.
1228
+
1229
+ 0:39:50.310 --> 0:39:58.301
1230
+ All guess follow the talk about their environmental
1231
+ fingerprint.
1232
+
1233
+ 0:39:58.478 --> 0:40:05.686
1234
+ So this calculation is normally heavy, and
1235
+ if you build systems yourself, you have to
1236
+
1237
+ 0:40:05.686 --> 0:40:06.189
1238
+ wait.
1239
+
1240
+ 0:40:06.466 --> 0:40:15.412
1241
+ So it's good to know a bit about how complex
1242
+ things are in order to do a good or efficient.
1243
+
1244
+ 0:40:15.915 --> 0:40:24.706
1245
+ So one thing where most of the calculation
1246
+ really happens is if you're.
1247
+
1248
+ 0:40:25.185 --> 0:40:34.649
1249
+ So in generally all these layers, of course,
1250
+ we're talking about networks and the zones
1251
+
1252
+ 0:40:34.649 --> 0:40:35.402
1253
+ fancy.
1254
+
1255
+ 0:40:35.835 --> 0:40:48.305
1256
+ So what you have to do in order to calculate
1257
+ here these activations, you have this weight.
1258
+
1259
+ 0:40:48.488 --> 0:41:05.021
1260
+ So to make it simple, let's see we have three
1261
+ outputs, and then you just do a metric identification
1262
+
1263
+ 0:41:05.021 --> 0:41:08.493
1264
+ between your weight.
1265
+
1266
+ 0:41:08.969 --> 0:41:19.641
1267
+ That is why the use is so powerful for neural
1268
+ networks because they are very good in doing
1269
+
1270
+ 0:41:19.641 --> 0:41:22.339
1271
+ metric multiplication.
1272
+
1273
+ 0:41:22.782 --> 0:41:28.017
1274
+ However, for some type of embedding layer
1275
+ this is really very inefficient.
1276
+
1277
+ 0:41:28.208 --> 0:41:37.547
1278
+ So in this input we are doing this calculation.
1279
+
1280
+ 0:41:37.352 --> 0:41:47.085
1281
+ What we are mainly doing is selecting one
1282
+ color.
1283
+
1284
+ 0:41:47.387 --> 0:42:03.570
1285
+ So therefore you can do at least the forward
1286
+ pass a lot more efficient if you don't really
1287
+
1288
+ 0:42:03.570 --> 0:42:07.304
1289
+ do this calculation.
1290
+
1291
+ 0:42:08.348 --> 0:42:20.032
1292
+ So the weight metrics of the first embedding
1293
+ layer is just that in each color you have.
1294
+
1295
+ 0:42:20.580 --> 0:42:30.990
1296
+ So this is how your initial weights look like
1297
+ and how you can interpret or understand.
1298
+
1299
+ 0:42:32.692 --> 0:42:42.042
1300
+ And this is already relatively important because
1301
+ remember this is a huge dimensional thing,
1302
+
1303
+ 0:42:42.042 --> 0:42:51.392
1304
+ so typically here we have the number of words
1305
+ ten thousand, so this is the word embeddings.
1306
+
1307
+ 0:42:51.451 --> 0:43:00.400
1308
+ Because it's the largest one there, we have
1309
+ entries, while for the others we maybe have.
1310
+
1311
+ 0:43:00.660 --> 0:43:03.402
1312
+ So they are a little bit efficient and are
1313
+ important to make this in.
1314
+
1315
+ 0:43:06.206 --> 0:43:10.529
1316
+ And then you can look at where else the calculations
1317
+ are very difficult.
1318
+
1319
+ 0:43:10.830 --> 0:43:20.294
1320
+ So here we have our individual network, so
1321
+ here are the word embeddings.
1322
+
1323
+ 0:43:20.164 --> 0:43:29.500
1324
+ Then we have one hidden layer, and then you
1325
+ can look at how difficult.
1326
+
1327
+ 0:43:30.270 --> 0:43:42.863
1328
+ We could save a lot of calculations by calculating
1329
+ that by just doing like do the selection because:
1330
+
1331
+ 0:43:42.863 --> 0:43:51.716
1332
+ And then the number of calculations you have
1333
+ to do here is the length.
1334
+
1335
+ 0:43:52.993 --> 0:44:06.206
1336
+ Then we have here the hint size that is the
1337
+ hint size, so the first step of calculation
1338
+
1339
+ 0:44:06.206 --> 0:44:10.260
1340
+ for this metric is an age.
1341
+
1342
+ 0:44:10.730 --> 0:44:20.639
1343
+ Then you have to do some activation function
1344
+ which is this: This is the hidden size hymn
1345
+
1346
+ 0:44:20.639 --> 0:44:29.100
1347
+ because we need the vocabulary socks to calculate
1348
+ the probability for each.
1349
+
1350
+ 0:44:29.889 --> 0:44:40.474
1351
+ And if you look at this number, so if you
1352
+ have a projection sign of one hundred and a
1353
+
1354
+ 0:44:40.474 --> 0:44:45.027
1355
+ vocabulary sign of one hundred, you.
1356
+
1357
+ 0:44:45.425 --> 0:44:53.958
1358
+ And that's why there has been especially at
1359
+ the beginning some ideas on how we can reduce
1360
+
1361
+ 0:44:53.958 --> 0:44:55.570
1362
+ the calculation.
1363
+
1364
+ 0:44:55.956 --> 0:45:02.352
1365
+ And if we really need to calculate all our
1366
+ capabilities, or if we can calculate only some.
1367
+
1368
+ 0:45:02.582 --> 0:45:13.061
1369
+ And there again one important thing to think
1370
+ about is for what you will use my language.
1371
+
1372
+ 0:45:12.943 --> 0:45:21.885
1373
+ One can use it for generations and that's
1374
+ where we will see the next week.
1375
+
1376
+ 0:45:21.766 --> 0:45:22.511
1377
+ And.
1378
+
1379
+ 0:45:23.123 --> 0:45:32.164
1380
+ Initially, if it's just used as a feature,
1381
+ we do not want to use it for generation, but
1382
+
1383
+ 0:45:32.164 --> 0:45:32.575
1384
+ we.
1385
+
1386
+ 0:45:32.953 --> 0:45:41.913
1387
+ And there we might not be interested in all
1388
+ the probabilities, but we already know all
1389
+
1390
+ 0:45:41.913 --> 0:45:49.432
1391
+ the probability of this one word, and then
1392
+ it might be very inefficient.
1393
+
1394
+ 0:45:51.231 --> 0:45:53.638
1395
+ And how can you do that so initially?
1396
+
1397
+ 0:45:53.575 --> 0:45:56.301
1398
+ For example, people look into shortlists.
1399
+
1400
+ 0:45:56.756 --> 0:46:03.321
1401
+ So the idea was this calculation at the end
1402
+ is really very expensive.
1403
+
1404
+ 0:46:03.227 --> 0:46:05.763
1405
+ So can we make that more.
1406
+
1407
+ 0:46:05.945 --> 0:46:17.135
1408
+ And the idea was okay, and most birds occur
1409
+ very rarely, and some beef birds occur very,
1410
+
1411
+ 0:46:17.135 --> 0:46:18.644
1412
+ very often.
1413
+
1414
+ 0:46:19.019 --> 0:46:37.644
1415
+ And so they use the smaller imagery, which
1416
+ is maybe very small, and then you merge a new.
1417
+
1418
+ 0:46:37.937 --> 0:46:45.174
1419
+ So you're taking if the word is in the shortness,
1420
+ so in the most frequent words.
1421
+
1422
+ 0:46:45.825 --> 0:46:58.287
1423
+ You're taking the probability of this short
1424
+ word by some normalization here, and otherwise
1425
+
1426
+ 0:46:58.287 --> 0:46:59.656
1427
+ you take.
1428
+
1429
+ 0:47:00.020 --> 0:47:00.836
1430
+ Course.
1431
+
1432
+ 0:47:00.734 --> 0:47:09.773
1433
+ It will not be as good, but then we don't
1434
+ have to calculate all the capabilities at the
1435
+
1436
+ 0:47:09.773 --> 0:47:16.038
1437
+ end, but we only have to calculate it for the
1438
+ most frequent.
1439
+
1440
+ 0:47:19.599 --> 0:47:39.477
1441
+ Machines about that, but of course we don't
1442
+ model the probability of the infrequent words.
1443
+
1444
+ 0:47:39.299 --> 0:47:46.658
1445
+ And one idea is to do what is reported as
1446
+ soles for the structure of the layer.
1447
+
1448
+ 0:47:46.606 --> 0:47:53.169
1449
+ You see how some years ago people were very
1450
+ creative in giving names to newer models.
1451
+
1452
+ 0:47:53.813 --> 0:48:00.338
1453
+ And there the idea is that we model the out
1454
+ group vocabulary as a clustered strip.
1455
+
1456
+ 0:48:00.680 --> 0:48:08.498
1457
+ So you don't need to mold all of your bodies
1458
+ directly, but you are putting words into.
1459
+
1460
+ 0:48:08.969 --> 0:48:20.623
1461
+ A very intricate word is first in and then
1462
+ in and then in and that is in sub-sub-clusters
1463
+
1464
+ 0:48:20.623 --> 0:48:21.270
1465
+ and.
1466
+
1467
+ 0:48:21.541 --> 0:48:29.936
1468
+ And this is what was mentioned in the past
1469
+ of the work, so these are the subclasses that
1470
+
1471
+ 0:48:29.936 --> 0:48:30.973
1472
+ always go.
1473
+
1474
+ 0:48:30.879 --> 0:48:40.756
1475
+ So if it's in cluster one at the first position
1476
+ then you only look at all the words which are:
1477
+
1478
+ 0:48:40.756 --> 0:48:50.217
1479
+ And then you can calculate the probability
1480
+ of a word again just by the product over these,
1481
+
1482
+ 0:48:50.217 --> 0:48:55.519
1483
+ so the probability of the word is the first
1484
+ class.
1485
+
1486
+ 0:48:57.617 --> 0:49:12.331
1487
+ It's maybe more clear where you have the sole
1488
+ architecture, so what you will do is first
1489
+
1490
+ 0:49:12.331 --> 0:49:13.818
1491
+ predict.
1492
+
1493
+ 0:49:14.154 --> 0:49:26.435
1494
+ Then you go to the appropriate sub-class,
1495
+ then you calculate the probability of the sub-class.
1496
+
1497
+ 0:49:27.687 --> 0:49:34.932
1498
+ Anybody have an idea why this is more, more
1499
+ efficient, or if people do it first, it looks
1500
+
1501
+ 0:49:34.932 --> 0:49:35.415
1502
+ more.
1503
+
1504
+ 0:49:42.242 --> 0:49:56.913
1505
+ Yes, so you have to do less calculations,
1506
+ or maybe here you have to calculate the element
1507
+
1508
+ 0:49:56.913 --> 0:49:59.522
1509
+ there, but you.
1510
+
1511
+ 0:49:59.980 --> 0:50:06.116
1512
+ The capabilities in the set classes that you're
1513
+ going through and not for all of them.
1514
+
1515
+ 0:50:06.386 --> 0:50:16.688
1516
+ Therefore, it's only more efficient if you
1517
+ don't need all awkward preferences because
1518
+
1519
+ 0:50:16.688 --> 0:50:21.240
1520
+ you have to even calculate the class.
1521
+
1522
+ 0:50:21.501 --> 0:50:30.040
1523
+ So it's only more efficient in scenarios where
1524
+ you really need to use a language to evaluate.
1525
+
1526
+ 0:50:35.275 --> 0:50:50.164
1527
+ How this works is that on the output layer
1528
+ you only have a vocabulary of: But on the input
1529
+
1530
+ 0:50:50.164 --> 0:51:04.563
1531
+ layer you have always your full vocabulary
1532
+ because at the input we saw that this is not
1533
+
1534
+ 0:51:04.563 --> 0:51:06.690
1535
+ complicated.
1536
+
1537
+ 0:51:06.906 --> 0:51:19.778
1538
+ And then you can cluster down all your words,
1539
+ embedding series of classes, and use that as
1540
+
1541
+ 0:51:19.778 --> 0:51:23.031
1542
+ your classes for that.
1543
+
1544
+ 0:51:22.890 --> 0:51:26.573
1545
+ So yeah, you have words.
1546
+
1547
+ 0:51:29.249 --> 0:51:32.593
1548
+ Is one idea of doing it.
1549
+
1550
+ 0:51:32.459 --> 0:51:44.899
1551
+ There is also a second idea of doing it again,
1552
+ the idea that we don't need the probability.
1553
+
1554
+ 0:51:45.025 --> 0:51:53.401
1555
+ So sometimes it doesn't really need to be
1556
+ a probability to evaluate.
1557
+
1558
+ 0:51:53.280 --> 0:52:05.637
1559
+ It's only important that: And: Here is called
1560
+ self-normalization.
1561
+
1562
+ 0:52:05.450 --> 0:52:19.350
1563
+ What people have done so is in the softmax
1564
+ is always to the input divided by normalization.
1565
+
1566
+ 0:52:19.759 --> 0:52:25.194
1567
+ So this is how we calculate the soft mix.
1568
+
1569
+ 0:52:25.825 --> 0:52:42.224
1570
+ And in self-normalization now, the idea is
1571
+ that we don't need to calculate the logarithm.
1572
+
1573
+ 0:52:42.102 --> 0:52:54.284
1574
+ That would be zero, and then you don't even
1575
+ have to calculate the normalization.
1576
+
1577
+ 0:52:54.514 --> 0:53:01.016
1578
+ So how can we achieve that?
1579
+
1580
+ 0:53:00.784 --> 0:53:08.687
1581
+ And then there's the nice thing.
1582
+
1583
+ 0:53:09.009 --> 0:53:14.743
1584
+ And our novel Lots and more to maximize probability.
1585
+
1586
+ 0:53:14.635 --> 0:53:23.833
1587
+ We have this cross entry lot that probability
1588
+ is higher, and now we're just adding.
1589
+
1590
+ 0:53:24.084 --> 0:53:31.617
1591
+ And the second loss just tells us you're pleased
1592
+ training the way the lock set is zero.
1593
+
1594
+ 0:53:32.352 --> 0:53:38.625
1595
+ So then if it's nearly zero at the end you
1596
+ don't need to calculate this and it's also
1597
+
1598
+ 0:53:38.625 --> 0:53:39.792
1599
+ very efficient.
1600
+
1601
+ 0:53:40.540 --> 0:53:57.335
1602
+ One important thing is this is only an inference,
1603
+ so during tests we don't need to calculate.
1604
+
1605
+ 0:54:00.480 --> 0:54:15.006
1606
+ You can do a bit of a hyperparameter here
1607
+ where you do the waiting and how much effort
1608
+
1609
+ 0:54:15.006 --> 0:54:16.843
1610
+ should be.
1611
+
1612
+ 0:54:18.318 --> 0:54:35.037
1613
+ The only disadvantage is that it's no speed
1614
+ up during training and there are other ways
1615
+
1616
+ 0:54:35.037 --> 0:54:37.887
1617
+ of doing that.
1618
+
1619
+ 0:54:41.801 --> 0:54:43.900
1620
+ I'm with you all.
1621
+
1622
+ 0:54:44.344 --> 0:54:48.540
1623
+ Then we are coming very, very briefly like
1624
+ this one here.
1625
+
1626
+ 0:54:48.828 --> 0:54:53.692
1627
+ There are more things on different types of
1628
+ languages.
1629
+
1630
+ 0:54:53.604 --> 0:54:58.028
1631
+ We are having a very short view of a restricted.
1632
+
1633
+ 0:54:58.298 --> 0:55:09.737
1634
+ And then we'll talk about recurrent neural
1635
+ networks for our language minds because they
1636
+
1637
+ 0:55:09.737 --> 0:55:17.407
1638
+ have the advantage now that we can't even further
1639
+ improve.
1640
+
1641
+ 0:55:18.238 --> 0:55:24.343
1642
+ Different types of neural networksThere's
1643
+ also different types of neural networks.
1644
+
1645
+ 0:55:24.269 --> 0:55:30.178
1646
+ These ballroom machines are not having input.
1647
+
1648
+ 0:55:30.330 --> 0:55:39.180
1649
+ They have these binary units: And they define
1650
+ an energy function on the network, which can
1651
+
1652
+ 0:55:39.180 --> 0:55:46.864
1653
+ be in respect of bottom machines efficiently
1654
+ calculated, and restricted needs.
1655
+
1656
+ 0:55:46.767 --> 0:55:53.149
1657
+ You only have connections between the input
1658
+ and the hidden layer.
1659
+
1660
+ 0:55:53.393 --> 0:56:00.190
1661
+ So you see here you don't have input and output,
1662
+ you just have an input and you calculate what.
1663
+
1664
+ 0:56:00.460 --> 0:56:16.429
1665
+ Which of course nicely fits with the idea
1666
+ we're having, so you can use this for N gram
1667
+
1668
+ 0:56:16.429 --> 0:56:19.182
1669
+ language ones.
1670
+
1671
+ 0:56:19.259 --> 0:56:25.187
1672
+ Decaying this credibility of the input by
1673
+ this type of neural networks.
1674
+
1675
+ 0:56:26.406 --> 0:56:30.582
1676
+ And the advantage of this type of model of
1677
+ board that is.
1678
+
1679
+ 0:56:30.550 --> 0:56:38.629
1680
+ Very fast to integrate it, so that one was
1681
+ the first one which was used during decoding.
1682
+
1683
+ 0:56:38.938 --> 0:56:50.103
1684
+ The problem of it is that the Enron language
1685
+ models were very good at performing the calculation.
1686
+
1687
+ 0:56:50.230 --> 0:57:00.114
1688
+ So what people typically did is we talked
1689
+ about a best list, so they generated a most
1690
+
1691
+ 0:57:00.114 --> 0:57:05.860
1692
+ probable output, and then they scored each
1693
+ entry.
1694
+
1695
+ 0:57:06.146 --> 0:57:10.884
1696
+ A language model, and then only like change
1697
+ the order against that based on that which.
1698
+
1699
+ 0:57:11.231 --> 0:57:20.731
1700
+ The knifing is maybe only hundred entries,
1701
+ while during decoding you will look at several
1702
+
1703
+ 0:57:20.731 --> 0:57:21.787
1704
+ thousand.
1705
+
1706
+ 0:57:26.186 --> 0:57:40.437
1707
+ This but let's look at the context, so we
1708
+ have now seen your language models.
1709
+
1710
+ 0:57:40.254 --> 0:57:43.737
1711
+ There is the big.
1712
+
1713
+ 0:57:44.084 --> 0:57:57.552
1714
+ Remember ingram language is not always words
1715
+ because sometimes you have to back off or interpolation
1716
+
1717
+ 0:57:57.552 --> 0:57:59.953
1718
+ to lower ingrams.
1719
+
1720
+ 0:58:00.760 --> 0:58:05.504
1721
+ However, in neural models we always have all
1722
+ of these inputs and some of these.
1723
+
1724
+ 0:58:07.147 --> 0:58:21.262
1725
+ The disadvantage is that you are still limited
1726
+ in your context, and if you remember the sentence
1727
+
1728
+ 0:58:21.262 --> 0:58:23.008
1729
+ from last,.
1730
+
1731
+ 0:58:22.882 --> 0:58:28.445
1732
+ Sometimes you need more context and there's
1733
+ unlimited contexts that you might need and
1734
+
1735
+ 0:58:28.445 --> 0:58:34.838
1736
+ you can always create sentences where you need
1737
+ this file context in order to put a good estimation.
1738
+
1739
+ 0:58:35.315 --> 0:58:44.955
1740
+ Can we also do it different in order to better
1741
+ understand that it makes sense to view?
1742
+
1743
+ 0:58:45.445 --> 0:58:56.160
1744
+ Sequence labeling tasksSo sequence labeling
1745
+ tasks are a very common type of towns in natural
1746
+
1747
+ 0:58:56.160 --> 0:59:03.418
1748
+ language processing where you have an input
1749
+ sequence and then.
1750
+
1751
+ 0:59:03.323 --> 0:59:08.663
1752
+ I've token so you have one output for each
1753
+ input so machine translation is not a secret
1754
+
1755
+ 0:59:08.663 --> 0:59:14.063
1756
+ labeling cast because the number of inputs
1757
+ and the number of outputs is different so you
1758
+
1759
+ 0:59:14.063 --> 0:59:19.099
1760
+ put in a string German which has five words
1761
+ and the output can be six or seven or.
1762
+
1763
+ 0:59:19.619 --> 0:59:20.155
1764
+ Secrets.
1765
+
1766
+ 0:59:20.095 --> 0:59:24.084
1767
+ Lately you always have the same number of
1768
+ and the same number of.
1769
+
1770
+ 0:59:24.944 --> 0:59:40.940
1771
+ And you can model language modeling as that,
1772
+ and you just say a label for each word is always
1773
+
1774
+ 0:59:40.940 --> 0:59:43.153
1775
+ a next word.
1776
+
1777
+ 0:59:45.705 --> 0:59:54.823
1778
+ This is the more general you can think of
1779
+ it, for example how to speech taking entity
1780
+
1781
+ 0:59:54.823 --> 0:59:56.202
1782
+ recognition.
1783
+
1784
+ 0:59:58.938 --> 1:00:08.081
1785
+ And if you look at now fruit cut token in
1786
+ generally sequence, they can depend on import
1787
+
1788
+ 1:00:08.081 --> 1:00:08.912
1789
+ tokens.
1790
+
1791
+ 1:00:09.869 --> 1:00:11.260
1792
+ Nice thing.
1793
+
1794
+ 1:00:11.144 --> 1:00:21.872
1795
+ In our case, the output tokens are the same
1796
+ so we can easily model it that they only depend
1797
+
1798
+ 1:00:21.872 --> 1:00:24.787
1799
+ on all the input tokens.
1800
+
1801
+ 1:00:24.670 --> 1:00:28.988
1802
+ So we have this whether it's or so.
1803
+
1804
+ 1:00:31.011 --> 1:00:42.945
1805
+ But we can always do a look at what specific
1806
+ type of sequence labeling, unidirectional sequence
1807
+
1808
+ 1:00:42.945 --> 1:00:44.188
1809
+ labeling.
1810
+
1811
+ 1:00:44.584 --> 1:00:58.215
1812
+ And that's exactly how we want the language
1813
+ of the next word only depends on all the previous
1814
+
1815
+ 1:00:58.215 --> 1:01:00.825
1816
+ words that we're.
1817
+
1818
+ 1:01:01.321 --> 1:01:12.899
1819
+ Mean, of course, that's not completely true
1820
+ in a language that the bad context might also
1821
+
1822
+ 1:01:12.899 --> 1:01:14.442
1823
+ be helpful.
1824
+
1825
+ 1:01:14.654 --> 1:01:22.468
1826
+ We will model always the probability of a
1827
+ word given on its history, and therefore we
1828
+
1829
+ 1:01:22.468 --> 1:01:23.013
1830
+ need.
1831
+
1832
+ 1:01:23.623 --> 1:01:29.896
1833
+ And currently we did there this approximation
1834
+ in sequence labeling that we have this windowing
1835
+
1836
+ 1:01:29.896 --> 1:01:30.556
1837
+ approach.
1838
+
1839
+ 1:01:30.951 --> 1:01:43.975
1840
+ So in order to predict this type of word we
1841
+ always look at the previous three words and
1842
+
1843
+ 1:01:43.975 --> 1:01:48.416
1844
+ then to do this one we again.
1845
+
1846
+ 1:01:49.389 --> 1:01:55.137
1847
+ If you are into neural networks you recognize
1848
+ this type of structure.
1849
+
1850
+ 1:01:55.055 --> 1:01:57.522
1851
+ Also are the typical neural.
1852
+
1853
+ 1:01:58.938 --> 1:02:09.688
1854
+ Yes, so this is like Engram, Louis Couperus,
1855
+ and at least in some way compared to the original,
1856
+
1857
+ 1:02:09.688 --> 1:02:12.264
1858
+ you're always looking.
1859
+
1860
+ 1:02:14.334 --> 1:02:30.781
1861
+ However, there are also other types of neural
1862
+ network structures which we can use for sequence.
1863
+
1864
+ 1:02:32.812 --> 1:02:34.678
1865
+ That we can do so.
1866
+
1867
+ 1:02:34.580 --> 1:02:39.646
1868
+ The idea is in recurrent neural network structure.
1869
+
1870
+ 1:02:39.547 --> 1:02:43.225
1871
+ We are saving the complete history.
1872
+
1873
+ 1:02:43.623 --> 1:02:55.118
1874
+ So again we have to do like this fix size
1875
+ representation because neural networks always
1876
+
1877
+ 1:02:55.118 --> 1:02:56.947
1878
+ need to have.
1879
+
1880
+ 1:02:57.157 --> 1:03:05.258
1881
+ And then we start with an initial value for
1882
+ our storage.
1883
+
1884
+ 1:03:05.116 --> 1:03:15.919
1885
+ We are giving our first input and then calculating
1886
+ the new representation.
1887
+
1888
+ 1:03:16.196 --> 1:03:26.328
1889
+ If you look at this, it's just again your
1890
+ network was two types of inputs: in your work,
1891
+
1892
+ 1:03:26.328 --> 1:03:29.743
1893
+ in your initial hidden state.
1894
+
1895
+ 1:03:30.210 --> 1:03:46.468
1896
+ Then you can apply it to the next type of
1897
+ input and you're again having.
1898
+
1899
+ 1:03:47.367 --> 1:03:53.306
1900
+ Nice thing is now that you can do now step
1901
+ by step by step, so all the way over.
1902
+
1903
+ 1:03:55.495 --> 1:04:05.245
1904
+ The nice thing that we are having here now
1905
+ is that we are having context information from
1906
+
1907
+ 1:04:05.245 --> 1:04:07.195
1908
+ all the previous.
1909
+
1910
+ 1:04:07.607 --> 1:04:13.582
1911
+ So if you're looking like based on which words
1912
+ do you use here, calculate your ability of
1913
+
1914
+ 1:04:13.582 --> 1:04:14.180
1915
+ varying.
1916
+
1917
+ 1:04:14.554 --> 1:04:20.128
1918
+ It depends on is based on this path.
1919
+
1920
+ 1:04:19.977 --> 1:04:33.085
1921
+ It depends on and this hidden state was influenced
1922
+ by this one and this hidden state.
1923
+
1924
+ 1:04:33.473 --> 1:04:37.741
1925
+ A new way to model probabilitySo now we're
1926
+ having something new.
1927
+
1928
+ 1:04:37.675 --> 1:04:46.451
1929
+ We can really model the word probability not
1930
+ only on a fixed context.
1931
+
1932
+ 1:04:46.906 --> 1:04:53.570
1933
+ Because the in-states we're having here in
1934
+ our area are influenced by all the trivia.
1935
+
1936
+ 1:04:56.296 --> 1:05:00.909
1937
+ So how is that to mean?
1938
+
1939
+ 1:05:00.717 --> 1:05:16.290
1940
+ If you're not thinking about the history of
1941
+ clustering, we said the clustering.
1942
+
1943
+ 1:05:16.736 --> 1:05:24.261
1944
+ So do not need to do any clustering here,
1945
+ and we also see how things are put together
1946
+
1947
+ 1:05:24.261 --> 1:05:26.273
1948
+ in order to really do.
1949
+
1950
+ 1:05:29.489 --> 1:05:43.433
1951
+ In the green box this way since we are starting
1952
+ from the left point to the right.
1953
+
1954
+ 1:05:44.524 --> 1:05:48.398
1955
+ And that's right, so they're clustered in
1956
+ some parts.
1957
+
1958
+ 1:05:48.326 --> 1:05:58.827
1959
+ Here is some type of clustering happening:
1960
+ It's continuous representations, but a smaller
1961
+
1962
+ 1:05:58.827 --> 1:06:02.677
1963
+ difference doesn't matter again.
1964
+
1965
+ 1:06:02.560 --> 1:06:10.846
1966
+ So if you have a lot of different histories,
1967
+ the similarity.
1968
+
1969
+ 1:06:11.071 --> 1:06:15.791
1970
+ Because in order to do the final restriction
1971
+ you only do it based on the green box.
1972
+
1973
+ 1:06:16.156 --> 1:06:24.284
1974
+ So you are now again still learning some type
1975
+ of clasp.
1976
+
1977
+ 1:06:24.139 --> 1:06:30.238
1978
+ You don't have to do this hard decision.
1979
+
1980
+ 1:06:30.570 --> 1:06:39.013
1981
+ The only restriction you are giving is you
1982
+ have to install everything that is important.
1983
+
1984
+ 1:06:39.359 --> 1:06:54.961
1985
+ So it's a different type of limitation, so
1986
+ you calculate the probability based on the
1987
+
1988
+ 1:06:54.961 --> 1:06:57.138
1989
+ last words.
1990
+
1991
+ 1:06:57.437 --> 1:07:09.645
1992
+ That is how you still need some cluster things
1993
+ in order to do it efficiently.
1994
+
1995
+ 1:07:09.970 --> 1:07:25.311
1996
+ But this is where things get merged together
1997
+ in this type of hidden representation, which
1998
+
1999
+ 1:07:25.311 --> 1:07:28.038
2000
+ is then merged.
2001
+
2002
+ 1:07:28.288 --> 1:07:33.104
2003
+ On the previous words, but they are some other
2004
+ bottleneck in order to make a good estimation.
2005
+
2006
+ 1:07:34.474 --> 1:07:41.242
2007
+ So the idea is that we can store all our history
2008
+ into one lecture.
2009
+
2010
+ 1:07:41.581 --> 1:07:47.351
2011
+ Which is very good and makes it more strong.
2012
+
2013
+ 1:07:47.223 --> 1:07:51.636
2014
+ Next we come to problems of that.
2015
+
2016
+ 1:07:51.507 --> 1:07:57.870
2017
+ Of course, at some point it might be difficult.
2018
+
2019
+ 1:07:58.398 --> 1:08:02.230
2020
+ Then maybe things get all overwritten, or
2021
+ you cannot store everything in there.
2022
+
2023
+ 1:08:02.662 --> 1:08:04.514
2024
+ So,.
2025
+
2026
+ 1:08:04.184 --> 1:08:10.252
2027
+ Therefore, yet for short things like signal
2028
+ sentences that works well, but especially if
2029
+
2030
+ 1:08:10.252 --> 1:08:16.184
2031
+ you think of other tasks like harmonisation
2032
+ where a document based on T where you need
2033
+
2034
+ 1:08:16.184 --> 1:08:22.457
2035
+ to consider a full document, these things got
2036
+ a bit more complicated and we learned another
2037
+
2038
+ 1:08:22.457 --> 1:08:23.071
2039
+ type of.
2040
+
2041
+ 1:08:24.464 --> 1:08:30.455
2042
+ For the further in order to understand these
2043
+ networks, it's good to have both views always.
2044
+
2045
+ 1:08:30.710 --> 1:08:39.426
2046
+ So this is the unroll view, so you have this
2047
+ type of network.
2048
+
2049
+ 1:08:39.285 --> 1:08:47.769
2050
+ Therefore, it can be shown as: We have here
2051
+ the output and here's your network which is
2052
+
2053
+ 1:08:47.769 --> 1:08:52.108
2054
+ connected by itself and that is a recurrent.
2055
+
2056
+ 1:08:56.176 --> 1:09:11.033
2057
+ There is one challenge in these networks and
2058
+ that is the training so the nice thing is train
2059
+
2060
+ 1:09:11.033 --> 1:09:11.991
2061
+ them.
2062
+
2063
+ 1:09:12.272 --> 1:09:20.147
2064
+ So the idea is we don't really know how to
2065
+ train them, but if you unroll them like this,.
2066
+
2067
+ 1:09:20.540 --> 1:09:38.054
2068
+ It's exactly the same so you can measure your
2069
+ arrows and then you propagate your arrows.
2070
+
2071
+ 1:09:38.378 --> 1:09:45.647
2072
+ Now the nice thing is if you unroll something,
2073
+ it's a feet forward and you can train it.
2074
+
2075
+ 1:09:46.106 --> 1:09:56.493
2076
+ The only important thing is, of course, for
2077
+ different inputs you have to take that into
2078
+
2079
+ 1:09:56.493 --> 1:09:57.555
2080
+ account.
2081
+
2082
+ 1:09:57.837 --> 1:10:07.621
2083
+ But since parameters are shared, it's somehow
2084
+ similar and you can train that the training
2085
+
2086
+ 1:10:07.621 --> 1:10:08.817
2087
+ algorithm.
2088
+
2089
+ 1:10:10.310 --> 1:10:16.113
2090
+ One thing which makes things difficult is
2091
+ what is referred to as the vanishing gradient.
2092
+
2093
+ 1:10:16.048 --> 1:10:21.683
2094
+ So we are saying there is a big advantage
2095
+ of these models and that's why we are using
2096
+
2097
+ 1:10:21.683 --> 1:10:22.076
2098
+ that.
2099
+
2100
+ 1:10:22.010 --> 1:10:27.960
2101
+ The output here does not only depend on the
2102
+ current input of a last three but on anything
2103
+
2104
+ 1:10:27.960 --> 1:10:29.415
2105
+ that was said before.
2106
+
2107
+ 1:10:29.809 --> 1:10:32.803
2108
+ That's a very strong thing is the motivation
2109
+ of using art.
2110
+
2111
+ 1:10:33.593 --> 1:10:44.599
2112
+ However, if you're using standard, the influence
2113
+ here gets smaller and smaller, and the models.
2114
+
2115
+ 1:10:44.804 --> 1:10:55.945
2116
+ Because the gradients get smaller and smaller,
2117
+ and so the arrow here propagated to this one,
2118
+
2119
+ 1:10:55.945 --> 1:10:59.659
2120
+ this contributes to the arrow.
2121
+
2122
+ 1:11:00.020 --> 1:11:06.710
2123
+ And yeah, that's why standard R&amp;S are
2124
+ difficult or have to become boosters.
2125
+
2126
+ 1:11:07.247 --> 1:11:11.439
2127
+ How to make neural networks more complexSo
2128
+ if we are talking about our ends nowadays,.
2129
+
2130
+ 1:11:11.791 --> 1:11:19.532
2131
+ What we are typically meaning are long short
2132
+ memories.
2133
+
2134
+ 1:11:19.391 --> 1:11:30.933
2135
+ You see there by now quite old already, but
2136
+ they have special gating mechanisms.
2137
+
2138
+ 1:11:31.171 --> 1:11:41.911
2139
+ So in the language model tasks, for example
2140
+ in some other story information, all this sentence
2141
+
2142
+ 1:11:41.911 --> 1:11:44.737
2143
+ started with a question.
2144
+
2145
+ 1:11:44.684 --> 1:11:51.886
2146
+ Because if you only look at the five last
2147
+ five words, it's often no longer clear as a
2148
+
2149
+ 1:11:51.886 --> 1:11:52.556
2150
+ normal.
2151
+
2152
+ 1:11:53.013 --> 1:12:06.287
2153
+ So there you have these mechanisms with the
2154
+ right gate in order to store things for a longer
2155
+
2156
+ 1:12:06.287 --> 1:12:08.571
2157
+ time into your.
2158
+
2159
+ 1:12:10.730 --> 1:12:20.147
2160
+ Here they are used in, in, in, in selling
2161
+ quite a lot of works.
2162
+
2163
+ 1:12:21.541 --> 1:12:30.487
2164
+ For especially text machine translation now,
2165
+ the standard is to do transformer base models.
2166
+
2167
+ 1:12:30.690 --> 1:12:42.857
2168
+ But for example, this type of in architecture
2169
+ we have later one lecture about efficiency.
2170
+
2171
+ 1:12:42.882 --> 1:12:53.044
2172
+ And there in the decoder and partial networks
2173
+ they are still using our edges because then.
2174
+
2175
+ 1:12:53.473 --> 1:12:57.542
2176
+ So it's not that our ends are of no importance.
2177
+
2178
+ 1:12:59.239 --> 1:13:09.178
2179
+ In order to make them strong, there are some
2180
+ more things which are helpful and should be:
2181
+
2182
+ 1:13:09.178 --> 1:13:19.669
2183
+ So one thing is it's a very easy and nice trick
2184
+ to make this neon network stronger and better.
2185
+
2186
+ 1:13:19.739 --> 1:13:21.619
2187
+ So, of course, it doesn't work always.
2188
+
2189
+ 1:13:21.571 --> 1:13:23.452
2190
+ They have to have enough training to.
2191
+
2192
+ 1:13:23.763 --> 1:13:29.583
2193
+ But in general that is the easiest way of
2194
+ making your mouth bigger and stronger is to
2195
+
2196
+ 1:13:29.583 --> 1:13:30.598
2197
+ increase your.
2198
+
2199
+ 1:13:30.630 --> 1:13:43.244
2200
+ And you've seen that with a large size model
2201
+ they are always braggling about.
2202
+
2203
+ 1:13:43.903 --> 1:13:53.657
2204
+ This is one way so the question is how do
2205
+ you get more parameters?
2206
+
2207
+ 1:13:53.511 --> 1:14:04.947
2208
+ There's two ways you can make your representations:
2209
+ And the other thing is its octave deep learning,
2210
+
2211
+ 1:14:04.947 --> 1:14:10.043
2212
+ so the other thing is to make your networks.
2213
+
2214
+ 1:14:11.471 --> 1:14:13.831
2215
+ And then you can also get more work off.
2216
+
2217
+ 1:14:14.614 --> 1:14:19.931
2218
+ There's one problem with this and with more
2219
+ deeper networks.
2220
+
2221
+ 1:14:19.844 --> 1:14:23.332
2222
+ It's very similar to what we saw with.
2223
+
2224
+ 1:14:23.603 --> 1:14:34.755
2225
+ With the we have this problem of radiant flow
2226
+ that if it flows so fast like the radiant gets
2227
+
2228
+ 1:14:34.755 --> 1:14:35.475
2229
+ very.
2230
+
2231
+ 1:14:35.795 --> 1:14:41.114
2232
+ Exactly the same thing happens in deep.
2233
+
2234
+ 1:14:40.981 --> 1:14:52.286
2235
+ If you take the gradient and tell it's the
2236
+ right or wrong, then you're propagating.
2237
+
2238
+ 1:14:52.612 --> 1:14:53.228
2239
+ Three layers.
2240
+
2241
+ 1:14:53.184 --> 1:14:56.440
2242
+ It's no problem, but if you're going to ten,
2243
+ twenty or a hundred layers.
2244
+
2245
+ 1:14:57.797 --> 1:14:59.690
2246
+ That is getting typically a problem.
2247
+
2248
+ 1:15:00.060 --> 1:15:10.659
2249
+ People are doing and they are using what is
2250
+ called visual connections.
2251
+
2252
+ 1:15:10.510 --> 1:15:15.889
2253
+ That's a very helpful idea, which.
2254
+
2255
+ 1:15:15.956 --> 1:15:20.309
2256
+ And so the idea is that these networks.
2257
+
2258
+ 1:15:20.320 --> 1:15:30.694
2259
+ In between should calculate really what is
2260
+ a new representation, but they are calculating
2261
+
2262
+ 1:15:30.694 --> 1:15:31.386
2263
+ what.
2264
+
2265
+ 1:15:31.731 --> 1:15:37.585
2266
+ And therefore in the end you'll always the
2267
+ output of a layer is added with the input.
2268
+
2269
+ 1:15:38.318 --> 1:15:48.824
2270
+ The nice thing is that later, if you are doing
2271
+ back propagation with this very fast back,.
2272
+
2273
+ 1:15:49.209 --> 1:16:01.896
2274
+ So that is what you're seeing nowadays in
2275
+ very deep architectures, not only as others,
2276
+
2277
+ 1:16:01.896 --> 1:16:04.229
2278
+ but you always.
2279
+
2280
+ 1:16:04.704 --> 1:16:07.388
2281
+ Has two advantages.
2282
+
2283
+ 1:16:07.253 --> 1:16:15.264
2284
+ On the one hand, it's more easy to learn a
2285
+ representation.
2286
+
2287
+ 1:16:15.128 --> 1:16:18.799
2288
+ On the other hand, these.
2289
+
2290
+ 1:16:22.082 --> 1:16:24.114
2291
+ Goods.
2292
+
2293
+ 1:16:23.843 --> 1:16:31.763
2294
+ That much for the new record before, so the
2295
+ last thing now means this.
2296
+
2297
+ 1:16:31.671 --> 1:16:36.400
2298
+ Language was used in the molds itself.
2299
+
2300
+ 1:16:36.279 --> 1:16:46.709
2301
+ Now we're seeing them again, but one thing
2302
+ that at the beginning was very essential.
2303
+
2304
+ 1:16:46.967 --> 1:16:57.655
2305
+ So people really train part in the language
2306
+ models only to get this type of embeddings
2307
+
2308
+ 1:16:57.655 --> 1:17:04.166
2309
+ and therefore we want to look a bit more into
2310
+ these.
2311
+
2312
+ 1:17:09.229 --> 1:17:13.456
2313
+ Some laugh words to the word embeddings.
2314
+
2315
+ 1:17:13.353 --> 1:17:22.080
2316
+ The interesting thing is that word embeddings
2317
+ can be used for very different tasks.
2318
+
2319
+ 1:17:21.976 --> 1:17:27.173
2320
+ The advantage is we can train the word embedded.
2321
+
2322
+ 1:17:27.347 --> 1:17:31.334
2323
+ The knife is you can train that on just large
2324
+ amounts of data.
2325
+
2326
+ 1:17:31.931 --> 1:17:40.937
2327
+ And then if you have these wooden beddings
2328
+ you don't have a layer of ten thousand any
2329
+
2330
+ 1:17:40.937 --> 1:17:41.566
2331
+ more.
2332
+
2333
+ 1:17:41.982 --> 1:17:52.231
2334
+ So then you can train a small market to do
2335
+ any other tasks and therefore you're more.
2336
+
2337
+ 1:17:52.532 --> 1:17:58.761
2338
+ Initial word embeddings really depend only
2339
+ on the word itself.
2340
+
2341
+ 1:17:58.662 --> 1:18:07.350
2342
+ If you look at the two meanings of can, the
2343
+ can of beans, or can they do that, some of
2344
+
2345
+ 1:18:07.350 --> 1:18:08.748
2346
+ the embedded.
2347
+
2348
+ 1:18:09.189 --> 1:18:12.395
2349
+ That cannot be resolved.
2350
+
2351
+ 1:18:12.267 --> 1:18:23.907
2352
+ Therefore, you need to know the context, and
2353
+ if you look at the higher levels that people
2354
+
2355
+ 1:18:23.907 --> 1:18:27.917
2356
+ are doing in the context, but.
2357
+
2358
+ 1:18:29.489 --> 1:18:33.757
2359
+ However, even this one has quite very interesting.
2360
+
2361
+ 1:18:34.034 --> 1:18:44.644
2362
+ So people like to visualize that they're always
2363
+ a bit difficult because if you look at this
2364
+
2365
+ 1:18:44.644 --> 1:18:47.182
2366
+ word, vector or word.
2367
+
2368
+ 1:18:47.767 --> 1:18:52.879
2369
+ And drawing your five hundred dimensional
2370
+ vector is still a bit challenging.
2371
+
2372
+ 1:18:53.113 --> 1:19:12.464
2373
+ So you cannot directly do that, so what people
2374
+ have to do is learn some type of dimension.
2375
+
2376
+ 1:19:13.073 --> 1:19:17.216
2377
+ And of course then yes some information gets
2378
+ lost but you can try it.
2379
+
2380
+ 1:19:18.238 --> 1:19:28.122
2381
+ And you see, for example, this is the most
2382
+ famous and common example, so what you can
2383
+
2384
+ 1:19:28.122 --> 1:19:37.892
2385
+ look is you can look at the difference between
2386
+ the male and the female word English.
2387
+
2388
+ 1:19:38.058 --> 1:19:40.389
2389
+ And you can do that for a very different work.
2390
+
2391
+ 1:19:40.780 --> 1:19:45.403
2392
+ And that is where, where the masks come into
2393
+ that, what people then look into.
2394
+
2395
+ 1:19:45.725 --> 1:19:50.995
2396
+ So what you can now, for example, do is you
2397
+ can calculate the difference between man and
2398
+
2399
+ 1:19:50.995 --> 1:19:51.410
2400
+ woman.
2401
+
2402
+ 1:19:52.232 --> 1:19:56.356
2403
+ And what you can do then you can take the
2404
+ embedding of peeing.
2405
+
2406
+ 1:19:56.290 --> 1:20:02.341
2407
+ You can add on it the difference between men
2408
+ and women and where people get really excited.
2409
+
2410
+ 1:20:02.275 --> 1:20:05.524
2411
+ Then you can look at what are the similar
2412
+ words.
2413
+
2414
+ 1:20:05.457 --> 1:20:09.220
2415
+ So you won't, of course, directly hit the
2416
+ correct word.
2417
+
2418
+ 1:20:09.153 --> 1:20:10.501
2419
+ It's a continuous.
2420
+
2421
+ 1:20:10.790 --> 1:20:24.062
2422
+ But you can look at what are the nearest neighbors
2423
+ to the same, and often these words are near.
2424
+
2425
+ 1:20:24.224 --> 1:20:33.911
2426
+ So it's somehow weird that the difference
2427
+ between these works is always the same.
2428
+
2429
+ 1:20:34.374 --> 1:20:37.308
2430
+ Can do different things.
2431
+
2432
+ 1:20:37.191 --> 1:20:47.506
2433
+ You can also imagine that the work tends to
2434
+ be assuming and swim, and with walking and
2435
+
2436
+ 1:20:47.506 --> 1:20:49.047
2437
+ walking you.
2438
+
2439
+ 1:20:49.469 --> 1:20:53.040
2440
+ So you can try to use him.
2441
+
2442
+ 1:20:52.907 --> 1:20:56.254
2443
+ It's no longer like say.
2444
+
2445
+ 1:20:56.120 --> 1:21:04.020
2446
+ The interesting thing is nobody taught him
2447
+ the principle.
2448
+
2449
+ 1:21:04.284 --> 1:21:09.910
2450
+ So it's purely trained on the task of doing
2451
+ the next work prediction.
2452
+
2453
+ 1:21:10.230 --> 1:21:23.669
2454
+ And even for some information like the capital,
2455
+ this is the difference between the capital.
2456
+
2457
+ 1:21:23.823 --> 1:21:33.760
2458
+ Is another visualization here where you have
2459
+ done the same things on the difference between.
2460
+
2461
+ 1:21:33.853 --> 1:21:41.342
2462
+ And you see it's not perfect, but it's building
2463
+ in my directory, so you can even use that for
2464
+
2465
+ 1:21:41.342 --> 1:21:42.936
2466
+ pressure answering.
2467
+
2468
+ 1:21:42.856 --> 1:21:50.322
2469
+ If you have no three countries, the capital,
2470
+ you can do what is the difference between them.
2471
+
2472
+ 1:21:50.242 --> 1:21:53.375
2473
+ You apply that to a new country, and.
2474
+
2475
+ 1:21:54.834 --> 1:22:02.280
2476
+ So these models are able to really learn a
2477
+ lot of information and collapse this information
2478
+
2479
+ 1:22:02.280 --> 1:22:04.385
2480
+ into this representation.
2481
+
2482
+ 1:22:05.325 --> 1:22:07.679
2483
+ And just to do the next two are predictions.
2484
+
2485
+ 1:22:07.707 --> 1:22:22.358
2486
+ And that also explains a bit maybe or explains
2487
+ strongly, but motivates what is the main advantage
2488
+
2489
+ 1:22:22.358 --> 1:22:26.095
2490
+ of this type of neurons.
2491
+
2492
+ 1:22:28.568 --> 1:22:41.599
2493
+ So to summarize what we did today, so what
2494
+ you should hopefully have with you is: Then
2495
+
2496
+ 1:22:41.599 --> 1:22:49.238
2497
+ how we can do language modeling with new networks.
2498
+
2499
+ 1:22:49.449 --> 1:22:55.849
2500
+ We looked at three different architectures:
2501
+ We looked into the feet forward language one,
2502
+
2503
+ 1:22:55.849 --> 1:22:59.050
2504
+ the R&amp;N, and the one based the balsamic.
2505
+
2506
+ 1:22:59.039 --> 1:23:04.559
2507
+ And finally, there are different architectures
2508
+ to do in neural networks.
2509
+
2510
+ 1:23:04.483 --> 1:23:10.961
2511
+ We have seen feet for neural networks and
2512
+ base neural networks, and we'll see in the
2513
+
2514
+ 1:23:10.961 --> 1:23:14.390
2515
+ next lectures the last type of architecture.
2516
+
2517
+ 1:23:15.915 --> 1:23:17.438
2518
+ Any questions.
2519
+
2520
+ 1:23:20.680 --> 1:23:27.360
2521
+ Then thanks a lot, and next I'm just there,
2522
+ we'll be again on order to.
2523
+
demo_data/lectures/Lecture-07-16.05.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee1fc2af8bf4d95a18dacaa3d5d9aad8c6c207e0f5f63090a9adefcfcf29f418
3
+ size 150440033
demo_data/lectures/Lecture-09-25.05.2023/English.vtt ADDED
@@ -0,0 +1,3039 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 0:00:01.721 --> 0:00:05.046
4
+ IntroductionHey, and then welcome to today's
5
+ lecture.
6
+
7
+ 0:00:06.126 --> 0:00:13.861
8
+ What we want to do today is we will finish
9
+ with what we have done last time, so we started
10
+
11
+ 0:00:13.861 --> 0:00:22.192
12
+ looking at the new machine translation system,
13
+ but we have had all the components of the sequence
14
+
15
+ 0:00:22.192 --> 0:00:22.787
16
+ model.
17
+
18
+ 0:00:22.722 --> 0:00:29.361
19
+ We're still missing is the transformer based
20
+ architecture so that maybe the self attention.
21
+
22
+ 0:00:29.849 --> 0:00:31.958
23
+ Then we want to look at the beginning today.
24
+
25
+ 0:00:32.572 --> 0:00:39.315
26
+ And then the main part of the day's lecture
27
+ will be decoding.
28
+
29
+ 0:00:39.207 --> 0:00:43.995
30
+ That means we know how to train the model.
31
+
32
+ 0:00:44.624 --> 0:00:47.507
33
+ So decoding sewage all they can be.
34
+
35
+ 0:00:47.667 --> 0:00:53.359
36
+ Be useful that and the idea is how we find
37
+ that and what challenges are there.
38
+
39
+ 0:00:53.287 --> 0:00:59.051
40
+ Since it's unregressive, we will see that
41
+ it's not as easy as for other tasks.
42
+
43
+ 0:00:59.359 --> 0:01:08.206
44
+ While generating the translation step by step,
45
+ we might make additional arrows that lead.
46
+
47
+ 0:01:09.069 --> 0:01:15.547
48
+ Self-AttentionBut let's start with a self
49
+ attention, so what we looked at into was an
50
+
51
+ 0:01:15.547 --> 0:01:16.451
52
+ base model.
53
+
54
+ 0:01:16.816 --> 0:01:27.931
55
+ And then in our based models you always take
56
+ the last new state, you take your input, you
57
+
58
+ 0:01:27.931 --> 0:01:31.513
59
+ generate a new hidden state.
60
+
61
+ 0:01:31.390 --> 0:01:35.222
62
+ This is more like a standard.
63
+
64
+ 0:01:35.675 --> 0:01:41.088
65
+ And one challenge in this is that we always
66
+ store all our history in one signal hidden
67
+
68
+ 0:01:41.088 --> 0:01:41.523
69
+ stick.
70
+
71
+ 0:01:41.781 --> 0:01:50.235
72
+ We saw that this is a problem when going from
73
+ encoder to decoder, and that is why we then
74
+
75
+ 0:01:50.235 --> 0:01:58.031
76
+ introduced the attention mechanism so that
77
+ we can look back and see all the parts.
78
+
79
+ 0:01:59.579 --> 0:02:06.059
80
+ However, in the decoder we still have this
81
+ issue so we are still storing all information
82
+
83
+ 0:02:06.059 --> 0:02:12.394
84
+ in one hidden state and we might do things
85
+ like here that we start to overwrite things
86
+
87
+ 0:02:12.394 --> 0:02:13.486
88
+ and we forgot.
89
+
90
+ 0:02:14.254 --> 0:02:23.575
91
+ So the idea is, can we do something similar
92
+ which we do between encoder and decoder within
93
+
94
+ 0:02:23.575 --> 0:02:24.907
95
+ the decoder?
96
+
97
+ 0:02:26.526 --> 0:02:33.732
98
+ And the idea is each time we're generating
99
+ here in New York State, it will not only depend
100
+
101
+ 0:02:33.732 --> 0:02:40.780
102
+ on the previous one, but we will focus on the
103
+ whole sequence and look at different parts
104
+
105
+ 0:02:40.780 --> 0:02:46.165
106
+ as we did in attention in order to generate
107
+ our new representation.
108
+
109
+ 0:02:46.206 --> 0:02:53.903
110
+ So each time we generate a new representation
111
+ we will look into what is important now to
112
+
113
+ 0:02:53.903 --> 0:02:54.941
114
+ understand.
115
+
116
+ 0:02:55.135 --> 0:03:00.558
117
+ You may want to understand what much is important.
118
+
119
+ 0:03:00.452 --> 0:03:08.536
120
+ You might want to look to vary and to like
121
+ so that it's much about liking.
122
+
123
+ 0:03:08.808 --> 0:03:24.076
124
+ So the idea is that we are not staring everything
125
+ in each time we are looking at the full sequence.
126
+
127
+ 0:03:25.125 --> 0:03:35.160
128
+ And that is achieved by no longer going really
129
+ secret, and the hidden states here aren't dependent
130
+
131
+ 0:03:35.160 --> 0:03:37.086
132
+ on the same layer.
133
+
134
+ 0:03:36.984 --> 0:03:42.865
135
+ But instead we are always looking at the previous
136
+ layer.
137
+
138
+ 0:03:42.942 --> 0:03:45.510
139
+ We will always have more information that
140
+ we are coming.
141
+
142
+ 0:03:47.147 --> 0:03:51.572
143
+ So how does this censor work in detail?
144
+
145
+ 0:03:51.461 --> 0:03:56.076
146
+ So we started with our initial mistakes.
147
+
148
+ 0:03:55.964 --> 0:04:08.148
149
+ So, for example: Now where we had the three
150
+ terms already, the query, the key and the value,
151
+
152
+ 0:04:08.148 --> 0:04:12.603
153
+ it was motivated by our database.
154
+
155
+ 0:04:12.772 --> 0:04:20.746
156
+ We are comparing it to the keys to all the
157
+ other values, and then we are merging the values.
158
+
159
+ 0:04:21.321 --> 0:04:35.735
160
+ There was a difference between the decoder
161
+ and the encoder.
162
+
163
+ 0:04:35.775 --> 0:04:41.981
164
+ You can assume all the same because we are
165
+ curving ourselves.
166
+
167
+ 0:04:41.881 --> 0:04:49.490
168
+ However, we can make them different but just
169
+ learning a linear projection.
170
+
171
+ 0:04:49.529 --> 0:05:01.836
172
+ So you learn here some projection based on
173
+ what need to do in order to ask which question.
174
+
175
+ 0:05:02.062 --> 0:05:11.800
176
+ That is, the query and the key is to what
177
+ do want to compare and provide others, and
178
+
179
+ 0:05:11.800 --> 0:05:13.748
180
+ which values do.
181
+
182
+ 0:05:14.014 --> 0:05:23.017
183
+ This is not like hand defined, but learn,
184
+ so it's like three linear projections that
185
+
186
+ 0:05:23.017 --> 0:05:26.618
187
+ you apply on all of these hidden.
188
+
189
+ 0:05:26.512 --> 0:05:32.340
190
+ That is the first thing based on your initial
191
+ hidden.
192
+
193
+ 0:05:32.612 --> 0:05:37.249
194
+ And now you can do exactly as before, you
195
+ can do the attention.
196
+
197
+ 0:05:37.637 --> 0:05:40.023
198
+ How did the attention work?
199
+
200
+ 0:05:39.937 --> 0:05:45.391
201
+ The first thing is we are comparing our query
202
+ to all the keys.
203
+
204
+ 0:05:45.445 --> 0:05:52.713
205
+ And that is now the difference before the
206
+ quarry was from the decoder, the keys were
207
+
208
+ 0:05:52.713 --> 0:05:54.253
209
+ from the encoder.
210
+
211
+ 0:05:54.167 --> 0:06:02.548
212
+ Now it's like all from the same, so we started
213
+ the first in state to the keys of all the others.
214
+
215
+ 0:06:02.582 --> 0:06:06.217
216
+ We're learning some value here.
217
+
218
+ 0:06:06.104 --> 0:06:12.808
219
+ How important are these information to better
220
+ understand?
221
+
222
+ 0:06:13.974 --> 0:06:19.103
223
+ And these are just like floating point numbers.
224
+
225
+ 0:06:18.996 --> 0:06:21.673
226
+ They are normalized so.
227
+
228
+ 0:06:22.762 --> 0:06:30.160
229
+ And that is the first step, so let's go first
230
+ for the first curve.
231
+
232
+ 0:06:30.470 --> 0:06:41.937
233
+ What we can then do is multiply each value
234
+ as we have done before with the importance
235
+
236
+ 0:06:41.937 --> 0:06:43.937
237
+ of each state.
238
+
239
+ 0:06:45.145 --> 0:06:47.686
240
+ And then we have in here the new hit step.
241
+
242
+ 0:06:48.308 --> 0:06:57.862
243
+ See now this new hidden status is depending
244
+ on all the hidden state of all the sequences
245
+
246
+ 0:06:57.862 --> 0:06:59.686
247
+ of the previous.
248
+
249
+ 0:06:59.879 --> 0:07:01.739
250
+ One important thing.
251
+
252
+ 0:07:01.651 --> 0:07:08.738
253
+ This one doesn't really depend, so the hidden
254
+ states here don't depend on the.
255
+
256
+ 0:07:09.029 --> 0:07:15.000
257
+ So it only depends on the hidden state of
258
+ the previous layer, but it depends on all the
259
+
260
+ 0:07:15.000 --> 0:07:18.664
261
+ hidden states, and that is of course a big
262
+ advantage.
263
+
264
+ 0:07:18.596 --> 0:07:25.095
265
+ So on the one hand information can directly
266
+ flow from each hidden state before the information
267
+
268
+ 0:07:25.095 --> 0:07:27.215
269
+ flow was always a bit limited.
270
+
271
+ 0:07:28.828 --> 0:07:35.100
272
+ And the independence is important so we can
273
+ calculate all these in the states in parallel.
274
+
275
+ 0:07:35.031 --> 0:07:41.339
276
+ That's another big advantage of self attention
277
+ that we can calculate all the hidden states
278
+
279
+ 0:07:41.339 --> 0:07:46.816
280
+ in one layer in parallel and therefore it's
281
+ the ad designed for GPUs and fast.
282
+
283
+ 0:07:47.587 --> 0:07:50.235
284
+ Then we can do the same thing for the second
285
+ in the state.
286
+
287
+ 0:07:50.530 --> 0:08:06.866
288
+ And the only difference here is how we calculate
289
+ what is occurring.
290
+
291
+ 0:08:07.227 --> 0:08:15.733
292
+ Getting these values is different because
293
+ we use the different query and then getting
294
+
295
+ 0:08:15.733 --> 0:08:17.316
296
+ our new hidden.
297
+
298
+ 0:08:18.258 --> 0:08:26.036
299
+ Yes, this is the word of words that underneath
300
+ this case might, but this is simple.
301
+
302
+ 0:08:25.943 --> 0:08:26.522
303
+ Not.
304
+
305
+ 0:08:27.127 --> 0:08:33.359
306
+ That's a very good question that is like on
307
+ the initial thing.
308
+
309
+ 0:08:33.260 --> 0:08:38.452
310
+ That is exactly not one of you in the architecture.
311
+
312
+ 0:08:38.352 --> 0:08:44.045
313
+ Maybe first you would think of a very big
314
+ disadvantage.
315
+
316
+ 0:08:44.384 --> 0:08:49.804
317
+ So this hidden state would be the same if
318
+ the movie would be different.
319
+
320
+ 0:08:50.650 --> 0:08:59.983
321
+ And of course this estate is a site someone
322
+ should like, so if the estate would be here
323
+
324
+ 0:08:59.983 --> 0:09:06.452
325
+ except for this correspondence the word order
326
+ is completely.
327
+
328
+ 0:09:06.706 --> 0:09:17.133
329
+ Therefore, just doing self attention wouldn't
330
+ work at all because we know word order is important
331
+
332
+ 0:09:17.133 --> 0:09:21.707
333
+ and there is a complete different meaning.
334
+
335
+ 0:09:22.262 --> 0:09:26.277
336
+ We introduce the word position again.
337
+
338
+ 0:09:26.171 --> 0:09:33.040
339
+ The main idea is if the position is already
340
+ in your embeddings.
341
+
342
+ 0:09:33.533 --> 0:09:39.296
343
+ Then of course the position is there and you
344
+ don't lose it anymore.
345
+
346
+ 0:09:39.211 --> 0:09:46.908
347
+ So mainly if your life representation here
348
+ encodes at the second position and your output
349
+
350
+ 0:09:46.908 --> 0:09:48.533
351
+ will be different.
352
+
353
+ 0:09:49.049 --> 0:09:54.585
354
+ And that's how you encode it, but that's essential
355
+ in order to get this work.
356
+
357
+ 0:09:57.137 --> 0:10:06.015
358
+ Multi-head AttentionBut before we are coming
359
+ to the next slide, one other thing that is
360
+
361
+ 0:10:06.015 --> 0:10:10.050
362
+ typically done is multi-head attention.
363
+
364
+ 0:10:10.430 --> 0:10:15.662
365
+ And it might be that in order to understand
366
+ much, it might be good that in some way we
367
+
368
+ 0:10:15.662 --> 0:10:19.872
369
+ focus on life, and in some way we can focus
370
+ on vary, but not equally.
371
+
372
+ 0:10:19.812 --> 0:10:25.346
373
+ But maybe it's like to understand again on
374
+ different dimensions we should look into these.
375
+
376
+ 0:10:25.905 --> 0:10:31.393
377
+ And therefore what we're doing is we're just
378
+ doing the self attention at once, but we're
379
+
380
+ 0:10:31.393 --> 0:10:35.031
381
+ doing it end times or based on your multi head
382
+ attentions.
383
+
384
+ 0:10:34.970 --> 0:10:43.517
385
+ So in typical examples, the number of heads
386
+ people are talking about is like: So you're
387
+
388
+ 0:10:43.517 --> 0:10:50.607
389
+ doing this process and have different queries
390
+ and keys so you can focus.
391
+
392
+ 0:10:50.790 --> 0:10:52.887
393
+ How can you generate eight different?
394
+
395
+ 0:10:53.593 --> 0:11:07.595
396
+ Things it's quite easy here, so instead of
397
+ having one linear projection you can have age
398
+
399
+ 0:11:07.595 --> 0:11:09.326
400
+ different.
401
+
402
+ 0:11:09.569 --> 0:11:13.844
403
+ And it might be that sometimes you're looking
404
+ more into one thing, and sometimes you're Looking
405
+
406
+ 0:11:13.844 --> 0:11:14.779
407
+ more into the other.
408
+
409
+ 0:11:15.055 --> 0:11:24.751
410
+ So that's of course nice with this type of
411
+ learned approach because we can automatically
412
+
413
+ 0:11:24.751 --> 0:11:25.514
414
+ learn.
415
+
416
+ 0:11:29.529 --> 0:11:36.629
417
+ And what you correctly said is its positional
418
+ independence, so it doesn't really matter the
419
+
420
+ 0:11:36.629 --> 0:11:39.176
421
+ order which should be important.
422
+
423
+ 0:11:39.379 --> 0:11:47.686
424
+ So how can we do that and the idea is we are
425
+ just encoding it directly into the embedding
426
+
427
+ 0:11:47.686 --> 0:11:52.024
428
+ so into the starting so that a representation.
429
+
430
+ 0:11:52.512 --> 0:11:55.873
431
+ How do we get that so we started with our
432
+ embeddings?
433
+
434
+ 0:11:55.810 --> 0:11:58.302
435
+ Just imagine this is embedding of eye.
436
+
437
+ 0:11:59.259 --> 0:12:06.169
438
+ And then we are having additionally this positional
439
+ encoding.
440
+
441
+ 0:12:06.057 --> 0:12:10.184
442
+ In this position, encoding is just.
443
+
444
+ 0:12:10.670 --> 0:12:19.564
445
+ With different wavelength, so with different
446
+ lengths of your signal as you see here.
447
+
448
+ 0:12:20.160 --> 0:12:37.531
449
+ And the number of functions you have is exactly
450
+ the number of dimensions you have in your embedded.
451
+
452
+ 0:12:38.118 --> 0:12:51.091
453
+ And what will then do is take the first one,
454
+ and based on your position you multiply your
455
+
456
+ 0:12:51.091 --> 0:12:51.955
457
+ word.
458
+
459
+ 0:12:52.212 --> 0:13:02.518
460
+ And you see now if you put it in this position,
461
+ of course it will get a different value.
462
+
463
+ 0:13:03.003 --> 0:13:12.347
464
+ And thereby in each position a different function
465
+ is multiplied.
466
+
467
+ 0:13:12.203 --> 0:13:19.826
468
+ This is a representation for at the first
469
+ position.
470
+
471
+ 0:13:20.020 --> 0:13:34.922
472
+ If you have it in the input already encoded
473
+ then of course the model is able to keep the
474
+
475
+ 0:13:34.922 --> 0:13:38.605
476
+ position information.
477
+
478
+ 0:13:38.758 --> 0:13:48.045
479
+ But your embeddings can also learn your embeddings
480
+ in a way that they are optimal collaborating
481
+
482
+ 0:13:48.045 --> 0:13:49.786
483
+ with these types.
484
+
485
+ 0:13:51.451 --> 0:13:59.351
486
+ Is that somehow clear where he is there?
487
+
488
+ 0:14:06.006 --> 0:14:13.630
489
+ Am the first position and second position?
490
+
491
+ 0:14:16.576 --> 0:14:17.697
492
+ Have a long wait period.
493
+
494
+ 0:14:17.652 --> 0:14:19.625
495
+ I'm not going to tell you how to turn the.
496
+
497
+ 0:14:21.441 --> 0:14:26.927
498
+ Be completely issued because if you have a
499
+ very short wavelength there might be quite
500
+
501
+ 0:14:26.927 --> 0:14:28.011
502
+ big differences.
503
+
504
+ 0:14:28.308 --> 0:14:33.577
505
+ And it might also be that then it depends,
506
+ of course, like what type of world embedding
507
+
508
+ 0:14:33.577 --> 0:14:34.834
509
+ you've learned like.
510
+
511
+ 0:14:34.774 --> 0:14:37.541
512
+ Is the dimension where you have long changes?
513
+
514
+ 0:14:37.481 --> 0:14:43.048
515
+ Is the report for your embedding or not so
516
+ that's what I mean so that the model can somehow
517
+
518
+ 0:14:43.048 --> 0:14:47.708
519
+ learn that by putting more information into
520
+ one of the embedding dimensions?
521
+
522
+ 0:14:48.128 --> 0:14:54.560
523
+ So incorporated and would assume it's learning
524
+ it a bit haven't seen.
525
+
526
+ 0:14:54.468 --> 0:14:57.412
527
+ Details studied how different.
528
+
529
+ 0:14:58.078 --> 0:15:07.863
530
+ It's also a bit difficult because really measuring
531
+ how similar or different a world isn't that
532
+
533
+ 0:15:07.863 --> 0:15:08.480
534
+ easy.
535
+
536
+ 0:15:08.377 --> 0:15:13.118
537
+ You can do, of course, the average distance.
538
+
539
+ 0:15:14.114 --> 0:15:21.393
540
+ Them, so are the weight tags not at model
541
+ two, or is there fixed weight tags that the
542
+
543
+ 0:15:21.393 --> 0:15:21.986
544
+ model.
545
+
546
+ 0:15:24.164 --> 0:15:30.165
547
+ To believe they are fixed and the mono learns
548
+ there's a different way of doing it.
549
+
550
+ 0:15:30.093 --> 0:15:32.987
551
+ The other thing you can do is you can.
552
+
553
+ 0:15:33.213 --> 0:15:36.945
554
+ So you can learn the second embedding which
555
+ says this is position one.
556
+
557
+ 0:15:36.893 --> 0:15:38.581
558
+ This is position two and so on.
559
+
560
+ 0:15:38.529 --> 0:15:42.502
561
+ Like for words you could learn fixed embeddings
562
+ and then add them upwards.
563
+
564
+ 0:15:42.449 --> 0:15:45.008
565
+ So then it would have the same thing it's
566
+ done.
567
+
568
+ 0:15:44.955 --> 0:15:46.836
569
+ There is one disadvantage of this.
570
+
571
+ 0:15:46.782 --> 0:15:51.405
572
+ There is anybody an idea what could be the
573
+ disadvantage of a more learned embedding.
574
+
575
+ 0:15:54.955 --> 0:16:00.000
576
+ Here maybe extra play this finger and ethnic
577
+ stuff that will be an art.
578
+
579
+ 0:15:59.929 --> 0:16:01.754
580
+ This will be an art for.
581
+
582
+ 0:16:02.502 --> 0:16:08.323
583
+ You would only be good at positions you have
584
+ seen often and especially for long sequences.
585
+
586
+ 0:16:08.259 --> 0:16:13.990
587
+ You might have seen the positions very rarely
588
+ and then normally not performing that well
589
+
590
+ 0:16:13.990 --> 0:16:17.982
591
+ while here it can better learn a more general
592
+ representation.
593
+
594
+ 0:16:18.298 --> 0:16:22.522
595
+ So that is another thing which we won't discuss
596
+ here.
597
+
598
+ 0:16:22.444 --> 0:16:25.965
599
+ Guess is what is called relative attention.
600
+
601
+ 0:16:25.945 --> 0:16:32.570
602
+ And in this case you don't learn absolute
603
+ positions, but in your calculation of the similarity
604
+
605
+ 0:16:32.570 --> 0:16:39.194
606
+ you take again the relative distance into account
607
+ and have a different similarity depending on
608
+
609
+ 0:16:39.194 --> 0:16:40.449
610
+ how far they are.
611
+
612
+ 0:16:40.660 --> 0:16:45.898
613
+ And then you don't need to encode it beforehand,
614
+ but you would more happen within your comparison.
615
+
616
+ 0:16:46.186 --> 0:16:53.471
617
+ So when you compare how similar things you
618
+ print, of course also take the relative position.
619
+
620
+ 0:16:55.715 --> 0:17:03.187
621
+ Because there are multiple ways to use the
622
+ one, to multiply all the embedding, or to use
623
+
624
+ 0:17:03.187 --> 0:17:03.607
625
+ all.
626
+
627
+ 0:17:17.557 --> 0:17:21.931
628
+ The encoder can be bidirectional.
629
+
630
+ 0:17:21.802 --> 0:17:30.681
631
+ We have everything from the beginning so we
632
+ can have a model where.
633
+
634
+ 0:17:31.111 --> 0:17:36.455
635
+ Decoder training of course has also everything
636
+ available but during inference you always have
637
+
638
+ 0:17:36.455 --> 0:17:41.628
639
+ only the past available so you can only look
640
+ into the previous one and not into the future
641
+
642
+ 0:17:41.628 --> 0:17:46.062
643
+ because if you generate word by word you don't
644
+ know what it will be there in.
645
+
646
+ 0:17:46.866 --> 0:17:53.180
647
+ And so we also have to consider this somehow
648
+ in the attention, and until now we look more
649
+
650
+ 0:17:53.180 --> 0:17:54.653
651
+ at the ecoder style.
652
+
653
+ 0:17:54.583 --> 0:17:58.613
654
+ So if you look at this type of model, it's
655
+ by direction.
656
+
657
+ 0:17:58.542 --> 0:18:03.775
658
+ So for this hill state we are looking into
659
+ the past and into the future.
660
+
661
+ 0:18:04.404 --> 0:18:14.436
662
+ So the question is, can we have to do this
663
+ like unidirectional so that you only look into
664
+
665
+ 0:18:14.436 --> 0:18:15.551
666
+ the past?
667
+
668
+ 0:18:15.439 --> 0:18:22.575
669
+ And the nice thing is, this is even easier
670
+ than for our hands.
671
+
672
+ 0:18:23.123 --> 0:18:29.738
673
+ So we would have different types of parameters
674
+ and models because you have a forward direction.
675
+
676
+ 0:18:31.211 --> 0:18:35.679
677
+ For attention, that is very simple.
678
+
679
+ 0:18:35.555 --> 0:18:39.326
680
+ We are doing what is masking.
681
+
682
+ 0:18:39.200 --> 0:18:45.613
683
+ If you want to have a backward model, these
684
+ ones.
685
+
686
+ 0:18:45.845 --> 0:18:54.355
687
+ So on the first hit stage it's been over,
688
+ so it's maybe only looking at its health.
689
+
690
+ 0:18:54.894 --> 0:19:05.310
691
+ By the second it looks on the second and the
692
+ third, so you're always selling all values
693
+
694
+ 0:19:05.310 --> 0:19:07.085
695
+ in the future.
696
+
697
+ 0:19:07.507 --> 0:19:13.318
698
+ And thereby you can have with the same parameters
699
+ the same model.
700
+
701
+ 0:19:13.230 --> 0:19:15.786
702
+ You can have then a unique.
703
+
704
+ 0:19:16.156 --> 0:19:29.895
705
+ In the decoder you do the masked self attention
706
+ where you only look into the past and you don't
707
+
708
+ 0:19:29.895 --> 0:19:30.753
709
+ look.
710
+
711
+ 0:19:32.212 --> 0:19:36.400
712
+ Then we only have, of course, looked onto
713
+ itself.
714
+
715
+ 0:19:36.616 --> 0:19:50.903
716
+ So the question: How can we combine forward
717
+ and decoder and then we can do a decoder and
718
+
719
+ 0:19:50.903 --> 0:19:54.114
720
+ just have a second?
721
+
722
+ 0:19:54.374 --> 0:20:00.286
723
+ And then we're doing the cross attention which
724
+ attacks from the decoder to the anchoder.
725
+
726
+ 0:20:00.540 --> 0:20:11.264
727
+ So in this time it's again that the queries
728
+ is a current state of decoder, while the keys
729
+
730
+ 0:20:11.264 --> 0:20:22.821
731
+ are: You can do both onto yourself to get the
732
+ meaning on the target side and to get the meaning.
733
+
734
+ 0:20:23.423 --> 0:20:25.928
735
+ So see then the full picture.
736
+
737
+ 0:20:25.844 --> 0:20:32.997
738
+ This is now the typical picture of the transformer
739
+ and where you use self attention.
740
+
741
+ 0:20:32.913 --> 0:20:36.702
742
+ So what you have is have your power hidden.
743
+
744
+ 0:20:37.217 --> 0:20:43.408
745
+ What you then apply is here the position they're
746
+ coding: We have then doing the self attention
747
+
748
+ 0:20:43.408 --> 0:20:46.731
749
+ to all the others, and this can be bi-directional.
750
+
751
+ 0:20:47.707 --> 0:20:54.918
752
+ You normally do another feed forward layer
753
+ just like to make things to learn additional
754
+
755
+ 0:20:54.918 --> 0:20:55.574
756
+ things.
757
+
758
+ 0:20:55.492 --> 0:21:02.755
759
+ You're just having also a feed forward layer
760
+ which takes your heel stable and generates
761
+
762
+ 0:21:02.755 --> 0:21:07.129
763
+ your heel state because we are making things
764
+ deeper.
765
+
766
+ 0:21:07.747 --> 0:21:15.648
767
+ Then this blue part you can stack over several
768
+ times so you can have layers so that.
769
+
770
+ 0:21:16.336 --> 0:21:30.256
771
+ In addition to these blue arrows, so we talked
772
+ about this in R&amp;S that if you are now back
773
+
774
+ 0:21:30.256 --> 0:21:35.883
775
+ propagating your arrow from the top,.
776
+
777
+ 0:21:36.436 --> 0:21:48.578
778
+ In order to prevent that we are not really
779
+ learning how to transform that, but instead
780
+
781
+ 0:21:48.578 --> 0:21:51.230
782
+ we have to change.
783
+
784
+ 0:21:51.671 --> 0:22:00.597
785
+ You're calculating what should be changed
786
+ with this one.
787
+
788
+ 0:22:00.440 --> 0:22:09.368
789
+ The backwards clip each layer and the learning
790
+ is just.
791
+
792
+ 0:22:10.750 --> 0:22:21.632
793
+ The encoder before we go to the decoder.
794
+
795
+ 0:22:21.366 --> 0:22:30.663
796
+ We have any additional questions.
797
+
798
+ 0:22:31.471 --> 0:22:33.220
799
+ That's a Very Good Point.
800
+
801
+ 0:22:33.553 --> 0:22:38.709
802
+ Yeah, you normally take always that at least
803
+ the default architecture to only look at the
804
+
805
+ 0:22:38.709 --> 0:22:38.996
806
+ top.
807
+
808
+ 0:22:40.000 --> 0:22:40.388
809
+ Coder.
810
+
811
+ 0:22:40.332 --> 0:22:42.340
812
+ Of course, you can do other things.
813
+
814
+ 0:22:42.285 --> 0:22:45.040
815
+ We investigated, for example, the lowest layout.
816
+
817
+ 0:22:44.983 --> 0:22:49.426
818
+ The decoder is looking at the lowest level
819
+ of the incoder and not of the top.
820
+
821
+ 0:22:49.749 --> 0:23:05.342
822
+ You can average or you can even learn theoretically
823
+ that what you can also do is attending to all.
824
+
825
+ 0:23:05.785 --> 0:23:11.180
826
+ Can attend to all possible layers and states.
827
+
828
+ 0:23:11.063 --> 0:23:18.337
829
+ But what the default thing is is that you
830
+ only have the top.
831
+
832
+ 0:23:20.580 --> 0:23:31.999
833
+ The decoder when we're doing is firstly doing
834
+ the same position and coding, then we're doing
835
+
836
+ 0:23:31.999 --> 0:23:36.419
837
+ self attention in the decoder side.
838
+
839
+ 0:23:37.837 --> 0:23:43.396
840
+ Of course here it's not important we're doing
841
+ the mask self attention so that we're only
842
+
843
+ 0:23:43.396 --> 0:23:45.708
844
+ attending to the past and we're not.
845
+
846
+ 0:23:47.287 --> 0:24:02.698
847
+ Here you see the difference, so in this case
848
+ the keys and values are from the encoder and
849
+
850
+ 0:24:02.698 --> 0:24:03.554
851
+ the.
852
+
853
+ 0:24:03.843 --> 0:24:12.103
854
+ You're comparing it to all the counter hidden
855
+ states calculating the similarity and then
856
+
857
+ 0:24:12.103 --> 0:24:13.866
858
+ you do the weight.
859
+
860
+ 0:24:14.294 --> 0:24:17.236
861
+ And that is an edit to what is here.
862
+
863
+ 0:24:18.418 --> 0:24:29.778
864
+ Then you have a linen layer and again this
865
+ green one is sticked several times and then.
866
+
867
+ 0:24:32.232 --> 0:24:36.987
868
+ Question, so each code is off.
869
+
870
+ 0:24:36.834 --> 0:24:46.041
871
+ Every one of those has the last layer of thing,
872
+ so in the.
873
+
874
+ 0:24:46.246 --> 0:24:51.007
875
+ All with and only to the last or the top layer
876
+ of the anchor.
877
+
878
+ 0:24:57.197 --> 0:25:00.053
879
+ Designing a translation systemGood So That
880
+ Would Be.
881
+
882
+ 0:25:01.501 --> 0:25:12.513
883
+ To sequence models we have looked at attention
884
+ and before we are decoding do you have any
885
+
886
+ 0:25:12.513 --> 0:25:18.020
887
+ more questions to this type of architecture.
888
+
889
+ 0:25:20.480 --> 0:25:30.049
890
+ Transformer was first used in machine translation,
891
+ but now it's a standard thing for doing nearly
892
+
893
+ 0:25:30.049 --> 0:25:32.490
894
+ any tie sequence models.
895
+
896
+ 0:25:33.013 --> 0:25:35.984
897
+ Even large language models.
898
+
899
+ 0:25:35.878 --> 0:25:38.455
900
+ They are a bit similar.
901
+
902
+ 0:25:38.347 --> 0:25:45.114
903
+ They are just throwing away the anchor and
904
+ cross the tension.
905
+
906
+ 0:25:45.505 --> 0:25:59.329
907
+ And that is maybe interesting that it's important
908
+ to have this attention because you cannot store
909
+
910
+ 0:25:59.329 --> 0:26:01.021
911
+ everything.
912
+
913
+ 0:26:01.361 --> 0:26:05.357
914
+ The interesting thing with the attention is
915
+ now we can attend to everything.
916
+
917
+ 0:26:05.745 --> 0:26:13.403
918
+ So you can again go back to your initial model
919
+ and have just a simple sequence model and then
920
+
921
+ 0:26:13.403 --> 0:26:14.055
922
+ target.
923
+
924
+ 0:26:14.694 --> 0:26:24.277
925
+ There would be a more language model style
926
+ or people call it Decoder Only model where
927
+
928
+ 0:26:24.277 --> 0:26:26.617
929
+ you throw this away.
930
+
931
+ 0:26:27.247 --> 0:26:30.327
932
+ The nice thing is because of your self attention.
933
+
934
+ 0:26:30.265 --> 0:26:34.163
935
+ You have the original problem why you introduce
936
+ the attention.
937
+
938
+ 0:26:34.101 --> 0:26:39.639
939
+ You don't have that anymore because it's not
940
+ everything is summarized, but each time you
941
+
942
+ 0:26:39.639 --> 0:26:44.866
943
+ generate, you're looking back at all the previous
944
+ words, the source and the target.
945
+
946
+ 0:26:45.805 --> 0:26:51.734
947
+ And there is a lot of work on is a really
948
+ important to have encoded a decoded model or
949
+
950
+ 0:26:51.734 --> 0:26:54.800
951
+ is a decoded only model as good if you have.
952
+
953
+ 0:26:54.732 --> 0:27:00.049
954
+ But the comparison is not that easy because
955
+ how many parameters do you have?
956
+
957
+ 0:27:00.360 --> 0:27:08.832
958
+ So think the general idea at the moment is,
959
+ at least for machine translation, it's normally
960
+
961
+ 0:27:08.832 --> 0:27:17.765
962
+ a bit better to have an encoded decoder model
963
+ and not a decoder model where you just concatenate
964
+
965
+ 0:27:17.765 --> 0:27:20.252
966
+ the source and the target.
967
+
968
+ 0:27:21.581 --> 0:27:24.073
969
+ But there is not really a big difference anymore.
970
+
971
+ 0:27:24.244 --> 0:27:29.891
972
+ Because this big issue, which we had initially
973
+ with it that everything is stored in the working
974
+
975
+ 0:27:29.891 --> 0:27:31.009
976
+ state, is nothing.
977
+
978
+ 0:27:31.211 --> 0:27:45.046
979
+ Of course, the advantage maybe here is that
980
+ you give it a bias at your same language information.
981
+
982
+ 0:27:45.285 --> 0:27:53.702
983
+ While in an encoder only model this all is
984
+ merged into one thing and sometimes it is good
985
+
986
+ 0:27:53.702 --> 0:28:02.120
987
+ to give models a bit of bias okay you should
988
+ maybe treat things separately and you should
989
+
990
+ 0:28:02.120 --> 0:28:03.617
991
+ look different.
992
+
993
+ 0:28:04.144 --> 0:28:11.612
994
+ And of course one other difference, one other
995
+ disadvantage, maybe of an encoder owning one.
996
+
997
+ 0:28:16.396 --> 0:28:19.634
998
+ You think about the suicide sentence and how
999
+ it's treated.
1000
+
1001
+ 0:28:21.061 --> 0:28:33.787
1002
+ Architecture: Anchorer can both be in the
1003
+ sentence for every state and cause a little
1004
+
1005
+ 0:28:33.787 --> 0:28:35.563
1006
+ difference.
1007
+
1008
+ 0:28:35.475 --> 0:28:43.178
1009
+ If you only have a decoder that has to be
1010
+ unidirectional because for the decoder side
1011
+
1012
+ 0:28:43.178 --> 0:28:51.239
1013
+ for the generation you need it and so your
1014
+ input is read state by state so you don't have
1015
+
1016
+ 0:28:51.239 --> 0:28:54.463
1017
+ positional bidirection information.
1018
+
1019
+ 0:28:56.596 --> 0:29:05.551
1020
+ Again, it receives a sequence of embeddings
1021
+ with position encoding.
1022
+
1023
+ 0:29:05.419 --> 0:29:11.085
1024
+ The piece is like long vector has output.
1025
+
1026
+ 0:29:11.031 --> 0:29:17.148
1027
+ Don't understand how you can set footworks
1028
+ to this part of each other through inputs.
1029
+
1030
+ 0:29:17.097 --> 0:29:20.060
1031
+ Other than cola is the same as the food consume.
1032
+
1033
+ 0:29:21.681 --> 0:29:27.438
1034
+ Okay, it's very good bye, so this one hand
1035
+ coding is only done on the top layer.
1036
+
1037
+ 0:29:27.727 --> 0:29:32.012
1038
+ So this green one is only repeated.
1039
+
1040
+ 0:29:31.893 --> 0:29:38.511
1041
+ You have the word embedding or the position
1042
+ embedding.
1043
+
1044
+ 0:29:38.390 --> 0:29:42.966
1045
+ You have one layer of decoder which.
1046
+
1047
+ 0:29:43.283 --> 0:29:48.245
1048
+ Then you stick in the second one, the third
1049
+ one, the fourth one, and then on the top.
1050
+
1051
+ 0:29:48.208 --> 0:29:55.188
1052
+ Layer: You put this projection layer which
1053
+ takes a one thousand dimensional backtalk and
1054
+
1055
+ 0:29:55.188 --> 0:30:02.089
1056
+ generates based on your vocabulary maybe in
1057
+ ten thousand soft max layer which gives you
1058
+
1059
+ 0:30:02.089 --> 0:30:04.442
1060
+ the probability of all words.
1061
+
1062
+ 0:30:06.066 --> 0:30:22.369
1063
+ It's a very good part part of the mass tape
1064
+ ladies, but it wouldn't be for the X-rays.
1065
+
1066
+ 0:30:22.262 --> 0:30:27.015
1067
+ Aquarium filters to be like monsoon roding
1068
+ as they get by the river.
1069
+
1070
+ 0:30:27.647 --> 0:30:33.140
1071
+ Yes, there is work on that think we will discuss
1072
+ that in the pre-trained models.
1073
+
1074
+ 0:30:33.493 --> 0:30:39.756
1075
+ It's called where you exactly do that.
1076
+
1077
+ 0:30:39.595 --> 0:30:48.591
1078
+ If you have more metric side, it's like diagonal
1079
+ here.
1080
+
1081
+ 0:30:48.708 --> 0:30:53.018
1082
+ And it's a full metric, so here everybody's
1083
+ attending to each position.
1084
+
1085
+ 0:30:52.958 --> 0:30:54.696
1086
+ Here you're only attending.
1087
+
1088
+ 0:30:54.975 --> 0:31:05.744
1089
+ Then you can do the previous one where this
1090
+ one is decoded, not everything but everything.
1091
+
1092
+ 0:31:06.166 --> 0:31:13.961
1093
+ So you have a bit more that is possible, and
1094
+ we'll have that in the lecture on pre-train
1095
+
1096
+ 0:31:13.961 --> 0:31:14.662
1097
+ models.
1098
+
1099
+ 0:31:18.478 --> 0:31:27.440
1100
+ So we now know how to build a translation
1101
+ system, but of course we don't want to have
1102
+
1103
+ 0:31:27.440 --> 0:31:30.774
1104
+ a translation system by itself.
1105
+
1106
+ 0:31:31.251 --> 0:31:40.037
1107
+ Now given this model an input sentence, how
1108
+ can we generate an output mind?
1109
+
1110
+ 0:31:39.921 --> 0:31:49.455
1111
+ The general idea is still: So what we really
1112
+ want to do is we start with the model.
1113
+
1114
+ 0:31:49.342 --> 0:31:53.894
1115
+ We generate different possible translations.
1116
+
1117
+ 0:31:54.014 --> 0:31:59.754
1118
+ We score them the lock probability that we're
1119
+ getting, so for each input and output pair
1120
+
1121
+ 0:31:59.754 --> 0:32:05.430
1122
+ we can calculate the lock probability, which
1123
+ is a product of all probabilities for each
1124
+
1125
+ 0:32:05.430 --> 0:32:09.493
1126
+ word in there, and then we can find what is
1127
+ the most probable.
1128
+
1129
+ 0:32:09.949 --> 0:32:15.410
1130
+ However, that's a bit complicated we will
1131
+ see because we can't look at all possible translations.
1132
+
1133
+ 0:32:15.795 --> 0:32:28.842
1134
+ So there is infinite or a number of possible
1135
+ translations, so we have to do it somehow in
1136
+
1137
+ 0:32:28.842 --> 0:32:31.596
1138
+ more intelligence.
1139
+
1140
+ 0:32:32.872 --> 0:32:37.821
1141
+ So what we want to do today in the rest of
1142
+ the lecture?
1143
+
1144
+ 0:32:37.732 --> 0:32:40.238
1145
+ What is the search problem?
1146
+
1147
+ 0:32:40.149 --> 0:32:44.716
1148
+ Then we will look at different search algorithms.
1149
+
1150
+ 0:32:45.825 --> 0:32:56.636
1151
+ Will compare model and search errors, so there
1152
+ can be errors on the model where the model
1153
+
1154
+ 0:32:56.636 --> 0:33:03.483
1155
+ is not giving the highest score to the best
1156
+ translation.
1157
+
1158
+ 0:33:03.903 --> 0:33:21.069
1159
+ This is always like searching the best translation
1160
+ out of one model, which is often also interesting.
1161
+
1162
+ 0:33:24.004 --> 0:33:29.570
1163
+ And how do we do the search?
1164
+
1165
+ 0:33:29.378 --> 0:33:41.856
1166
+ We want to find the translation where the
1167
+ reference is minimal.
1168
+
1169
+ 0:33:42.042 --> 0:33:44.041
1170
+ So the nice thing is SMT.
1171
+
1172
+ 0:33:43.964 --> 0:33:51.310
1173
+ It wasn't the case, but in neuromachine translation
1174
+ we can't find any possible translation, so
1175
+
1176
+ 0:33:51.310 --> 0:33:53.785
1177
+ at least within our vocabulary.
1178
+
1179
+ 0:33:53.707 --> 0:33:58.116
1180
+ But if we have BPE we can really generate
1181
+ any possible.
1182
+
1183
+ 0:33:58.078 --> 0:34:04.604
1184
+ Translation and cereal: We could always minimize
1185
+ that, but yeah, we can't do it that easy because
1186
+
1187
+ 0:34:04.604 --> 0:34:07.734
1188
+ of course we don't have the reference at hand.
1189
+
1190
+ 0:34:07.747 --> 0:34:10.384
1191
+ If it has a reference, it's not a problem.
1192
+
1193
+ 0:34:10.322 --> 0:34:13.696
1194
+ We know what we are searching for, but we
1195
+ don't know.
1196
+
1197
+ 0:34:14.054 --> 0:34:23.886
1198
+ So how can we then model this by just finding
1199
+ the translation with the highest probability?
1200
+
1201
+ 0:34:23.779 --> 0:34:29.018
1202
+ Looking at it, we want to find the translation.
1203
+
1204
+ 0:34:29.169 --> 0:34:32.525
1205
+ Idea is our model is a good approximation.
1206
+
1207
+ 0:34:32.447 --> 0:34:34.333
1208
+ That's how we train it.
1209
+
1210
+ 0:34:34.254 --> 0:34:36.471
1211
+ What is a good translation?
1212
+
1213
+ 0:34:36.391 --> 0:34:43.665
1214
+ And if we find translation with the highest
1215
+ probability, this should also give us the best
1216
+
1217
+ 0:34:43.665 --> 0:34:44.704
1218
+ translation.
1219
+
1220
+ 0:34:45.265 --> 0:34:56.965
1221
+ And that is then, of course, the difference
1222
+ between the search error is that the model
1223
+
1224
+ 0:34:56.965 --> 0:35:02.076
1225
+ doesn't predict the best translation.
1226
+
1227
+ 0:35:02.622 --> 0:35:08.777
1228
+ How can we do the basic search first of all
1229
+ in basic search that seems to be very easy
1230
+
1231
+ 0:35:08.777 --> 0:35:15.003
1232
+ so what we can do is we can do the forward
1233
+ pass for the whole encoder and that's how it
1234
+
1235
+ 0:35:15.003 --> 0:35:21.724
1236
+ starts the input sentences known you can put
1237
+ the input sentence and calculate all your estates
1238
+
1239
+ 0:35:21.724 --> 0:35:22.573
1240
+ and hidden?
1241
+
1242
+ 0:35:23.083 --> 0:35:35.508
1243
+ Then you can put in your sentence start and
1244
+ you can generate.
1245
+
1246
+ 0:35:35.308 --> 0:35:41.728
1247
+ Here you have the probability.
1248
+
1249
+ 0:35:41.801 --> 0:35:52.624
1250
+ A good idea we would see later that as a typical
1251
+ algorithm is guess what you all would do, you
1252
+
1253
+ 0:35:52.624 --> 0:35:54.788
1254
+ would then select.
1255
+
1256
+ 0:35:55.235 --> 0:36:06.265
1257
+ So if you generate here a probability distribution
1258
+ over all the words in your vocabulary then
1259
+
1260
+ 0:36:06.265 --> 0:36:08.025
1261
+ you can solve.
1262
+
1263
+ 0:36:08.688 --> 0:36:13.147
1264
+ Yeah, this is how our auto condition is done
1265
+ in our system.
1266
+
1267
+ 0:36:14.794 --> 0:36:19.463
1268
+ Yeah, this is also why there you have to have
1269
+ a model of possible extending.
1270
+
1271
+ 0:36:19.403 --> 0:36:24.274
1272
+ It's more of a language model, but then this
1273
+ is one algorithm to do the search.
1274
+
1275
+ 0:36:24.213 --> 0:36:26.726
1276
+ They maybe have also more advanced ones.
1277
+
1278
+ 0:36:26.665 --> 0:36:32.044
1279
+ We will see that so this search and other
1280
+ completion should be exactly the same as the
1281
+
1282
+ 0:36:32.044 --> 0:36:33.775
1283
+ search machine translation.
1284
+
1285
+ 0:36:34.914 --> 0:36:40.480
1286
+ So we'll see that this is not optimal, so
1287
+ hopefully it's not that this way, but for this
1288
+
1289
+ 0:36:40.480 --> 0:36:41.043
1290
+ problem.
1291
+
1292
+ 0:36:41.941 --> 0:36:47.437
1293
+ And what you can do then you can select this
1294
+ word.
1295
+
1296
+ 0:36:47.329 --> 0:36:50.781
1297
+ This was the best translation.
1298
+
1299
+ 0:36:51.111 --> 0:36:57.675
1300
+ Because the decoder, of course, in the next
1301
+ step needs not to know what is the best word
1302
+
1303
+ 0:36:57.675 --> 0:37:02.396
1304
+ here, it inputs it and generates that flexibility
1305
+ distribution.
1306
+
1307
+ 0:37:03.423 --> 0:37:14.608
1308
+ And then your new distribution, and you can
1309
+ do the same thing, there's the best word there,
1310
+
1311
+ 0:37:14.608 --> 0:37:15.216
1312
+ and.
1313
+
1314
+ 0:37:15.435 --> 0:37:22.647
1315
+ So you can continue doing that and always
1316
+ get the hopefully the best translation in.
1317
+
1318
+ 0:37:23.483 --> 0:37:30.839
1319
+ The first question is, of course, how long
1320
+ are you doing it?
1321
+
1322
+ 0:37:30.718 --> 0:37:33.859
1323
+ Now we could go forever.
1324
+
1325
+ 0:37:36.476 --> 0:37:52.596
1326
+ We had this token at the input and we put
1327
+ the stop token at the output.
1328
+
1329
+ 0:37:53.974 --> 0:38:07.217
1330
+ And this is important because if we wouldn't
1331
+ do that then we wouldn't have a good idea.
1332
+
1333
+ 0:38:10.930 --> 0:38:16.193
1334
+ So that seems to be a good idea, but is it
1335
+ really?
1336
+
1337
+ 0:38:16.090 --> 0:38:21.046
1338
+ Do we find the most probable sentence in this?
1339
+
1340
+ 0:38:23.763 --> 0:38:25.154
1341
+ Or my dear healed proverb,.
1342
+
1343
+ 0:38:27.547 --> 0:38:41.823
1344
+ We are always selecting the highest probability
1345
+ one, so it seems to be that this is a very
1346
+
1347
+ 0:38:41.823 --> 0:38:45.902
1348
+ good solution to anybody.
1349
+
1350
+ 0:38:46.406 --> 0:38:49.909
1351
+ Yes, that is actually the problem.
1352
+
1353
+ 0:38:49.809 --> 0:38:56.417
1354
+ You might do early decisions and you don't
1355
+ have the global view.
1356
+
1357
+ 0:38:56.796 --> 0:39:02.813
1358
+ And this problem happens because it is an
1359
+ outer regressive model.
1360
+
1361
+ 0:39:03.223 --> 0:39:13.275
1362
+ So it happens because yeah, the output we
1363
+ generate is the input in the next step.
1364
+
1365
+ 0:39:13.793 --> 0:39:19.493
1366
+ And this, of course, is leading to problems.
1367
+
1368
+ 0:39:19.367 --> 0:39:27.476
1369
+ If we always take the best solution, it doesn't
1370
+ mean you have.
1371
+
1372
+ 0:39:27.727 --> 0:39:33.941
1373
+ It would be different if you have a problem
1374
+ where the output is not influencing your input.
1375
+
1376
+ 0:39:34.294 --> 0:39:44.079
1377
+ Then this solution will give you the best
1378
+ model, but since the output is influencing
1379
+
1380
+ 0:39:44.079 --> 0:39:47.762
1381
+ your next input and the model,.
1382
+
1383
+ 0:39:48.268 --> 0:39:51.599
1384
+ Because one question might not be why do we
1385
+ have this type of model?
1386
+
1387
+ 0:39:51.771 --> 0:39:58.946
1388
+ So why do we really need to put here in the
1389
+ last source word?
1390
+
1391
+ 0:39:58.831 --> 0:40:05.351
1392
+ You can also put in: And then always predict
1393
+ the word and the nice thing is then you wouldn't
1394
+
1395
+ 0:40:05.351 --> 0:40:11.733
1396
+ need to do beams or a difficult search because
1397
+ then the output here wouldn't influence what
1398
+
1399
+ 0:40:11.733 --> 0:40:12.982
1400
+ is inputted here.
1401
+
1402
+ 0:40:15.435 --> 0:40:20.219
1403
+ Idea whether that might not be the best idea.
1404
+
1405
+ 0:40:20.115 --> 0:40:24.590
1406
+ You'll just be translating each word and.
1407
+
1408
+ 0:40:26.626 --> 0:40:37.815
1409
+ The second one is right, yes, you're not generating
1410
+ a Korean sentence.
1411
+
1412
+ 0:40:38.058 --> 0:40:48.197
1413
+ We'll also see that later it's called non
1414
+ auto-progressive translation, so there is work
1415
+
1416
+ 0:40:48.197 --> 0:40:49.223
1417
+ on that.
1418
+
1419
+ 0:40:49.529 --> 0:41:02.142
1420
+ So you might know it roughly because you know
1421
+ it's based on this hidden state, but it can
1422
+
1423
+ 0:41:02.142 --> 0:41:08.588
1424
+ be that in the end you have your probability.
1425
+
1426
+ 0:41:09.189 --> 0:41:14.633
1427
+ And then you're not modeling the dependencies
1428
+ within a work within the target sentence.
1429
+
1430
+ 0:41:14.571 --> 0:41:27.579
1431
+ For example: You can express things in German,
1432
+ then you don't know which one you really select.
1433
+
1434
+ 0:41:27.443 --> 0:41:32.159
1435
+ That influences what you later.
1436
+
1437
+ 0:41:33.393 --> 0:41:46.411
1438
+ Then you try to find a better way not only
1439
+ based on the English sentence and the words
1440
+
1441
+ 0:41:46.411 --> 0:41:48.057
1442
+ that come.
1443
+
1444
+ 0:41:49.709 --> 0:42:00.954
1445
+ Yes, that is more like a two-step decoding,
1446
+ but that is, of course, a lot more like computational.
1447
+
1448
+ 0:42:01.181 --> 0:42:15.978
1449
+ The first thing you can do, which is typically
1450
+ done, is doing not really search.
1451
+
1452
+ 0:42:16.176 --> 0:42:32.968
1453
+ So first look at what the problem of research
1454
+ is to make it a bit more clear.
1455
+
1456
+ 0:42:34.254 --> 0:42:53.163
1457
+ And now you can extend them and you can extend
1458
+ these and the joint probabilities.
1459
+
1460
+ 0:42:54.334 --> 0:42:59.063
1461
+ The other thing is the second word.
1462
+
1463
+ 0:42:58.931 --> 0:43:03.336
1464
+ You can do the second word dusk.
1465
+
1466
+ 0:43:03.202 --> 0:43:07.345
1467
+ Now you see the problem here.
1468
+
1469
+ 0:43:07.707 --> 0:43:17.507
1470
+ It is true that these have the highest probability,
1471
+ but for these you have an extension.
1472
+
1473
+ 0:43:18.078 --> 0:43:31.585
1474
+ So the problem is just because in one position
1475
+ one hypothesis, so you can always call this
1476
+
1477
+ 0:43:31.585 --> 0:43:34.702
1478
+ partial translation.
1479
+
1480
+ 0:43:34.874 --> 0:43:41.269
1481
+ The blue one begin is higher, but the green
1482
+ one can be better extended and it will overtake.
1483
+
1484
+ 0:43:45.525 --> 0:43:54.672
1485
+ So the problem is if we are doing this greedy
1486
+ search is that we might not end up in really
1487
+
1488
+ 0:43:54.672 --> 0:43:55.275
1489
+ good.
1490
+
1491
+ 0:43:55.956 --> 0:44:00.916
1492
+ So the first thing we could not do is like
1493
+ yeah, we can just try.
1494
+
1495
+ 0:44:00.880 --> 0:44:06.049
1496
+ All combinations that are there, so there
1497
+ is the other direction.
1498
+
1499
+ 0:44:05.971 --> 0:44:12.988
1500
+ So if the solution to to check the first one
1501
+ is to just try all and it doesn't give us a
1502
+
1503
+ 0:44:12.988 --> 0:44:17.876
1504
+ good result, maybe what we have to do is just
1505
+ try everything.
1506
+
1507
+ 0:44:18.318 --> 0:44:23.120
1508
+ The nice thing is if we try everything, we'll
1509
+ definitely find the best translation.
1510
+
1511
+ 0:44:23.463 --> 0:44:26.094
1512
+ So we won't have a search error.
1513
+
1514
+ 0:44:26.014 --> 0:44:28.113
1515
+ We'll come to that later.
1516
+
1517
+ 0:44:28.032 --> 0:44:32.474
1518
+ The interesting thing is our translation performance.
1519
+
1520
+ 0:44:33.353 --> 0:44:37.039
1521
+ But we will definitely find the most probable
1522
+ translation.
1523
+
1524
+ 0:44:38.598 --> 0:44:44.552
1525
+ However, it's not really possible because
1526
+ the number of combinations is just too high.
1527
+
1528
+ 0:44:44.764 --> 0:44:57.127
1529
+ So the number of congregations is your vocabulary
1530
+ science times the lengths of your sentences.
1531
+
1532
+ 0:44:57.157 --> 0:45:03.665
1533
+ Ten thousand or so you can imagine that very
1534
+ soon you will have so many possibilities here
1535
+
1536
+ 0:45:03.665 --> 0:45:05.597
1537
+ that you cannot check all.
1538
+
1539
+ 0:45:06.226 --> 0:45:13.460
1540
+ So this is not really an implication or an
1541
+ algorithm that you can use for applying machine
1542
+
1543
+ 0:45:13.460 --> 0:45:14.493
1544
+ translation.
1545
+
1546
+ 0:45:15.135 --> 0:45:24.657
1547
+ So maybe we have to do something in between
1548
+ and yeah, not look at all but only look at
1549
+
1550
+ 0:45:24.657 --> 0:45:25.314
1551
+ some.
1552
+
1553
+ 0:45:26.826 --> 0:45:29.342
1554
+ And the easiest thing for that is okay.
1555
+
1556
+ 0:45:29.279 --> 0:45:34.840
1557
+ Just do sampling, so if we don't know what
1558
+ to look at, maybe it's good to randomly pick
1559
+
1560
+ 0:45:34.840 --> 0:45:35.219
1561
+ some.
1562
+
1563
+ 0:45:35.156 --> 0:45:40.572
1564
+ That's not only a very good algorithm, so
1565
+ the basic idea will always randomly select
1566
+
1567
+ 0:45:40.572 --> 0:45:42.866
1568
+ the word, of course, based on bits.
1569
+
1570
+ 0:45:43.223 --> 0:45:52.434
1571
+ We are doing that or times, and then we are
1572
+ looking which one at the end has the highest.
1573
+
1574
+ 0:45:52.672 --> 0:45:59.060
1575
+ So we are not doing anymore really searching
1576
+ for the best one, but we are more randomly
1577
+
1578
+ 0:45:59.060 --> 0:46:05.158
1579
+ doing selections with the idea that we always
1580
+ select the best one at the beginning.
1581
+
1582
+ 0:46:05.085 --> 0:46:11.758
1583
+ So maybe it's better to do random, but of
1584
+ course one important thing is how do we randomly
1585
+
1586
+ 0:46:11.758 --> 0:46:12.345
1587
+ select?
1588
+
1589
+ 0:46:12.452 --> 0:46:15.756
1590
+ If we just do uniform distribution, it would
1591
+ be very bad.
1592
+
1593
+ 0:46:15.699 --> 0:46:18.036
1594
+ You'll only have very bad translations.
1595
+
1596
+ 0:46:18.398 --> 0:46:23.261
1597
+ Because in each position if you think about
1598
+ it you have ten thousand possibilities.
1599
+
1600
+ 0:46:23.903 --> 0:46:28.729
1601
+ Most of them are really bad decisions and
1602
+ you shouldn't do that.
1603
+
1604
+ 0:46:28.655 --> 0:46:35.190
1605
+ There is always only a very small number,
1606
+ at least compared to the 10 000 translation.
1607
+
1608
+ 0:46:35.395 --> 0:46:43.826
1609
+ So if you have the sentence here, this is
1610
+ an English sentence.
1611
+
1612
+ 0:46:43.692 --> 0:46:47.846
1613
+ You can start with these and.
1614
+
1615
+ 0:46:48.408 --> 0:46:58.345
1616
+ You're thinking about setting legal documents
1617
+ in a legal document.
1618
+
1619
+ 0:46:58.197 --> 0:47:02.356
1620
+ You should not change the.
1621
+
1622
+ 0:47:03.603 --> 0:47:11.032
1623
+ The problem is we have a neural network, we
1624
+ have a black box, so it's anyway a bit random.
1625
+
1626
+ 0:47:12.092 --> 0:47:24.341
1627
+ It is considered, but you will see that if
1628
+ you make it intelligent for clear sentences,
1629
+
1630
+ 0:47:24.341 --> 0:47:26.986
1631
+ there is not that.
1632
+
1633
+ 0:47:27.787 --> 0:47:35.600
1634
+ Is an issue we should consider that this one
1635
+ might lead to more randomness, but it might
1636
+
1637
+ 0:47:35.600 --> 0:47:39.286
1638
+ also be positive for machine translation.
1639
+
1640
+ 0:47:40.080 --> 0:47:46.395
1641
+ Least can't directly think of a good implication
1642
+ where it's positive, but if you most think
1643
+
1644
+ 0:47:46.395 --> 0:47:52.778
1645
+ about dialogue systems, for example, whereas
1646
+ the similar architecture is nowadays also used,
1647
+
1648
+ 0:47:52.778 --> 0:47:55.524
1649
+ you predict what the system should say.
1650
+
1651
+ 0:47:55.695 --> 0:48:00.885
1652
+ Then you want to have randomness because it's
1653
+ not always saying the same thing.
1654
+
1655
+ 0:48:01.341 --> 0:48:08.370
1656
+ Machine translation is typically not you want
1657
+ to have consistency, so if you have the same
1658
+
1659
+ 0:48:08.370 --> 0:48:09.606
1660
+ input normally.
1661
+
1662
+ 0:48:09.889 --> 0:48:14.528
1663
+ Therefore, sampling is not a mathieu.
1664
+
1665
+ 0:48:14.406 --> 0:48:22.565
1666
+ PreprocessingThere are some things you will
1667
+ later see as a preprocessing step.
1668
+
1669
+ 0:48:23.003 --> 0:48:27.832
1670
+ But of course it's important how you can make
1671
+ this process not too random.
1672
+
1673
+ 0:48:29.269 --> 0:48:41.619
1674
+ Therefore, the first thing is don't take a
1675
+ uniform distribution, but we have a very nice
1676
+
1677
+ 0:48:41.619 --> 0:48:43.562
1678
+ distribution.
1679
+
1680
+ 0:48:43.843 --> 0:48:46.621
1681
+ So I'm like randomly taking a word.
1682
+
1683
+ 0:48:46.544 --> 0:48:51.329
1684
+ We are looking at output distribution and
1685
+ now taking a word.
1686
+
1687
+ 0:48:51.731 --> 0:49:03.901
1688
+ So that means we are taking the word these,
1689
+ we are taking the word does, and all these.
1690
+
1691
+ 0:49:04.444 --> 0:49:06.095
1692
+ How can you do that?
1693
+
1694
+ 0:49:06.016 --> 0:49:09.950
1695
+ You randomly draw a number between zero and
1696
+ one.
1697
+
1698
+ 0:49:10.390 --> 0:49:23.686
1699
+ And then you have ordered your words in some
1700
+ way, and then you take the words before the
1701
+
1702
+ 0:49:23.686 --> 0:49:26.375
1703
+ sum of the words.
1704
+
1705
+ 0:49:26.806 --> 0:49:34.981
1706
+ So the easiest thing is you have zero point
1707
+ five, zero point two five, and zero point two
1708
+
1709
+ 0:49:34.981 --> 0:49:35.526
1710
+ five.
1711
+
1712
+ 0:49:35.435 --> 0:49:43.411
1713
+ If you have a number smaller than you take
1714
+ the first word, it takes a second word, and
1715
+
1716
+ 0:49:43.411 --> 0:49:45.336
1717
+ if it's higher than.
1718
+
1719
+ 0:49:45.845 --> 0:49:57.707
1720
+ Therefore, you can very easily get a distribution
1721
+ distributed according to this probability mass
1722
+
1723
+ 0:49:57.707 --> 0:49:59.541
1724
+ and no longer.
1725
+
1726
+ 0:49:59.799 --> 0:50:12.479
1727
+ You can't even do that a bit more and more
1728
+ focus on the important part if we are not randomly
1729
+
1730
+ 0:50:12.479 --> 0:50:19.494
1731
+ drawing from all words, but we are looking
1732
+ only at.
1733
+
1734
+ 0:50:21.361 --> 0:50:24.278
1735
+ You have an idea why this is an important
1736
+ stamp.
1737
+
1738
+ 0:50:24.219 --> 0:50:29.427
1739
+ Although we say I'm only throwing away the
1740
+ words which have a very low probability, so
1741
+
1742
+ 0:50:29.427 --> 0:50:32.541
1743
+ anyway the probability of taking them is quite
1744
+ low.
1745
+
1746
+ 0:50:32.481 --> 0:50:35.236
1747
+ So normally that shouldn't matter that much.
1748
+
1749
+ 0:50:36.256 --> 0:50:38.830
1750
+ There's ten thousand words.
1751
+
1752
+ 0:50:40.300 --> 0:50:42.074
1753
+ Of course, they admire thousand nine hundred.
1754
+
1755
+ 0:50:42.035 --> 0:50:44.003
1756
+ They're going to build a good people steal
1757
+ it up.
1758
+
1759
+ 0:50:45.085 --> 0:50:47.425
1760
+ Hi, I'm Sarah Hauer and I'm Sig Hauer and
1761
+ We're Professional.
1762
+
1763
+ 0:50:47.867 --> 0:50:55.299
1764
+ Yes, that's exactly why you do this most sampling
1765
+ or so that you don't take the lowest.
1766
+
1767
+ 0:50:55.415 --> 0:50:59.694
1768
+ Probability words, but you only look at the
1769
+ most probable ones and then like.
1770
+
1771
+ 0:50:59.639 --> 0:51:04.594
1772
+ Of course you have to rescale your probability
1773
+ mass then so that it's still a probability
1774
+
1775
+ 0:51:04.594 --> 0:51:08.393
1776
+ because now it's a probability distribution
1777
+ over ten thousand words.
1778
+
1779
+ 0:51:08.338 --> 0:51:13.332
1780
+ If you only take ten of them or so it's no
1781
+ longer a probability distribution, you rescale
1782
+
1783
+ 0:51:13.332 --> 0:51:15.330
1784
+ them and you can still do that and.
1785
+
1786
+ 0:51:16.756 --> 0:51:20.095
1787
+ That is what is done assembling.
1788
+
1789
+ 0:51:19.994 --> 0:51:26.269
1790
+ It's not the most common thing, but it's done
1791
+ several times.
1792
+
1793
+ 0:51:28.088 --> 0:51:40.625
1794
+ Then the search, which is somehow a standard,
1795
+ and if you're doing some type of machine translation.
1796
+
1797
+ 0:51:41.181 --> 0:51:50.162
1798
+ And the basic idea is that in research we
1799
+ select for the most probable and only continue
1800
+
1801
+ 0:51:50.162 --> 0:51:51.171
1802
+ with the.
1803
+
1804
+ 0:51:51.691 --> 0:51:53.970
1805
+ You can easily generalize this.
1806
+
1807
+ 0:51:53.899 --> 0:52:00.452
1808
+ We are not only continuing the most probable
1809
+ one, but we are continuing the most probable.
1810
+
1811
+ 0:52:00.880 --> 0:52:21.376
1812
+ The.
1813
+
1814
+ 0:52:17.697 --> 0:52:26.920
1815
+ You should say we are sampling how many examples
1816
+ it makes sense to take the one with the highest.
1817
+
1818
+ 0:52:27.127 --> 0:52:33.947
1819
+ But that is important that once you do a mistake
1820
+ you might want to not influence that much.
1821
+
1822
+ 0:52:39.899 --> 0:52:45.815
1823
+ So the idea is if we're keeping the end best
1824
+ hypotheses and not only the first fact.
1825
+
1826
+ 0:52:46.586 --> 0:52:51.558
1827
+ And the nice thing is in statistical machine
1828
+ translation.
1829
+
1830
+ 0:52:51.473 --> 0:52:54.408
1831
+ We have exactly the same problem.
1832
+
1833
+ 0:52:54.322 --> 0:52:57.635
1834
+ You would do the same thing, however.
1835
+
1836
+ 0:52:57.548 --> 0:53:03.391
1837
+ Since the model wasn't that strong you needed
1838
+ a quite large beam.
1839
+
1840
+ 0:53:03.984 --> 0:53:18.944
1841
+ Machine translation models are really strong
1842
+ and you get already a very good performance.
1843
+
1844
+ 0:53:19.899 --> 0:53:22.835
1845
+ So how does it work?
1846
+
1847
+ 0:53:22.695 --> 0:53:35.136
1848
+ We can't relate to our capabilities, but now
1849
+ we are not storing the most probable ones.
1850
+
1851
+ 0:53:36.156 --> 0:53:45.163
1852
+ Done that we extend all these hypothesis and
1853
+ of course there is now a bit difficult because
1854
+
1855
+ 0:53:45.163 --> 0:53:54.073
1856
+ now we always have to switch what is the input
1857
+ so the search gets more complicated and the
1858
+
1859
+ 0:53:54.073 --> 0:53:55.933
1860
+ first one is easy.
1861
+
1862
+ 0:53:56.276 --> 0:54:09.816
1863
+ In this case we have to once put in here these
1864
+ and then somehow delete this one and instead
1865
+
1866
+ 0:54:09.816 --> 0:54:12.759
1867
+ put that into that.
1868
+
1869
+ 0:54:13.093 --> 0:54:24.318
1870
+ Otherwise you could only store your current
1871
+ network states here and just continue by going
1872
+
1873
+ 0:54:24.318 --> 0:54:25.428
1874
+ forward.
1875
+
1876
+ 0:54:26.766 --> 0:54:34.357
1877
+ So now you have done the first two, and then
1878
+ you have known the best.
1879
+
1880
+ 0:54:34.249 --> 0:54:37.289
1881
+ Can you now just continue?
1882
+
1883
+ 0:54:39.239 --> 0:54:53.511
1884
+ Yes, that's very important, otherwise all
1885
+ your beam search doesn't really help because
1886
+
1887
+ 0:54:53.511 --> 0:54:57.120
1888
+ you would still have.
1889
+
1890
+ 0:54:57.317 --> 0:55:06.472
1891
+ So now you have to do one important step and
1892
+ then reduce again to end.
1893
+
1894
+ 0:55:06.343 --> 0:55:13.824
1895
+ So in our case to make things easier we have
1896
+ the inputs.
1897
+
1898
+ 0:55:14.014 --> 0:55:19.072
1899
+ Otherwise you will have two to the power of
1900
+ length possibilities, so it is still exponential.
1901
+
1902
+ 0:55:19.559 --> 0:55:26.637
1903
+ But by always throwing them away you keep
1904
+ your beans fixed.
1905
+
1906
+ 0:55:26.519 --> 0:55:31.712
1907
+ The items now differ in the last position.
1908
+
1909
+ 0:55:32.492 --> 0:55:42.078
1910
+ They are completely different, but you are
1911
+ always searching what is the best one.
1912
+
1913
+ 0:55:44.564 --> 0:55:50.791
1914
+ So another way of hearing it is like this,
1915
+ so just imagine you start with the empty sentence.
1916
+
1917
+ 0:55:50.725 --> 0:55:55.266
1918
+ Then you have three possible extensions: A,
1919
+ B, and end of sentence.
1920
+
1921
+ 0:55:55.199 --> 0:55:59.207
1922
+ It's throwing away the worst one, continuing
1923
+ with the two.
1924
+
1925
+ 0:55:59.699 --> 0:56:13.136
1926
+ Then you want to stay too, so in this state
1927
+ it's either or and then you continue.
1928
+
1929
+ 0:56:13.293 --> 0:56:24.924
1930
+ So you always have this exponential growing
1931
+ tree by destroying most of them away and only
1932
+
1933
+ 0:56:24.924 --> 0:56:26.475
1934
+ continuing.
1935
+
1936
+ 0:56:26.806 --> 0:56:42.455
1937
+ And thereby you can hopefully do less errors
1938
+ because in these examples you always see this
1939
+
1940
+ 0:56:42.455 --> 0:56:43.315
1941
+ one.
1942
+
1943
+ 0:56:43.503 --> 0:56:47.406
1944
+ So you're preventing some errors, but of course
1945
+ it's not perfect.
1946
+
1947
+ 0:56:47.447 --> 0:56:56.829
1948
+ You can still do errors because it could be
1949
+ not the second one but the fourth one.
1950
+
1951
+ 0:56:57.017 --> 0:57:03.272
1952
+ Now just the idea is that you make yeah less
1953
+ errors and prevent that.
1954
+
1955
+ 0:57:07.667 --> 0:57:11.191
1956
+ Then the question is how much does it help?
1957
+
1958
+ 0:57:11.111 --> 0:57:14.012
1959
+ And here is some examples for that.
1960
+
1961
+ 0:57:13.932 --> 0:57:16.614
1962
+ So for S & T it was really like.
1963
+
1964
+ 0:57:16.533 --> 0:57:23.525
1965
+ Typically the larger beam you have a larger
1966
+ third space and you have a better score.
1967
+
1968
+ 0:57:23.763 --> 0:57:27.370
1969
+ So the larger you get, the bigger your emails,
1970
+ the better you will.
1971
+
1972
+ 0:57:27.317 --> 0:57:30.024
1973
+ Typically maybe use something like three hundred.
1974
+
1975
+ 0:57:30.250 --> 0:57:38.777
1976
+ And it's mainly a trade-off between quality
1977
+ and speed because the larger your beams, the
1978
+
1979
+ 0:57:38.777 --> 0:57:43.184
1980
+ more time it takes and you want to finish it.
1981
+
1982
+ 0:57:43.088 --> 0:57:49.126
1983
+ So your quality improvements are getting smaller
1984
+ and smaller.
1985
+
1986
+ 0:57:49.349 --> 0:57:57.164
1987
+ So the difference between a beam of one and
1988
+ ten is bigger than the difference between a.
1989
+
1990
+ 0:57:58.098 --> 0:58:14.203
1991
+ And the interesting thing is we're seeing
1992
+ a bit of a different view, and we're seeing
1993
+
1994
+ 0:58:14.203 --> 0:58:16.263
1995
+ typically.
1996
+
1997
+ 0:58:16.776 --> 0:58:24.376
1998
+ And then especially if you look at the green
1999
+ ones, this is unnormalized.
2000
+
2001
+ 0:58:24.272 --> 0:58:26.775
2002
+ You're seeing a sharp.
2003
+
2004
+ 0:58:27.207 --> 0:58:32.284
2005
+ So your translation quality here measured
2006
+ in blue will go down again.
2007
+
2008
+ 0:58:33.373 --> 0:58:35.663
2009
+ That is now a question.
2010
+
2011
+ 0:58:35.568 --> 0:58:37.692
2012
+ Why is that the case?
2013
+
2014
+ 0:58:37.596 --> 0:58:43.681
2015
+ Why should we are seeing more and more possible
2016
+ translations?
2017
+
2018
+ 0:58:46.226 --> 0:58:48.743
2019
+ If we have a bigger stretch and we are going.
2020
+
2021
+ 0:58:52.612 --> 0:58:56.312
2022
+ I'm going to be using my examples before we
2023
+ also look at the bar.
2024
+
2025
+ 0:58:56.656 --> 0:58:59.194
2026
+ A good idea.
2027
+
2028
+ 0:59:00.000 --> 0:59:18.521
2029
+ But it's not everything because we in the
2030
+ end always in this list we're selecting.
2031
+
2032
+ 0:59:18.538 --> 0:59:19.382
2033
+ So this is here.
2034
+
2035
+ 0:59:19.333 --> 0:59:21.172
2036
+ We don't do any regions to do that.
2037
+
2038
+ 0:59:21.601 --> 0:59:29.287
2039
+ So the probabilities at the end we always
2040
+ give out the hypothesis with the highest probabilities.
2041
+
2042
+ 0:59:30.250 --> 0:59:33.623
2043
+ That is always the case.
2044
+
2045
+ 0:59:33.488 --> 0:59:43.340
2046
+ If you have a beam of this should be a subset
2047
+ of the items you look at.
2048
+
2049
+ 0:59:44.224 --> 0:59:52.571
2050
+ So if you increase your biomeat you're just
2051
+ looking at more and you're always taking the
2052
+
2053
+ 0:59:52.571 --> 0:59:54.728
2054
+ wine with the highest.
2055
+
2056
+ 0:59:57.737 --> 1:00:07.014
2057
+ Maybe they are all the probability that they
2058
+ will be comparable to don't really have.
2059
+
2060
+ 1:00:08.388 --> 1:00:14.010
2061
+ But the probabilities are the same, not that
2062
+ easy.
2063
+
2064
+ 1:00:13.900 --> 1:00:23.910
2065
+ One morning maybe you will have more examples
2066
+ where we look at some stuff that's not seen
2067
+
2068
+ 1:00:23.910 --> 1:00:26.357
2069
+ in the trading space.
2070
+
2071
+ 1:00:28.428 --> 1:00:36.478
2072
+ That's mainly the answer why we give a hyperability
2073
+ math we will see, but that is first of all
2074
+
2075
+ 1:00:36.478 --> 1:00:43.087
2076
+ the biggest issues, so here is a blue score,
2077
+ so that is somewhat translation.
2078
+
2079
+ 1:00:43.883 --> 1:00:48.673
2080
+ This will go down by the probability of the
2081
+ highest one that only goes out where stays
2082
+
2083
+ 1:00:48.673 --> 1:00:49.224
2084
+ at least.
2085
+
2086
+ 1:00:49.609 --> 1:00:57.971
2087
+ The problem is if we are searching more, we
2088
+ are finding high processes which have a high
2089
+
2090
+ 1:00:57.971 --> 1:00:59.193
2091
+ translation.
2092
+
2093
+ 1:00:59.579 --> 1:01:10.375
2094
+ So we are finding these things which we wouldn't
2095
+ find and we'll see why this is happening.
2096
+
2097
+ 1:01:10.256 --> 1:01:15.716
2098
+ So somehow we are reducing our search error.
2099
+
2100
+ 1:01:16.336 --> 1:01:25.300
2101
+ However, we also have a model error and we
2102
+ don't assign the highest probability to translation
2103
+
2104
+ 1:01:25.300 --> 1:01:27.942
2105
+ quality to the really best.
2106
+
2107
+ 1:01:28.548 --> 1:01:31.460
2108
+ They don't always add up.
2109
+
2110
+ 1:01:31.348 --> 1:01:34.859
2111
+ Of course somehow they add up.
2112
+
2113
+ 1:01:34.746 --> 1:01:41.656
2114
+ If your bottle is worse then your performance
2115
+ will even go.
2116
+
2117
+ 1:01:42.202 --> 1:01:49.718
2118
+ But sometimes it's happening that by increasing
2119
+ search errors we are missing out the really
2120
+
2121
+ 1:01:49.718 --> 1:01:57.969
2122
+ bad translations which have a high probability
2123
+ and we are only finding the decently good probability
2124
+
2125
+ 1:01:57.969 --> 1:01:58.460
2126
+ mass.
2127
+
2128
+ 1:01:59.159 --> 1:02:03.859
2129
+ So they are a bit independent of each other
2130
+ and you can make those types of arrows.
2131
+
2132
+ 1:02:04.224 --> 1:02:09.858
2133
+ That's why, for example, doing exact search
2134
+ will give you the translation with the highest
2135
+
2136
+ 1:02:09.858 --> 1:02:15.245
2137
+ probability, but there has been work on it
2138
+ that you then even have a lower translation
2139
+
2140
+ 1:02:15.245 --> 1:02:21.436
2141
+ quality because then you find some random translation
2142
+ which has a very high translation probability
2143
+
2144
+ 1:02:21.436 --> 1:02:22.984
2145
+ by which I'm really bad.
2146
+
2147
+ 1:02:23.063 --> 1:02:29.036
2148
+ Because our model is not perfect and giving
2149
+ a perfect translation probability over air,.
2150
+
2151
+ 1:02:31.431 --> 1:02:34.537
2152
+ So why is this happening?
2153
+
2154
+ 1:02:34.417 --> 1:02:42.303
2155
+ And one issue with this is the so called label
2156
+ or length spiral.
2157
+
2158
+ 1:02:42.782 --> 1:02:47.115
2159
+ And we are in each step of decoding.
2160
+
2161
+ 1:02:46.998 --> 1:02:55.313
2162
+ We are modeling the probability of the next
2163
+ word given the input and.
2164
+
2165
+ 1:02:55.895 --> 1:03:06.037
2166
+ So if you have this picture, so you always
2167
+ hear you have the probability of the next word.
2168
+
2169
+ 1:03:06.446 --> 1:03:16.147
2170
+ That's that's what your modeling, and of course
2171
+ the model is not perfect.
2172
+
2173
+ 1:03:16.576 --> 1:03:22.765
2174
+ So it can be that if we at one time do a bitter
2175
+ wrong prediction not for the first one but
2176
+
2177
+ 1:03:22.765 --> 1:03:28.749
2178
+ maybe for the 5th or 6th thing, then we're
2179
+ giving it an exceptional high probability we
2180
+
2181
+ 1:03:28.749 --> 1:03:30.178
2182
+ cannot recover from.
2183
+
2184
+ 1:03:30.230 --> 1:03:34.891
2185
+ Because this high probability will stay there
2186
+ forever and we just multiply other things to
2187
+
2188
+ 1:03:34.891 --> 1:03:39.910
2189
+ it, but we cannot like later say all this probability
2190
+ was a bit too high, we shouldn't have done.
2191
+
2192
+ 1:03:41.541 --> 1:03:48.984
2193
+ And this leads to that the more the longer
2194
+ your translation is, the more often you use
2195
+
2196
+ 1:03:48.984 --> 1:03:51.637
2197
+ this probability distribution.
2198
+
2199
+ 1:03:52.112 --> 1:04:03.321
2200
+ The typical example is this one, so you have
2201
+ the probability of the translation.
2202
+
2203
+ 1:04:04.104 --> 1:04:12.608
2204
+ And this probability is quite low as you see,
2205
+ and maybe there are a lot of other things.
2206
+
2207
+ 1:04:13.053 --> 1:04:25.658
2208
+ However, it might still be overestimated that
2209
+ it's still a bit too high.
2210
+
2211
+ 1:04:26.066 --> 1:04:33.042
2212
+ The problem is if you know the project translation
2213
+ is a very long one, but probability mask gets
2214
+
2215
+ 1:04:33.042 --> 1:04:33.545
2216
+ lower.
2217
+
2218
+ 1:04:34.314 --> 1:04:45.399
2219
+ Because each time you multiply your probability
2220
+ to it, so your sequence probability gets lower
2221
+
2222
+ 1:04:45.399 --> 1:04:46.683
2223
+ and lower.
2224
+
2225
+ 1:04:48.588 --> 1:04:59.776
2226
+ And this means that at some point you might
2227
+ get over this, and it might be a lower probability.
2228
+
2229
+ 1:05:00.180 --> 1:05:09.651
2230
+ And if you then have this probability at the
2231
+ beginning away, but it wasn't your beam, then
2232
+
2233
+ 1:05:09.651 --> 1:05:14.958
2234
+ at this point you would select the empty sentence.
2235
+
2236
+ 1:05:15.535 --> 1:05:25.379
2237
+ So this has happened because this short translation
2238
+ is seen and it's not thrown away.
2239
+
2240
+ 1:05:28.268 --> 1:05:31.121
2241
+ So,.
2242
+
2243
+ 1:05:31.151 --> 1:05:41.256
2244
+ If you have a very sore beam that can be prevented,
2245
+ but if you have a large beam, this one is in
2246
+
2247
+ 1:05:41.256 --> 1:05:41.986
2248
+ there.
2249
+
2250
+ 1:05:42.302 --> 1:05:52.029
2251
+ This in general seems reasonable that shorter
2252
+ pronunciations instead of longer sentences
2253
+
2254
+ 1:05:52.029 --> 1:05:54.543
2255
+ because non-religious.
2256
+
2257
+ 1:05:56.376 --> 1:06:01.561
2258
+ It's a bit depending on whether the translation
2259
+ should be a bit related to your input.
2260
+
2261
+ 1:06:02.402 --> 1:06:18.053
2262
+ And since we are always multiplying things,
2263
+ the longer the sequences we are getting smaller,
2264
+
2265
+ 1:06:18.053 --> 1:06:18.726
2266
+ it.
2267
+
2268
+ 1:06:19.359 --> 1:06:29.340
2269
+ It's somewhat right for human main too, but
2270
+ the models tend to overestimate because of
2271
+
2272
+ 1:06:29.340 --> 1:06:34.388
2273
+ this short translation of long translation.
2274
+
2275
+ 1:06:35.375 --> 1:06:46.474
2276
+ Then, of course, that means that it's not
2277
+ easy to stay on a computer because eventually
2278
+
2279
+ 1:06:46.474 --> 1:06:48.114
2280
+ it suggests.
2281
+
2282
+ 1:06:51.571 --> 1:06:59.247
2283
+ First of all there is another way and that's
2284
+ typically used but you don't have to do really
2285
+
2286
+ 1:06:59.247 --> 1:07:07.089
2287
+ because this is normally not a second position
2288
+ and if it's like on the 20th position you only
2289
+
2290
+ 1:07:07.089 --> 1:07:09.592
2291
+ have to have some bean lower.
2292
+
2293
+ 1:07:10.030 --> 1:07:17.729
2294
+ But you are right because these issues get
2295
+ larger, the larger your input is, and then
2296
+
2297
+ 1:07:17.729 --> 1:07:20.235
2298
+ you might make more errors.
2299
+
2300
+ 1:07:20.146 --> 1:07:27.578
2301
+ So therefore this is true, but it's not as
2302
+ simple that this one is always in the.
2303
+
2304
+ 1:07:28.408 --> 1:07:45.430
2305
+ That the translation for it goes down with
2306
+ higher insert sizes has there been more control.
2307
+
2308
+ 1:07:47.507 --> 1:07:51.435
2309
+ In this work you see a dozen knocks.
2310
+
2311
+ 1:07:51.329 --> 1:07:52.940
2312
+ Knots go down.
2313
+
2314
+ 1:07:52.833 --> 1:08:00.249
2315
+ That's light green here, but at least you
2316
+ don't see the sharp rock.
2317
+
2318
+ 1:08:00.820 --> 1:08:07.897
2319
+ So if you do some type of normalization, at
2320
+ least you can assess this probability and limit
2321
+
2322
+ 1:08:07.897 --> 1:08:08.204
2323
+ it.
2324
+
2325
+ 1:08:15.675 --> 1:08:24.828
2326
+ There is other reasons why, like initial,
2327
+ it's not only the length, but there can be
2328
+
2329
+ 1:08:24.828 --> 1:08:26.874
2330
+ other reasons why.
2331
+
2332
+ 1:08:27.067 --> 1:08:37.316
2333
+ And if you just take it too large, you're
2334
+ looking too often at ways in between, but it's
2335
+
2336
+ 1:08:37.316 --> 1:08:40.195
2337
+ better to ignore things.
2338
+
2339
+ 1:08:41.101 --> 1:08:44.487
2340
+ But that's more a hand gravy argument.
2341
+
2342
+ 1:08:44.401 --> 1:08:47.876
2343
+ Agree so don't know if the exact word.
2344
+
2345
+ 1:08:48.648 --> 1:08:53.223
2346
+ You need to do the normalization and there
2347
+ are different ways of doing it.
2348
+
2349
+ 1:08:53.162 --> 1:08:54.142
2350
+ It's mainly OK.
2351
+
2352
+ 1:08:54.142 --> 1:08:59.410
2353
+ We're just now not taking the translation
2354
+ with the highest probability, but we during
2355
+
2356
+ 1:08:59.410 --> 1:09:04.922
2357
+ the coding have another feature saying not
2358
+ only take the one with the highest probability
2359
+
2360
+ 1:09:04.922 --> 1:09:08.169
2361
+ but also prefer translations which are a bit
2362
+ longer.
2363
+
2364
+ 1:09:08.488 --> 1:09:16.933
2365
+ You can do that different in a way to divide
2366
+ by the center length.
2367
+
2368
+ 1:09:16.807 --> 1:09:23.111
2369
+ We take not the highest but the highest average.
2370
+
2371
+ 1:09:23.563 --> 1:09:28.841
2372
+ Of course, if both are the same lengths, it
2373
+ doesn't matter if M is the same lengths in
2374
+
2375
+ 1:09:28.841 --> 1:09:34.483
2376
+ all cases, but if you compare a translation
2377
+ with seven or eight words, there is a difference
2378
+
2379
+ 1:09:34.483 --> 1:09:39.700
2380
+ if you want to have the one with the highest
2381
+ probability or with the highest average.
2382
+
2383
+ 1:09:41.021 --> 1:09:50.993
2384
+ So that is the first one can have some reward
2385
+ model for each word, add a bit of the score,
2386
+
2387
+ 1:09:50.993 --> 1:09:51.540
2388
+ and.
2389
+
2390
+ 1:09:51.711 --> 1:10:03.258
2391
+ And then, of course, you have to find you
2392
+ that there is also more complex ones here.
2393
+
2394
+ 1:10:03.903 --> 1:10:08.226
2395
+ So there is different ways of doing that,
2396
+ and of course that's important.
2397
+
2398
+ 1:10:08.428 --> 1:10:11.493
2399
+ But in all of that, the main idea is OK.
2400
+
2401
+ 1:10:11.493 --> 1:10:18.520
2402
+ We are like knowing of the arrow that the
2403
+ model seems to prevent or prefer short translation.
2404
+
2405
+ 1:10:18.445 --> 1:10:24.800
2406
+ We circumvent that by OK we are adding we
2407
+ are no longer searching for the best one.
2408
+
2409
+ 1:10:24.764 --> 1:10:30.071
2410
+ But we're searching for the one best one and
2411
+ some additional constraints, so mainly you
2412
+
2413
+ 1:10:30.071 --> 1:10:32.122
2414
+ are doing here during the coding.
2415
+
2416
+ 1:10:32.061 --> 1:10:37.411
2417
+ You're not completely trusting your model,
2418
+ but you're adding some buyers or constraints
2419
+
2420
+ 1:10:37.411 --> 1:10:39.600
2421
+ into what should also be fulfilled.
2422
+
2423
+ 1:10:40.000 --> 1:10:42.543
2424
+ That can be, for example, that the length
2425
+ should be recently.
2426
+
2427
+ 1:10:49.369 --> 1:10:51.071
2428
+ Any More Questions to That.
2429
+
2430
+ 1:10:56.736 --> 1:11:04.001
2431
+ Last idea which gets recently quite a bit
2432
+ more interest also is what is called minimum
2433
+
2434
+ 1:11:04.001 --> 1:11:11.682
2435
+ base risk decoding and there is maybe not the
2436
+ one correct translation but there are several
2437
+
2438
+ 1:11:11.682 --> 1:11:13.937
2439
+ good correct translations.
2440
+
2441
+ 1:11:14.294 --> 1:11:21.731
2442
+ And the idea is now we don't want to find
2443
+ the one translation, which is maybe the highest
2444
+
2445
+ 1:11:21.731 --> 1:11:22.805
2446
+ probability.
2447
+
2448
+ 1:11:23.203 --> 1:11:31.707
2449
+ Instead we are looking at all the high translation,
2450
+ all translation with high probability and then
2451
+
2452
+ 1:11:31.707 --> 1:11:39.524
2453
+ we want to take one representative out of this
2454
+ so we're just most similar to all the other
2455
+
2456
+ 1:11:39.524 --> 1:11:42.187
2457
+ hydrobility translation again.
2458
+
2459
+ 1:11:43.643 --> 1:11:46.642
2460
+ So how does it work?
2461
+
2462
+ 1:11:46.499 --> 1:11:55.640
2463
+ First you could have imagined you have reference
2464
+ translations.
2465
+
2466
+ 1:11:55.996 --> 1:12:13.017
2467
+ You have a set of reference translations and
2468
+ then what you want to get is you want to have.
2469
+
2470
+ 1:12:13.073 --> 1:12:28.641
2471
+ As a probability distribution you measure
2472
+ the similarity of reference and the hypothesis.
2473
+
2474
+ 1:12:28.748 --> 1:12:31.408
2475
+ So you have two sets of translation.
2476
+
2477
+ 1:12:31.336 --> 1:12:34.788
2478
+ You have the human translations of a sentence.
2479
+
2480
+ 1:12:35.675 --> 1:12:39.251
2481
+ That's of course not realistic, but first
2482
+ from the idea.
2483
+
2484
+ 1:12:39.188 --> 1:12:42.326
2485
+ Then you have your set of possible translations.
2486
+
2487
+ 1:12:42.622 --> 1:12:52.994
2488
+ And now you're not saying okay, we have only
2489
+ one human, but we have several humans with
2490
+
2491
+ 1:12:52.994 --> 1:12:56.294
2492
+ different types of quality.
2493
+
2494
+ 1:12:56.796 --> 1:13:07.798
2495
+ You have to have two metrics here, the similarity
2496
+ between the automatic translation and the quality
2497
+
2498
+ 1:13:07.798 --> 1:13:09.339
2499
+ of the human.
2500
+
2501
+ 1:13:10.951 --> 1:13:17.451
2502
+ Of course, we have the same problem that we
2503
+ don't have the human reference, so we have.
2504
+
2505
+ 1:13:18.058 --> 1:13:29.751
2506
+ So when we are doing it, instead of estimating
2507
+ the quality based on the human, we use our
2508
+
2509
+ 1:13:29.751 --> 1:13:30.660
2510
+ model.
2511
+
2512
+ 1:13:31.271 --> 1:13:37.612
2513
+ So we can't be like humans, so we take the
2514
+ model probability.
2515
+
2516
+ 1:13:37.510 --> 1:13:40.786
2517
+ We take the set here first of.
2518
+
2519
+ 1:13:41.681 --> 1:13:48.755
2520
+ Then we are comparing each hypothesis to this
2521
+ one, so you have two sets.
2522
+
2523
+ 1:13:48.658 --> 1:13:53.942
2524
+ Just imagine here you take all possible translations.
2525
+
2526
+ 1:13:53.844 --> 1:13:58.738
2527
+ Here you take your hypothesis in comparing
2528
+ them.
2529
+
2530
+ 1:13:58.678 --> 1:14:03.798
2531
+ And then you're taking estimating the quality
2532
+ based on the outcome.
2533
+
2534
+ 1:14:04.304 --> 1:14:06.874
2535
+ So the overall idea is okay.
2536
+
2537
+ 1:14:06.785 --> 1:14:14.652
2538
+ We are not finding the best hypothesis but
2539
+ finding the hypothesis which is most similar
2540
+
2541
+ 1:14:14.652 --> 1:14:17.066
2542
+ to many good translations.
2543
+
2544
+ 1:14:19.599 --> 1:14:21.826
2545
+ Why would you do that?
2546
+
2547
+ 1:14:21.730 --> 1:14:25.070
2548
+ It's a bit like a smoothing idea.
2549
+
2550
+ 1:14:24.971 --> 1:14:28.609
2551
+ Imagine this is the probability of.
2552
+
2553
+ 1:14:29.529 --> 1:14:36.634
2554
+ So if you would do beam search or mini search
2555
+ or anything, if you just take the highest probability
2556
+
2557
+ 1:14:36.634 --> 1:14:39.049
2558
+ one, you would take this red one.
2559
+
2560
+ 1:14:39.799 --> 1:14:45.686
2561
+ Has this type of probability distribution.
2562
+
2563
+ 1:14:45.549 --> 1:14:58.556
2564
+ Then it might be better to take some of these
2565
+ models because it's a bit lower in probability.
2566
+
2567
+ 1:14:58.618 --> 1:15:12.501
2568
+ So what you're mainly doing is you're doing
2569
+ some smoothing of your probability distribution.
2570
+
2571
+ 1:15:15.935 --> 1:15:17.010
2572
+ How can you do that?
2573
+
2574
+ 1:15:16.959 --> 1:15:20.132
2575
+ Of course, we cannot do this again compared
2576
+ to all the hype.
2577
+
2578
+ 1:15:21.141 --> 1:15:29.472
2579
+ But what we can do is we have just two sets
2580
+ and we're just taking them the same.
2581
+
2582
+ 1:15:29.369 --> 1:15:38.422
2583
+ So we're having our penny data of the hypothesis
2584
+ and the sum of the soider references.
2585
+
2586
+ 1:15:39.179 --> 1:15:55.707
2587
+ And we can just take the same clue so we can
2588
+ just compare the utility of the.
2589
+
2590
+ 1:15:56.656 --> 1:16:16.182
2591
+ And then, of course, the question is how do
2592
+ we measure the quality of the hypothesis?
2593
+
2594
+ 1:16:16.396 --> 1:16:28.148
2595
+ Course: You could also take here the probability
2596
+ of this pee of given, but you can also say
2597
+
2598
+ 1:16:28.148 --> 1:16:30.958
2599
+ we only take the top.
2600
+
2601
+ 1:16:31.211 --> 1:16:39.665
2602
+ And where we don't want to really rely on
2603
+ how good they are, we filtered out all the
2604
+
2605
+ 1:16:39.665 --> 1:16:40.659
2606
+ bad ones.
2607
+
2608
+ 1:16:40.940 --> 1:16:50.109
2609
+ Q&A: How do you set the quality of the pseudo-referencesSo
2610
+ that is the first question for the minimum
2611
+
2612
+ 1:16:50.109 --> 1:16:54.604
2613
+ base rhythm, and what are your pseudo references?
2614
+
2615
+ 1:16:55.255 --> 1:17:06.968
2616
+ So how do you set the quality of all these
2617
+ references here in the independent sampling?
2618
+
2619
+ 1:17:06.835 --> 1:17:10.168
2620
+ They all have the same.
2621
+
2622
+ 1:17:10.750 --> 1:17:12.308
2623
+ There's Also Work Where You Can Take That.
2624
+
2625
+ 1:17:13.453 --> 1:17:17.952
2626
+ And then the second question you have to do
2627
+ is, of course,.
2628
+
2629
+ 1:17:17.917 --> 1:17:26.190
2630
+ How do you prepare now two hypothesisms so
2631
+ you have now Y and H which are post generated
2632
+
2633
+ 1:17:26.190 --> 1:17:34.927
2634
+ by the system and you want to find the H which
2635
+ is most similar to all the other translations.
2636
+
2637
+ 1:17:35.335 --> 1:17:41.812
2638
+ So it's mainly like this model here, which
2639
+ says how similar is age to all the other whites.
2640
+
2641
+ 1:17:42.942 --> 1:17:50.127
2642
+ So you have to again use some type of similarity
2643
+ metric, which says how similar to possible.
2644
+
2645
+ 1:17:52.172 --> 1:17:53.775
2646
+ How can you do that?
2647
+
2648
+ 1:17:53.699 --> 1:17:58.315
2649
+ We luckily knew how to compare a reference
2650
+ to a hypothesis.
2651
+
2652
+ 1:17:58.238 --> 1:18:00.423
2653
+ We have evaluation metrics.
2654
+
2655
+ 1:18:00.345 --> 1:18:03.703
2656
+ You can do something like sentence level.
2657
+
2658
+ 1:18:04.044 --> 1:18:13.501
2659
+ But especially if you're looking into neuromodels
2660
+ you should have a stromometric so you can use
2661
+
2662
+ 1:18:13.501 --> 1:18:17.836
2663
+ a neural metric which directly compares to.
2664
+
2665
+ 1:18:22.842 --> 1:18:29.292
2666
+ Yes, so that is, is the main idea of minimum
2667
+ base risk to, so the important idea you should
2668
+
2669
+ 1:18:29.292 --> 1:18:35.743
2670
+ keep in mind is that it's doing somehow the
2671
+ smoothing by not taking the highest probability
2672
+
2673
+ 1:18:35.743 --> 1:18:40.510
2674
+ one, but by comparing like by taking a set
2675
+ of high probability one.
2676
+
2677
+ 1:18:40.640 --> 1:18:45.042
2678
+ And then looking for the translation, which
2679
+ is most similar to all of that.
2680
+
2681
+ 1:18:45.445 --> 1:18:49.888
2682
+ And thereby doing a bit more smoothing because
2683
+ you look at this one.
2684
+
2685
+ 1:18:49.824 --> 1:18:55.135
2686
+ If you have this one, for example, it would
2687
+ be more similar to all of these ones.
2688
+
2689
+ 1:18:55.071 --> 1:19:00.966
2690
+ But if you take this one, it's higher probability,
2691
+ but it's very dissimilar to all these.
2692
+
2693
+ 1:19:05.445 --> 1:19:17.609
2694
+ Hey, that is all for decoding before we finish
2695
+ with your combination of models.
2696
+
2697
+ 1:19:18.678 --> 1:19:20.877
2698
+ Sort of set of pseudo-reperences.
2699
+
2700
+ 1:19:20.812 --> 1:19:24.370
2701
+ Thomas Brown writes a little bit of type research
2702
+ or.
2703
+
2704
+ 1:19:24.944 --> 1:19:27.087
2705
+ For example, you can do beam search.
2706
+
2707
+ 1:19:27.029 --> 1:19:28.774
2708
+ You can do sampling for that.
2709
+
2710
+ 1:19:28.716 --> 1:19:31.172
2711
+ Oh yeah, we had mentioned sampling there.
2712
+
2713
+ 1:19:31.113 --> 1:19:34.409
2714
+ I don't know somebody asking for what sampling
2715
+ is good.
2716
+
2717
+ 1:19:34.351 --> 1:19:37.205
2718
+ So there's, of course, another important issue.
2719
+
2720
+ 1:19:37.146 --> 1:19:40.120
2721
+ How do you get a good representative set of
2722
+ age?
2723
+
2724
+ 1:19:40.620 --> 1:19:47.147
2725
+ If you do beam search, it might be that you
2726
+ end up with two similar ones, and maybe it's
2727
+
2728
+ 1:19:47.147 --> 1:19:49.274
2729
+ prevented by doing sampling.
2730
+
2731
+ 1:19:49.201 --> 1:19:55.289
2732
+ But maybe in sampling you find worse ones,
2733
+ but yet some type of model is helpful.
2734
+
2735
+ 1:19:56.416 --> 1:20:04.863
2736
+ Search method use more transformed based translation
2737
+ points.
2738
+
2739
+ 1:20:04.724 --> 1:20:09.852
2740
+ Nowadays beam search is definitely.
2741
+
2742
+ 1:20:10.130 --> 1:20:13.749
2743
+ There is work on this.
2744
+
2745
+ 1:20:13.592 --> 1:20:27.262
2746
+ The problem is that the MBR is often a lot
2747
+ more like heavy because you have to sample
2748
+
2749
+ 1:20:27.262 --> 1:20:29.488
2750
+ translations.
2751
+
2752
+ 1:20:31.871 --> 1:20:40.946
2753
+ If you are bustling then we take a pen or
2754
+ a pen for the most possible one.
2755
+
2756
+ 1:20:40.825 --> 1:20:43.011
2757
+ Now we put them.
2758
+
2759
+ 1:20:43.623 --> 1:20:46.262
2760
+ Bit and then we say okay, you don't have to
2761
+ be fine.
2762
+
2763
+ 1:20:46.213 --> 1:20:47.659
2764
+ I'm going to put it to you.
2765
+
2766
+ 1:20:48.428 --> 1:20:52.690
2767
+ Yes, so that is what you can also do.
2768
+
2769
+ 1:20:52.577 --> 1:21:00.093
2770
+ Instead of taking uniform per ability, you
2771
+ could take the modest.
2772
+
2773
+ 1:21:01.041 --> 1:21:14.303
2774
+ The uniform is a bit more robust because if
2775
+ you had this one it might be that there is
2776
+
2777
+ 1:21:14.303 --> 1:21:17.810
2778
+ some crazy exceptions.
2779
+
2780
+ 1:21:17.897 --> 1:21:21.088
2781
+ And then it would still relax.
2782
+
2783
+ 1:21:20.986 --> 1:21:28.261
2784
+ So if you look at this picture, the probability
2785
+ here would be higher.
2786
+
2787
+ 1:21:28.157 --> 1:21:31.798
2788
+ But yeah, that's a bit of tuning.
2789
+
2790
+ 1:21:33.073 --> 1:21:42.980
2791
+ In this case, and yes, it is like modeling
2792
+ also the ants that.
2793
+
2794
+ 1:21:49.169 --> 1:21:56.265
2795
+ The last thing is now we always have considered
2796
+ one model.
2797
+
2798
+ 1:21:56.145 --> 1:22:04.086
2799
+ It's also some prints helpful to not only
2800
+ look at one model but.
2801
+
2802
+ 1:22:04.384 --> 1:22:10.453
2803
+ So in general there's many ways of how you
2804
+ can make several models and with it's even
2805
+
2806
+ 1:22:10.453 --> 1:22:17.370
2807
+ easier you can just start three different random
2808
+ municipalizations you get three different models
2809
+
2810
+ 1:22:17.370 --> 1:22:18.428
2811
+ and typically.
2812
+
2813
+ 1:22:19.019 --> 1:22:27.299
2814
+ And then the question is, can we combine their
2815
+ strength into one model and use that then?
2816
+
2817
+ 1:22:29.669 --> 1:22:39.281
2818
+ And that can be done and it can be either
2819
+ online or ensemble, and the more offline thing
2820
+
2821
+ 1:22:39.281 --> 1:22:41.549
2822
+ is called reranking.
2823
+
2824
+ 1:22:42.462 --> 1:22:52.800
2825
+ So the idea is, for example, an ensemble that
2826
+ you combine different initializations.
2827
+
2828
+ 1:22:52.678 --> 1:23:02.045
2829
+ Of course, you can also do other things like
2830
+ having different architecture.
2831
+
2832
+ 1:23:02.222 --> 1:23:08.922
2833
+ But the easiest thing you can change always
2834
+ in generating two motors is to have different.
2835
+
2836
+ 1:23:09.209 --> 1:23:24.054
2837
+ And then the question is how can you combine
2838
+ that?
2839
+
2840
+ 1:23:26.006 --> 1:23:34.245
2841
+ And the easiest thing, as said, is the bottle
2842
+ of soda.
2843
+
2844
+ 1:23:34.095 --> 1:23:39.422
2845
+ What you mainly do is in parallel.
2846
+
2847
+ 1:23:39.270 --> 1:23:43.841
2848
+ You decode all of the money.
2849
+
2850
+ 1:23:44.444 --> 1:23:59.084
2851
+ So the probability of the output and you can
2852
+ join this one to a joint one by just summing
2853
+
2854
+ 1:23:59.084 --> 1:24:04.126
2855
+ up over your key models again.
2856
+
2857
+ 1:24:04.084 --> 1:24:10.374
2858
+ So you still have a pro bonding distribution,
2859
+ but you are not taking only one output here,
2860
+
2861
+ 1:24:10.374 --> 1:24:10.719
2862
+ but.
2863
+
2864
+ 1:24:11.491 --> 1:24:20.049
2865
+ So that's one you can easily combine different
2866
+ models, and the nice thing is it typically
2867
+
2868
+ 1:24:20.049 --> 1:24:20.715
2869
+ works.
2870
+
2871
+ 1:24:21.141 --> 1:24:27.487
2872
+ You additional improvement with only more
2873
+ calculation but not more human work.
2874
+
2875
+ 1:24:27.407 --> 1:24:33.754
2876
+ You just do the same thing for times and you're
2877
+ getting a better performance.
2878
+
2879
+ 1:24:33.793 --> 1:24:41.623
2880
+ Like having more layers and so on, the advantage
2881
+ of bigger models is of course you have to have
2882
+
2883
+ 1:24:41.623 --> 1:24:46.272
2884
+ the big models only joint and decoding during
2885
+ inference.
2886
+
2887
+ 1:24:46.190 --> 1:24:52.635
2888
+ There you have to load models in parallel
2889
+ because you have to do your search.
2890
+
2891
+ 1:24:52.672 --> 1:24:57.557
2892
+ Normally there is more memory resources for
2893
+ training than you need for insurance.
2894
+
2895
+ 1:25:00.000 --> 1:25:12.637
2896
+ You have to train four models and the decoding
2897
+ speed is also slower because you need to decode
2898
+
2899
+ 1:25:12.637 --> 1:25:14.367
2900
+ four models.
2901
+
2902
+ 1:25:14.874 --> 1:25:25.670
2903
+ There is one other very important thing and
2904
+ the models have to be very similar, at least
2905
+
2906
+ 1:25:25.670 --> 1:25:27.368
2907
+ in some ways.
2908
+
2909
+ 1:25:27.887 --> 1:25:28.506
2910
+ Course.
2911
+
2912
+ 1:25:28.428 --> 1:25:34.612
2913
+ You can only combine this one if you have
2914
+ the same words because you are just.
2915
+
2916
+ 1:25:34.874 --> 1:25:43.110
2917
+ So just imagine you have two different sizes
2918
+ because you want to compare them or a director
2919
+
2920
+ 1:25:43.110 --> 1:25:44.273
2921
+ based model.
2922
+
2923
+ 1:25:44.724 --> 1:25:53.327
2924
+ That's at least not easily possible here because
2925
+ once your output would be here a word and the
2926
+
2927
+ 1:25:53.327 --> 1:25:56.406
2928
+ other one would have to sum over.
2929
+
2930
+ 1:25:56.636 --> 1:26:07.324
2931
+ So this ensemble typically only works if you
2932
+ have the same output vocabulary.
2933
+
2934
+ 1:26:07.707 --> 1:26:16.636
2935
+ Your input can be different because that is
2936
+ only done once and then.
2937
+
2938
+ 1:26:16.506 --> 1:26:23.755
2939
+ Your hardware vocabulary has to be the same
2940
+ otherwise.
2941
+
2942
+ 1:26:27.507 --> 1:26:41.522
2943
+ There's even a surprising effect of improving
2944
+ your performance and it's again some kind of
2945
+
2946
+ 1:26:41.522 --> 1:26:43.217
2947
+ smoothing.
2948
+
2949
+ 1:26:43.483 --> 1:26:52.122
2950
+ So normally during training what we are doing
2951
+ is we can save the checkpoints after each epoch.
2952
+
2953
+ 1:26:52.412 --> 1:27:01.774
2954
+ And you have this type of curve where your
2955
+ Arab performance normally should go down, and
2956
+
2957
+ 1:27:01.774 --> 1:27:09.874
2958
+ if you do early stopping it means that at the
2959
+ end you select not the lowest.
2960
+
2961
+ 1:27:11.571 --> 1:27:21.467
2962
+ However, some type of smoothing is there again.
2963
+
2964
+ 1:27:21.261 --> 1:27:31.161
2965
+ Sometimes what you can do is take an ensemble.
2966
+
2967
+ 1:27:31.491 --> 1:27:38.798
2968
+ That is not as good, but you still have four
2969
+ different bottles, and they give you a little.
2970
+
2971
+ 1:27:39.259 --> 1:27:42.212
2972
+ So,.
2973
+
2974
+ 1:27:43.723 --> 1:27:48.340
2975
+ It's some are helping you, so now they're
2976
+ supposed to be something different, you know.
2977
+
2978
+ 1:27:49.489 --> 1:27:53.812
2979
+ Oh didn't do that, so that is a checkpoint.
2980
+
2981
+ 1:27:53.713 --> 1:27:59.119
2982
+ There is one thing interesting, which is even
2983
+ faster.
2984
+
2985
+ 1:27:59.419 --> 1:28:12.255
2986
+ Normally let's give you better performance
2987
+ because this one might be again like a smooth
2988
+
2989
+ 1:28:12.255 --> 1:28:13.697
2990
+ ensemble.
2991
+
2992
+ 1:28:16.736 --> 1:28:22.364
2993
+ Of course, there is also some problems with
2994
+ this, so I said.
2995
+
2996
+ 1:28:22.272 --> 1:28:30.023
2997
+ For example, maybe you want to do different
2998
+ web representations with Cherokee and.
2999
+
3000
+ 1:28:30.590 --> 1:28:37.189
3001
+ You want to do right to left decoding so you
3002
+ normally do like I go home but then your translation
3003
+
3004
+ 1:28:37.189 --> 1:28:39.613
3005
+ depends only on the previous words.
3006
+
3007
+ 1:28:39.545 --> 1:28:45.926
3008
+ If you want to model on the future you could
3009
+ do the inverse direction and generate the target
3010
+
3011
+ 1:28:45.926 --> 1:28:47.895
3012
+ sentence from right to left.
3013
+
3014
+ 1:28:48.728 --> 1:28:50.839
3015
+ But it's not easy to combine these things.
3016
+
3017
+ 1:28:51.571 --> 1:28:56.976
3018
+ In order to do this, or what is also sometimes
3019
+ interesting is doing in verse translation.
3020
+
3021
+ 1:28:57.637 --> 1:29:07.841
3022
+ You can combine these types of models in the
3023
+ next election.
3024
+
3025
+ 1:29:07.671 --> 1:29:13.968
3026
+ That is only a bit which we can do.
3027
+
3028
+ 1:29:14.494 --> 1:29:29.593
3029
+ Next time what you should remember is how
3030
+ search works and do you have any final questions.
3031
+
3032
+ 1:29:33.773 --> 1:29:43.393
3033
+ Then I wish you a happy holiday for next week
3034
+ and then Monday there is another practical
3035
+
3036
+ 1:29:43.393 --> 1:29:50.958
3037
+ and then Thursday in two weeks so we'll have
3038
+ the next lecture Monday.
3039
+
demo_data/lectures/Lecture-09-25.05.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb17280ddd03304eacdea7239b8a65b617c0c5bc9a4ab92e07100370c09187af
3
+ size 119262060
demo_data/lectures/Lecture-10-13.06.2023/English.vtt ADDED
@@ -0,0 +1,2458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 0:00:00.860 --> 0:00:04.146
4
+ IntroductionOkay Again Welcome.
5
+
6
+ 0:00:04.524 --> 0:00:09.256
7
+ So today I'll be doing the lecture.
8
+
9
+ 0:00:09.124 --> 0:00:12.201
10
+ My name is Danny Liro.
11
+
12
+ 0:00:12.067 --> 0:00:16.754
13
+ I'm one of the PhD students with.
14
+
15
+ 0:00:17.137 --> 0:00:25.942
16
+ And specifically how to learn representations
17
+ that are common across languages and use that
18
+
19
+ 0:00:25.942 --> 0:00:29.004
20
+ to help low resource languages.
21
+
22
+ 0:00:29.689 --> 0:00:39.445
23
+ So hope today we can explore a little bit
24
+ about motoring machine translation and hopefully.
25
+
26
+ 0:00:40.100 --> 0:00:50.940
27
+ So today what we are going to do first we
28
+ are going to look at.
29
+
30
+ 0:00:52.152 --> 0:01:02.491
31
+ Second, we will be looking into more details
32
+ as in how we achieve modeling or machine translation
33
+
34
+ 0:01:02.491 --> 0:01:06.183
35
+ and what are the techniques there.
36
+
37
+ 0:01:06.078 --> 0:01:12.199
38
+ At last, we are going to look at the current
39
+ challenges.
40
+
41
+ 0:01:13.573 --> 0:01:15.976
42
+ Alright, so some definitions.
43
+
44
+ 0:01:15.895 --> 0:01:19.821
45
+ First, what is modeling or machine translation?
46
+
47
+ 0:01:21.201 --> 0:01:28.637
48
+ So for a multilingual machine translation
49
+ system, it's basically a system that is able
50
+
51
+ 0:01:28.637 --> 0:01:34.279
52
+ to handle multiple source languages or multiple
53
+ target languages.
54
+
55
+ 0:01:34.254 --> 0:01:44.798
56
+ You see here you've got source on the source
57
+ side, some German Chinese, Spanish and English.
58
+
59
+ 0:01:45.485 --> 0:01:50.615
60
+ Physically, it's also a quite interesting
61
+ machine learning challenge actually.
62
+
63
+ 0:01:51.031 --> 0:02:05.528
64
+ So if you consider each translation pair as
65
+ a different task in machine learning, then
66
+
67
+ 0:02:05.528 --> 0:02:08.194
68
+ a multilingual.
69
+
70
+ 0:02:08.628 --> 0:02:17.290
71
+ Where it has to specialize in all these different
72
+ translation directions and try to be good.
73
+
74
+ 0:02:17.917 --> 0:02:26.890
75
+ So this is basically about multi-task learning,
76
+ and here when translation direction being one
77
+
78
+ 0:02:26.890 --> 0:02:27.462
79
+ task.
80
+
81
+ 0:02:28.428 --> 0:02:35.096
82
+ Interesting question to ask here is like do
83
+ we get synergy like different tasks helping
84
+
85
+ 0:02:35.096 --> 0:02:39.415
86
+ each other, the knowledge of one task helping
87
+ the other?
88
+
89
+ 0:02:39.539 --> 0:02:48.156
90
+ Or do we get more interference in English
91
+ to German, and now I get worse at English to
92
+
93
+ 0:02:48.156 --> 0:02:49.047
94
+ Chinese.
95
+
96
+ 0:02:49.629 --> 0:02:55.070
97
+ So this is also a very interesting question
98
+ that we'll look into later.
99
+
100
+ 0:02:56.096 --> 0:02:58.605
101
+ Now a little bit of context.
102
+
103
+ 0:02:59.519 --> 0:03:04.733
104
+ We care about multilingual machine translation.
105
+
106
+ 0:03:04.624 --> 0:03:10.601
107
+ Part of the thing is that machine translation
108
+ models.
109
+
110
+ 0:03:11.291 --> 0:03:22.659
111
+ If you consider all the languages in the world,
112
+ there are a read it here roughly seven thousand
113
+
114
+ 0:03:22.659 --> 0:03:23.962
115
+ languages.
116
+
117
+ 0:03:24.684 --> 0:03:37.764
118
+ So consider this number, and if you think
119
+ about this many languages out there, how many
120
+
121
+ 0:03:37.764 --> 0:03:39.548
122
+ directions.
123
+
124
+ 0:03:40.220 --> 0:03:46.897
125
+ So this means to cover end languages.
126
+
127
+ 0:03:46.722 --> 0:03:59.377
128
+ We're going to end up with a prodretic in
129
+ square number of directions.
130
+
131
+ 0:03:59.779 --> 0:04:02.290
132
+ This Is Very Bad, Padre Is Very Bad.
133
+
134
+ 0:04:03.203 --> 0:04:14.078
135
+ The prosthetic situation going on means that
136
+ for a lot of translation directions, if you
137
+
138
+ 0:04:14.078 --> 0:04:16.278
139
+ consider all the.
140
+
141
+ 0:04:17.177 --> 0:04:34.950
142
+ For many of them we aren't going to have any
143
+ parallel data as in existing translated data.
144
+
145
+ 0:04:35.675 --> 0:04:40.001
146
+ So this is a very data scarce situation.
147
+
148
+ 0:04:39.896 --> 0:04:49.685
149
+ We're not going to get parallel data in blue
150
+ wear, especially likely when you have a system
151
+
152
+ 0:04:49.685 --> 0:04:52.558
153
+ that covers tan languages.
154
+
155
+ 0:04:52.912 --> 0:05:04.437
156
+ If this access actually goes towards thousands
157
+ that are realistic, we are going to end up
158
+
159
+ 0:05:04.437 --> 0:05:06.614
160
+ with some holes.
161
+
162
+ 0:05:07.667 --> 0:05:15.400
163
+ So now we are going to ask: Can we use motel
164
+ inquality to help this kind of glow resource?
165
+
166
+ 0:05:15.875 --> 0:05:22.858
167
+ So when useful concept there is mutual intelligibility,
168
+ don't know if you've heard of this.
169
+
170
+ 0:05:23.203 --> 0:05:30.264
171
+ Basically isn't linguistic when you say somebody
172
+ who's speaking one language can directly without
173
+
174
+ 0:05:30.264 --> 0:05:33.218
175
+ learning understands the other language.
176
+
177
+ 0:05:33.146 --> 0:05:39.340
178
+ So if you're a German speaker maybe Dutch
179
+ or Danish and all that kind of stuff would
180
+
181
+ 0:05:39.340 --> 0:05:39.632
182
+ be.
183
+
184
+ 0:05:40.000 --> 0:05:45.990
185
+ Useful or like directly understandable partially
186
+ to you.
187
+
188
+ 0:05:46.586 --> 0:05:52.082
189
+ That is, thanks to this kind of mutual enthology
190
+ ability that is basically based on language
191
+
192
+ 0:05:52.082 --> 0:05:52.791
193
+ similarity.
194
+
195
+ 0:05:53.893 --> 0:05:57.105
196
+ And then there's knowledge sharing this concept.
197
+
198
+ 0:05:57.039 --> 0:06:01.188
199
+ I mean, it's quite intuitive, basically a
200
+ very German speaker.
201
+
202
+ 0:06:01.122 --> 0:06:06.756
203
+ If you start learning Dutch or Danish and
204
+ all these Mordic languages, I think you're
205
+
206
+ 0:06:06.756 --> 0:06:11.197
207
+ going to be faster than just a native English
208
+ speaker or anything.
209
+
210
+ 0:06:11.952 --> 0:06:18.751
211
+ So hopefully our model is also able to do
212
+ this, but we'll see later what the real situation.
213
+
214
+ 0:06:19.799 --> 0:06:27.221
215
+ So we said multilingual is good multilingual
216
+ transmission, it's nice and there's a lot of
217
+
218
+ 0:06:27.221 --> 0:06:28.210
219
+ potentials.
220
+
221
+ 0:06:28.969 --> 0:06:32.205
222
+ So it's a long path towards there.
223
+
224
+ 0:06:32.113 --> 0:06:37.571
225
+ Think all the efforts started in so quite
226
+ some years ago.
227
+
228
+ 0:06:37.958 --> 0:06:54.639
229
+ At first people started with models with language
230
+ specific modules.
231
+
232
+ 0:06:54.454 --> 0:06:58.747
233
+ So we talked about the input of the decoder
234
+ architecture in the previous lecturer area.
235
+
236
+ 0:07:00.100 --> 0:07:06.749
237
+ And with this separation of the inputter and
238
+ the decoder, it gives it a natural way to split
239
+
240
+ 0:07:06.749 --> 0:07:07.679
241
+ the modules.
242
+
243
+ 0:07:09.069 --> 0:07:20.805
244
+ So basically what's happening going on here
245
+ is dedicated to each toes language and dedicated.
246
+
247
+ 0:07:21.281 --> 0:07:34.252
248
+ Now given parallel data of body good data
249
+ English German data we just activate this German
250
+
251
+ 0:07:34.252 --> 0:07:39.241
252
+ inputter and activate this and an.
253
+
254
+ 0:07:40.680 --> 0:07:48.236
255
+ So now we are training basically like corresponding
256
+ parts of the encoder decoders.
257
+
258
+ 0:07:48.145 --> 0:07:55.369
259
+ It has some advantages: First, we have a multilingual
260
+ system.
261
+
262
+ 0:07:55.252 --> 0:08:03.887
263
+ Of course, second modularity is also an advantage
264
+ in software engineering.
265
+
266
+ 0:08:03.772 --> 0:08:10.567
267
+ We want to decouple things if the German input
268
+ is broken.
269
+
270
+ 0:08:11.011 --> 0:08:19.313
271
+ So modularity is advantage in this case, but
272
+ again if we think about scalability, if we
273
+
274
+ 0:08:19.313 --> 0:08:27.521
275
+ think about languages out there that we talked
276
+ about, scalability isn't a great thing.
277
+
278
+ 0:08:27.947 --> 0:08:37.016
279
+ We also talked about sharing knowledge or
280
+ sharing representations for different languages.
281
+
282
+ 0:08:37.317 --> 0:08:41.968
283
+ We have a separate thing for each language.
284
+
285
+ 0:08:41.862 --> 0:08:46.453
286
+ How likely is it that we are sharing much?
287
+
288
+ 0:08:46.346 --> 0:08:52.541
289
+ So these are potential disadvantages with
290
+ this approach.
291
+
292
+ 0:08:53.073 --> 0:09:01.181
293
+ So yeah we talked about, we want to have knowledge
294
+ transfer, we want to have similar languages
295
+
296
+ 0:09:01.181 --> 0:09:02.888
297
+ helping each other.
298
+
299
+ 0:09:02.822 --> 0:09:06.095
300
+ This is somehow a more reachable goal.
301
+
302
+ 0:09:06.011 --> 0:09:13.521
303
+ If you have a shared in corner and a shared
304
+ in physically, a full perimeter shared model
305
+
306
+ 0:09:13.521 --> 0:09:21.284
307
+ for all the translation pairs out there, and
308
+ there's also another game, so if you just have
309
+
310
+ 0:09:21.284 --> 0:09:21.705
311
+ one.
312
+
313
+ 0:09:22.582 --> 0:09:26.084
314
+ Lock of model for all the translation directions
315
+ out there.
316
+
317
+ 0:09:26.606 --> 0:09:38.966
318
+ It's easier to deploy in the sense that if
319
+ you are serving a model you don't have a thousand
320
+
321
+ 0:09:38.966 --> 0:09:42.555
322
+ small modules to maintain.
323
+
324
+ 0:09:42.762 --> 0:09:52.262
325
+ So in terms of engineering somehow these kind
326
+ of fully primitive shared models have: So this
327
+
328
+ 0:09:52.262 --> 0:09:59.821
329
+ is also where the parent research has been
330
+ going towards in recent years.
331
+
332
+ 0:10:00.460 --> 0:10:16.614
333
+ So the rest of the electro are also going
334
+ to focus on this kind of model.
335
+
336
+ 0:10:17.037 --> 0:10:30.901
337
+ So the first type of multilinguali is this
338
+ kind of many to one abbreviated kind of situation.
339
+
340
+ 0:10:30.754 --> 0:10:34.448
341
+ Basically what's going.
342
+
343
+ 0:10:35.355 --> 0:10:49.804
344
+ So one news case that you can think of here
345
+ is if you're subtitled for international movies
346
+
347
+ 0:10:49.804 --> 0:10:51.688
348
+ in Germany.
349
+
350
+ 0:10:53.073 --> 0:11:02.863
351
+ Then flipping the situation there is also
352
+ many configurations where we only have when
353
+
354
+ 0:11:02.863 --> 0:11:04.798
355
+ source language.
356
+
357
+ 0:11:06.046 --> 0:11:13.716
358
+ There's also many use cases like if you think
359
+ about the lecture translator here you've seen.
360
+
361
+ 0:11:14.914 --> 0:11:21.842
362
+ So here most of the lecturers are in German
363
+ and now we want to translate it into.
364
+
365
+ 0:11:21.758 --> 0:11:28.433
366
+ I think on the user end we only support English
367
+ but they're also supportable.
368
+
369
+ 0:11:28.608 --> 0:11:38.988
370
+ So in this kind of used case, if you have
371
+ one speaker and you want to serve or expand
372
+
373
+ 0:11:38.988 --> 0:11:41.281
374
+ to many audience,.
375
+
376
+ 0:11:42.802 --> 0:11:50.542
377
+ But of course, combining everything, there's
378
+ the many to many situation here.
379
+
380
+ 0:11:50.443 --> 0:11:53.958
381
+ You can think of Google Translate.
382
+
383
+ 0:11:53.857 --> 0:11:58.781
384
+ They are doing basically any selected language.
385
+
386
+ 0:11:59.159 --> 0:12:03.760
387
+ And this is also more difficult.
388
+
389
+ 0:12:03.620 --> 0:12:14.775
390
+ If you consider the data you need to get and
391
+ concerns, we'll cover this later.
392
+
393
+ 0:12:15.135 --> 0:12:21.008
394
+ Many to One TranslationsBut first we are going
395
+ to start with many to one translations.
396
+
397
+ 0:12:21.741 --> 0:12:30.436
398
+ Say this is the most similar to the bilingual
399
+ translation situation you saw earlier, but
400
+
401
+ 0:12:30.436 --> 0:12:39.423
402
+ now one difference is we need a vocabulary
403
+ or tokens that can represent all these different
404
+
405
+ 0:12:39.423 --> 0:12:40.498
406
+ languages.
407
+
408
+ 0:12:41.301 --> 0:12:44.200
409
+ So we need a joint more telecom global vocabulary.
410
+
411
+ 0:12:44.924 --> 0:12:48.794
412
+ So let's just quickly recall what word embedding
413
+ is to do.
414
+
415
+ 0:12:49.189 --> 0:12:54.561
416
+ Basically we need to represent it.
417
+
418
+ 0:12:54.407 --> 0:13:04.079
419
+ We have to get some vector representation
420
+ for discrete words.
421
+
422
+ 0:13:04.784 --> 0:13:16.911
423
+ And when we embed a token, we are retrieving
424
+ the corresponding vector out of this little.
425
+
426
+ 0:13:17.697 --> 0:13:19.625
427
+ And then we put it.
428
+
429
+ 0:13:19.528 --> 0:13:26.083
430
+ We feed a sequence of vectors into the inputter
431
+ as the next steps.
432
+
433
+ 0:13:26.987 --> 0:13:34.973
434
+ Now if it's motelingual you can imagine that
435
+ vocabulary suddenly gets very, very big because
436
+
437
+ 0:13:34.973 --> 0:13:36.262
438
+ the languages.
439
+
440
+ 0:13:37.877 --> 0:13:46.141
441
+ So what is quite useful here is the by pair
442
+ like subwords you talked about by pairing.
443
+
444
+ 0:13:46.406 --> 0:13:55.992
445
+ So in this case we are still limiting ourselves
446
+ to a finite number of vocabularies that we
447
+
448
+ 0:13:55.992 --> 0:13:59.785
449
+ are exploding the vocabulary table.
450
+
451
+ 0:14:01.181 --> 0:14:11.631
452
+ So when we learn these kinds of subwords,
453
+ what happens basically?
454
+
455
+ 0:14:11.473 --> 0:14:17.020
456
+ We look at all the training data.
457
+
458
+ 0:14:18.558 --> 0:14:20.856
459
+ So think about this.
460
+
461
+ 0:14:20.746 --> 0:14:28.079
462
+ If we do this now on a bunch of Mozilla data,
463
+ are there concerns?
464
+
465
+ 0:14:30.050 --> 0:14:36.811
466
+ Maybe we have an underground status head,
467
+ so we get over English mergers and nocularities.
468
+
469
+ 0:14:37.337 --> 0:14:39.271
470
+ Yeah Exactly Thanks.
471
+
472
+ 0:14:39.539 --> 0:14:46.602
473
+ So what we have to pay attention to here is
474
+ learn this motilingual vocabulary.
475
+
476
+ 0:14:46.513 --> 0:14:52.550
477
+ We should pay attention: All the languages
478
+ are more or less balanced, not that you only
479
+
480
+ 0:14:52.550 --> 0:14:58.862
481
+ learning words is for for English or some bigger
482
+ languages, and then neglecting other other
483
+
484
+ 0:14:58.862 --> 0:15:00.028
485
+ languages, yeah.
486
+
487
+ 0:15:01.021 --> 0:15:04.068
488
+ Of course, this is not going to solve everything.
489
+
490
+ 0:15:04.007 --> 0:15:09.589
491
+ Even if we get a perfectly uniform distribution
492
+ out of all the languages out, there is not
493
+
494
+ 0:15:09.589 --> 0:15:13.454
495
+ going to mean that we are ending up with a
496
+ perfect vocabulary.
497
+
498
+ 0:15:14.154 --> 0:15:20.068
499
+ There are also language differences read,
500
+ so if you consider more European languages.
501
+
502
+ 0:15:20.180 --> 0:15:27.081
503
+ There will be many shared subcomponents like
504
+ how you write a certain word, somewhat similar.
505
+
506
+ 0:15:27.267 --> 0:15:34.556
507
+ But then there are other languages with completely
508
+ different scripts like Arabic, Cyrillic scripts
509
+
510
+ 0:15:34.556 --> 0:15:40.594
511
+ or Eastern Asian scripts where you get a vocabulary
512
+ like the characters set with.
513
+
514
+ 0:15:40.940 --> 0:15:43.531
515
+ Tens of thousands of characters.
516
+
517
+ 0:15:43.453 --> 0:15:50.356
518
+ So these are also individual concerns that
519
+ one has to think about my building specific
520
+
521
+ 0:15:50.356 --> 0:15:51.070
522
+ systems.
523
+
524
+ 0:15:51.591 --> 0:16:02.660
525
+ But overall, the rule of thumb is that when
526
+ you do a mottling tokenizer vocabulary, there's
527
+
528
+ 0:16:02.660 --> 0:16:04.344
529
+ more or less.
530
+
531
+ 0:16:05.385 --> 0:16:17.566
532
+ And there's actually some paper showing that
533
+ the performance of the final system is going
534
+
535
+ 0:16:17.566 --> 0:16:25.280
536
+ to start to degrade if you have a disproportionate
537
+ data.
538
+
539
+ 0:16:27.207 --> 0:16:33.186
540
+ Of course there is currently the trend of
541
+ using pre-train models.
542
+
543
+ 0:16:33.095 --> 0:16:39.891
544
+ If you take a pre-train model somewhere then
545
+ you don't have this concern.
546
+
547
+ 0:16:40.580 --> 0:16:47.810
548
+ Making sure that you use the same organizers
549
+ that they used so that there is no train test
550
+
551
+ 0:16:47.810 --> 0:16:48.287
552
+ time.
553
+
554
+ 0:16:48.888 --> 0:16:53.634
555
+ Yeah for a pre-trainer, we're going to talk
556
+ about a little bit later as well.
557
+
558
+ 0:16:54.734 --> 0:16:59.928
559
+ Martin Luther VocabularyAlright: So now where's
560
+ a Martin Luther vocabulary?
561
+
562
+ 0:17:00.920 --> 0:17:04.187
563
+ There are several good things, obviously.
564
+
565
+ 0:17:04.109 --> 0:17:10.909
566
+ So one thing is that if we have words that
567
+ are in the textful form like we said, there
568
+
569
+ 0:17:10.909 --> 0:17:16.224
570
+ are European languages that share some vocabulary,
571
+ then it's great.
572
+
573
+ 0:17:16.146 --> 0:17:19.899
574
+ Then we have the first step towards knowledge.
575
+
576
+ 0:17:20.000 --> 0:17:30.464
577
+ For example, the word pineapple for some reason
578
+ is also in Eastern European languages.
579
+
580
+ 0:17:30.344 --> 0:17:34.918
581
+ In Cyrillic scripts that's also the.
582
+
583
+ 0:17:36.116 --> 0:17:42.054
584
+ But however, there is also ambiguity if you've
585
+ embracing together or dye.
586
+
587
+ 0:17:41.973 --> 0:17:46.067
588
+ Of course, they mean different things for
589
+ German.
590
+
591
+ 0:17:46.246 --> 0:17:53.276
592
+ Then, of course, that's possible to rely on
593
+ further context.
594
+
595
+ 0:17:53.161 --> 0:17:59.156
596
+ It's not a problem, it's something to think
597
+ about.
598
+
599
+ 0:18:00.200 --> 0:18:11.061
600
+ And when we go higher to cover more vocabulary
601
+ entries, we might need to go bigger in the
602
+
603
+ 0:18:11.061 --> 0:18:13.233
604
+ vocabulary count.
605
+
606
+ 0:18:13.653 --> 0:18:28.561
607
+ So there is always sort of a bottleneck as
608
+ the number of languages increase.
609
+
610
+ 0:18:30.110 --> 0:18:32.836
611
+ Right, so what is the result?
612
+
613
+ 0:18:32.745 --> 0:18:38.290
614
+ What are these crustling over inventings actually
615
+ learning?
616
+
617
+ 0:18:40.160 --> 0:18:44.658
618
+ So normally to inspect them it's quite hard.
619
+
620
+ 0:18:44.558 --> 0:18:53.854
621
+ It's like high dimensional vectors with dimensions,
622
+ but researchers also try to project it.
623
+
624
+ 0:18:54.454 --> 0:19:05.074
625
+ So in this case it is a little bit small,
626
+ but in this case for English and French there
627
+
628
+ 0:19:05.074 --> 0:19:07.367
629
+ are many injuries.
630
+
631
+ 0:19:07.467 --> 0:19:20.014
632
+ My example is like different words with the
633
+ same word in morphological forms.
634
+
635
+ 0:19:19.853 --> 0:19:26.131
636
+ Basically, it's like a morphological.
637
+
638
+ 0:19:26.546 --> 0:19:32.727
639
+ There are also words in different languages
640
+ like think there is research for English and
641
+
642
+ 0:19:32.727 --> 0:19:33.282
643
+ French.
644
+
645
+ 0:19:33.954 --> 0:19:41.508
646
+ So the take away from this plot is that somehow
647
+ we learn a bit of semantic meanings beyond
648
+
649
+ 0:19:41.508 --> 0:19:43.086
650
+ the textual forms.
651
+
652
+ 0:19:45.905 --> 0:19:50.851
653
+ But then this looks good and this gives us
654
+ hope.
655
+
656
+ 0:19:52.252 --> 0:20:05.240
657
+ That if we consider what is the baseline here,
658
+ the baseline we compare to is a bilingual system
659
+
660
+ 0:20:05.240 --> 0:20:09.164
661
+ without any multilinguality.
662
+
663
+ 0:20:10.290 --> 0:20:18.214
664
+ This looks good because if we compare for
665
+ many Central European languages, Eastern and
666
+
667
+ 0:20:18.214 --> 0:20:27.413
668
+ Central European languages to English, we compare:
669
+ And we see that the Mini Two English has actually
670
+
671
+ 0:20:27.413 --> 0:20:30.601
672
+ always gained quite a bit over it.
673
+
674
+ 0:20:31.751 --> 0:20:38.876
675
+ But there is also later investigation on whether
676
+ it is actually out of mountain linguality or
677
+
678
+ 0:20:38.876 --> 0:20:39.254
679
+ not.
680
+
681
+ 0:20:39.639 --> 0:20:46.692
682
+ So this is a spoiler won't tell much about
683
+ it until the second half, but just remember
684
+
685
+ 0:20:46.692 --> 0:20:47.908
686
+ there is this.
687
+
688
+ 0:20:49.449 --> 0:20:53.548
689
+ Many to Many TranslationsNow move on to many
690
+ translations.
691
+
692
+ 0:20:53.479 --> 0:21:01.785
693
+ Let's recall in a normal transformer or any
694
+ encoder decoder setup.
695
+
696
+ 0:21:02.242 --> 0:21:08.839
697
+ We have an inkluder that creates sort of contextual
698
+ representation for the sort of sentence.
699
+
700
+ 0:21:09.949 --> 0:21:17.787
701
+ Is more or less the context for generating
702
+ the target sentence red.
703
+
704
+ 0:21:17.672 --> 0:21:28.381
705
+ Now on the target side we get the first open,
706
+ then we feed it again and then get the second
707
+
708
+ 0:21:28.381 --> 0:21:29.545
709
+ decoding.
710
+
711
+ 0:21:31.651 --> 0:21:35.039
712
+ And now we have multiple target languages.
713
+
714
+ 0:21:34.960 --> 0:21:39.059
715
+ Does anybody see a problem with this architecture?
716
+
717
+ 0:21:48.268 --> 0:21:57.791
718
+ Specifically, it's in the decoder, so now
719
+ have a German sentiments encoded.
720
+
721
+ 0:21:57.666 --> 0:22:01.930
722
+ It now want to generate Spanish.
723
+
724
+ 0:22:07.367 --> 0:22:11.551
725
+ So the problem is how does the model know
726
+ which language to generate?
727
+
728
+ 0:22:12.112 --> 0:22:24.053
729
+ If you just give it a generic start token,
730
+ there is nowhere where we are telling the model.
731
+
732
+ 0:22:24.944 --> 0:22:30.277
733
+ So that this can only be a guess, and this
734
+ model will definitely not run well.
735
+
736
+ 0:22:32.492 --> 0:22:40.021
737
+ So this comes to the question: How do we indicate
738
+ the one's intended language to the model?
739
+
740
+ 0:22:41.441 --> 0:22:52.602
741
+ One first idea is what people tried is basically
742
+ now in a source where not only including the
743
+
744
+ 0:22:52.602 --> 0:22:53.552
745
+ source.
746
+
747
+ 0:22:53.933 --> 0:23:01.172
748
+ To Spanish things like this, so basically
749
+ the source is already informed.
750
+
751
+ 0:23:01.074 --> 0:23:11.818
752
+ The source sentence is already supplemented
753
+ with: Now this is also called a target forcing
754
+
755
+ 0:23:11.818 --> 0:23:19.257
756
+ in the sense that we try to force it to give
757
+ the right target.
758
+
759
+ 0:23:20.080 --> 0:23:24.622
760
+ This is one approach.
761
+
762
+ 0:23:24.416 --> 0:23:38.047
763
+ Another approach is basically based on the
764
+ idea that if we have.
765
+
766
+ 0:23:38.438 --> 0:23:52.177
767
+ So if we create a context of our world, the
768
+ incode output shouldn't really differ.
769
+
770
+ 0:23:52.472 --> 0:24:02.397
771
+ So out of this motivation people have moved
772
+ this signaling mechanism.
773
+
774
+ 0:24:02.255 --> 0:24:09.914
775
+ They basically replaced the traditional start
776
+ token.
777
+
778
+ 0:24:10.330 --> 0:24:17.493
779
+ So here we are not kids starting into the
780
+ generic start talking anymore instead language
781
+
782
+ 0:24:17.493 --> 0:24:18.298
783
+ specific.
784
+
785
+ 0:24:18.938 --> 0:24:21.805
786
+ So this is also another way to achieve this.
787
+
788
+ 0:24:23.283 --> 0:24:27.714
789
+ But there are still more challenging cases.
790
+
791
+ 0:24:27.614 --> 0:24:35.536
792
+ Sometimes here it can be called as General
793
+ English or German when it's there.
794
+
795
+ 0:24:35.435 --> 0:24:39.703
796
+ Later on it goes further and further on.
797
+
798
+ 0:24:40.320 --> 0:24:46.752
799
+ Basically this information is not strong enough
800
+ to always enforce the target language, especially
801
+
802
+ 0:24:46.752 --> 0:24:48.392
803
+ in zero shot conditions.
804
+
805
+ 0:24:48.327 --> 0:24:54.142
806
+ We'll look into this later so we'll get this
807
+ kind of target translation into generating
808
+
809
+ 0:24:54.142 --> 0:24:57.843
810
+ and generating and then going into some wrong
811
+ language.
812
+
813
+ 0:24:59.219 --> 0:25:12.542
814
+ So another technique actually developed here
815
+ some years ago was to inject this language.
816
+
817
+ 0:25:12.872 --> 0:25:19.834
818
+ So when we are feeding doing the auto-aggressive
819
+ decoding normally, we only feed the upherb.
820
+
821
+ 0:25:20.000 --> 0:25:22.327
822
+ Into the depoter.
823
+
824
+ 0:25:22.197 --> 0:25:33.676
825
+ But if we also add a language embedding for
826
+ the target language, on top of that we have
827
+
828
+ 0:25:33.676 --> 0:25:37.067
829
+ the language information.
830
+
831
+ 0:25:37.397 --> 0:25:44.335
832
+ And this has shown to perform quite a bit
833
+ better, especially in conditions where the
834
+
835
+ 0:25:44.335 --> 0:25:44.906
836
+ model.
837
+
838
+ 0:25:46.126 --> 0:25:55.015
839
+ So yeah, we introduced three ways to enforce
840
+ the Tardid language: And now with this we're
841
+
842
+ 0:25:55.015 --> 0:26:02.621
843
+ going to move on to the more interesting case
844
+ of many too many translations.
845
+
846
+ 0:26:03.503 --> 0:26:13.183
847
+ Zero-Shot TranslationAm so here we just consider
848
+ a system that translates two directions: English
849
+
850
+ 0:26:13.183 --> 0:26:15.554
851
+ to English and English.
852
+
853
+ 0:26:16.676 --> 0:26:21.416
854
+ Now we have target languages read.
855
+
856
+ 0:26:21.280 --> 0:26:29.498
857
+ Can you see where we're enforcing the target
858
+ language here?
859
+
860
+ 0:26:29.361 --> 0:26:33.475
861
+ In this case what technique?
862
+
863
+ 0:26:34.934 --> 0:26:45.338
864
+ So here we are enforcing the characteristic
865
+ language with the yelling we train this system.
866
+
867
+ 0:26:46.526 --> 0:26:59.567
868
+ And at the inference time we are able to generate
869
+ English to French, but in addition to this
870
+
871
+ 0:26:59.567 --> 0:27:12.048
872
+ we are also able to: We will be able to do
873
+ zero shot inference that basically translates
874
+
875
+ 0:27:12.048 --> 0:27:17.937
876
+ a direction that is not seen in training.
877
+
878
+ 0:27:19.319 --> 0:27:25.489
879
+ So this is so called zero shot translation
880
+ using a modeling wall system.
881
+
882
+ 0:27:26.606 --> 0:27:34.644
883
+ Of course, we have to reach several things
884
+ before we are able to control the language,
885
+
886
+ 0:27:34.644 --> 0:27:36.769
887
+ otherwise it's no use.
888
+
889
+ 0:27:37.317 --> 0:27:51.087
890
+ Second, we should also have some kind of language
891
+ independent representation.
892
+
893
+ 0:27:51.731 --> 0:27:53.196
894
+ Why is this?
895
+
896
+ 0:27:53.083 --> 0:27:55.028
897
+ Why is this big?
898
+
899
+ 0:27:54.914 --> 0:28:00.637
900
+ Because if women drink generally French up
901
+ here?
902
+
903
+ 0:28:00.940 --> 0:28:05.870
904
+ It was trained to translate from some English.
905
+
906
+ 0:28:07.187 --> 0:28:15.246
907
+ But now we use Anchored Germans in the French,
908
+ so intuitively we need these representations
909
+
910
+ 0:28:15.246 --> 0:28:22.429
911
+ to be similar enough, not that they are so
912
+ far attracted that we cannot use this.
913
+
914
+ 0:28:25.085 --> 0:28:32.059
915
+ So there are several works out there showing
916
+ that if you do a standard transformer architecture
917
+
918
+ 0:28:32.059 --> 0:28:39.107
919
+ this language independent property is not really
920
+ there and you need to add additional approaches
921
+
922
+ 0:28:39.107 --> 0:28:40.633
923
+ in order to enforce.
924
+
925
+ 0:28:41.201 --> 0:28:50.863
926
+ So you can, for example, add an additional
927
+ training objective: That says, we invoked SARSN,
928
+
929
+ 0:28:50.863 --> 0:29:00.211
930
+ be invoked by German, and the invoked English
931
+ have to be the same or be as close to each
932
+
933
+ 0:29:00.211 --> 0:29:02.207
934
+ other as possible.
935
+
936
+ 0:29:02.882 --> 0:29:17.576
937
+ So if we take the output and the output for
938
+ another language, how can we formulate this
939
+
940
+ 0:29:17.576 --> 0:29:18.745
941
+ as an.
942
+
943
+ 0:29:20.981 --> 0:29:27.027
944
+ We can take the translation to the encoder
945
+ and whatever you translate.
946
+
947
+ 0:29:26.942 --> 0:29:32.819
948
+ The embeddings also must be similar and that's
949
+ the great direction.
950
+
951
+ 0:29:33.253 --> 0:29:42.877
952
+ So one thing to take care of here is the length
953
+ for the same sentence in German and English
954
+
955
+ 0:29:42.877 --> 0:29:44.969
956
+ is not necessarily.
957
+
958
+ 0:29:45.305 --> 0:30:00.858
959
+ So if we just do a word to word matching,
960
+ we can always do pulling to a fixed length
961
+
962
+ 0:30:00.858 --> 0:30:03.786
963
+ representation.
964
+
965
+ 0:30:04.004 --> 0:30:08.392
966
+ Or there are more advanced techniques that
967
+ involve some alignments.
968
+
969
+ 0:30:08.848 --> 0:30:23.456
970
+ So this is useful in the sense that in this
971
+ part in experiments we have shown it improves
972
+
973
+ 0:30:23.456 --> 0:30:27.189
974
+ zero shot translation.
975
+
976
+ 0:30:27.447 --> 0:30:36.628
977
+ This is on the data condition of English to
978
+ Malay, Java and Filipino, so kind of made to
979
+
980
+ 0:30:36.628 --> 0:30:39.722
981
+ low resource language family.
982
+
983
+ 0:30:40.100 --> 0:30:50.876
984
+ And there we assume that we get parallel English
985
+ to all of them, but among all these.
986
+
987
+ 0:30:51.451 --> 0:31:03.592
988
+ So the blue bar is a Vanilla Transformer model,
989
+ and the purple bar is when we add a language.
990
+
991
+ 0:31:04.544 --> 0:31:12.547
992
+ You see that in supervised conditions it's
993
+ not changing much, but in zero shots there's
994
+
995
+ 0:31:12.547 --> 0:31:13.183
996
+ quite.
997
+
998
+ 0:31:15.215 --> 0:31:22.649
999
+ Yeah, so far we said zero shots is doable
1000
+ and it's even more achievable if we enforce
1001
+
1002
+ 0:31:22.649 --> 0:31:26.366
1003
+ some language independent representations.
1004
+
1005
+ 0:31:26.279 --> 0:31:29.778
1006
+ However, there's one practical concern.
1007
+
1008
+ 0:31:29.690 --> 0:31:33.803
1009
+ Don't know if you also had the same question.
1010
+
1011
+ 0:31:34.514 --> 0:31:39.835
1012
+ If you have two languages, you don't have
1013
+ direct parallel.
1014
+
1015
+ 0:31:39.745 --> 0:31:43.895
1016
+ One's into English and one's out of English.
1017
+
1018
+ 0:31:45.685 --> 0:31:52.845
1019
+ It's actually this kind of approach is called
1020
+ pivoting as in pivoting over an intermediate
1021
+
1022
+ 0:31:52.845 --> 0:31:53.632
1023
+ language.
1024
+
1025
+ 0:31:55.935 --> 0:32:00.058
1026
+ Yeah, that it definitely has advantages in
1027
+ the sense that we're going.
1028
+
1029
+ 0:32:00.440 --> 0:32:11.507
1030
+ Now if we go over these two steps every direction
1031
+ was trained with supervised data so you could
1032
+
1033
+ 0:32:11.507 --> 0:32:18.193
1034
+ always assume that when we are working with
1035
+ a supervised.
1036
+
1037
+ 0:32:18.718 --> 0:32:26.868
1038
+ So in this case we can expect more robust
1039
+ inference time behavior.
1040
+
1041
+ 0:32:26.747 --> 0:32:31.616
1042
+ However, there are also disadvantages.
1043
+
1044
+ 0:32:31.531 --> 0:32:38.860
1045
+ An inference where passing through the model
1046
+ ties so that's doubling the inference time
1047
+
1048
+ 0:32:38.860 --> 0:32:39.943
1049
+ computation.
1050
+
1051
+ 0:32:40.500 --> 0:32:47.878
1052
+ You might think okay doubling then what, but
1053
+ if you consider if your company like Google,
1054
+
1055
+ 0:32:47.878 --> 0:32:54.929
1056
+ Google Translate and all your life traffic
1057
+ suddenly becomes twice as big, this is not
1058
+
1059
+ 0:32:54.929 --> 0:33:00.422
1060
+ something scalable that you want to see, especially
1061
+ in production.
1062
+
1063
+ 0:33:01.641 --> 0:33:11.577
1064
+ A problem with this is making information
1065
+ loss because if we go over these games when
1066
+
1067
+ 0:33:11.577 --> 0:33:20.936
1068
+ a chain of kids pass the word to each other,
1069
+ in the end it's losing information.
1070
+
1071
+ 0:33:22.082 --> 0:33:24.595
1072
+ Can give it an example here.
1073
+
1074
+ 0:33:24.509 --> 0:33:27.765
1075
+ It's also from a master thesis here.
1076
+
1077
+ 0:33:27.677 --> 0:33:30.321
1078
+ It's on gender preservation.
1079
+
1080
+ 0:33:30.770 --> 0:33:39.863
1081
+ Basically, some languages like Italian and
1082
+ French have different word forms based on the
1083
+
1084
+ 0:33:39.863 --> 0:33:40.782
1085
+ speaker.
1086
+
1087
+ 0:33:41.001 --> 0:33:55.987
1088
+ So if a male person says feel alienated, this
1089
+ word for alienated would be exclusive and a
1090
+
1091
+ 0:33:55.987 --> 0:33:58.484
1092
+ female person.
1093
+
1094
+ 0:34:00.620 --> 0:34:05.730
1095
+ Now imagine that we pivot through anguish.
1096
+
1097
+ 0:34:05.611 --> 0:34:08.641
1098
+ The information is lost.
1099
+
1100
+ 0:34:08.520 --> 0:34:11.917
1101
+ We don't know what gender.
1102
+
1103
+ 0:34:12.492 --> 0:34:19.626
1104
+ When we go out into branch again, there are
1105
+ different forms.
1106
+
1107
+ 0:34:19.509 --> 0:34:29.177
1108
+ Depending on the speaker gender, we can: So
1109
+ this is one problem.
1110
+
1111
+ 0:34:31.871 --> 0:34:44.122
1112
+ This is especially the case because English
1113
+ compared to many other languages is relatively
1114
+
1115
+ 0:34:44.122 --> 0:34:45.199
1116
+ simple.
1117
+
1118
+ 0:34:45.205 --> 0:34:53.373
1119
+ Gendered where it forms like this, it also
1120
+ doesn't have many cases, so going through English
1121
+
1122
+ 0:34:53.373 --> 0:34:56.183
1123
+ many information would be lost.
1124
+
1125
+ 0:34:57.877 --> 0:35:12.796
1126
+ And another thing is if you have similar languages
1127
+ that you are translating out of my systems
1128
+
1129
+ 0:35:12.796 --> 0:35:15.494
1130
+ that translates.
1131
+
1132
+ 0:35:16.496 --> 0:35:24.426
1133
+ This is the output of going from Dutch to
1134
+ German again.
1135
+
1136
+ 0:35:24.284 --> 0:35:30.235
1137
+ If you read the German, how many of you?
1138
+
1139
+ 0:35:32.552 --> 0:35:51.679
1140
+ Good and the problem here is that we are going
1141
+ over English and then the English to German.
1142
+
1143
+ 0:35:51.831 --> 0:36:06.332
1144
+ However, if we go direct in this case zero
1145
+ shot translation you see that word forgive.
1146
+
1147
+ 0:36:06.546 --> 0:36:09.836
1148
+ In this case, the outward translation is better.
1149
+
1150
+ 0:36:10.150 --> 0:36:20.335
1151
+ And we believe this has to do with using the
1152
+ language similarity between the two languages.
1153
+
1154
+ 0:36:20.225 --> 0:36:26.759
1155
+ There is also quantitative results we found
1156
+ when born in.
1157
+
1158
+ 0:36:27.988 --> 0:36:33.780
1159
+ The models are always doing better when translating
1160
+ similar languages compared to the.
1161
+
1162
+ 0:36:35.535 --> 0:36:42.130
1163
+ SummaryYeah, so in this first half what we
1164
+ talked about basically first, we started with
1165
+
1166
+ 0:36:42.130 --> 0:36:49.838
1167
+ how motilinguality or motilingual machine translation
1168
+ could enable knowledge transfer between languages
1169
+
1170
+ 0:36:49.838 --> 0:36:53.987
1171
+ and help with conditions where we don't have
1172
+ much data.
1173
+
1174
+ 0:36:55.235 --> 0:37:02.826
1175
+ Now it looks at three types of multilingual
1176
+ translation, so one is many to one, one to
1177
+
1178
+ 0:37:02.826 --> 0:37:03.350
1179
+ many.
1180
+
1181
+ 0:37:05.285 --> 0:37:13.397
1182
+ We got there first about a shared vocabulary
1183
+ based on different languages and how these
1184
+
1185
+ 0:37:13.397 --> 0:37:22.154
1186
+ cross lingual word embeddings capture semantic
1187
+ meanings rather than just on a text proof form.
1188
+
1189
+ 0:37:25.505 --> 0:37:37.637
1190
+ Then we looked at how to signal the target
1191
+ language, how to ask for the model to generate,
1192
+
1193
+ 0:37:37.637 --> 0:37:43.636
1194
+ and then we looked at zero shot translation.
1195
+
1196
+ 0:37:45.325 --> 0:37:57.395
1197
+ MotilingualityYou now before go into the second
1198
+ half are there questions about the first okay
1199
+
1200
+ 0:37:57.395 --> 0:37:58.166
1201
+ good.
1202
+
1203
+ 0:38:00.140 --> 0:38:10.932
1204
+ In the second half of this lecture we'll be
1205
+ looking into challenges like what is still
1206
+
1207
+ 0:38:10.932 --> 0:38:12.916
1208
+ unsolved about.
1209
+
1210
+ 0:38:13.113 --> 0:38:18.620
1211
+ There are some aspects to look at it.
1212
+
1213
+ 0:38:18.475 --> 0:38:26.593
1214
+ The first is modeling, the second is more
1215
+ engineering.
1216
+
1217
+ 0:38:28.248 --> 0:38:33.002
1218
+ Okay, so we talked about this question several
1219
+ times.
1220
+
1221
+ 0:38:32.914 --> 0:38:35.610
1222
+ How does motilinguality help?
1223
+
1224
+ 0:38:35.520 --> 0:38:37.411
1225
+ Where does it help?
1226
+
1227
+ 0:38:38.298 --> 0:38:45.416
1228
+ Here want to show results of an experiment
1229
+ based on over a hundred languages.
1230
+
1231
+ 0:38:46.266 --> 0:38:58.603
1232
+ Here you can see the data amount so they use
1233
+ parallel data to English and it's very.
1234
+
1235
+ 0:38:58.999 --> 0:39:00.514
1236
+ This is already lock scale.
1237
+
1238
+ 0:39:00.961 --> 0:39:12.982
1239
+ So for higher resource languages like English
1240
+ to French, German to Spanish you get over billion
1241
+
1242
+ 0:39:12.982 --> 0:39:14.359
1243
+ sentences.
1244
+
1245
+ 0:39:14.254 --> 0:39:21.003
1246
+ In parallel, and when we go more to the right
1247
+ to the more low resource spectrum on the other
1248
+
1249
+ 0:39:21.003 --> 0:39:26.519
1250
+ hand, there are languages that maybe many of
1251
+ us have new and heard of like.
1252
+
1253
+ 0:39:26.466 --> 0:39:29.589
1254
+ Do You Want to Move Back?
1255
+
1256
+ 0:39:30.570 --> 0:39:33.270
1257
+ Hawaiian Indians have heard of it.
1258
+
1259
+ 0:39:34.414 --> 0:39:39.497
1260
+ So on that spectrum we only have like thirty
1261
+ thousand sentences.
1262
+
1263
+ 0:39:40.400 --> 0:39:48.389
1264
+ So what this means is when we train, we have
1265
+ to up sample these guys.
1266
+
1267
+ 0:39:48.275 --> 0:39:51.589
1268
+ The model didn't even know.
1269
+
1270
+ 0:39:52.732 --> 0:40:05.777
1271
+ Yeah, so on this graph on how we read it is
1272
+ this horizontal line and zero is basically
1273
+
1274
+ 0:40:05.777 --> 0:40:07.577
1275
+ indicating.
1276
+
1277
+ 0:40:07.747 --> 0:40:14.761
1278
+ Because we want to see where mottling quality
1279
+ helps only compare to what happens when there
1280
+
1281
+ 0:40:14.761 --> 0:40:15.371
1282
+ is not.
1283
+
1284
+ 0:40:16.356 --> 0:40:29.108
1285
+ So upper like higher than the zero line it
1286
+ means we're gaining.
1287
+
1288
+ 0:40:29.309 --> 0:40:34.154
1289
+ The same like for these languages.
1290
+
1291
+ 0:40:34.015 --> 0:40:40.802
1292
+ This side means we are a high resource for
1293
+ the.
1294
+
1295
+ 0:40:40.981 --> 0:40:46.675
1296
+ Yeah sorry, think I've somehow removed the
1297
+ the ex-O as he does.
1298
+
1299
+ 0:40:48.008 --> 0:40:58.502
1300
+ Yeah alright, what happens now if we look
1301
+ at many into English?
1302
+
1303
+ 0:40:58.698 --> 0:41:08.741
1304
+ On the low resource spectrum by going multilingua
1305
+ we gain a lot over the Palumbo system.
1306
+
1307
+ 0:41:10.010 --> 0:41:16.658
1308
+ Overall, if you consider the average for all
1309
+ of the languages, it's still again.
1310
+
1311
+ 0:41:17.817 --> 0:41:27.301
1312
+ Now we're looking at the green line so you
1313
+ can ignore the blue line.
1314
+
1315
+ 0:41:27.164 --> 0:41:32.253
1316
+ Basically we have to do our sample.
1317
+
1318
+ 0:41:33.753 --> 0:41:41.188
1319
+ Yeah, so if you just even consider the average,
1320
+ it's still a game form over by link.
1321
+
1322
+ 0:41:42.983 --> 0:41:57.821
1323
+ However, if we go to the English to many systems
1324
+ looking at the gains, we only get minor improvements.
1325
+
1326
+ 0:41:59.039 --> 0:42:12.160
1327
+ So why is it the case that Going Mott Lingu
1328
+ isn't really helping universally?
1329
+
1330
+ 0:42:16.016 --> 0:42:18.546
1331
+ Do you have some intuitions on yeah?
1332
+
1333
+ 0:42:18.698 --> 0:42:38.257
1334
+ It's easier to understand something that generates
1335
+ if we consider what the model has to generate.
1336
+
1337
+ 0:42:38.718 --> 0:42:40.091
1338
+ I See It Like.
1339
+
1340
+ 0:42:40.460 --> 0:42:49.769
1341
+ Generating is a bit like writing or speaking,
1342
+ while inputing on the source side is more like
1343
+
1344
+ 0:42:49.769 --> 0:42:50.670
1345
+ reading.
1346
+
1347
+ 0:42:50.650 --> 0:42:57.971
1348
+ So one is more passive and the other is more
1349
+ active and don't know if you have similar experience.
1350
+
1351
+ 0:42:57.897 --> 0:43:05.116
1352
+ I think speaking and writing is always a little
1353
+ bit more difficult than just passively listening
1354
+
1355
+ 0:43:05.116 --> 0:43:06.009
1356
+ or reading.
1357
+
1358
+ 0:43:05.934 --> 0:43:09.805
1359
+ But this is a very pendwavy kind of understanding.
1360
+
1361
+ 0:43:10.390 --> 0:43:11.854
1362
+ And fed.
1363
+
1364
+ 0:43:12.032 --> 0:43:20.718
1365
+ In terms of the model, if we consider what
1366
+ is the difference for the target side for many
1367
+
1368
+ 0:43:20.718 --> 0:43:26.703
1369
+ to English: One difference is that there's
1370
+ a data difference.
1371
+
1372
+ 0:43:27.167 --> 0:43:33.438
1373
+ So if you just consider a modern English system
1374
+ with German to English and Spanish to English,.
1375
+
1376
+ 0:43:34.975 --> 0:43:44.321
1377
+ One thing we have to keep in mind is that
1378
+ the parallel data is not all the same, so on
1379
+
1380
+ 0:43:44.321 --> 0:43:49.156
1381
+ the target side there are different English.
1382
+
1383
+ 0:43:49.769 --> 0:43:54.481
1384
+ So the situation rather looks like this.
1385
+
1386
+ 0:43:54.366 --> 0:43:59.196
1387
+ What this means is that we are going to.
1388
+
1389
+ 0:44:00.820 --> 0:44:04.635
1390
+ We also add more data on the target side for
1391
+ English.
1392
+
1393
+ 0:44:06.967 --> 0:44:18.581
1394
+ Now since the target side data is not identical,
1395
+ how do we do a controlled experiment to remove
1396
+
1397
+ 0:44:18.581 --> 0:44:21.121
1398
+ the multilinguality?
1399
+
1400
+ 0:44:24.644 --> 0:44:42.794
1401
+ So what people tried as a control experiment
1402
+ is to keep all the English same as the above
1403
+
1404
+ 0:44:42.794 --> 0:44:44.205
1405
+ setup.
1406
+
1407
+ 0:44:44.684 --> 0:44:49.700
1408
+ So they take the English on English data of
1409
+ the same branch to German.
1410
+
1411
+ 0:44:50.090 --> 0:44:55.533
1412
+ And then the general synthetic data for Germans.
1413
+
1414
+ 0:44:55.422 --> 0:45:05.843
1415
+ So now we have a bilingual system again, but
1416
+ on the target side we still have the previously
1417
+
1418
+ 0:45:05.843 --> 0:45:08.420
1419
+ enriched English data.
1420
+
1421
+ 0:45:10.290 --> 0:45:25.092
1422
+ Now back to this picture that we've seen before,
1423
+ this mysterious orange line here is basically
1424
+
1425
+ 0:45:25.092 --> 0:45:26.962
1426
+ the result.
1427
+
1428
+ 0:45:27.907 --> 0:45:36.594
1429
+ And somewhat struckly and perhaps sadly for
1430
+ believers of multilinguality.
1431
+
1432
+ 0:45:36.476 --> 0:45:39.182
1433
+ This is also gaining.
1434
+
1435
+ 0:45:41.001 --> 0:45:52.775
1436
+ So what this means is for the many English
1437
+ is gaining not really because of multilinguality
1438
+
1439
+ 0:45:52.775 --> 0:45:55.463
1440
+ but just because of.
1441
+
1442
+ 0:45:55.976 --> 0:46:10.650
1443
+ And this means that there is still quite a
1444
+ lot to do if we really want to gain from just
1445
+
1446
+ 0:46:10.650 --> 0:46:13.618
1447
+ shared knowledge.
1448
+
1449
+ 0:46:14.514 --> 0:46:27.599
1450
+ But this also gives hope because there are
1451
+ still many things to research in this area
1452
+
1453
+ 0:46:27.599 --> 0:46:28.360
1454
+ now.
1455
+
1456
+ 0:46:28.708 --> 0:46:40.984
1457
+ So we've seen adding more languages helps
1458
+ with somewhat data side effect and can it hurt.
1459
+
1460
+ 0:46:40.848 --> 0:46:45.626
1461
+ So if we just add more languages.
1462
+
1463
+ 0:46:47.007 --> 0:46:48.408
1464
+ We've seen this.
1465
+
1466
+ 0:46:48.325 --> 0:46:52.696
1467
+ This is the picture for the Manitou English
1468
+ system.
1469
+
1470
+ 0:46:53.793 --> 0:47:09.328
1471
+ Comparing to this valuable face line, we see
1472
+ that for these high resource languages we are
1473
+
1474
+ 0:47:09.328 --> 0:47:12.743
1475
+ not doing as great.
1476
+
1477
+ 0:47:15.956 --> 0:47:18.664
1478
+ So why are we losing here?
1479
+
1480
+ 0:47:18.564 --> 0:47:25.287
1481
+ It's been showing that this performance last
1482
+ is somewhat related.
1483
+
1484
+ 0:47:26.026 --> 0:47:37.373
1485
+ In the sense that the motto has to learn so
1486
+ much that at some point it has to sacrifice
1487
+
1488
+ 0:47:37.373 --> 0:47:39.308
1489
+ capacity from.
1490
+
1491
+ 0:47:41.001 --> 0:47:57.081
1492
+ So what to do to basically grow a bigger brain
1493
+ to tackle this is to add some dedicated capacity
1494
+
1495
+ 0:47:57.081 --> 0:47:59.426
1496
+ per language.
1497
+
1498
+ 0:48:00.100 --> 0:48:15.600
1499
+ Here it's like a simplified graph of a transformer
1500
+ architecture, so this is the encoder within
1501
+
1502
+ 0:48:15.600 --> 0:48:16.579
1503
+ time.
1504
+
1505
+ 0:48:17.357 --> 0:48:27.108
1506
+ But additionally here these little colorable
1507
+ blouse are now the language-specific capable
1508
+
1509
+ 0:48:27.108 --> 0:48:28.516
1510
+ of capacity.
1511
+
1512
+ 0:48:29.169 --> 0:48:42.504
1513
+ There are language specific in the sense that
1514
+ if you get the Chinese to English, the pattern.
1515
+
1516
+ 0:48:43.103 --> 0:48:54.900
1517
+ We are also going to language specific parts
1518
+ that in this case consists of a down projection.
1519
+
1520
+ 0:48:56.416 --> 0:49:07.177
1521
+ So this is also called adaptors, something
1522
+ that is plugged into an existing model and
1523
+
1524
+ 0:49:07.177 --> 0:49:11.556
1525
+ it adapts towards a specific task.
1526
+
1527
+ 0:49:12.232 --> 0:49:22.593
1528
+ And this is conditionally activated in the
1529
+ sense that if you get a different input sentence.
1530
+
1531
+ 0:49:27.307 --> 0:49:34.173
1532
+ So this was first proposed in by some folks
1533
+ selling Google.
1534
+
1535
+ 0:49:34.058 --> 0:49:36.696
1536
+ Does this scale well?
1537
+
1538
+ 0:49:39.619 --> 0:49:56.621
1539
+ Yes exactly, so this is a translation periscusive
1540
+ cannon adapter, and this is not going to scale
1541
+
1542
+ 0:49:56.621 --> 0:49:57.672
1543
+ well.
1544
+
1545
+ 0:49:58.959 --> 0:50:13.676
1546
+ So this also brought people to try some more
1547
+ simple architecture.
1548
+
1549
+ 0:50:16.196 --> 0:50:22.788
1550
+ Yeah, this is also an alternative, in this
1551
+ case called monolingual adapters.
1552
+
1553
+ 0:50:24.184 --> 0:50:32.097
1554
+ Any of these adapters so again have this low
1555
+ resource.
1556
+
1557
+ 0:50:31.953 --> 0:50:42.027
1558
+ The zero line is bilingual baseline, but the
1559
+ lines are interpolated.
1560
+
1561
+ 0:50:43.783 --> 0:50:48.767
1562
+ The red one is the mottling word original
1563
+ mottling word model.
1564
+
1565
+ 0:50:49.929 --> 0:50:57.582
1566
+ And if we put the adapters in like a basic
1567
+ virginal adapter that goes to the blue liner,.
1568
+
1569
+ 0:50:58.078 --> 0:51:08.582
1570
+ You see the lids gaining performance for the
1571
+ high resource languages.
1572
+
1573
+ 0:51:08.432 --> 0:51:16.089
1574
+ If they even scale a lot, this further increases.
1575
+
1576
+ 0:51:16.556 --> 0:51:22.770
1577
+ So this is also a side kind of this.
1578
+
1579
+ 0:51:23.103 --> 0:51:27.807
1580
+ From the side shows that it's really a capacity
1581
+ bottom up.
1582
+
1583
+ 0:51:28.488 --> 0:51:30.590
1584
+ Like If You Eleanor.
1585
+
1586
+ 0:51:31.151 --> 0:51:34.313
1587
+ Resource they regain their performance.
1588
+
1589
+ 0:51:38.959 --> 0:51:50.514
1590
+ For smaller languages, but it's just.
1591
+
1592
+ 0:51:50.770 --> 0:52:03.258
1593
+ Think in the original modeling, the smaller
1594
+ languages they weren't constrained by capacity.
1595
+
1596
+ 0:52:05.445 --> 0:52:13.412
1597
+ So guess for the smaller languages, the difficulty
1598
+ is more the data rather than the model capacity.
1599
+
1600
+ 0:52:13.573 --> 0:52:26.597
1601
+ So in general you always want to have more
1602
+ or less data matching your model capacity.
1603
+
1604
+ 0:52:27.647 --> 0:52:33.255
1605
+ Yeah, here think the bigger challenge for
1606
+ lower roots was the data.
1607
+
1608
+ 0:52:34.874 --> 0:52:39.397
1609
+ You also mention it a little bit.
1610
+
1611
+ 0:52:39.264 --> 0:52:46.982
1612
+ Are these adapters per language or how many
1613
+ adapters do?
1614
+
1615
+ 0:52:47.267 --> 0:52:55.378
1616
+ And do we have to design them differently
1617
+ so that we learn to share more like a language
1618
+
1619
+ 0:52:55.378 --> 0:52:56.107
1620
+ family?
1621
+
1622
+ 0:52:56.576 --> 0:53:15.680
1623
+ So one downside of the adaptor we talked about
1624
+ is that basically there is no way to go over.
1625
+
1626
+ 0:53:16.516 --> 0:53:29.862
1627
+ Routing or LearningSo then a recent kind of
1628
+ additional approach for these language specific
1629
+
1630
+ 0:53:29.862 --> 0:53:36.100
1631
+ capacity is so called routing or learning.
1632
+
1633
+ 0:53:36.256 --> 0:53:42.438
1634
+ Basically, we have these language specific
1635
+ components.
1636
+
1637
+ 0:53:42.326 --> 0:53:45.875
1638
+ We also have a shared adapter.
1639
+
1640
+ 0:53:45.760 --> 0:53:52.148
1641
+ The model should learn: So in this case maybe
1642
+ we could imagine for the lower resource case
1643
+
1644
+ 0:53:52.148 --> 0:53:54.044
1645
+ that we just talked about.
1646
+
1647
+ 0:53:54.094 --> 0:54:04.838
1648
+ Sense to go there because there's not much
1649
+ to do with language specific anyway than it's
1650
+
1651
+ 0:54:04.838 --> 0:54:10.270
1652
+ better to make use of similarity with other.
1653
+
1654
+ 0:54:11.111 --> 0:54:30.493
1655
+ So this architecture is more data driven instead
1656
+ of what we specify prior to training.
1657
+
1658
+ 0:54:31.871 --> 0:54:33.998
1659
+ So how do we learn this?
1660
+
1661
+ 0:54:35.095 --> 0:54:49.286
1662
+ Basically, in terms of the mask, we want to
1663
+ basically have a binary rule that goes either
1664
+
1665
+ 0:54:49.286 --> 0:54:50.548
1666
+ to the.
1667
+
1668
+ 0:54:51.311 --> 0:54:56.501
1669
+ But how do we get a valued zero or one mean
1670
+ we can?
1671
+
1672
+ 0:54:56.402 --> 0:54:58.503
1673
+ We can do a signal.
1674
+
1675
+ 0:54:58.999 --> 0:55:13.376
1676
+ However, one thing is we don't want to get
1677
+ stuck in the middle, so we don't want black.
1678
+
1679
+ 0:55:14.434 --> 0:55:28.830
1680
+ It is also bad because it is not going to
1681
+ be the same training and test time by the way.
1682
+
1683
+ 0:55:31.151 --> 0:55:50.483
1684
+ So here the question is how do we force basically
1685
+ the model to always go there prior to activation?
1686
+
1687
+ 0:55:54.894 --> 0:56:02.463
1688
+ Found it interesting because it sounds like
1689
+ a trick for me.
1690
+
1691
+ 0:56:02.337 --> 0:56:05.497
1692
+ This approach has been.
1693
+
1694
+ 0:56:06.026 --> 0:56:15.844
1695
+ So what they do is prior to going through
1696
+ this activation, and they add some bosom noise.
1697
+
1698
+ 0:56:17.257 --> 0:56:31.610
1699
+ If there is always noise prior to activation
1700
+ then the model will be encouraged to preserve
1701
+
1702
+ 0:56:31.610 --> 0:56:34.291
1703
+ the information.
1704
+
1705
+ 0:56:36.356 --> 0:56:44.067
1706
+ Was a very interesting thing that found out
1707
+ while preparing this, so wanted to share this
1708
+
1709
+ 0:56:44.067 --> 0:56:44.410
1710
+ as.
1711
+
1712
+ 0:56:44.544 --> 0:56:48.937
1713
+ So basically you can create a battery gate
1714
+ with this technique.
1715
+
1716
+ 0:56:50.390 --> 0:57:01.747
1717
+ And if you add these language specific routing:
1718
+ Here they also have some that can control how
1719
+
1720
+ 0:57:01.747 --> 0:57:07.788
1721
+ much is shared and how much is language specific.
1722
+
1723
+ 0:57:07.727 --> 0:57:16.374
1724
+ Here the seals are the is the routing with
1725
+ the red and orange lines, so.
1726
+
1727
+ 0:57:16.576 --> 0:57:22.752
1728
+ So you can see that poor for many and many
1729
+ to one there in both cases quite some games.
1730
+
1731
+ 0:57:23.063 --> 0:57:30.717
1732
+ So that is the overall picture and just find
1733
+ the idea of the routing quite interesting.
1734
+
1735
+ 0:57:30.991 --> 0:57:32.363
1736
+ And UM.
1737
+
1738
+ 0:57:32.212 --> 0:57:38.348
1739
+ It's also getting a bit more increasingly
1740
+ used as there are the so called mixture of
1741
+
1742
+ 0:57:38.348 --> 0:57:39.431
1743
+ expert models.
1744
+
1745
+ 0:57:39.499 --> 0:57:51.801
1746
+ The model learns where to route the input
1747
+ so they are all conditionally activated when
1748
+
1749
+ 0:57:51.801 --> 0:57:53.074
1750
+ you are.
1751
+
1752
+ 0:57:53.213 --> 0:57:59.089
1753
+ But this is not really something specific
1754
+ to mortal inquality, so won't talk too much
1755
+
1756
+ 0:57:59.089 --> 0:57:59.567
1757
+ about.
1758
+
1759
+ 0:58:00.620 --> 0:58:02.115
1760
+ No.
1761
+
1762
+ 0:58:01.761 --> 0:58:09.640
1763
+ From this parrot is first that we talked about
1764
+ the listing of the capacity bottleneck.
1765
+
1766
+ 0:58:10.570 --> 0:58:19.808
1767
+ Where we can partly compensate by adapters
1768
+ or adding language specific capacity, there's
1769
+
1770
+ 0:58:19.808 --> 0:58:23.026
1771
+ the idea of negative transfer.
1772
+
1773
+ 0:58:24.844 --> 0:58:35.915
1774
+ When we add any additional capacity, how can
1775
+ we improve the knowledge sharing?
1776
+
1777
+ 0:58:38.318 --> 0:58:46.662
1778
+ Also, for this one too many directions that
1779
+ seem to be hopeless for multilinguality, can
1780
+
1781
+ 0:58:46.662 --> 0:58:47.881
1782
+ we actually?
1783
+
1784
+ 0:58:49.129 --> 0:58:52.171
1785
+ Yeah, these are all open things still in the
1786
+ area.
1787
+
1788
+ 0:58:53.673 --> 0:59:04.010
1789
+ Data ScarcityNow next part, I'm going to talk
1790
+ about some data challenges for Model Ewell.
1791
+
1792
+ 0:59:03.895 --> 0:59:07.667
1793
+ We talk about Model Ewell.
1794
+
1795
+ 0:59:08.488 --> 0:59:14.967
1796
+ But there are these lower resource languages
1797
+ that don't have well curated parallel data.
1798
+
1799
+ 0:59:16.216 --> 0:59:27.539
1800
+ When alternative people resort to Pro Data
1801
+ from the Internet, there's a lot of noise.
1802
+
1803
+ 0:59:27.927 --> 0:59:36.244
1804
+ And in this paper last year they did some
1805
+ manual analyses of several popular cross data
1806
+
1807
+ 0:59:36.244 --> 0:59:36.811
1808
+ sets.
1809
+
1810
+ 0:59:37.437 --> 0:59:55.262
1811
+ And you'll see that there are a lot of wrong
1812
+ translations, non-linguistic contents, pornographic
1813
+
1814
+ 0:59:55.262 --> 0:59:57.100
1815
+ contents.
1816
+
1817
+ 0:59:57.777 --> 1:00:04.661
1818
+ So as you can imagine, they say what you eat.
1819
+
1820
+ 1:00:04.512 --> 1:00:20.028
1821
+ If you use this kind of data to train a model,
1822
+ you can: So there are also many techniques
1823
+
1824
+ 1:00:20.028 --> 1:00:28.820
1825
+ for filtering and filtering these noisy data
1826
+ sets.
1827
+
1828
+ 1:00:29.809 --> 1:00:36.982
1829
+ So to filter these out we can use an additional
1830
+ classifier that basically are trained to classify
1831
+
1832
+ 1:00:36.982 --> 1:00:43.496
1833
+ which language to sentences and then kick out
1834
+ all the sentences with the wrong language.
1835
+
1836
+ 1:00:45.105 --> 1:00:49.331
1837
+ Another thing is the length ratio.
1838
+
1839
+ 1:00:49.211 --> 1:01:00.202
1840
+ Basically, the assumption there is that if
1841
+ two sentences are translations of each other,.
1842
+
1843
+ 1:01:01.901 --> 1:01:08.718
1844
+ So often people use maybe a ratio of three
1845
+ and then it eliminates the rest.
1846
+
1847
+ 1:01:09.909 --> 1:01:20.187
1848
+ Also, the other idea maybe similar to the
1849
+ language classifier is basically to heaven
1850
+
1851
+ 1:01:20.187 --> 1:01:24.540
1852
+ allowed character set per language.
1853
+
1854
+ 1:01:24.419 --> 1:01:28.293
1855
+ So if you're trying to filter.
1856
+
1857
+ 1:01:28.568 --> 1:01:34.622
1858
+ Don't know Cyrillic spribs or Arabic spribs,
1859
+ then it's maybe a good idea to remove them.
1860
+
1861
+ 1:01:35.775 --> 1:01:43.123
1862
+ This is not all there are many other ideas
1863
+ using some pre-trained neural networks to compare
1864
+
1865
+ 1:01:43.123 --> 1:01:50.629
1866
+ the representations, but just to give you an
1867
+ idea of what our basic techniques were filtering.
1868
+
1869
+ 1:01:50.991 --> 1:01:53.458
1870
+ Is quite important.
1871
+
1872
+ 1:01:53.335 --> 1:02:02.467
1873
+ We have seen in our experience that if you
1874
+ do these thoroughly there is.
1875
+
1876
+ 1:02:03.883 --> 1:02:17.814
1877
+ So after all, even if we do web crawling,
1878
+ there is still a bit of data scarcity problem.
1879
+
1880
+ 1:02:18.118 --> 1:02:30.760
1881
+ So there are many bad things that can happen
1882
+ when there's too little training data.
1883
+
1884
+ 1:02:30.609 --> 1:02:35.430
1885
+ The first is low performances.
1886
+
1887
+ 1:02:35.735 --> 1:02:49.859
1888
+ So they did it on many English system index
1889
+ languages, all together with here means: So
1890
+
1891
+ 1:02:49.859 --> 1:03:04.144
1892
+ we really need to get that area of a lot of
1893
+ data in order to get that ideal performance.
1894
+
1895
+ 1:03:04.884 --> 1:03:20.639
1896
+ There are also many horrible things that can
1897
+ happen in general when you train a model across
1898
+
1899
+ 1:03:20.639 --> 1:03:24.874
1900
+ different training runs.
1901
+
1902
+ 1:03:26.946 --> 1:03:36.733
1903
+ So one solution to tackle this problem, the
1904
+ data scarcity problem, is by fine tuning some
1905
+
1906
+ 1:03:36.733 --> 1:03:38.146
1907
+ pre-trained.
1908
+
1909
+ 1:03:38.979 --> 1:03:46.245
1910
+ And basically the idea is you've got the pre-trained
1911
+ model that can already do translation.
1912
+
1913
+ 1:03:46.846 --> 1:03:54.214
1914
+ Then you find units on your own training data
1915
+ and you end up with a more specialized model.
1916
+
1917
+ 1:03:55.155 --> 1:03:59.369
1918
+ So why does pretraining help?
1919
+
1920
+ 1:03:59.228 --> 1:04:11.436
1921
+ One argument is that if you do pretraining
1922
+ then the motto has seen over more data and
1923
+
1924
+ 1:04:11.436 --> 1:04:12.714
1925
+ learned.
1926
+
1927
+ 1:04:13.313 --> 1:04:19.135
1928
+ Say more generalizable representations that
1929
+ can help more downstream tasks.
1930
+
1931
+ 1:04:19.719 --> 1:04:28.063
1932
+ So in this case we are basically trying to
1933
+ make use of the more meaningful and generalizable
1934
+
1935
+ 1:04:28.063 --> 1:04:29.499
1936
+ representation.
1937
+
1938
+ 1:04:30.490 --> 1:04:45.103
1939
+ So for machine translation there are several
1940
+ open source models out there that can handle
1941
+
1942
+ 1:04:45.103 --> 1:04:46.889
1943
+ languages.
1944
+
1945
+ 1:04:48.188 --> 1:04:49.912
1946
+ Two hundred model.
1947
+
1948
+ 1:04:49.822 --> 1:04:53.404
1949
+ They also cover two hundred languages.
1950
+
1951
+ 1:04:53.312 --> 1:04:57.631
1952
+ That means that's quite a lot of translation.
1953
+
1954
+ 1:04:57.978 --> 1:05:06.218
1955
+ However, one thing to remember is that these
1956
+ lados are more like a how do you call them.
1957
+
1958
+ 1:05:06.146 --> 1:05:12.812
1959
+ Jackson Waltry is a master of none in the
1960
+ sense that they are very good as coverage,
1961
+
1962
+ 1:05:12.812 --> 1:05:20.498
1963
+ but if you look at specific translation directions
1964
+ they might be not as good as dedicated models.
1965
+
1966
+ 1:05:21.521 --> 1:05:34.170
1967
+ So here I'm going to have some results by
1968
+ comparing random initialization versus the
1969
+
1970
+ 1:05:34.170 --> 1:05:36.104
1971
+ first thing.
1972
+
1973
+ 1:05:36.396 --> 1:05:46.420
1974
+ The third line is the result of basically
1975
+ finding a pre-train model that is one of the
1976
+
1977
+ 1:05:46.420 --> 1:05:47.342
1978
+ family.
1979
+
1980
+ 1:05:47.947 --> 1:05:51.822
1981
+ So in this case you could see the.
1982
+
1983
+ 1:05:51.831 --> 1:05:58.374
1984
+ If we just look at the second line, that is
1985
+ the pre trade model out of the box, you see
1986
+
1987
+ 1:05:58.374 --> 1:06:04.842
1988
+ that if we just use it out of the box, the
1989
+ performance everywhere isn't super great as
1990
+
1991
+ 1:06:04.842 --> 1:06:06.180
1992
+ dedicated models.
1993
+
1994
+ 1:06:07.867 --> 1:06:22.305
1995
+ But then here that ex-here means English:
1996
+ So the first takeaway here is that if we do
1997
+
1998
+ 1:06:22.305 --> 1:06:31.539
1999
+ pre-train financing again when we do it into
2000
+ English,.
2001
+
2002
+ 1:06:33.433 --> 1:06:40.438
2003
+ Here is that we are forgetting.
2004
+
2005
+ 1:06:40.219 --> 1:06:50.514
2006
+ When we do further training there is no data.
2007
+
2008
+ 1:06:50.770 --> 1:07:04.865
2009
+ So even if we initialize the pre-trained bottle
2010
+ and continue training, if we don't see translation.
2011
+
2012
+ 1:07:05.345 --> 1:07:13.826
2013
+ So this is bad machine learning people termed
2014
+ it as perfect forgetting in the sense that
2015
+
2016
+ 1:07:13.826 --> 1:07:20.115
2017
+ if you have a model that is trained to do some
2018
+ task and then you.
2019
+
2020
+ 1:07:20.860 --> 1:07:22.487
2021
+ This Is Also Pretty Bad.
2022
+
2023
+ 1:07:24.244 --> 1:07:32.341
2024
+ Is especially bad if you consider training
2025
+ data actually grows over time.
2026
+
2027
+ 1:07:32.231 --> 1:07:35.408
2028
+ It's not like you have one.
2029
+
2030
+ 1:07:36.336 --> 1:07:46.756
2031
+ So in practice we do not always train systems
2032
+ from stretch so it's more like you have an
2033
+
2034
+ 1:07:46.756 --> 1:07:54.951
2035
+ existing system and later we want to expand
2036
+ the translation coverage.
2037
+
2038
+ 1:07:57.277 --> 1:08:08.932
2039
+ Here and the key question is how do we continue
2040
+ training from an existing system in doing so?
2041
+
2042
+ 1:08:09.909 --> 1:08:12.288
2043
+ Approaches.
2044
+
2045
+ 1:08:12.090 --> 1:08:27.948
2046
+ One very simple one is to include a portion
2047
+ of your previous training so that.
2048
+
2049
+ 1:08:28.148 --> 1:08:34.333
2050
+ So if you consider you have an English German
2051
+ system and now you want to explain it to English
2052
+
2053
+ 1:08:34.333 --> 1:08:34.919
2054
+ French,.
2055
+
2056
+ 1:08:36.036 --> 1:08:42.308
2057
+ Like so nice going English, French and English
2058
+ German, so when you train it you still include
2059
+
2060
+ 1:08:42.308 --> 1:08:45.578
2061
+ a small proportion of your previous German
2062
+ data.
2063
+
2064
+ 1:08:45.512 --> 1:08:51.118
2065
+ Hopefully your model is not forgetting that
2066
+ much about the previously lent German.
2067
+
2068
+ 1:08:53.073 --> 1:08:58.876
2069
+ Idea here is what we saw earlier.
2070
+
2071
+ 1:08:58.705 --> 1:09:09.803
2072
+ We can also add adaptors and only train them
2073
+ while keeping the.
2074
+
2075
+ 1:09:10.170 --> 1:09:26.860
2076
+ So this means we're going to end up with a
2077
+ generic model that was not anyhow changed.
2078
+
2079
+ 1:09:27.447 --> 1:09:37.972
2080
+ So in this way it's also more module and more
2081
+ suitable to the incremental learning kind of.
2082
+
2083
+ 1:09:38.758 --> 1:09:49.666
2084
+ Right in this part, the takeaways guess are
2085
+ first data filtering.
2086
+
2087
+ 1:09:49.501 --> 1:09:55.125
2088
+ His Internet data is very noisy.
2089
+
2090
+ 1:09:56.496 --> 1:10:05.061
2091
+ Second, it's about paint tuning pre-fine models
2092
+ and how we can or cannot avoid catastrophic
2093
+
2094
+ 1:10:05.061 --> 1:10:06.179
2095
+ forgetting.
2096
+
2097
+ 1:10:07.247 --> 1:10:15.866
2098
+ And of course open questions would include
2099
+ how can we do incremental learning with these
2100
+
2101
+ 1:10:15.866 --> 1:10:19.836
2102
+ multilingual machine translation models?
2103
+
2104
+ 1:10:20.860 --> 1:10:30.247
2105
+ Engineering ChallengesSo with this in mind
2106
+ would like to briefly cover several engineering
2107
+
2108
+ 1:10:30.247 --> 1:10:39.531
2109
+ challenges when we talk about: Yeah, earlier
2110
+ we also briefly talked about the motelingual
2111
+
2112
+ 1:10:39.531 --> 1:10:49.021
2113
+ means sometimes you have to scale up, you have
2114
+ to make your models bigger just to have that
2115
+
2116
+ 1:10:49.021 --> 1:10:51.394
2117
+ capacity to deal with.
2118
+
2119
+ 1:10:52.472 --> 1:10:59.262
2120
+ This means the model sizes are getting bigger
2121
+ and sometimes having one single is not enough
2122
+
2123
+ 1:10:59.262 --> 1:11:00.073
2124
+ to handle.
2125
+
2126
+ 1:11:00.400 --> 1:11:08.914
2127
+ Here wanted to introduce ideas of going parallel
2128
+ and scaling up.
2129
+
2130
+ 1:11:08.783 --> 1:11:12.848
2131
+ The first is so called model.
2132
+
2133
+ 1:11:14.434 --> 1:11:18.859
2134
+ Don't know if you also had this in other like
2135
+ maury cue related courses.
2136
+
2137
+ 1:11:20.220 --> 1:11:30.639
2138
+ Okay, so the idea of data parallel is basically
2139
+ we train in parallel.
2140
+
2141
+ 1:11:30.790 --> 1:11:35.852
2142
+ We put our model onto several GPS.
2143
+
2144
+ 1:11:35.707 --> 1:11:47.133
2145
+ We send the same model there and then when
2146
+ we get the training data we split.
2147
+
2148
+ 1:11:48.108 --> 1:11:54.594
2149
+ So each on each of these we are doing the
2150
+ forward and backward pass in parallel.
2151
+
2152
+ 1:11:55.355 --> 1:12:07.779
2153
+ Then after we get his gradient all these reviews
2154
+ will be synchronized and the gradients will
2155
+
2156
+ 1:12:07.779 --> 1:12:09.783
2157
+ be aggregated.
2158
+
2159
+ 1:12:11.691 --> 1:12:27.127
2160
+ We are having a bigger batch size in effect,
2161
+ so this would be much faster than, for example,
2162
+
2163
+ 1:12:27.127 --> 1:12:31.277
2164
+ doing all these smaller.
2165
+
2166
+ 1:12:32.772 --> 1:12:45.252
2167
+ That is, if your model itself is too big to
2168
+ fit onto an energy group, so you cannot split
2169
+
2170
+ 1:12:45.252 --> 1:12:46.084
2171
+ this.
2172
+
2173
+ 1:12:46.486 --> 1:12:51.958
2174
+ And honestly, the model itself, unless you're
2175
+ going for those.
2176
+
2177
+ 1:12:51.891 --> 1:12:55.500
2178
+ Huge models the industry made these days.
2179
+
2180
+ 1:12:55.414 --> 1:13:03.198
2181
+ I've never run into a situation where the
2182
+ single model itself does not fit into one shape
2183
+
2184
+ 1:13:03.198 --> 1:13:03.717
2185
+ here.
2186
+
2187
+ 1:13:03.631 --> 1:13:08.476
2188
+ Realistically, it's more the what is memory
2189
+ consuming.
2190
+
2191
+ 1:13:08.528 --> 1:13:14.871
2192
+ It is more of the backward cast and the Optimizer
2193
+ states that led me to be stored.
2194
+
2195
+ 1:13:15.555 --> 1:13:22.193
2196
+ So but still there are people training gigantic
2197
+ models where they have to go model parallel.
2198
+
2199
+ 1:13:22.602 --> 1:13:35.955
2200
+ This means you have a model consisting of
2201
+ all those orange pets, but it doesn't fit to
2202
+
2203
+ 1:13:35.955 --> 1:13:40.714
2204
+ split the next several layers.
2205
+
2206
+ 1:13:41.581 --> 1:13:51.787
2207
+ So this means when you do the forward pass
2208
+ you have to wait and to finish before doing.
2209
+
2210
+ 1:13:52.532 --> 1:14:11.193
2211
+ And this kind of implementation is sometimes
2212
+ a bit architecture or specific.
2213
+
2214
+ 1:14:12.172 --> 1:14:17.177
2215
+ Right, so there's one more thing when scaling
2216
+ up.
2217
+
2218
+ 1:14:17.077 --> 1:14:19.184
2219
+ Want it to mention.
2220
+
2221
+ 1:14:20.080 --> 1:14:25.687
2222
+ We also talked about it briefly earlier.
2223
+
2224
+ 1:14:25.550 --> 1:14:34.032
2225
+ We said that when we go to Linguo we need
2226
+ a vocabulary that.
2227
+
2228
+ 1:14:34.614 --> 1:14:40.867
2229
+ And can give you some numbers.
2230
+
2231
+ 1:14:40.665 --> 1:14:53.578
2232
+ Most of the pre-trained modeling models here
2233
+ use a vocabulary.
2234
+
2235
+ 1:14:53.933 --> 1:14:58.454
2236
+ Normally each vector is.
2237
+
2238
+ 1:14:58.273 --> 1:15:10.754
2239
+ This means just the word embedding table alone
2240
+ is times parameters.
2241
+
2242
+ 1:15:11.011 --> 1:15:18.620
2243
+ This means just for the embedding table alone
2244
+ it's already taking million parameters of the.
2245
+
2246
+ 1:15:19.859 --> 1:15:28.187
2247
+ And this is often one of the largest parts
2248
+ of the machine.
2249
+
2250
+ 1:15:28.046 --> 1:15:31.299
2251
+ This also comes with.
2252
+
2253
+ 1:15:31.651 --> 1:15:43.891
2254
+ So one question is how can we efficiently
2255
+ represent a multilingual vocabulary?
2256
+
2257
+ 1:15:43.736 --> 1:15:49.008
2258
+ Are there better ways than just?
2259
+
2260
+ 1:15:50.750 --> 1:16:00.526
2261
+ There are many out there people tread, maybe
2262
+ not all targeted for mottling wool, but think.
2263
+
2264
+ 1:16:00.840 --> 1:16:03.635
2265
+ So when is bites level representation?
2266
+
2267
+ 1:16:03.743 --> 1:16:11.973
2268
+ So the idea there is if we train with data
2269
+ they're all stored on computers, so all their
2270
+
2271
+ 1:16:11.973 --> 1:16:15.579
2272
+ characters must be reused in by bites.
2273
+
2274
+ 1:16:15.486 --> 1:16:23.717
2275
+ So they want to then not using subwords, not
2276
+ using characters, but using bites instead.
2277
+
2278
+ 1:16:25.905 --> 1:16:27.693
2279
+ Do You See Some Downsides?
2280
+
2281
+ 1:16:31.791 --> 1:16:38.245
2282
+ There are some languages that are easier to
2283
+ represent than others.
2284
+
2285
+ 1:16:38.148 --> 1:16:40.561
2286
+ That's definitely true.
2287
+
2288
+ 1:16:41.081 --> 1:16:44.981
2289
+ So if you have a sentence normally of five
2290
+ words,.
2291
+
2292
+ 1:16:46.246 --> 1:16:59.899
2293
+ You think about if we split it into characters,
2294
+ how many characters we have, and each character
2295
+
2296
+ 1:16:59.899 --> 1:17:04.166
2297
+ that would be how many bites.
2298
+
2299
+ 1:17:04.424 --> 1:17:15.749
2300
+ And then it's more to model, it's more for
2301
+ the model to learn, and it's also a bigger
2302
+
2303
+ 1:17:15.749 --> 1:17:19.831
2304
+ sequence to give to the model.
2305
+
2306
+ 1:17:20.260 --> 1:17:22.038
2307
+ Yeah.
2308
+
2309
+ 1:17:21.941 --> 1:17:31.232
2310
+ Visual representation is also quite interesting,
2311
+ so some people argued that we don't want to
2312
+
2313
+ 1:17:31.232 --> 1:17:35.428
2314
+ have a fixed discrete vocabulary anymore.
2315
+
2316
+ 1:17:35.328 --> 1:17:41.923
2317
+ Instead, we want to do it like OCR, like reading
2318
+ them as images.
2319
+
2320
+ 1:17:42.942 --> 1:17:55.403
2321
+ We'll look at one example for this next: Then
2322
+ another idea is how if you can distill the
2323
+
2324
+ 1:17:55.403 --> 1:18:03.943
2325
+ vocabulary as in learning some more compact
2326
+ representation,.
2327
+
2328
+ 1:18:04.284 --> 1:18:12.554
2329
+ But next wanted to show you an example of
2330
+ pixel inputs for modeling war machine.
2331
+
2332
+ 1:18:12.852 --> 1:18:29.757
2333
+ If you look at the picture, all the characters
2334
+ that are marked with red are actually not.
2335
+
2336
+ 1:18:32.772 --> 1:18:48.876
2337
+ They are actually from a different script
2338
+ for the model and let it do the subword tokenization.
2339
+
2340
+ 1:18:52.852 --> 1:19:04.373
2341
+ You would get maybe mostly characters out
2342
+ of it because I guess in the pre existing vocabulary
2343
+
2344
+ 1:19:04.373 --> 1:19:07.768
2345
+ there won't be Latin H and.
2346
+
2347
+ 1:19:07.707 --> 1:19:16.737
2348
+ So you'll get characters out of it, which
2349
+ means it's probably going to be more difficult
2350
+
2351
+ 1:19:16.737 --> 1:19:18.259
2352
+ for the model.
2353
+
2354
+ 1:19:20.140 --> 1:19:28.502
2355
+ Yeah, so the motivation for pixel inputs is
2356
+ that there is more sharing across languages.
2357
+
2358
+ 1:19:30.010 --> 1:19:37.773
2359
+ Here basically illustrates an embedding table
2360
+ for subwords and saying if you have sentences
2361
+
2362
+ 1:19:37.773 --> 1:19:45.705
2363
+ in the letter scripts like French and the English
2364
+ then it's going to take certain proportions
2365
+
2366
+ 1:19:45.705 --> 1:19:48.152
2367
+ of this big embetting table.
2368
+
2369
+ 1:19:48.328 --> 1:19:56.854
2370
+ While for Arabic and Chinese it's yet again
2371
+ another,.
2372
+
2373
+ 1:19:56.796 --> 1:20:09.037
2374
+ That is not joined with the previous one if
2375
+ we want to have shared representations for
2376
+
2377
+ 1:20:09.037 --> 1:20:11.992
2378
+ different languages.
2379
+
2380
+ 1:20:12.692 --> 1:20:18.531
2381
+ On the other hand, if we're going with pixels,
2382
+ there's definitely more sharing.
2383
+
2384
+ 1:20:22.362 --> 1:20:30.911
2385
+ There's a difference though to a standard
2386
+ kind of norm machine translation typeline.
2387
+
2388
+ 1:20:32.252 --> 1:20:47.581
2389
+ If you have this brace then how do we go with
2390
+ images into a translation model?
2391
+
2392
+ 1:20:50.690 --> 1:20:58.684
2393
+ We still have to tokenize it somehow, so in
2394
+ this case they do an overlapping sliding window.
2395
+
2396
+ 1:20:59.259 --> 1:21:13.636
2397
+ Since it's more visual, we're using some kind
2398
+ of convolution blocks before going into these
2399
+
2400
+ 1:21:13.636 --> 1:21:14.730
2401
+ black.
2402
+
2403
+ 1:21:15.035 --> 1:21:25.514
2404
+ So here wanted to show that if you go with
2405
+ these more specialist architectures we get
2406
+
2407
+ 1:21:25.514 --> 1:21:27.829
2408
+ pixels and that's.
2409
+
2410
+ 1:21:30.050 --> 1:21:31.310
2411
+ There's Also One Down the Side.
2412
+
2413
+ 1:21:31.431 --> 1:21:51.380
2414
+ If we go with pixels and present teachings,
2415
+ what are our challenges?
2416
+
2417
+ 1:21:52.993 --> 1:22:00.001
2418
+ Exactly so as they beat us others here, also
2419
+ pointing out here for their experiments.
2420
+
2421
+ 1:22:01.061 --> 1:22:08.596
2422
+ They only consider a one target language,
2423
+ and this is also on their target site.
2424
+
2425
+ 1:22:08.503 --> 1:22:10.648
2426
+ It's not pixel based.
2427
+
2428
+ 1:22:11.131 --> 1:22:31.033
2429
+ So this is definitely, in my opinion, very
2430
+ interesting steps towards more shared representations.
2431
+
2432
+ 1:22:31.831 --> 1:22:40.574
2433
+ Yeah, so with this kind of out of the box
2434
+ approach just wanted to summarize today's lecture.
2435
+
2436
+ 1:22:41.962 --> 1:22:53.158
2437
+ First think we saw why motelingue is cool,
2438
+ why there are several open challenges out there
2439
+
2440
+ 1:22:53.158 --> 1:22:53.896
2441
+ that.
2442
+
2443
+ 1:22:55.355 --> 1:23:03.601
2444
+ We also saw, like several approaches, how
2445
+ to realize implement a modern molecular translation
2446
+
2447
+ 1:23:03.601 --> 1:23:11.058
2448
+ system, and yeah, lastly, we've seen quite
2449
+ some over challenges on what is unsolved.
2450
+
2451
+ 1:23:11.691 --> 1:23:22.403
2452
+ Yeah, so with this want to thank you for being
2453
+ here today and I'm up there if you want.
2454
+
2455
+ 1:23:26.106 --> 1:23:29.727
2456
+ If you have questions, how will we also share
2457
+ with the moment?
2458
+
demo_data/lectures/Lecture-10-13.06.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8dc282db3512e8731326f1898c8dd757c40f33bd1468ffae249a9374f76fe28
3
+ size 122197601
demo_data/lectures/Lecture-11-15.06.2023/English.vtt ADDED
The diff for this file is too large to render. See raw diff
 
demo_data/lectures/Lecture-11-15.06.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:018f7b42f2225e9ea6d68c39e22111b3d3e172c045fde57e3dfd6b2ca3df4198
3
+ size 123175586
demo_data/lectures/Lecture-12-20.06.2023/English.vtt ADDED
The diff for this file is too large to render. See raw diff
 
demo_data/lectures/Lecture-12-20.06.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e86b4df900483ac17cf6e78c131d83ab5f7df2a0790c7ae034502bdce61554f3
3
+ size 158173841
demo_data/lectures/Lecture-13-04.07.2023/English.vtt ADDED
@@ -0,0 +1,2699 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 0:00:01.641 --> 0:00:06.289
4
+ IntroductionHey so what again to today's lecture
5
+ on machine translation.
6
+
7
+ 0:00:07.968 --> 0:00:15.152
8
+ This week we'll have a bit of different focus,
9
+ so last two weeks or so we have looking into.
10
+
11
+ 0:00:15.655 --> 0:00:28.073
12
+ How we can improve our system by having more
13
+ data, other data sources, or using them to
14
+
15
+ 0:00:28.073 --> 0:00:30.331
16
+ more efficient.
17
+
18
+ 0:00:30.590 --> 0:00:38.046
19
+ And we'll have a bit more of that next week
20
+ with the anti-travised and the context.
21
+
22
+ 0:00:38.338 --> 0:00:47.415
23
+ So that we are shifting from this idea of
24
+ we treat each sentence independently, but treat
25
+
26
+ 0:00:47.415 --> 0:00:49.129
27
+ the translation.
28
+
29
+ 0:00:49.129 --> 0:00:58.788
30
+ Because maybe you can remember from the beginning,
31
+ there are phenomenon in machine translation
32
+
33
+ 0:00:58.788 --> 0:01:02.143
34
+ that you cannot correctly check.
35
+
36
+ 0:01:03.443 --> 0:01:14.616
37
+ However, today we want to more look into what
38
+ challenges arise, specifically when we're practically
39
+
40
+ 0:01:14.616 --> 0:01:16.628
41
+ applying machine.
42
+
43
+ 0:01:17.017 --> 0:01:23.674
44
+ And this block will be a total of four different
45
+ lectures.
46
+
47
+ 0:01:23.561 --> 0:01:29.495
48
+ Types of Biases in Machine TranslationWhat
49
+ type of biases are in machine translation can.
50
+
51
+ 0:01:29.729 --> 0:01:37.646
52
+ Just then can we try to improve this, but
53
+ of course the first focus can be at least the.
54
+
55
+ 0:01:37.717 --> 0:01:41.375
56
+ And this, of course, gets more and more important.
57
+
58
+ 0:01:41.304 --> 0:01:48.302
59
+ The more often you apply this type of technology,
60
+ when it was mainly a basic research tool which
61
+
62
+ 0:01:48.302 --> 0:01:53.785
63
+ you were using in a research environment, it's
64
+ not directly that important.
65
+
66
+ 0:01:54.054 --> 0:02:00.370
67
+ But once you apply it to the question, is
68
+ it performed the same for everybody or is it
69
+
70
+ 0:02:00.370 --> 0:02:04.436
71
+ performance of some people less good than other
72
+ people?
73
+
74
+ 0:02:04.364 --> 0:02:10.463
75
+ Does it have specific challenges and we are
76
+ seeing that especially in translation?
77
+
78
+ 0:02:10.710 --> 0:02:13.420
79
+ We have the major challenge.
80
+
81
+ 0:02:13.326 --> 0:02:20.334
82
+ We have the grammatical gender and this is
83
+ not the same in all languages.
84
+
85
+ 0:02:20.520 --> 0:02:35.431
86
+ In English, it's not clear if you talk about
87
+ some person, if it's male or female, and so
88
+
89
+ 0:02:35.431 --> 0:02:39.787
90
+ hopefully you've learned.
91
+
92
+ 0:02:41.301 --> 0:02:49.419
93
+ Just as a brief view, so based on this one
94
+ aspect of application will then have two other
95
+
96
+ 0:02:49.419 --> 0:02:57.807
97
+ aspects: On Thursday we'll look into adaptation,
98
+ so how can we adapt to specific situations?
99
+
100
+ 0:02:58.718 --> 0:03:09.127
101
+ Because we have seen that your systems perform
102
+ well when the test case is similar to the training
103
+
104
+ 0:03:09.127 --> 0:03:15.181
105
+ case, it's always the case you should get training
106
+ data.
107
+
108
+ 0:03:16.036 --> 0:03:27.577
109
+ However, in practical applications, it's not
110
+ always possible to collect really the best
111
+
112
+ 0:03:27.577 --> 0:03:31.642
113
+ fitting data, so in that case.
114
+
115
+ 0:03:32.092 --> 0:03:39.269
116
+ And then the third larger group of applications
117
+ will then be speech translation.
118
+
119
+ 0:03:39.181 --> 0:03:42.993
120
+ What do we have to change in our machine?
121
+
122
+ 0:03:43.323 --> 0:03:53.569
123
+ If we are now not translating text, but if
124
+ we want to translate speech, that will be more
125
+
126
+ 0:03:53.569 --> 0:03:54.708
127
+ lectures.
128
+
129
+ 0:04:00.180 --> 0:04:12.173
130
+ So what are we talking about when we are talking
131
+ about bias from a definition point?
132
+
133
+ 0:04:12.092 --> 0:04:21.799
134
+ Means we are introducing systematic errors
135
+ when testing, and then we encourage the selection
136
+
137
+ 0:04:21.799 --> 0:04:24.408
138
+ of the specific answers.
139
+
140
+ 0:04:24.804 --> 0:04:36.862
141
+ The most prominent case, which is analyzed
142
+ most in the research community, is a bias based
143
+
144
+ 0:04:36.862 --> 0:04:38.320
145
+ on gender.
146
+
147
+ 0:04:38.187 --> 0:04:43.359
148
+ One example: she works in a hospital.
149
+
150
+ 0:04:43.523 --> 0:04:50.787
151
+ It is not directly able to assess whether
152
+ this is now a point or a friend.
153
+
154
+ 0:04:51.251 --> 0:05:07.095
155
+ And although in this one even there is, it's
156
+ possible to ambiguate this based on the context.
157
+
158
+ 0:05:07.127 --> 0:05:14.391
159
+ However, there is yeah, this relation to learn
160
+ is of course not that easy.
161
+
162
+ 0:05:14.614 --> 0:05:27.249
163
+ So the system might also learn more like shortcut
164
+ connections, which might be that in your training
165
+
166
+ 0:05:27.249 --> 0:05:31.798
167
+ data most of the doctors are males.
168
+
169
+ 0:05:32.232 --> 0:05:41.725
170
+ That is like that was too bigly analyzed and
171
+ biased, and we'll focus on that also in this.
172
+
173
+ 0:05:41.641 --> 0:05:47.664
174
+ In this lecture, however, of course, the system
175
+ might be a lot of other biases too, which have
176
+
177
+ 0:05:47.664 --> 0:05:50.326
178
+ been partly investigated in other fields.
179
+
180
+ 0:05:50.263 --> 0:05:53.498
181
+ But I think machine translation is not that
182
+ much.
183
+
184
+ 0:05:53.813 --> 0:05:57.637
185
+ For example, it can be based on your originals.
186
+
187
+ 0:05:57.737 --> 0:06:09.405
188
+ So there is an example for a sentiment analysis
189
+ that's a bit prominent.
190
+
191
+ 0:06:09.243 --> 0:06:15.081
192
+ A sentiment analysis means you're.
193
+
194
+ 0:06:15.035 --> 0:06:16.788
195
+ Like you're seeing it in reviews.
196
+
197
+ 0:06:17.077 --> 0:06:24.045
198
+ And then you can show that with baseline models,
199
+ if the name is Mohammed then the sentiment
200
+
201
+ 0:06:24.045 --> 0:06:30.786
202
+ in a lot of systems will be more negative than
203
+ if it's like a traditional European name.
204
+
205
+ 0:06:31.271 --> 0:06:33.924
206
+ Are with foods that is simple.
207
+
208
+ 0:06:33.839 --> 0:06:36.453
209
+ It's this type of restaurant.
210
+
211
+ 0:06:36.366 --> 0:06:38.809
212
+ It's positive and another.
213
+
214
+ 0:06:39.319 --> 0:06:49.510
215
+ You have other aspects, so we have seen this.
216
+
217
+ 0:06:49.289 --> 0:06:59.485
218
+ We have done some experiments in Vietnamese.
219
+
220
+ 0:06:59.559 --> 0:07:11.040
221
+ And then, for example, you can analyze that
222
+ if it's like he's Germany will address it more
223
+
224
+ 0:07:11.040 --> 0:07:18.484
225
+ formal, while if he is North Korean he'll use
226
+ an informal.
227
+
228
+ 0:07:18.838 --> 0:07:24.923
229
+ So these are also possible types of gender.
230
+
231
+ 0:07:24.785 --> 0:07:31.012
232
+ However, this is difficult types of biases.
233
+
234
+ 0:07:31.251 --> 0:07:38.903
235
+ However, especially in translation, the bias
236
+ for gender is the most challenging because
237
+
238
+ 0:07:38.903 --> 0:07:42.989
239
+ we are treating gender in different languages.
240
+
241
+ 0:07:45.405 --> 0:07:46.930
242
+ Hi this is challenging.
243
+
244
+ 0:07:48.148 --> 0:07:54.616
245
+ The reason for that is that there is a translation
246
+ mismatch and we have, I mean, one reason for
247
+
248
+ 0:07:54.616 --> 0:08:00.140
249
+ that is there's a translation mismatch and
250
+ that's the most challenging situation.
251
+
252
+ 0:08:00.073 --> 0:08:05.733
253
+ So there is there is different information
254
+ in the Sears language or in the target.
255
+
256
+ 0:08:06.046 --> 0:08:08.832
257
+ So if we have the English word dot player,.
258
+
259
+ 0:08:09.029 --> 0:08:12.911
260
+ It's there is no information about the gender
261
+ in there.
262
+
263
+ 0:08:12.842 --> 0:08:19.043
264
+ However, if you want to translate in German,
265
+ you cannot easily generate a word without a
266
+
267
+ 0:08:19.043 --> 0:08:20.437
268
+ gender information.
269
+
270
+ 0:08:20.367 --> 0:08:27.057
271
+ Or man, you can't do something like Shubila
272
+ in, but that sounds a bit weird if you're talking.
273
+
274
+ 0:08:27.027 --> 0:08:29.006
275
+ About a specific person.
276
+
277
+ 0:08:28.927 --> 0:08:32.333
278
+ Then you should use the appropriate font.
279
+
280
+ 0:08:32.692 --> 0:08:44.128
281
+ And so it's most challenging translation as
282
+ always in this situation where you have less
283
+
284
+ 0:08:44.128 --> 0:08:50.939
285
+ information on the source side but more information.
286
+
287
+ 0:08:51.911 --> 0:08:57.103
288
+ Similar things like if you think about Japanese,
289
+ for example where there's different formality
290
+
291
+ 0:08:57.103 --> 0:08:57.540
292
+ levels.
293
+
294
+ 0:08:57.485 --> 0:09:02.291
295
+ If in German there is no formality or like
296
+ two only or in English there's no formality
297
+
298
+ 0:09:02.291 --> 0:09:02.677
299
+ level.
300
+
301
+ 0:09:02.862 --> 0:09:08.139
302
+ And now you have to estimate the formality
303
+ level.
304
+
305
+ 0:09:08.034 --> 0:09:10.830
306
+ Of course, it takes some.
307
+
308
+ 0:09:10.722 --> 0:09:13.845
309
+ It's not directly possible.
310
+
311
+ 0:09:14.094 --> 0:09:20.475
312
+ What nowadays systems are doing is at least
313
+ assess.
314
+
315
+ 0:09:20.352 --> 0:09:27.472
316
+ This is a situation where don't have enough
317
+ information.
318
+
319
+ 0:09:27.567 --> 0:09:28.656
320
+ Translation.
321
+
322
+ 0:09:28.572 --> 0:09:34.939
323
+ So here you have that suggesting it can be
324
+ doctor or doctorate in Spanish.
325
+
326
+ 0:09:35.115 --> 0:09:37.051
327
+ So that is a possibility.
328
+
329
+ 0:09:36.977 --> 0:09:41.597
330
+ However, it is of course very, very challenging
331
+ to find out.
332
+
333
+ 0:09:42.062 --> 0:09:46.130
334
+ Is there two really different meanings, or
335
+ is it not the case?
336
+
337
+ 0:09:46.326 --> 0:09:47.933
338
+ You can do the big rule base here.
339
+
340
+ 0:09:47.887 --> 0:09:49.496
341
+ Maybe don't know how they did it.
342
+
343
+ 0:09:49.990 --> 0:09:57.469
344
+ You can, of course, if you are focusing on
345
+ gender, the source and the target is different,
346
+
347
+ 0:09:57.469 --> 0:09:57.879
348
+ and.
349
+
350
+ 0:09:58.118 --> 0:10:05.799
351
+ But if you want to do it more general, it's
352
+ not that easy because there's always.
353
+
354
+ 0:10:06.166 --> 0:10:18.255
355
+ But it's not clear if these are really different
356
+ or if there's only slight differences.
357
+
358
+ 0:10:22.142 --> 0:10:36.451
359
+ Between that another reason why there is a
360
+ bias in there is typically the system tries
361
+
362
+ 0:10:36.451 --> 0:10:41.385
363
+ to always do the most simple.
364
+
365
+ 0:10:42.262 --> 0:10:54.483
366
+ And also in your training data there are unintended
367
+ shortcuts or clues only in the training data
368
+
369
+ 0:10:54.483 --> 0:10:59.145
370
+ because you sample them in some way.
371
+
372
+ 0:10:59.379 --> 0:11:06.257
373
+ This example, if she works in a hospital and
374
+ my friend is a nurse, then it might be that
375
+
376
+ 0:11:06.257 --> 0:11:07.184
377
+ one friend.
378
+
379
+ 0:11:08.168 --> 0:11:18.979
380
+ Male and female because it has learned that
381
+ in your trained doctor is a male and a nurse
382
+
383
+ 0:11:18.979 --> 0:11:20.802
384
+ is doing this.
385
+
386
+ 0:11:20.880 --> 0:11:29.587
387
+ And of course, if we are doing maximum likelihood
388
+ approximation as we are doing it in general,
389
+
390
+ 0:11:29.587 --> 0:11:30.962
391
+ we are always.
392
+
393
+ 0:11:30.951 --> 0:11:43.562
394
+ So that means if in your training data this
395
+ correlation is maybe in the case then your
396
+
397
+ 0:11:43.562 --> 0:11:48.345
398
+ predictions are always the same.
399
+
400
+ 0:11:48.200 --> 0:11:50.386
401
+ It typically.
402
+
403
+ 0:11:55.035 --> 0:12:06.007
404
+ What does it mean, of course, if we are having
405
+ this type of fires and if we are applying?
406
+
407
+ 0:12:05.925 --> 0:12:14.821
408
+ It might be that the benefit of machine translation
409
+ rice so more and more people can benefit from
410
+
411
+ 0:12:14.821 --> 0:12:20.631
412
+ the ability to talk to people in different
413
+ languages and so on.
414
+
415
+ 0:12:20.780 --> 0:12:27.261
416
+ But if you more often use it, problems of
417
+ the system also get more and more important.
418
+
419
+ 0:12:27.727 --> 0:12:36.984
420
+ And so if we are seeing that these problems
421
+ and people nowadays only start to analyze these
422
+
423
+ 0:12:36.984 --> 0:12:46.341
424
+ problems partly, also because if it hasn't
425
+ been used, it's not that important if the quality
426
+
427
+ 0:12:46.341 --> 0:12:47.447
428
+ is so bad.
429
+
430
+ 0:12:47.627 --> 0:12:51.907
431
+ Version or is mixing it all the time like
432
+ we have seen in old systems.
433
+
434
+ 0:12:51.847 --> 0:12:52.996
435
+ Then, of course,.
436
+
437
+ 0:12:53.053 --> 0:12:57.303
438
+ The issue is not that you have biased issues
439
+ that you at first need to create a right view.
440
+
441
+ 0:12:57.637 --> 0:13:10.604
442
+ So only with the wide application of the good
443
+ quality this becomes important, and then of
444
+
445
+ 0:13:10.604 --> 0:13:15.359
446
+ course you should look into how.
447
+
448
+ 0:13:15.355 --> 0:13:21.355
449
+ Challenges in Machine TranslationIn order
450
+ to first get aware of what are the challenges,
451
+
452
+ 0:13:21.355 --> 0:13:24.591
453
+ and that is a general idea not only about bias.
454
+
455
+ 0:13:24.764 --> 0:13:31.868
456
+ Of course, we have learned about blue scores,
457
+ so how can you evaluate the over quality and
458
+
459
+ 0:13:31.868 --> 0:13:36.006
460
+ they are very important, either blue or any
461
+ of that.
462
+
463
+ 0:13:35.928 --> 0:13:40.379
464
+ However, they are somehow giving us a general
465
+ overview.
466
+
467
+ 0:13:40.560 --> 0:13:58.410
468
+ And if we want to improve our systems, of
469
+ course it's important that we also do more
470
+
471
+ 0:13:58.410 --> 0:14:00.510
472
+ detailed.
473
+
474
+ 0:14:00.340 --> 0:14:05.828
475
+ Test sets which are very challenging in order
476
+ to attend to see how good these systems.
477
+
478
+ 0:14:06.446 --> 0:14:18.674
479
+ Of course, one last reminder to that if you
480
+ do a challenge that says it's typically good
481
+
482
+ 0:14:18.674 --> 0:14:24.581
483
+ to keep track of your general performance.
484
+
485
+ 0:14:24.784 --> 0:14:28.648
486
+ You don't want to improve normally then on
487
+ the general quality.
488
+
489
+ 0:14:28.688 --> 0:14:41.555
490
+ So if you build a system which will mitigate
491
+ some biases then the aim is that if you evaluate
492
+
493
+ 0:14:41.555 --> 0:14:45.662
494
+ it on the challenging biases.
495
+
496
+ 0:14:45.745 --> 0:14:53.646
497
+ You don't need to get better because the aggregated
498
+ versions don't really measure that aspect well,
499
+
500
+ 0:14:53.646 --> 0:14:57.676
501
+ but if you significantly drop in performance
502
+ then.
503
+
504
+ 0:15:00.000 --> 0:15:19.164
505
+ What are, in generally calms, people report
506
+ about that or why should you care about?
507
+
508
+ 0:15:19.259 --> 0:15:23.598
509
+ And you're even then amplifying this type
510
+ of stereotypes.
511
+
512
+ 0:15:23.883 --> 0:15:33.879
513
+ And that is not what you want to achieve with
514
+ using this technology.
515
+
516
+ 0:15:33.734 --> 0:15:39.388
517
+ It's not working through some groups.
518
+
519
+ 0:15:39.819 --> 0:15:47.991
520
+ And secondly what is referred to as allocational
521
+ parts.
522
+
523
+ 0:15:47.845 --> 0:15:54.123
524
+ The system might not perform as well for.
525
+
526
+ 0:15:54.314 --> 0:16:00.193
527
+ So another example of which we would like
528
+ to see is that sometimes the translation depends
529
+
530
+ 0:16:00.193 --> 0:16:01.485
531
+ on who is speaking.
532
+
533
+ 0:16:01.601 --> 0:16:03.463
534
+ So Here You Have It in French.
535
+
536
+ 0:16:03.723 --> 0:16:16.359
537
+ Not say it, but the word happy or French has
538
+ to be expressed differently, whether it's a
539
+
540
+ 0:16:16.359 --> 0:16:20.902
541
+ male person or a female person.
542
+
543
+ 0:16:21.121 --> 0:16:28.917
544
+ It's nearly impossible to guess that or it's
545
+ impossible, so then you always select one.
546
+
547
+ 0:16:29.189 --> 0:16:37.109
548
+ And of course, since we do greedy search,
549
+ it will always generate the same, so you will
550
+
551
+ 0:16:37.109 --> 0:16:39.449
552
+ have a worse performance.
553
+
554
+ 0:16:39.779 --> 0:16:46.826
555
+ And of course not what we want to achieve
556
+ in average.
557
+
558
+ 0:16:46.696 --> 0:16:54.006
559
+ You might be then good, but you also have
560
+ the ability.
561
+
562
+ 0:16:54.234 --> 0:17:08.749
563
+ This is a biased problem or an interface problem
564
+ because mean you can say well.
565
+
566
+ 0:17:09.069 --> 0:17:17.358
567
+ And if you do it, we still have a system that
568
+ generates unusable output.
569
+
570
+ 0:17:17.244 --> 0:17:24.059
571
+ If you don't tell it what you want to do,
572
+ so in this case.
573
+
574
+ 0:17:24.244 --> 0:17:27.173
575
+ So in this case it's like if we don't have
576
+ enough information.
577
+
578
+ 0:17:27.467 --> 0:17:34.629
579
+ So you have to adapt your system in some way
580
+ that can either access the information or output.
581
+
582
+ 0:17:34.894 --> 0:17:46.144
583
+ But yeah, how you mean there's different ways
584
+ of how to improve over that first thing is
585
+
586
+ 0:17:46.144 --> 0:17:47.914
587
+ you find out.
588
+
589
+ 0:17:48.688 --> 0:17:53.826
590
+ Then there is different ways of addressing
591
+ them, and they of course differ.
592
+
593
+ 0:17:53.759 --> 0:17:57.546
594
+ Isn't the situation where the information's
595
+ available?
596
+
597
+ 0:17:58.038 --> 0:18:12.057
598
+ That's the first case we have, or is it a
599
+ situation where we don't have the information
600
+
601
+ 0:18:12.057 --> 0:18:13.332
602
+ either?
603
+
604
+ 0:18:14.154 --> 0:18:28.787
605
+ Or should give the system maybe the opportunity
606
+ to output those or say don't know this is still
607
+
608
+ 0:18:28.787 --> 0:18:29.701
609
+ open.
610
+
611
+ 0:18:29.769 --> 0:18:35.470
612
+ And even if they have enough information,
613
+ need this additional information, but they
614
+
615
+ 0:18:35.470 --> 0:18:36.543
616
+ are just doing.
617
+
618
+ 0:18:36.776 --> 0:18:51.132
619
+ Which is a bit based on how we find that there
620
+ is research on that, but it's not that easy
621
+
622
+ 0:18:51.132 --> 0:18:52.710
623
+ to solve.
624
+
625
+ 0:18:52.993 --> 0:19:05.291
626
+ But in general, detecting do have enough information
627
+ to do a good translation or are information
628
+
629
+ 0:19:05.291 --> 0:19:06.433
630
+ missing?
631
+
632
+ 0:19:09.669 --> 0:19:18.951
633
+ But before we come on how we will address
634
+ it or try to change it, and before we look
635
+
636
+ 0:19:18.951 --> 0:19:22.992
637
+ at how we can assess it, of course,.
638
+
639
+ 0:19:23.683 --> 0:19:42.820
640
+ And therefore wanted to do a bit of a review
641
+ on how gender is represented in languages.
642
+
643
+ 0:19:43.743 --> 0:19:48.920
644
+ Course: You can have more fine grained.
645
+
646
+ 0:19:48.791 --> 0:20:00.571
647
+ It's not that everything in the group is the
648
+ same, but in general you have a large group.
649
+
650
+ 0:20:01.381 --> 0:20:08.347
651
+ For example, you even don't say ishi or but
652
+ it's just one word for it written.
653
+
654
+ 0:20:08.259 --> 0:20:16.101
655
+ Oh, don't know how it's pronounced, so you
656
+ cannot say from a sentence whether it's ishi
657
+
658
+ 0:20:16.101 --> 0:20:16.725
659
+ or it.
660
+
661
+ 0:20:17.937 --> 0:20:29.615
662
+ Of course, there are some exceptions for whether
663
+ it's a difference between male and female.
664
+
665
+ 0:20:29.488 --> 0:20:35.965
666
+ They have different names for brother and
667
+ sister.
668
+
669
+ 0:20:36.036 --> 0:20:41.772
670
+ So normally you cannot infer whether this
671
+ is a male speaker or speaking about a male
672
+
673
+ 0:20:41.772 --> 0:20:42.649
674
+ or a female.
675
+
676
+ 0:20:44.304 --> 0:20:50.153
677
+ Examples for these languages are, for example,
678
+ Finnish and Turkish.
679
+
680
+ 0:20:50.067 --> 0:21:00.205
681
+ There are more languages, but these are: Then
682
+ we have no nutritional gender languages where
683
+
684
+ 0:21:00.205 --> 0:21:05.935
685
+ there's some gender information in there, but
686
+ it's.
687
+
688
+ 0:21:05.905 --> 0:21:08.169
689
+ And this is an example.
690
+
691
+ 0:21:08.075 --> 0:21:15.150
692
+ This is English, which is in that way a nice
693
+ example because most people.
694
+
695
+ 0:21:15.415 --> 0:21:20.164
696
+ So you have there some lexicogender and phenomenal
697
+ gender.
698
+
699
+ 0:21:20.083 --> 0:21:23.305
700
+ I mean mamadeta there she-hee and him.
701
+
702
+ 0:21:23.643 --> 0:21:31.171
703
+ And very few words are marked like actor and
704
+ actress, but in general most words are not
705
+
706
+ 0:21:31.171 --> 0:21:39.468
707
+ marked, so it's teacher and lecturer and friend,
708
+ so in all these words the gender is not marked,
709
+
710
+ 0:21:39.468 --> 0:21:41.607
711
+ and so you cannot infer.
712
+
713
+ 0:21:42.622 --> 0:21:48.216
714
+ So the initial Turkish sentence here would
715
+ be translated to either he is a good friend
716
+
717
+ 0:21:48.216 --> 0:21:49.373
718
+ or she is a good.
719
+
720
+ 0:21:51.571 --> 0:22:05.222
721
+ In this case you would have them gender information
722
+ in there, but of course there's a good friend.
723
+
724
+ 0:22:07.667 --> 0:22:21.077
725
+ And then finally there is the grammatical
726
+ German languages where each noun has a gender.
727
+
728
+ 0:22:20.926 --> 0:22:25.301
729
+ That's the case in Spanish.
730
+
731
+ 0:22:26.186 --> 0:22:34.025
732
+ This is mostly formal, but at least if you're
733
+ talking about a human that also agrees.
734
+
735
+ 0:22:34.214 --> 0:22:38.209
736
+ Of course, it's like the sun.
737
+
738
+ 0:22:38.076 --> 0:22:50.464
739
+ There is no clear thing why the sun should
740
+ be female, and in other language it's different.
741
+
742
+ 0:22:50.390 --> 0:22:56.100
743
+ The matching, and then you also have more
744
+ agreements with this that makes things more
745
+
746
+ 0:22:56.100 --> 0:22:56.963
747
+ complicated.
748
+
749
+ 0:22:57.958 --> 0:23:08.571
750
+ Here he is a good friend and the good is also
751
+ depending whether it's male or went up so it's
752
+
753
+ 0:23:08.571 --> 0:23:17.131
754
+ changing also based on the gender so you have
755
+ a lot of gender information.
756
+
757
+ 0:23:17.777 --> 0:23:21.364
758
+ Get them, but do you always get them correctly?
759
+
760
+ 0:23:21.289 --> 0:23:25.101
761
+ It might be that they're in English, for example.
762
+
763
+ 0:23:28.748 --> 0:23:36.154
764
+ And since this is the case, and you need to
765
+ like often express the gender even though you
766
+
767
+ 0:23:36.154 --> 0:23:37.059
768
+ might not.
769
+
770
+ 0:23:37.377 --> 0:23:53.030
771
+ Aware of it or it's not possible, there's
772
+ some ways in German how to mark mutual forms.
773
+
774
+ 0:23:54.194 --> 0:24:03.025
775
+ But then it's again from the machine learning
776
+ side of view, of course quite challenging because
777
+
778
+ 0:24:03.025 --> 0:24:05.417
779
+ you only want to use the.
780
+
781
+ 0:24:05.625 --> 0:24:11.108
782
+ If it's known to the reader you want to use
783
+ the correct, the not mutual form but either
784
+
785
+ 0:24:11.108 --> 0:24:12.354
786
+ the male or female.
787
+
788
+ 0:24:13.013 --> 0:24:21.771
789
+ So they are assessing what is known to the
790
+ reader as a challenge which needs to in some
791
+
792
+ 0:24:21.771 --> 0:24:23.562
793
+ way be addressed.
794
+
795
+ 0:24:26.506 --> 0:24:30.887
796
+ Here why does that happen?
797
+
798
+ 0:24:30.725 --> 0:24:42.086
799
+ Three reasons we have that in a bit so one
800
+ is, of course, that your.
801
+
802
+ 0:24:42.162 --> 0:24:49.003
803
+ Example: If you look at the Europe High Corpus,
804
+ which is an important resource for doing machine
805
+
806
+ 0:24:49.003 --> 0:24:49.920
807
+ translation.
808
+
809
+ 0:24:50.010 --> 0:24:59.208
810
+ Then there's only thirty percent of the speakers
811
+ are female, and so if you train a model on
812
+
813
+ 0:24:59.208 --> 0:25:06.606
814
+ that data, if you're translating to French,
815
+ there will be a male version.
816
+
817
+ 0:25:06.746 --> 0:25:10.762
818
+ And so you'll just have a lot more like seventy
819
+ percent of your mail for it.
820
+
821
+ 0:25:10.971 --> 0:25:18.748
822
+ And that will be Yep will make the model therefore
823
+ from this data sub.
824
+
825
+ 0:25:18.898 --> 0:25:25.882
826
+ And of course this will be in the data for
827
+ a very long time.
828
+
829
+ 0:25:25.768 --> 0:25:33.669
830
+ So if there's more female speakers in the
831
+ European Parliament, but.
832
+
833
+ 0:25:33.933 --> 0:25:42.338
834
+ But we are training on historical data, so
835
+ even if there is for a long time, it will not
836
+
837
+ 0:25:42.338 --> 0:25:43.377
838
+ be in the.
839
+
840
+ 0:25:46.346 --> 0:25:57.457
841
+ Then besides these preexisting data there
842
+ is of course technical biases which will amplify
843
+
844
+ 0:25:57.457 --> 0:25:58.800
845
+ this type.
846
+
847
+ 0:25:59.039 --> 0:26:04.027
848
+ So one we already address, that's for example
849
+ sampling or beam search.
850
+
851
+ 0:26:03.957 --> 0:26:06.418
852
+ You get the most probable output.
853
+
854
+ 0:26:06.646 --> 0:26:16.306
855
+ So if there's a bias in your model, it will
856
+ amplify that not only in the case we had before,
857
+
858
+ 0:26:16.306 --> 0:26:19.423
859
+ and produce the male version.
860
+
861
+ 0:26:20.040 --> 0:26:32.873
862
+ So if you have the same source sentence like
863
+ am happy and in your training data it will
864
+
865
+ 0:26:32.873 --> 0:26:38.123
866
+ be male and female if you're doing.
867
+
868
+ 0:26:38.418 --> 0:26:44.510
869
+ So in that way by doing this type of algorithmic
870
+ design you will have.
871
+
872
+ 0:26:44.604 --> 0:26:59.970
873
+ Another use case is if you think about a multilingual
874
+ machine translation, for example if you are
875
+
876
+ 0:26:59.970 --> 0:27:04.360
877
+ now doing a pivot language.
878
+
879
+ 0:27:04.524 --> 0:27:13.654
880
+ But if you're first trying to English this
881
+ information might get lost and then you translate
882
+
883
+ 0:27:13.654 --> 0:27:14.832
884
+ to Spanish.
885
+
886
+ 0:27:15.075 --> 0:27:21.509
887
+ So while in general in this class there is
888
+ not this type of bias there,.
889
+
890
+ 0:27:22.922 --> 0:27:28.996
891
+ You might introduce it because you might have
892
+ good reasons for doing a modular system because
893
+
894
+ 0:27:28.996 --> 0:27:31.968
895
+ you don't have enough training data or so on.
896
+
897
+ 0:27:31.903 --> 0:27:37.570
898
+ It's performing better in average, but of
899
+ course by doing this choice you'll introduce
900
+
901
+ 0:27:37.570 --> 0:27:40.045
902
+ an additional type of bias into your.
903
+
904
+ 0:27:45.805 --> 0:27:52.212
905
+ And then there is what people refer to as
906
+ emergent bias, and that is, if you use a system
907
+
908
+ 0:27:52.212 --> 0:27:58.903
909
+ for a different use case as we see in, generally
910
+ it is the case that is performing worse, but
911
+
912
+ 0:27:58.903 --> 0:28:02.533
913
+ then of course you can have even more challenging.
914
+
915
+ 0:28:02.942 --> 0:28:16.196
916
+ So the extreme case would be if you train
917
+ a system only on male speakers, then of course
918
+
919
+ 0:28:16.196 --> 0:28:22.451
920
+ it will perform worse on female speakers.
921
+
922
+ 0:28:22.902 --> 0:28:36.287
923
+ So, of course, if you're doing this type of
924
+ problem, if you use a system for a different
925
+
926
+ 0:28:36.287 --> 0:28:42.152
927
+ situation where it was original, then.
928
+
929
+ 0:28:44.004 --> 0:28:54.337
930
+ And with this we would then go for type of
931
+ evaluation, but before we are looking at how
932
+
933
+ 0:28:54.337 --> 0:28:56.333
934
+ we can evaluate.
935
+
936
+ 0:29:00.740 --> 0:29:09.484
937
+ Stereotypes in Machine TranslationBefore we
938
+ want to look into how we can improve the system,
939
+
940
+ 0:29:09.484 --> 0:29:13.527
941
+ think yeah, maybe at the moment most work.
942
+
943
+ 0:29:13.954 --> 0:29:21.659
944
+ And the one thing is the system trying to
945
+ look into stereotypes.
946
+
947
+ 0:29:21.541 --> 0:29:26.167
948
+ So how does a system use stereotypes?
949
+
950
+ 0:29:26.466 --> 0:29:29.443
951
+ So if you have the Hungarian sentence,.
952
+
953
+ 0:29:29.729 --> 0:29:33.805
954
+ Which should be he is an engineer or she is
955
+ an engineer.
956
+
957
+ 0:29:35.375 --> 0:29:43.173
958
+ And you cannot guess that because we saw that
959
+ he and she is not different in Hungary.
960
+
961
+ 0:29:43.423 --> 0:29:57.085
962
+ Then you can have a test set where you have
963
+ these type of ailanomal occupations.
964
+
965
+ 0:29:56.977 --> 0:30:03.862
966
+ You have statistics from how is the distribution
967
+ by gender so you can automatically generate
968
+
969
+ 0:30:03.862 --> 0:30:04.898
970
+ the sentence.
971
+
972
+ 0:30:04.985 --> 0:30:21.333
973
+ Then you could put in jobs which are mostly
974
+ done by a man and then you can check how is
975
+
976
+ 0:30:21.333 --> 0:30:22.448
977
+ your.
978
+
979
+ 0:30:22.542 --> 0:30:33.276
980
+ That is one type of evaluating stereotypes
981
+ that one of the most famous benchmarks called
982
+
983
+ 0:30:33.276 --> 0:30:42.322
984
+ vino is exactly: The second type of evaluation
985
+ is about gender preserving.
986
+
987
+ 0:30:42.342 --> 0:30:51.201
988
+ So that is exactly what we have seen beforehand.
989
+
990
+ 0:30:51.020 --> 0:31:00.244
991
+ If these information are not in the text itself,.
992
+
993
+ 0:31:00.320 --> 0:31:01.875
994
+ Gender as a speaker.
995
+
996
+ 0:31:02.062 --> 0:31:04.450
997
+ And how good does a system do that?
998
+
999
+ 0:31:04.784 --> 0:31:09.675
1000
+ And we'll see there's, for example, one benchmark
1001
+ on this.
1002
+
1003
+ 0:31:09.592 --> 0:31:15.762
1004
+ For example: For Arabic there is one benchmark
1005
+ on this foot: Audio because if you're now think
1006
+
1007
+ 0:31:15.762 --> 0:31:16.801
1008
+ already of the.
1009
+
1010
+ 0:31:17.157 --> 0:31:25.257
1011
+ From when we're talking about speech translation,
1012
+ it might be interesting because in the speech
1013
+
1014
+ 0:31:25.257 --> 0:31:32.176
1015
+ signal you should have a better guess on whether
1016
+ it's a male or a female speaker.
1017
+
1018
+ 0:31:32.432 --> 0:31:38.928
1019
+ So but mean current systems, mostly you can
1020
+ always add, and they will just first transcribe.
1021
+
1022
+ 0:31:42.562 --> 0:31:45.370
1023
+ Yes, so how do these benchmarks?
1024
+
1025
+ 0:31:45.305 --> 0:31:51.356
1026
+ Look like that, the first one is here.
1027
+
1028
+ 0:31:51.201 --> 0:32:02.839
1029
+ There's an occupation test where it looks
1030
+ like a simple test set because.
1031
+
1032
+ 0:32:03.023 --> 0:32:10.111
1033
+ So I've known either hurry him or pronounce
1034
+ the name for a long time.
1035
+
1036
+ 0:32:10.010 --> 0:32:13.557
1037
+ My friend works as an occupation.
1038
+
1039
+ 0:32:13.833 --> 0:32:16.771
1040
+ So that is like all sentences in that look
1041
+ like that.
1042
+
1043
+ 0:32:17.257 --> 0:32:28.576
1044
+ So in this case you haven't had the biggest
1045
+ work in here, which is friends.
1046
+
1047
+ 0:32:28.427 --> 0:32:33.346
1048
+ So your only checking later is.
1049
+
1050
+ 0:32:34.934 --> 0:32:46.981
1051
+ This can be inferred from whether it's her
1052
+ or her or her, or if it's a proper name, so
1053
+
1054
+ 0:32:46.981 --> 0:32:55.013
1055
+ can you infer it from the name, and then you
1056
+ can compare.
1057
+
1058
+ 0:32:55.115 --> 0:33:01.744
1059
+ So is this because the job description is
1060
+ nearer to friend.
1061
+
1062
+ 0:33:01.633 --> 0:33:06.939
1063
+ Does the system get disturbed by this type
1064
+ of.
1065
+
1066
+ 0:33:08.828 --> 0:33:14.753
1067
+ And there you can then automatically assess
1068
+ yeah this type.
1069
+
1070
+ 0:33:14.774 --> 0:33:18.242
1071
+ Of course, that's what said at the beginning.
1072
+
1073
+ 0:33:18.167 --> 0:33:24.837
1074
+ You shouldn't only rely on that because if
1075
+ you only rely on it you can easily trick the
1076
+
1077
+ 0:33:24.837 --> 0:33:25.444
1078
+ system.
1079
+
1080
+ 0:33:25.368 --> 0:33:31.888
1081
+ So one type of sentence is translated, but
1082
+ of course it can give you very important.
1083
+
1084
+ 0:33:33.813 --> 0:33:35.309
1085
+ Any questions yeah.
1086
+
1087
+ 0:33:36.736 --> 0:33:44.553
1088
+ Much like the evaluation of stereotype, we
1089
+ want the system to agree with stereotypes because
1090
+
1091
+ 0:33:44.553 --> 0:33:46.570
1092
+ it increases precision.
1093
+
1094
+ 0:33:46.786 --> 0:33:47.979
1095
+ No, no, no.
1096
+
1097
+ 0:33:47.880 --> 0:33:53.088
1098
+ In this case, if we say oh yeah, he is an
1099
+ engineer.
1100
+
1101
+ 0:33:52.988 --> 0:34:01.602
1102
+ From the example, it's probably the most likely
1103
+ translation, probably in more cases.
1104
+
1105
+ 0:34:02.702 --> 0:34:08.611
1106
+ Now there is two things, so yeah yeah, so
1107
+ there is two ways of evaluating.
1108
+
1109
+ 0:34:08.533 --> 0:34:15.594
1110
+ The one thing is in this case he's using that
1111
+ he's an engineer, but there is conflicting
1112
+
1113
+ 0:34:15.594 --> 0:34:19.879
1114
+ information that in this case the engineer
1115
+ is female.
1116
+
1117
+ 0:34:20.380 --> 0:34:21.890
1118
+ So anything was.
1119
+
1120
+ 0:34:22.342 --> 0:34:29.281
1121
+ Information yes, so that is the one in the
1122
+ other case.
1123
+
1124
+ 0:34:29.155 --> 0:34:38.746
1125
+ Typically it's not evaluated in that, but
1126
+ in that time you really want it.
1127
+
1128
+ 0:34:38.898 --> 0:34:52.732
1129
+ That's why most of those cases you have evaluated
1130
+ in scenarios where you have context information.
1131
+
1132
+ 0:34:53.453 --> 0:34:58.878
1133
+ How to deal with the other thing is even more
1134
+ challenging to one case where it is the case
1135
+
1136
+ 0:34:58.878 --> 0:35:04.243
1137
+ is what I said before is when it's about the
1138
+ speaker so that the speech translation test.
1139
+
1140
+ 0:35:04.584 --> 0:35:17.305
1141
+ And there they try to look in a way that can
1142
+ you use, so use the audio also as input.
1143
+
1144
+ 0:35:18.678 --> 0:35:20.432
1145
+ Yeah.
1146
+
1147
+ 0:35:20.640 --> 0:35:30.660
1148
+ So if we have a reference where she is an
1149
+ engineer okay, are there efforts to adjust
1150
+
1151
+ 0:35:30.660 --> 0:35:37.497
1152
+ the metric so that our transmissions go into
1153
+ the correct?
1154
+
1155
+ 0:35:37.379 --> 0:35:38.689
1156
+ We don't.
1157
+
1158
+ 0:35:38.618 --> 0:35:40.389
1159
+ Only done for mean this is evaluation.
1160
+
1161
+ 0:35:40.344 --> 0:35:42.388
1162
+ You are not pushing the model for anything.
1163
+
1164
+ 0:35:43.023 --> 0:35:53.458
1165
+ But if you want to do it in training, that
1166
+ you're not doing it this way.
1167
+
1168
+ 0:35:53.315 --> 0:35:58.465
1169
+ I'm not aware of any direct model.
1170
+
1171
+ 0:35:58.638 --> 0:36:04.146
1172
+ Because you have to find out, is it known
1173
+ in this scenario or not?
1174
+
1175
+ 0:36:05.725 --> 0:36:12.622
1176
+ So at least I'm not aware of there's like
1177
+ the directive doing training try to assess
1178
+
1179
+ 0:36:12.622 --> 0:36:13.514
1180
+ more than.
1181
+
1182
+ 0:36:13.813 --> 0:36:18.518
1183
+ Mean there is data augmentation in the way
1184
+ that is done.
1185
+
1186
+ 0:36:18.436 --> 0:36:23.967
1187
+ Think we'll have that later, so what you can
1188
+ do is generate more.
1189
+
1190
+ 0:36:24.144 --> 0:36:35.355
1191
+ You can do that automatically or there's ways
1192
+ of biasing so that you can try to make your
1193
+
1194
+ 0:36:35.355 --> 0:36:36.600
1195
+ training.
1196
+
1197
+ 0:36:36.957 --> 0:36:46.228
1198
+ That's typically not done with focusing on
1199
+ scenarios where you check before or do have
1200
+
1201
+ 0:36:46.228 --> 0:36:47.614
1202
+ information.
1203
+
1204
+ 0:36:49.990 --> 0:36:58.692
1205
+ Mean, but for everyone it's not clear and
1206
+ agree with you in this scenario, the normal
1207
+
1208
+ 0:36:58.692 --> 0:37:01.222
1209
+ evaluation system where.
1210
+
1211
+ 0:37:01.341 --> 0:37:07.006
1212
+ Maybe you could say it shouldn't do always
1213
+ the same but have a distribution like a training
1214
+
1215
+ 0:37:07.006 --> 0:37:12.733
1216
+ data or something like that because otherwise
1217
+ we're amplifying but that current system can't
1218
+
1219
+ 0:37:12.733 --> 0:37:15.135
1220
+ do current systems can't predict both.
1221
+
1222
+ 0:37:15.073 --> 0:37:17.377
1223
+ That's why we see all the beginning.
1224
+
1225
+ 0:37:17.314 --> 0:37:20.864
1226
+ They have this extra interface where they
1227
+ then propose.
1228
+
1229
+ 0:37:24.784 --> 0:37:33.896
1230
+ Another thing is the vino empty system and
1231
+ it started from a challenge set for co-reference
1232
+
1233
+ 0:37:33.896 --> 0:37:35.084
1234
+ resolution.
1235
+
1236
+ 0:37:34.985 --> 0:37:43.503
1237
+ Co-reference resolution means we have pear
1238
+ on him and we need to find out what it's.
1239
+
1240
+ 0:37:43.823 --> 0:37:53.620
1241
+ So you have the doctor off the nurse to help
1242
+ her in the procedure, and now her does not
1243
+
1244
+ 0:37:53.620 --> 0:37:55.847
1245
+ refer to the nurse.
1246
+
1247
+ 0:37:56.556 --> 0:38:10.689
1248
+ And there you of course have the same type
1249
+ of stewardesses and the same type of buyers
1250
+
1251
+ 0:38:10.689 --> 0:38:15.237
1252
+ as the machine translation.
1253
+
1254
+ 0:38:16.316 --> 0:38:25.165
1255
+ And no think that normally yeah mean maybe
1256
+ that's also biased.
1257
+
1258
+ 0:38:27.687 --> 0:38:37.514
1259
+ No, but if you ask somebody, I guess if you
1260
+ ask somebody, then I mean syntectically it's
1261
+
1262
+ 0:38:37.514 --> 0:38:38.728
1263
+ ambiguous.
1264
+
1265
+ 0:38:38.918 --> 0:38:50.248
1266
+ If you ask somebody to help, then the horror
1267
+ has to refer to that.
1268
+
1269
+ 0:38:50.079 --> 0:38:54.990
1270
+ So it should also help the.
1271
+
1272
+ 0:38:56.396 --> 0:38:57.469
1273
+ Of the time.
1274
+
1275
+ 0:38:57.386 --> 0:39:03.907
1276
+ The doctor is female and says please have
1277
+ me in the procedure, but the other.
1278
+
1279
+ 0:39:04.904 --> 0:39:09.789
1280
+ Oh, you mean that it's helping the third person.
1281
+
1282
+ 0:39:12.192 --> 0:39:16.140
1283
+ Yeah, agree that it could also be yes.
1284
+
1285
+ 0:39:16.039 --> 0:39:19.037
1286
+ Don't know how easy that is.
1287
+
1288
+ 0:39:18.933 --> 0:39:21.109
1289
+ Only know the test.
1290
+
1291
+ 0:39:21.321 --> 0:39:31.820
1292
+ Then guess yeah, then you need a situation
1293
+ context where you know the situation, the other
1294
+
1295
+ 0:39:31.820 --> 0:39:34.589
1296
+ person having problems.
1297
+
1298
+ 0:39:36.936 --> 0:39:42.251
1299
+ Yeah no yeah that is like here when there
1300
+ is additional ambiguity in there.
1301
+
1302
+ 0:39:45.465 --> 0:39:48.395
1303
+ See that pure text models is not always okay.
1304
+
1305
+ 0:39:48.331 --> 0:39:51.136
1306
+ How full mean there is a lot of work also.
1307
+
1308
+ 0:39:52.472 --> 0:40:00.119
1309
+ Will not cover that in the lecture, but there
1310
+ are things like multimodal machine translation
1311
+
1312
+ 0:40:00.119 --> 0:40:07.109
1313
+ where you try to add pictures or something
1314
+ like that to have more context, and then.
1315
+
1316
+ 0:40:10.370 --> 0:40:23.498
1317
+ Yeah, it starts with this, so in order to
1318
+ evaluate that what it does is that you translate
1319
+
1320
+ 0:40:23.498 --> 0:40:25.229
1321
+ the system.
1322
+
1323
+ 0:40:25.305 --> 0:40:32.310
1324
+ It's doing stereotyping so the doctor is male
1325
+ and the nurse is female.
1326
+
1327
+ 0:40:32.492 --> 0:40:42.362
1328
+ And then you're using word alignment, and
1329
+ then you check whether this gender maps with
1330
+
1331
+ 0:40:42.362 --> 0:40:52.345
1332
+ the annotated gender of there, and that is
1333
+ how you evaluate in this type of vino empty.
1334
+
1335
+ 0:40:52.832 --> 0:40:59.475
1336
+ Mean, as you see, you're only focusing on
1337
+ the situation where you can or where the gender
1338
+
1339
+ 0:40:59.475 --> 0:41:00.214
1340
+ is known.
1341
+
1342
+ 0:41:00.140 --> 0:41:06.915
1343
+ Why for this one you don't do any evaluation,
1344
+ but because nurses can in that case be those
1345
+
1346
+ 0:41:06.915 --> 0:41:08.703
1347
+ and you cannot, as has.
1348
+
1349
+ 0:41:08.728 --> 0:41:19.112
1350
+ The benchmarks are at the moment designed
1351
+ in a way that you only evaluate things that
1352
+
1353
+ 0:41:19.112 --> 0:41:20.440
1354
+ are known.
1355
+
1356
+ 0:41:23.243 --> 0:41:25.081
1357
+ Then yeah, you can have a look.
1358
+
1359
+ 0:41:25.024 --> 0:41:28.905
1360
+ For example, here what people are looking
1361
+ is you can do the first.
1362
+
1363
+ 0:41:28.847 --> 0:41:32.150
1364
+ Oh well, the currency, how often does it do
1365
+ it correct?
1366
+
1367
+ 0:41:32.552 --> 0:41:41.551
1368
+ And there you see these numbers are a bit
1369
+ older.
1370
+
1371
+ 0:41:41.367 --> 0:41:51.838
1372
+ There's more work on that, but this is the
1373
+ first color.
1374
+
1375
+ 0:41:51.731 --> 0:42:01.311
1376
+ Because they do it like in this test, they
1377
+ do it twice, one with him and one with her.
1378
+
1379
+ 0:42:01.201 --> 0:42:04.838
1380
+ So the chance is fifty percent.
1381
+
1382
+ 0:42:05.065 --> 0:42:12.097
1383
+ Except somehow here, the one system seems
1384
+ to be quite good there that everything.
1385
+
1386
+ 0:42:13.433 --> 0:42:30.863
1387
+ What you can also do is look at the difference,
1388
+ where you need to predict female and the difference.
1389
+
1390
+ 0:42:30.850 --> 0:42:40.338
1391
+ It's more often correct on the male forms
1392
+ than on the female forms, and you see that
1393
+
1394
+ 0:42:40.338 --> 0:42:43.575
1395
+ it's except for this system.
1396
+
1397
+ 0:42:43.603 --> 0:42:53.507
1398
+ So would assume that they maybe in this one
1399
+ language did some type of method in there.
1400
+
1401
+ 0:42:55.515 --> 0:42:57.586
1402
+ If you are more often mean there is like.
1403
+
1404
+ 0:42:58.178 --> 0:43:01.764
1405
+ It's not a lot lower, there's one.
1406
+
1407
+ 0:43:01.662 --> 0:43:08.893
1408
+ I don't know why, but if you're always to
1409
+ the same then it should be.
1410
+
1411
+ 0:43:08.789 --> 0:43:14.679
1412
+ You seem to be counter intuitive, so maybe
1413
+ it's better.
1414
+
1415
+ 0:43:15.175 --> 0:43:18.629
1416
+ Don't know exactly how yes, but it's, it's
1417
+ true.
1418
+
1419
+ 0:43:19.019 --> 0:43:20.849
1420
+ Mean, there's very few cases.
1421
+
1422
+ 0:43:20.788 --> 0:43:22.686
1423
+ I also don't know for Russian.
1424
+
1425
+ 0:43:22.624 --> 0:43:27.480
1426
+ I mean, there is, I think, mainly for Russian
1427
+ where you have very low numbers.
1428
+
1429
+ 0:43:27.418 --> 0:43:30.076
1430
+ I mean, I would say like forty five or so.
1431
+
1432
+ 0:43:30.014 --> 0:43:32.878
1433
+ There can be more about renting and sampling.
1434
+
1435
+ 0:43:32.816 --> 0:43:37.287
1436
+ I don't know if they have even more gender
1437
+ or if they have a new tool.
1438
+
1439
+ 0:43:37.224 --> 0:43:38.424
1440
+ I don't think so.
1441
+
1442
+ 0:43:40.040 --> 0:43:46.901
1443
+ Then you have typically even a stronger bias
1444
+ here where you not do the differentiation between
1445
+
1446
+ 0:43:46.901 --> 0:43:53.185
1447
+ how often is it correct for me and the female,
1448
+ but you are distinguishing between the.
1449
+
1450
+ 0:43:53.553 --> 0:44:00.503
1451
+ So you're here, for you can check for each
1452
+ occupation, which is the most important.
1453
+
1454
+ 0:44:00.440 --> 0:44:06.182
1455
+ A comment one based on statistics, and then
1456
+ you take that on the one side and the anti
1457
+
1458
+ 0:44:06.182 --> 0:44:12.188
1459
+ stereotypically on the other side, and you
1460
+ see that not in all cases but in a lot of cases
1461
+
1462
+ 0:44:12.188 --> 0:44:16.081
1463
+ that null probabilities are even higher than
1464
+ on the other.
1465
+
1466
+ 0:44:21.061 --> 0:44:24.595
1467
+ Ah, I'm telling you there's something.
1468
+
1469
+ 0:44:28.668 --> 0:44:32.850
1470
+ But it has to be for a doctor.
1471
+
1472
+ 0:44:32.715 --> 0:44:39.597
1473
+ For example, for a doctor there three don't
1474
+ know.
1475
+
1476
+ 0:44:40.780 --> 0:44:44.275
1477
+ Yeah, but guess here it's mainly imminent
1478
+ job description.
1479
+
1480
+ 0:44:44.215 --> 0:44:45.108
1481
+ So yeah, but.
1482
+
1483
+ 0:44:50.050 --> 0:45:01.145
1484
+ And then there is the Arabic capital gender
1485
+ corpus where it is about more assessing how
1486
+
1487
+ 0:45:01.145 --> 0:45:03.289
1488
+ strong a singer.
1489
+
1490
+ 0:45:03.483 --> 0:45:09.445
1491
+ How that is done is the open subtitles.
1492
+
1493
+ 0:45:09.296 --> 0:45:18.690
1494
+ Corpus is like a corpus of subtitles generated
1495
+ by volunteers.
1496
+
1497
+ 0:45:18.558 --> 0:45:23.426
1498
+ For the Words Like I Mean Myself.
1499
+
1500
+ 0:45:23.303 --> 0:45:30.670
1501
+ And mine, and then they annotated the Arabic
1502
+ sentences, whether here I refer to as a female
1503
+
1504
+ 0:45:30.670 --> 0:45:38.198
1505
+ and masculine, or whether it's ambiguous, and
1506
+ then from the male and female one they generate
1507
+
1508
+ 0:45:38.198 --> 0:45:40.040
1509
+ types of translations.
1510
+
1511
+ 0:45:43.703 --> 0:45:51.921
1512
+ And then a bit more different test sets as
1513
+ the last one that is referred to as the machine.
1514
+
1515
+ 0:45:52.172 --> 0:45:57.926
1516
+ Corpus, which is based on these lectures.
1517
+
1518
+ 0:45:57.789 --> 0:46:05.464
1519
+ In general, this lecture is very important
1520
+ because it.
1521
+
1522
+ 0:46:05.765 --> 0:46:22.293
1523
+ And here is also interesting because you also
1524
+ have the obvious signal and it's done in the
1525
+
1526
+ 0:46:22.293 --> 0:46:23.564
1527
+ worst.
1528
+
1529
+ 0:46:23.763 --> 0:46:27.740
1530
+ In the first case is where it can only be
1531
+ determined based on the speaker.
1532
+
1533
+ 0:46:27.968 --> 0:46:30.293
1534
+ So something like am a good speaker.
1535
+
1536
+ 0:46:30.430 --> 0:46:32.377
1537
+ You cannot do that correctly.
1538
+
1539
+ 0:46:32.652 --> 0:46:36.970
1540
+ However, if you would have the audio signal
1541
+ you should have a lot better guests.
1542
+
1543
+ 0:46:37.257 --> 0:46:47.812
1544
+ So it wasn't evaluated, especially machine
1545
+ translation and speech translation system,
1546
+
1547
+ 0:46:47.812 --> 0:46:53.335
1548
+ which take this into account or, of course,.
1549
+
1550
+ 0:46:57.697 --> 0:47:04.265
1551
+ The second thing is where you can do it based
1552
+ on the context.
1553
+
1554
+ 0:47:04.159 --> 0:47:08.717
1555
+ In this case we are not using artificial.
1556
+
1557
+ 0:47:11.011 --> 0:47:15.550
1558
+ Cope from the from the real data, so it's
1559
+ not like artificial creative data, but.
1560
+
1561
+ 0:47:15.815 --> 0:47:20.939
1562
+ Of course, in a lot more work you have to
1563
+ somehow find these in the corpus and use them
1564
+
1565
+ 0:47:20.939 --> 0:47:21.579
1566
+ as a test.
1567
+
1568
+ 0:47:21.601 --> 0:47:27.594
1569
+ Is something she got together with two of
1570
+ her dearest friends, this older woman, and
1571
+
1572
+ 0:47:27.594 --> 0:47:34.152
1573
+ then, of course, here friends can we get from
1574
+ the context, but it might be that some systems
1575
+
1576
+ 0:47:34.152 --> 0:47:36.126
1577
+ ignore that that should be.
1578
+
1579
+ 0:47:36.256 --> 0:47:43.434
1580
+ So you have two test sets in there, two types
1581
+ of benchmarks, and you want to determine which
1582
+
1583
+ 0:47:43.434 --> 0:47:43.820
1584
+ one.
1585
+
1586
+ 0:47:47.787 --> 0:47:54.443
1587
+ Modeling in Machine TranslationYes, this is
1588
+ how we can evaluate it, so the next question
1589
+
1590
+ 0:47:54.443 --> 0:48:01.397
1591
+ is how can we improve our systems because that's
1592
+ normally how we do evaluation and why we do
1593
+
1594
+ 0:48:01.397 --> 0:48:04.238
1595
+ evaluation so before we go into that?
1596
+
1597
+ 0:48:08.508 --> 0:48:22.685
1598
+ One idea is to do what is referred to as modeling,
1599
+ so the idea is somehow change the model in
1600
+
1601
+ 0:48:22.685 --> 0:48:24.495
1602
+ a way that.
1603
+
1604
+ 0:48:24.965 --> 0:48:38.271
1605
+ And yes, one idea is, of course, if we are
1606
+ giving him more information, the system doesn't
1607
+
1608
+ 0:48:38.271 --> 0:48:44.850
1609
+ need to do a guess without this information.
1610
+
1611
+ 0:48:44.724 --> 0:48:47.253
1612
+ In order to just ambiguate the bias,.
1613
+
1614
+ 0:48:47.707 --> 0:48:59.746
1615
+ The first thing is you can do that on the
1616
+ sentence level, for example, especially if
1617
+
1618
+ 0:48:59.746 --> 0:49:03.004
1619
+ you have the speakers.
1620
+
1621
+ 0:49:03.063 --> 0:49:14.585
1622
+ You can annotate the sentence with whether
1623
+ a speaker is made or a female, and then you
1624
+
1625
+ 0:49:14.585 --> 0:49:26.505
1626
+ can: Here we're seeing one thing which is very
1627
+ successful in neuromachine translation and
1628
+
1629
+ 0:49:26.505 --> 0:49:30.743
1630
+ other kinds of neural networks.
1631
+
1632
+ 0:49:31.711 --> 0:49:39.546
1633
+ However, in neuromachine translation, since
1634
+ we have no longer the strong correlation between
1635
+
1636
+ 0:49:39.546 --> 0:49:47.043
1637
+ input and output, the nice thing is you can
1638
+ normally put everything into your input, and
1639
+
1640
+ 0:49:47.043 --> 0:49:50.834
1641
+ if you have enough data, it's well balanced.
1642
+
1643
+ 0:49:51.151 --> 0:50:00.608
1644
+ So how you can do it here is you can add the
1645
+ token here saying female or male if the speaker
1646
+
1647
+ 0:50:00.608 --> 0:50:01.523
1648
+ is male.
1649
+
1650
+ 0:50:01.881 --> 0:50:07.195
1651
+ So, of course, this is no longer for human
1652
+ correct translation.
1653
+
1654
+ 0:50:07.112 --> 0:50:09.855
1655
+ It's like female Madam because.
1656
+
1657
+ 0:50:10.090 --> 0:50:22.951
1658
+ If you are doing the same thing then the translation
1659
+ would not be to translate female but can use
1660
+
1661
+ 0:50:22.951 --> 0:50:25.576
1662
+ it to disintegrate.
1663
+
1664
+ 0:50:25.865 --> 0:50:43.573
1665
+ And so this type of tagging is a very commonly
1666
+ used method in order to add more information.
1667
+
1668
+ 0:50:47.107 --> 0:50:54.047
1669
+ So this is first of all a very good thing,
1670
+ a very easy one.
1671
+
1672
+ 0:50:53.931 --> 0:50:57.637
1673
+ You don't have to change your.
1674
+
1675
+ 0:50:58.018 --> 0:51:04.581
1676
+ For example, has also been done if you think
1677
+ about formality in German.
1678
+
1679
+ 0:51:04.490 --> 0:51:11.479
1680
+ Whether you have to produce or, you can: We'll
1681
+ see it on Thursday.
1682
+
1683
+ 0:51:11.375 --> 0:51:19.621
1684
+ It's a very common approach for domains, so
1685
+ you put in the domain beforehand.
1686
+
1687
+ 0:51:19.515 --> 0:51:24.592
1688
+ This is from a Twitter or something like that.
1689
+
1690
+ 0:51:24.904 --> 0:51:36.239
1691
+ Of course, it only learns it if it has seen
1692
+ it and it dees them out, but in this case you
1693
+
1694
+ 0:51:36.239 --> 0:51:38.884
1695
+ don't need an equal.
1696
+
1697
+ 0:51:39.159 --> 0:51:42.593
1698
+ But however, it's still like challenging to
1699
+ get this availability.
1700
+
1701
+ 0:51:42.983 --> 0:51:55.300
1702
+ If you would do that on the first of all,
1703
+ of course, it only works if you really have
1704
+
1705
+ 0:51:55.300 --> 0:52:02.605
1706
+ data from speaking because otherwise it's unclear.
1707
+
1708
+ 0:52:02.642 --> 0:52:09.816
1709
+ You would only have the text and you would
1710
+ not easily see whether it is the mayor or the
1711
+
1712
+ 0:52:09.816 --> 0:52:14.895
1713
+ female speaker because this information has
1714
+ been removed from.
1715
+
1716
+ 0:52:16.456 --> 0:52:18.745
1717
+ Does anybody of you have an idea of how it
1718
+ fits?
1719
+
1720
+ 0:52:20.000 --> 0:52:25.480
1721
+ Manage that and still get the data of whether
1722
+ it's made or not speaking.
1723
+
1724
+ 0:52:32.152 --> 0:52:34.270
1725
+ Can do a small trick.
1726
+
1727
+ 0:52:34.174 --> 0:52:37.836
1728
+ We can just look on the target side.
1729
+
1730
+ 0:52:37.937 --> 0:52:43.573
1731
+ Mean this is, of course, only important if
1732
+ in the target side this is the case.
1733
+
1734
+ 0:52:44.004 --> 0:52:50.882
1735
+ So for your training data you can irritate
1736
+ it based on your target site in German you
1737
+
1738
+ 0:52:50.882 --> 0:52:51.362
1739
+ know.
1740
+
1741
+ 0:52:51.282 --> 0:52:58.383
1742
+ In German you don't know but in Spanish for
1743
+ example you know because different and then
1744
+
1745
+ 0:52:58.383 --> 0:53:00.400
1746
+ you can use grammatical.
1747
+
1748
+ 0:53:00.700 --> 0:53:10.964
1749
+ Of course, the test day would still need to
1750
+ do that more interface decision.
1751
+
1752
+ 0:53:13.954 --> 0:53:18.854
1753
+ And: You can, of course, do it even more advanced.
1754
+
1755
+ 0:53:18.898 --> 0:53:30.659
1756
+ You can even try to add these information
1757
+ to each word, so you're not doing it for the
1758
+
1759
+ 0:53:30.659 --> 0:53:32.687
1760
+ full sentence.
1761
+
1762
+ 0:53:32.572 --> 0:53:42.129
1763
+ If it's unknown, if it's female or if it's
1764
+ male, you know word alignment so you can't
1765
+
1766
+ 0:53:42.129 --> 0:53:42.573
1767
+ do.
1768
+
1769
+ 0:53:42.502 --> 0:53:55.919
1770
+ Here then you can do a word alignment, which
1771
+ is of course not always perfect, but roughly
1772
+
1773
+ 0:53:55.919 --> 0:53:59.348
1774
+ then you can annotate.
1775
+
1776
+ 0:54:01.401 --> 0:54:14.165
1777
+ Now you have these type of inputs where you
1778
+ have one information per word, but on the one
1779
+
1780
+ 0:54:14.165 --> 0:54:16.718
1781
+ end you have the.
1782
+
1783
+ 0:54:17.517 --> 0:54:26.019
1784
+ This has been used before in other scenarios,
1785
+ so you might not put in the gender, but in
1786
+
1787
+ 0:54:26.019 --> 0:54:29.745
1788
+ general this can be other information.
1789
+
1790
+ 0:54:30.090 --> 0:54:39.981
1791
+ And people refer to that or have used that
1792
+ as a factored translation model, so what you
1793
+
1794
+ 0:54:39.981 --> 0:54:42.454
1795
+ may do is you factor.
1796
+
1797
+ 0:54:42.742 --> 0:54:45.612
1798
+ You have the word itself.
1799
+
1800
+ 0:54:45.501 --> 0:54:48.513
1801
+ You might have the gender.
1802
+
1803
+ 0:54:48.401 --> 0:54:55.988
1804
+ You could have more information like don't
1805
+ know the paddle speech.
1806
+
1807
+ 0:54:56.316 --> 0:54:58.564
1808
+ And then you have an embedding for each of
1809
+ them.
1810
+
1811
+ 0:54:59.199 --> 0:55:03.599
1812
+ And you congratulate them, and then you have
1813
+ years of congratulated a bedding.
1814
+
1815
+ 0:55:03.563 --> 0:55:09.947
1816
+ Which says okay, this is a female plumber
1817
+ or a male plumber or so on.
1818
+
1819
+ 0:55:09.856 --> 0:55:18.032
1820
+ This has additional information and then you
1821
+ can train this factory model where you have
1822
+
1823
+ 0:55:18.032 --> 0:55:22.534
1824
+ the ability to give the model extra information.
1825
+
1826
+ 0:55:23.263 --> 0:55:35.702
1827
+ And of course now if you are training this
1828
+ way directly you always need to have this information.
1829
+
1830
+ 0:55:36.576 --> 0:55:45.396
1831
+ So that might not be the best way if you want
1832
+ to use a translation system and sometimes don't
1833
+
1834
+ 0:55:45.396 --> 0:55:45.959
1835
+ have.
1836
+
1837
+ 0:55:46.866 --> 0:55:57.987
1838
+ So any idea of how you can train it or what
1839
+ machine learning technique you can use to deal
1840
+
1841
+ 0:55:57.987 --> 0:55:58.720
1842
+ with.
1843
+
1844
+ 0:56:03.263 --> 0:56:07.475
1845
+ Mainly despite it already, many of your things.
1846
+
1847
+ 0:56:14.154 --> 0:56:21.521
1848
+ Drop out so you sometimes put information
1849
+ in there and then you can use dropouts to inputs.
1850
+
1851
+ 0:56:21.861 --> 0:56:27.599
1852
+ Is sometimes put in this information in there,
1853
+ sometimes not, and the system is then able
1854
+
1855
+ 0:56:27.599 --> 0:56:28.874
1856
+ to deal with those.
1857
+
1858
+ 0:56:28.811 --> 0:56:34.776
1859
+ If it doesn't have the information, it's doing
1860
+ some of the best it can do, but if it has the
1861
+
1862
+ 0:56:34.776 --> 0:56:39.203
1863
+ information, it can use the information and
1864
+ maybe do a more rounded.
1865
+
1866
+ 0:56:46.766 --> 0:56:52.148
1867
+ Context Based Machine TranslationSo then there
1868
+ is, of course, more ways to try to do a moderately
1869
+
1870
+ 0:56:52.148 --> 0:56:52.807
1871
+ biased one.
1872
+
1873
+ 0:56:52.993 --> 0:57:01.690
1874
+ We will only want to mention here because
1875
+ you'll have a full lecture on that next week
1876
+
1877
+ 0:57:01.690 --> 0:57:08.188
1878
+ and that is referred to where context based
1879
+ machine translation.
1880
+
1881
+ 0:57:08.728 --> 0:57:10.397
1882
+ Good, and in this other ones, but.
1883
+
1884
+ 0:57:10.750 --> 0:57:16.830
1885
+ If you translate several sentences well, of
1886
+ course, there are more situations where you
1887
+
1888
+ 0:57:16.830 --> 0:57:17.866
1889
+ can dissemble.
1890
+
1891
+ 0:57:18.118 --> 0:57:23.996
1892
+ Because it might be that the information is
1893
+ not in the current sentence, but it's in the
1894
+
1895
+ 0:57:23.996 --> 0:57:25.911
1896
+ previous sentence or before.
1897
+
1898
+ 0:57:26.967 --> 0:57:33.124
1899
+ If you have the mean with the speaker maybe
1900
+ not, but if it's referring to, you can core
1901
+
1902
+ 0:57:33.124 --> 0:57:33.963
1903
+ references.
1904
+
1905
+ 0:57:34.394 --> 0:57:40.611
1906
+ They are often referring to things in the
1907
+ previous sentence so you can use them in order
1908
+
1909
+ 0:57:40.611 --> 0:57:44.104
1910
+ to: And that can be done basically and very
1911
+ easy.
1912
+
1913
+ 0:57:44.034 --> 0:57:47.438
1914
+ You'll see more advanced options, but the
1915
+ main.
1916
+
1917
+ 0:57:48.108 --> 0:57:58.516
1918
+ Mean, no machine translation is a sequence
1919
+ to sequence model, which can use any input
1920
+
1921
+ 0:57:58.516 --> 0:58:02.993
1922
+ sequence to output sequence mapping.
1923
+
1924
+ 0:58:02.872 --> 0:58:04.337
1925
+ So now at.
1926
+
1927
+ 0:58:04.484 --> 0:58:11.281
1928
+ So then you can do, for example, five to five
1929
+ translations, or also five to one, or so there's.
1930
+
1931
+ 0:58:11.811 --> 0:58:19.211
1932
+ This is not a method like only dedicated to
1933
+ buying, of course, but the hope is.
1934
+
1935
+ 0:58:19.139 --> 0:58:25.534
1936
+ If you're using this because I mean bias often,
1937
+ we have seen that it rises in situations where
1938
+
1939
+ 0:58:25.534 --> 0:58:27.756
1940
+ we're not having enough context.
1941
+
1942
+ 0:58:27.688 --> 0:58:32.940
1943
+ So the idea is if we generally increase our
1944
+ context, it will also help this.
1945
+
1946
+ 0:58:32.932 --> 0:58:42.378
1947
+ Of course, it will help other situations where
1948
+ you need context to disintegrate.
1949
+
1950
+ 0:58:43.603 --> 0:58:45.768
1951
+ Get There If You're Saying I'm Going to the
1952
+ Bank.
1953
+
1954
+ 0:58:46.286 --> 0:58:54.761
1955
+ It's not directly from this sentence clear
1956
+ whether it's the finance institute or the bank
1957
+
1958
+ 0:58:54.761 --> 0:58:59.093
1959
+ for sitting, but maybe if you say afterward,.
1960
+
1961
+ 0:59:02.322 --> 0:59:11.258
1962
+ And then there is in generally a very large
1963
+ amount of work on debiasing the word embelling.
1964
+
1965
+ 0:59:11.161 --> 0:59:20.098
1966
+ So the one I hear like, I mean, I think that
1967
+ partly comes from the fact that like a first.
1968
+
1969
+ 0:59:21.041 --> 0:59:26.925
1970
+ Or that first research was done often on inspecting
1971
+ the word embeddings and seeing whether they
1972
+
1973
+ 0:59:26.925 --> 0:59:32.503
1974
+ are biased or not, and people found out how
1975
+ there is some bias in there, and then the idea
1976
+
1977
+ 0:59:32.503 --> 0:59:38.326
1978
+ is oh, if you remove them from the word embedded
1979
+ in already, then maybe your system later will
1980
+
1981
+ 0:59:38.326 --> 0:59:39.981
1982
+ not have that strong of a.
1983
+
1984
+ 0:59:40.520 --> 0:59:44.825
1985
+ So how can that work?
1986
+
1987
+ 0:59:44.629 --> 0:59:56.360
1988
+ Or like maybe first, how do words encounter
1989
+ bias in there?
1990
+
1991
+ 0:59:56.161 --> 0:59:57.221
1992
+ So.
1993
+
1994
+ 0:59:57.137 --> 1:00:06.152
1995
+ So you can look at the word embedding, and
1996
+ then you can compare the distance of the word
1997
+
1998
+ 1:00:06.152 --> 1:00:11.116
1999
+ compared: And there's like interesting findings.
2000
+
2001
+ 1:00:11.015 --> 1:00:18.285
2002
+ For example, you have the difference in occupation
2003
+ and how similar.
2004
+
2005
+ 1:00:18.678 --> 1:00:33.068
2006
+ And of course it's not a perfect correlation,
2007
+ but you see some type of correlation: jobs
2008
+
2009
+ 1:00:33.068 --> 1:00:37.919
2010
+ which have a high occupation.
2011
+
2012
+ 1:00:37.797 --> 1:00:41.387
2013
+ They also are more similar to the word what
2014
+ we're going to be talking about.
2015
+
2016
+ 1:00:43.023 --> 1:00:50.682
2017
+ Maybe a secretary is also a bit difficult,
2018
+ but because yeah maybe it's more often.
2019
+
2020
+ 1:00:50.610 --> 1:00:52.438
2021
+ Done in general by by women.
2022
+
2023
+ 1:00:52.375 --> 1:00:58.208
2024
+ However, there is a secretary like the Secretary
2025
+ of State or so, the German minister, which
2026
+
2027
+ 1:00:58.208 --> 1:01:03.406
2028
+ I of course know that many so in the statistics
2029
+ they are not counting that often.
2030
+
2031
+ 1:01:03.543 --> 1:01:11.576
2032
+ But in data they of course cook quite often,
2033
+ so there's different ways of different meanings.
2034
+
2035
+ 1:01:14.154 --> 1:01:23.307
2036
+ So how can you not try to remove this type
2037
+ of bias?
2038
+
2039
+ 1:01:23.131 --> 1:01:32.992
2040
+ One way is the idea of hearts, devices and
2041
+ embeddings.
2042
+
2043
+ 1:01:33.113 --> 1:01:39.354
2044
+ So if you remember on word embeddings think
2045
+ we have this image that you can do the difference
2046
+
2047
+ 1:01:39.354 --> 1:01:44.931
2048
+ between man and woman and add this difference
2049
+ to king and then look at your screen.
2050
+
2051
+ 1:01:45.865 --> 1:01:57.886
2052
+ So here's the idea we want to remove this
2053
+ gender information from some things which should
2054
+
2055
+ 1:01:57.886 --> 1:02:00.132
2056
+ not have gender.
2057
+
2058
+ 1:02:00.120 --> 1:02:01.386
2059
+ The word engineer.
2060
+
2061
+ 1:02:01.320 --> 1:02:06.854
2062
+ There is no information about the gender in
2063
+ that, so you should remove this type.
2064
+
2065
+ 1:02:07.347 --> 1:02:16.772
2066
+ Of course, you first need to find out where
2067
+ these inflammations are and you can.
2068
+
2069
+ 1:02:17.037 --> 1:02:23.603
2070
+ However, normally if you do the difference
2071
+ like the subspace by only one example, it's
2072
+
2073
+ 1:02:23.603 --> 1:02:24.659
2074
+ not the best.
2075
+
2076
+ 1:02:24.924 --> 1:02:31.446
2077
+ So you can do the same thing for things like
2078
+ brother and sister, man and dad, and then you
2079
+
2080
+ 1:02:31.446 --> 1:02:38.398
2081
+ can somehow take the average of these differences
2082
+ saying this is a vector which maps a male from
2083
+
2084
+ 1:02:38.398 --> 1:02:39.831
2085
+ to the female form.
2086
+
2087
+ 1:02:40.660 --> 1:02:50.455
2088
+ And then you can try to neutralize this gender
2089
+ information on this dimension.
2090
+
2091
+ 1:02:50.490 --> 1:02:57.951
2092
+ You can find it's subspace or dimensional.
2093
+
2094
+ 1:02:57.777 --> 1:03:08.884
2095
+ It would be a line, but now this is dimensional,
2096
+ and then you.
2097
+
2098
+ 1:03:08.728 --> 1:03:13.104
2099
+ Representation: Where you remove this type
2100
+ of embellishment.
2101
+
2102
+ 1:03:15.595 --> 1:03:18.178
2103
+ This is, of course, quite strong of the questions.
2104
+
2105
+ 1:03:18.128 --> 1:03:19.058
2106
+ How good does it?
2107
+
2108
+ 1:03:19.006 --> 1:03:20.714
2109
+ Thanks tell them for one other.
2110
+
2111
+ 1:03:20.880 --> 1:03:28.256
2112
+ But it's an idea we are trying to after learning
2113
+ before we are using the Word and Banks for
2114
+
2115
+ 1:03:28.256 --> 1:03:29.940
2116
+ machine translation.
2117
+
2118
+ 1:03:29.859 --> 1:03:37.303
2119
+ We are trying to remove the gender information
2120
+ from the jobs and then have a representation
2121
+
2122
+ 1:03:37.303 --> 1:03:38.679
2123
+ which hopefully.
2124
+
2125
+ 1:03:40.240 --> 1:03:45.047
2126
+ Similar idea is the one of agenda neutral
2127
+ glove.
2128
+
2129
+ 1:03:44.949 --> 1:03:50.250
2130
+ Glove is another technique to learn word embeddings.
2131
+
2132
+ 1:03:50.750 --> 1:03:52.870
2133
+ Think we discussed one shortly.
2134
+
2135
+ 1:03:52.804 --> 1:03:56.183
2136
+ It was too back, which was some of the first
2137
+ one.
2138
+
2139
+ 1:03:56.456 --> 1:04:04.383
2140
+ But there are other of course methods how
2141
+ you can train word embeddings and glove as
2142
+
2143
+ 1:04:04.383 --> 1:04:04.849
2144
+ one.
2145
+
2146
+ 1:04:04.756 --> 1:04:07.464
2147
+ The idea is we're training.
2148
+
2149
+ 1:04:07.747 --> 1:04:19.007
2150
+ At least this is somehow a bit separated,
2151
+ so where you have part of the vector is gender
2152
+
2153
+ 1:04:19.007 --> 1:04:20.146
2154
+ neutral.
2155
+
2156
+ 1:04:20.300 --> 1:04:29.247
2157
+ What you need therefore is three sets of words,
2158
+ so you have male words and you have words.
2159
+
2160
+ 1:04:29.769 --> 1:04:39.071
2161
+ And then you're trying to learn some type
2162
+ of vector where some dimensions are not.
2163
+
2164
+ 1:04:39.179 --> 1:04:51.997
2165
+ So the idea is can learn a representation
2166
+ where at least know that this part is gender
2167
+
2168
+ 1:04:51.997 --> 1:04:56.123
2169
+ neutral and the other part.
2170
+
2171
+ 1:05:00.760 --> 1:05:03.793
2172
+ How can we do that?
2173
+
2174
+ 1:05:03.641 --> 1:05:12.363
2175
+ How can we change the system to learn anything
2176
+ specific?
2177
+
2178
+ 1:05:12.210 --> 1:05:20.476
2179
+ Nearly in all cases this works by the loss
2180
+ function.
2181
+
2182
+ 1:05:20.520 --> 1:05:26.206
2183
+ And that is more a general approach in machine
2184
+ translation.
2185
+
2186
+ 1:05:26.111 --> 1:05:30.567
2187
+ The general loss function is we are learning.
2188
+
2189
+ 1:05:31.111 --> 1:05:33.842
2190
+ Here is the same idea.
2191
+
2192
+ 1:05:33.723 --> 1:05:44.378
2193
+ You have the general loss function in order
2194
+ to learn good embeddings and then you try to
2195
+
2196
+ 1:05:44.378 --> 1:05:48.688
2197
+ introduce additional loss function.
2198
+
2199
+ 1:05:48.969 --> 1:05:58.213
2200
+ Yes, I think yes, yes, that's the solution,
2201
+ and how you make sure that if I have training
2202
+
2203
+ 1:05:58.213 --> 1:06:07.149
2204
+ for all nurses of email, how do you make sure
2205
+ that the algorithm puts it into neutral?
2206
+
2207
+ 1:06:07.747 --> 1:06:12.448
2208
+ And you need, so this is like for only the
2209
+ first learning of word embeddings.
2210
+
2211
+ 1:06:12.388 --> 1:06:18.019
2212
+ Then the idea is if you have word embeddings
2213
+ where the gender is separate and then you train
2214
+
2215
+ 1:06:18.019 --> 1:06:23.711
2216
+ on top of that machine translation where you
2217
+ don't change the embeddings, it should hopefully
2218
+
2219
+ 1:06:23.711 --> 1:06:25.225
2220
+ be less and less biased.
2221
+
2222
+ 1:06:25.865 --> 1:06:33.465
2223
+ And in order to train that yes you need additional
2224
+ information so these information need to be
2225
+
2226
+ 1:06:33.465 --> 1:06:40.904
2227
+ hence defined and they can't be general so
2228
+ you need to have a list of these are male persons
2229
+
2230
+ 1:06:40.904 --> 1:06:44.744
2231
+ or males these are nouns for females and these.
2232
+
2233
+ 1:06:49.429 --> 1:06:52.575
2234
+ So the first step, of course, we still want
2235
+ to have good word inventings.
2236
+
2237
+ 1:06:54.314 --> 1:07:04.100
2238
+ So you have the normal objective function
2239
+ of the word embedding.
2240
+
2241
+ 1:07:03.949 --> 1:07:09.524
2242
+ It's something like the similarity.
2243
+
2244
+ 1:07:09.849 --> 1:07:19.751
2245
+ How it's exactly derived is not that important
2246
+ because we're not interested in love itself,
2247
+
2248
+ 1:07:19.751 --> 1:07:23.195
2249
+ but you have any loss function.
2250
+
2251
+ 1:07:23.087 --> 1:07:26.857
2252
+ Of course, you have to keep that.
2253
+
2254
+ 1:07:27.167 --> 1:07:38.977
2255
+ And then there's three more lost functions
2256
+ that you can add: So the one is you take the
2257
+
2258
+ 1:07:38.977 --> 1:07:51.325
2259
+ average value of all the male words and the
2260
+ average word embedding of all the female words.
2261
+
2262
+ 1:07:51.731 --> 1:08:00.066
2263
+ So the good thing about this is we don't always
2264
+ need to have for one word the male and the
2265
+
2266
+ 1:08:00.066 --> 1:08:05.837
2267
+ female worship, so it's only like we have a
2268
+ set of male words.
2269
+
2270
+ 1:08:06.946 --> 1:08:21.719
2271
+ So this is just saying yeah, we want these
2272
+ two should be somehow similar to each other.
2273
+
2274
+ 1:08:21.551 --> 1:08:25.421
2275
+ It shouldn't be that.
2276
+
2277
+ 1:08:30.330 --> 1:08:40.081
2278
+ Should be the other one, or think this should
2279
+ be it.
2280
+
2281
+ 1:08:39.897 --> 1:08:45.975
2282
+ This is agenda, the average of.
2283
+
2284
+ 1:08:45.945 --> 1:09:01.206
2285
+ The average should be the same, but if you're
2286
+ looking at the female should be at the other.
2287
+
2288
+ 1:09:01.681 --> 1:09:06.959
2289
+ This is like on these dimensions, the male
2290
+ should be on the one and the female on the
2291
+
2292
+ 1:09:06.959 --> 1:09:07.388
2293
+ other.
2294
+
2295
+ 1:09:07.627 --> 1:09:16.123
2296
+ The same yeah, this gender information should
2297
+ be there, so you're pushing all the males to
2298
+
2299
+ 1:09:16.123 --> 1:09:17.150
2300
+ the other.
2301
+
2302
+ 1:09:21.541 --> 1:09:23.680
2303
+ Then their words should be.
2304
+
2305
+ 1:09:23.604 --> 1:09:30.389
2306
+ If you have that you see the neutral words,
2307
+ they should be in the middle of between the
2308
+
2309
+ 1:09:30.389 --> 1:09:32.008
2310
+ male and the female.
2311
+
2312
+ 1:09:32.012 --> 1:09:48.261
2313
+ So you say is the middle point between all
2314
+ male and female words and just somehow putting
2315
+
2316
+ 1:09:48.261 --> 1:09:51.691
2317
+ the neutral words.
2318
+
2319
+ 1:09:52.912 --> 1:09:56.563
2320
+ And then you're learning them, and then you
2321
+ can apply them in different ways.
2322
+
2323
+ 1:09:57.057 --> 1:10:03.458
2324
+ So you have this a bit in the pre-training
2325
+ thing.
2326
+
2327
+ 1:10:03.330 --> 1:10:10.337
2328
+ You can use the pre-trained inbeddings on
2329
+ the output.
2330
+
2331
+ 1:10:10.208 --> 1:10:23.179
2332
+ All you can use are: And then you can analyze
2333
+ what happens instead of training them directly.
2334
+
2335
+ 1:10:23.041 --> 1:10:30.506
2336
+ If have this additional loss, which tries
2337
+ to optimize.
2338
+
2339
+ 1:10:32.432 --> 1:10:42.453
2340
+ And then it was evaluated exactly on the sentences
2341
+ we had at the beginning where it is about know
2342
+
2343
+ 1:10:42.453 --> 1:10:44.600
2344
+ her for a long time.
2345
+
2346
+ 1:10:44.498 --> 1:10:48.693
2347
+ My friend works as an accounting cling.
2348
+
2349
+ 1:10:48.788 --> 1:10:58.049
2350
+ So all these examples are not very difficult
2351
+ to translation, but the question is how often
2352
+
2353
+ 1:10:58.049 --> 1:10:58.660
2354
+ does?
2355
+
2356
+ 1:11:01.621 --> 1:11:06.028
2357
+ That it's not that complicated as you see
2358
+ here, so even the baseline.
2359
+
2360
+ 1:11:06.366 --> 1:11:10.772
2361
+ If you're doing nothing is working quite well,
2362
+ it's most challenging.
2363
+
2364
+ 1:11:10.709 --> 1:11:16.401
2365
+ It seems overall in the situation where it's
2366
+ a name, so for he and him he has learned the
2367
+
2368
+ 1:11:16.401 --> 1:11:22.282
2369
+ correlation because that's maybe not surprisingly
2370
+ because this correlation occurs more often
2371
+
2372
+ 1:11:22.282 --> 1:11:23.927
2373
+ than with any name there.
2374
+
2375
+ 1:11:24.044 --> 1:11:31.749
2376
+ If you have a name that you can extract, that
2377
+ is talking about Mary, that's female is a lot
2378
+
2379
+ 1:11:31.749 --> 1:11:34.177
2380
+ harder to extract than this.
2381
+
2382
+ 1:11:34.594 --> 1:11:40.495
2383
+ So you'll see already in the bass line this
2384
+ is yeah, not working, not working.
2385
+
2386
+ 1:11:43.403 --> 1:11:47.159
2387
+ And for all the other cases it's working very
2388
+ well.
2389
+
2390
+ 1:11:47.787 --> 1:11:53.921
2391
+ Where all the best one is achieved here with
2392
+ an arc debiasing both on the encoder, on the.
2393
+
2394
+ 1:11:57.077 --> 1:12:09.044
2395
+ It makes sense that a hard debasing on the
2396
+ decoder doesn't really work because there you
2397
+
2398
+ 1:12:09.044 --> 1:12:12.406
2399
+ have gender information.
2400
+
2401
+ 1:12:14.034 --> 1:12:17.406
2402
+ For glove it seems to already work here.
2403
+
2404
+ 1:12:17.323 --> 1:12:20.204
2405
+ That's maybe surprising and yeah.
2406
+
2407
+ 1:12:20.260 --> 1:12:28.263
2408
+ So there is no clear else we don't have numbers
2409
+ for that doesn't really work well on the other.
2410
+
2411
+ 1:12:28.179 --> 1:12:30.517
2412
+ So how much do I use then?
2413
+
2414
+ 1:12:33.693 --> 1:12:44.720
2415
+ Then as a last way of improving that is a
2416
+ bit what we had mentioned before.
2417
+
2418
+ 1:12:44.575 --> 1:12:48.499
2419
+ That is what is referred.
2420
+
2421
+ 1:12:48.488 --> 1:12:59.133
2422
+ One problem is the bias in the data so you
2423
+ can adapt your data so you can just try to
2424
+
2425
+ 1:12:59.133 --> 1:13:01.485
2426
+ find equal amount.
2427
+
2428
+ 1:13:01.561 --> 1:13:11.368
2429
+ In your data like you adapt your data and
2430
+ then you find your data on the smaller but
2431
+
2432
+ 1:13:11.368 --> 1:13:12.868
2433
+ you can try.
2434
+
2435
+ 1:13:18.298 --> 1:13:19.345
2436
+ This is line okay.
2437
+
2438
+ 1:13:19.290 --> 1:13:21.584
2439
+ We have access to the data to the model.
2440
+
2441
+ 1:13:21.528 --> 1:13:23.041
2442
+ We can improve our model.
2443
+
2444
+ 1:13:24.564 --> 1:13:31.328
2445
+ One situation we haven't talked a lot about
2446
+ but another situation might also be and that's
2447
+
2448
+ 1:13:31.328 --> 1:13:37.942
2449
+ even getting more important is oh you want
2450
+ to work with a model which you don't have but
2451
+
2452
+ 1:13:37.942 --> 1:13:42.476
2453
+ you want to improve the model without having
2454
+ access so when.
2455
+
2456
+ 1:13:42.862 --> 1:13:49.232
2457
+ Nowadays there are a lot of companies who
2458
+ are not developing their own system but they're
2459
+
2460
+ 1:13:49.232 --> 1:13:52.983
2461
+ using or something like that or machine translation.
2462
+
2463
+ 1:13:53.313 --> 1:13:59.853
2464
+ So there is interest that you might not be
2465
+ able to find children with models completely.
2466
+
2467
+ 1:14:00.080 --> 1:14:10.068
2468
+ So the question is, can you do some type of
2469
+ black box adaptation of a system that takes
2470
+
2471
+ 1:14:10.068 --> 1:14:20.055
2472
+ the black box system but tries to improve it
2473
+ in some ways through: There's some ways of
2474
+
2475
+ 1:14:20.055 --> 1:14:21.417
2476
+ doing that.
2477
+
2478
+ 1:14:21.304 --> 1:14:30.328
2479
+ One is called black box injection and that's
2480
+ what is referred to as prompt.
2481
+
2482
+ 1:14:30.730 --> 1:14:39.793
2483
+ So the problem is if you have sentences you
2484
+ don't have information about the speakers.
2485
+
2486
+ 1:14:39.689 --> 1:14:43.130
2487
+ So how can you put information?
2488
+
2489
+ 1:14:43.984 --> 1:14:53.299
2490
+ And what we know from a large language model,
2491
+ we just prompt them, and you can do that.
2492
+
2493
+ 1:14:53.233 --> 1:14:59.545
2494
+ Translating directly, I love you, you said
2495
+ she said to him, I love you, and then of course
2496
+
2497
+ 1:14:59.545 --> 1:15:01.210
2498
+ you have to strip away.
2499
+
2500
+ 1:15:01.181 --> 1:15:06.629
2501
+ I mean, you cannot prevent the model from
2502
+ translating that, but you should be able to
2503
+
2504
+ 1:15:06.629 --> 1:15:08.974
2505
+ see what is the translation of this.
2506
+
2507
+ 1:15:08.910 --> 1:15:14.849
2508
+ One can strip that away, and now the system
2509
+ had hopefully the information that it's somebody
2510
+
2511
+ 1:15:14.849 --> 1:15:15.552
2512
+ like that.
2513
+
2514
+ 1:15:15.488 --> 1:15:17.023
2515
+ The speaker is female.
2516
+
2517
+ 1:15:18.198 --> 1:15:23.222
2518
+ Because you're no longer translating love
2519
+ you, but you're translating the sentence she
2520
+
2521
+ 1:15:23.222 --> 1:15:24.261
2522
+ said to him love.
2523
+
2524
+ 1:15:24.744 --> 1:15:37.146
2525
+ And so you insert this information as contextual
2526
+ information around it and don't have to change
2527
+
2528
+ 1:15:37.146 --> 1:15:38.567
2529
+ the model.
2530
+
2531
+ 1:15:41.861 --> 1:15:54.518
2532
+ Researches in Machine TranslationLast idea
2533
+ is to do what is referred to as letters rescoring,
2534
+
2535
+ 1:15:54.518 --> 1:16:01.115
2536
+ so the idea there is you generate a translation.
2537
+
2538
+ 1:16:01.481 --> 1:16:18.547
2539
+ And now you have an additional component which
2540
+ tries to add possibilities where gender information
2541
+
2542
+ 1:16:18.547 --> 1:16:21.133
2543
+ might be lost.
2544
+
2545
+ 1:16:21.261 --> 1:16:29.687
2546
+ It's just a graph in this way, a simplified
2547
+ graph where there's always one word between
2548
+
2549
+ 1:16:29.687 --> 1:16:31.507
2550
+ two notes and you.
2551
+
2552
+ 1:16:31.851 --> 1:16:35.212
2553
+ So you have something like Zi is an ads or
2554
+ a Zi is an ads.
2555
+
2556
+ 1:16:35.535 --> 1:16:41.847
2557
+ And then you can generate all possible variants.
2558
+
2559
+ 1:16:41.718 --> 1:16:49.320
2560
+ Then, of course, we're not done because the
2561
+ final output.
2562
+
2563
+ 1:16:50.530 --> 1:16:56.999
2564
+ Then you can re-score the system by a gender
2565
+ de-biased model.
2566
+
2567
+ 1:16:56.895 --> 1:17:03.414
2568
+ So the nice thing is why why don't we directly
2569
+ use our model?
2570
+
2571
+ 1:17:03.309 --> 1:17:10.356
2572
+ The idea is our model, which is only focusing
2573
+ on gender devising.
2574
+
2575
+ 1:17:10.530 --> 1:17:16.470
2576
+ It can be, for example, if it's just trained
2577
+ on some synthetical data, it will not be that
2578
+
2579
+ 1:17:16.470 --> 1:17:16.862
2580
+ well.
2581
+
2582
+ 1:17:16.957 --> 1:17:21.456
2583
+ But what we can do then is now you can rescore
2584
+ the possible translations in here.
2585
+
2586
+ 1:17:21.721 --> 1:17:31.090
2587
+ And here the cases of course in general structure
2588
+ is already done how to translate the words.
2589
+
2590
+ 1:17:31.051 --> 1:17:42.226
2591
+ Then you're only using the second component
2592
+ in order to react for some variants and then
2593
+
2594
+ 1:17:42.226 --> 1:17:45.490
2595
+ get the best translation.
2596
+
2597
+ 1:17:45.925 --> 1:17:58.553
2598
+ And: As the last one there is the post processing
2599
+ so you can't have it.
2600
+
2601
+ 1:17:58.538 --> 1:18:02.830
2602
+ Mean this was one way of post-processing was
2603
+ to generate the lattice and retranslate it.
2604
+
2605
+ 1:18:03.123 --> 1:18:08.407
2606
+ But you can also have a processing, for example
2607
+ only on the target side where you have additional
2608
+
2609
+ 1:18:08.407 --> 1:18:12.236
2610
+ components with checks about the gender which
2611
+ maybe only knows gender.
2612
+
2613
+ 1:18:12.182 --> 1:18:17.073
2614
+ So it's not a machine translation component
2615
+ but more like a grammatical checker which can
2616
+
2617
+ 1:18:17.073 --> 1:18:19.193
2618
+ be used as most processing to do that.
2619
+
2620
+ 1:18:19.579 --> 1:18:22.926
2621
+ Think about it a bit like when you use PPT.
2622
+
2623
+ 1:18:22.850 --> 1:18:25.833
2624
+ There's also a lot of post processing.
2625
+
2626
+ 1:18:25.757 --> 1:18:32.618
2627
+ If you use a directive, it would tell you
2628
+ how to build a bond, but they have some checks
2629
+
2630
+ 1:18:32.618 --> 1:18:35.932
2631
+ either before and after to prevent things.
2632
+
2633
+ 1:18:36.356 --> 1:18:40.580
2634
+ So often there might be an application system.
2635
+
2636
+ 1:18:40.490 --> 1:18:44.716
2637
+ There might be extra pre and post processing.
2638
+
2639
+ 1:18:48.608 --> 1:18:52.589
2640
+ And yeah, with this we're at the end of.
2641
+
2642
+ 1:18:52.512 --> 1:19:09.359
2643
+ To this lecture where we focused on the bias,
2644
+ but think a lot of these techniques we have
2645
+
2646
+ 1:19:09.359 --> 1:19:11.418
2647
+ seen here.
2648
+
2649
+ 1:19:11.331 --> 1:19:17.664
2650
+ So we saw, on the one hand, we saw that evaluating
2651
+ just pure blues first might not always be.
2652
+
2653
+ 1:19:17.677 --> 1:19:18.947
2654
+ Mean it's very important.
2655
+
2656
+ 1:19:20.000 --> 1:19:30.866
2657
+ Always do that, but if you want to check and
2658
+ some specific things are important, then you
2659
+
2660
+ 1:19:30.866 --> 1:19:35.696
2661
+ might have to do dedicated evaluations.
2662
+
2663
+ 1:19:36.036 --> 1:19:44.296
2664
+ It is now translating for the President and
2665
+ it is like in German that guess it is not very
2666
+
2667
+ 1:19:44.296 --> 1:19:45.476
2668
+ appropriate.
2669
+
2670
+ 1:19:45.785 --> 1:19:53.591
2671
+ So it might be important if characteristics
2672
+ of your system are essential to have dedicated
2673
+
2674
+ 1:19:53.591 --> 1:19:54.620
2675
+ evaluation.
2676
+
2677
+ 1:19:55.135 --> 1:20:02.478
2678
+ And then if you have that, of course, it might
2679
+ be also important to develop delicate techniques.
2680
+
2681
+ 1:20:02.862 --> 1:20:10.988
2682
+ We have seen today some how to mitigate biases,
2683
+ but I hope you see that a lot of these techniques
2684
+
2685
+ 1:20:10.988 --> 1:20:13.476
2686
+ you can also use to mitigate.
2687
+
2688
+ 1:20:13.573 --> 1:20:31.702
2689
+ At least related things you can adjust the
2690
+ training data you can do for other things.
2691
+
2692
+ 1:20:33.253 --> 1:20:36.022
2693
+ Before we have been finishing, we have any
2694
+ more questions.
2695
+
2696
+ 1:20:41.761 --> 1:20:47.218
2697
+ Then thanks a lot, and then we will see each
2698
+ other again on the first step.
2699
+
demo_data/lectures/Lecture-13-04.07.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42f89fc932d5818061ea4e7490a1ea9a58c6b937b7696d69d117fca50623f0a2
3
+ size 108699463
demo_data/lectures/Lecture-14-27.06.2023/English.vtt ADDED
@@ -0,0 +1,2753 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 0:00:01.921 --> 0:00:14.926
4
+ IntroductionHey welcome to today's lecture,
5
+ what we today want to look at is how we can
6
+
7
+ 0:00:14.926 --> 0:00:16.403
8
+ make new.
9
+
10
+ 0:00:16.796 --> 0:00:26.458
11
+ So until now we have this global system, the
12
+ encoder and the decoder mostly, and we haven't
13
+
14
+ 0:00:26.458 --> 0:00:29.714
15
+ really thought about how long.
16
+
17
+ 0:00:30.170 --> 0:00:42.684
18
+ And what we, for example, know is yeah, you
19
+ can make the systems bigger in different ways.
20
+
21
+ 0:00:42.546 --> 0:00:47.088
22
+ We can make them deeper so the.
23
+
24
+ 0:00:47.407 --> 0:00:56.331
25
+ And if we have at least enough data that typically
26
+ helps you make things performance better,.
27
+
28
+ 0:00:56.576 --> 0:01:00.620
29
+ But of course leads to problems that we need
30
+ more resources.
31
+
32
+ 0:01:00.554 --> 0:01:06.556
33
+ That is a problem at universities where we
34
+ have typically limited computation capacities.
35
+
36
+ 0:01:06.489 --> 0:01:11.759
37
+ So at some point you have such big models
38
+ that you cannot train them anymore.
39
+
40
+ 0:01:13.033 --> 0:01:23.792
41
+ And also for companies is of course important
42
+ if it costs you like to generate translation
43
+
44
+ 0:01:23.792 --> 0:01:26.984
45
+ just by power consumption.
46
+
47
+ 0:01:27.667 --> 0:01:35.386
48
+ So yeah, there's different reasons why you
49
+ want to do efficient machine translation.
50
+
51
+ 0:01:36.436 --> 0:01:48.338
52
+ One reason is there are different ways of
53
+ how you can improve your machine translation
54
+
55
+ 0:01:48.338 --> 0:01:50.527
56
+ system once we.
57
+
58
+ 0:01:50.670 --> 0:01:55.694
59
+ There can be different types of data we looked
60
+ into data crawling, monolingual data.
61
+
62
+ 0:01:55.875 --> 0:01:59.024
63
+ All this data and the aim is always.
64
+
65
+ 0:01:59.099 --> 0:02:05.735
66
+ Of course, we are not just purely interested
67
+ in having more data, but the idea why we want
68
+
69
+ 0:02:05.735 --> 0:02:12.299
70
+ to have more data is that more data also means
71
+ that we have better quality because mostly
72
+
73
+ 0:02:12.299 --> 0:02:17.550
74
+ we are interested in increasing the quality
75
+ of the machine translation.
76
+
77
+ 0:02:18.838 --> 0:02:24.892
78
+ But there's also other ways of how you can
79
+ improve the quality of a machine translation.
80
+
81
+ 0:02:25.325 --> 0:02:36.450
82
+ And what is, of course, that is where most
83
+ research is focusing on.
84
+
85
+ 0:02:36.287 --> 0:02:44.471
86
+ It means all we want to build better algorithms.
87
+
88
+ 0:02:44.684 --> 0:02:48.199
89
+ Course: The other things are normally as good.
90
+
91
+ 0:02:48.124 --> 0:02:54.596
92
+ Sometimes it's easier to improve, so often
93
+ it's easier to just collect more data than
94
+
95
+ 0:02:54.596 --> 0:02:57.455
96
+ to invent some great view algorithms.
97
+
98
+ 0:02:57.380 --> 0:03:00.317
99
+ But yeah, both of them are important.
100
+
101
+ 0:03:00.920 --> 0:03:09.812
102
+ But there is this third thing, especially
103
+ with neural machine translation, and that means
104
+
105
+ 0:03:09.812 --> 0:03:11.590
106
+ we make a bigger.
107
+
108
+ 0:03:11.751 --> 0:03:16.510
109
+ Can be, as said, that we have more layers,
110
+ that we have wider layers.
111
+
112
+ 0:03:16.442 --> 0:03:19.928
113
+ EnsemblesThe other thing we talked a bit about
114
+ is ensemble.
115
+
116
+ 0:03:19.870 --> 0:03:24.534
117
+ That means we are not building one new machine
118
+ translation system.
119
+
120
+ 0:03:24.965 --> 0:03:27.505
121
+ And we can easily build four.
122
+
123
+ 0:03:27.420 --> 0:03:32.319
124
+ What is the typical strategy to build different
125
+ systems?
126
+
127
+ 0:03:32.233 --> 0:03:33.188
128
+ Remember.
129
+
130
+ 0:03:35.795 --> 0:03:40.119
131
+ It should be of course a bit different if
132
+ you have the same.
133
+
134
+ 0:03:40.048 --> 0:03:44.550
135
+ If they all predict the same then combining
136
+ them doesn't help.
137
+
138
+ 0:03:44.478 --> 0:03:48.981
139
+ So what is the easiest way if you have to
140
+ build four systems?
141
+
142
+ 0:03:51.711 --> 0:04:01.747
143
+ And the Charleston's will take, but this is
144
+ the best output of a single system.
145
+
146
+ 0:04:02.362 --> 0:04:10.165
147
+ Mean now, it's really three different systems
148
+ so that you later can combine them and maybe
149
+
150
+ 0:04:10.165 --> 0:04:11.280
151
+ the average.
152
+
153
+ 0:04:11.194 --> 0:04:16.683
154
+ Ensembles are typically that the average is
155
+ all probabilities.
156
+
157
+ 0:04:19.439 --> 0:04:24.227
158
+ The idea is to think about neural networks.
159
+
160
+ 0:04:24.118 --> 0:04:29.279
161
+ There's one parameter which can easily adjust.
162
+
163
+ 0:04:29.169 --> 0:04:36.527
164
+ That's exactly the easiest way to randomize
165
+ with three different.
166
+
167
+ 0:04:37.017 --> 0:04:43.119
168
+ They have the same architecture, so all the
169
+ hydroparameters are the same, but they are
170
+
171
+ 0:04:43.119 --> 0:04:43.891
172
+ different.
173
+
174
+ 0:04:43.821 --> 0:04:46.558
175
+ They will have different predictions.
176
+
177
+ 0:04:48.228 --> 0:04:52.572
178
+ So, of course, bigger amounts.
179
+
180
+ 0:04:52.432 --> 0:05:05.300
181
+ Some of these are a bit the easiest way of
182
+ improving your quality because you don't really
183
+
184
+ 0:05:05.300 --> 0:05:08.269
185
+ have to do anything.
186
+
187
+ 0:05:08.588 --> 0:05:12.588
188
+ There is limits on that bigger models only
189
+ get better.
190
+
191
+ 0:05:12.515 --> 0:05:19.098
192
+ If you have enough training data you can't
193
+ do like a handheld layer and you will not work
194
+
195
+ 0:05:19.098 --> 0:05:24.877
196
+ on very small data but with a recent amount
197
+ of data that is the easiest thing.
198
+
199
+ 0:05:25.305 --> 0:05:33.726
200
+ However, they are challenging with making
201
+ better models, bigger motors, and that is the
202
+
203
+ 0:05:33.726 --> 0:05:34.970
204
+ computation.
205
+
206
+ 0:05:35.175 --> 0:05:44.482
207
+ So, of course, if you have a bigger model
208
+ that can mean that you have longer running
209
+
210
+ 0:05:44.482 --> 0:05:49.518
211
+ times, if you have models, you have to times.
212
+
213
+ 0:05:51.171 --> 0:05:56.685
214
+ Normally you cannot paralyze the different
215
+ layers because the input to one layer is always
216
+
217
+ 0:05:56.685 --> 0:06:02.442
218
+ the output of the previous layer, so you propagate
219
+ that so it will also increase your runtime.
220
+
221
+ 0:06:02.822 --> 0:06:10.720
222
+ Then you have to store all your models in
223
+ memory.
224
+
225
+ 0:06:10.562 --> 0:06:21.027
226
+ If you have double weights you will have:
227
+ Is more difficult to then do back propagation.
228
+
229
+ 0:06:20.909 --> 0:06:27.674
230
+ You have to store in between the activations,
231
+ so there's not only do you increase the model
232
+
233
+ 0:06:27.674 --> 0:06:31.865
234
+ in your memory, but also all these other variables
235
+ that.
236
+
237
+ 0:06:34.414 --> 0:06:36.734
238
+ And so in general it is more expensive.
239
+
240
+ 0:06:37.137 --> 0:06:54.208
241
+ And therefore there's good reasons in looking
242
+ into can we make these models sound more efficient.
243
+
244
+ 0:06:54.134 --> 0:07:00.982
245
+ So it's been through the viewer, you can have
246
+ it okay, have one and one day of training time,
247
+
248
+ 0:07:00.982 --> 0:07:01.274
249
+ or.
250
+
251
+ 0:07:01.221 --> 0:07:07.535
252
+ Forty thousand euros and then what is the
253
+ best machine translation system I can get within
254
+
255
+ 0:07:07.535 --> 0:07:08.437
256
+ this budget.
257
+
258
+ 0:07:08.969 --> 0:07:19.085
259
+ And then, of course, you can make the models
260
+ bigger, but then you have to train them shorter,
261
+
262
+ 0:07:19.085 --> 0:07:24.251
263
+ and then we can make more efficient algorithms.
264
+
265
+ 0:07:25.925 --> 0:07:31.687
266
+ EfficiencyIf you think about efficiency, there's
267
+ a bit different scenarios.
268
+
269
+ 0:07:32.312 --> 0:07:43.635
270
+ So if you're more of coming from the research
271
+ community, what you'll be doing is building
272
+
273
+ 0:07:43.635 --> 0:07:47.913
274
+ a lot of models in your research.
275
+
276
+ 0:07:48.088 --> 0:07:58.645
277
+ So you're having your test set of maybe sentences,
278
+ calculating the blue score, then another model.
279
+
280
+ 0:07:58.818 --> 0:08:08.911
281
+ So what that means is typically you're training
282
+ on millions of cents, so your training time
283
+
284
+ 0:08:08.911 --> 0:08:14.944
285
+ is long, maybe a day, but maybe in other cases
286
+ a week.
287
+
288
+ 0:08:15.135 --> 0:08:22.860
289
+ The testing is not really the cost efficient,
290
+ but the training is very costly.
291
+
292
+ 0:08:23.443 --> 0:08:37.830
293
+ If you are more thinking of building models
294
+ for application, the scenario is quite different.
295
+
296
+ 0:08:38.038 --> 0:08:46.603
297
+ And then you keep it running, and maybe thousands
298
+ of customers are using it in translating.
299
+
300
+ 0:08:46.510 --> 0:08:47.729
301
+ So in that.
302
+
303
+ 0:08:48.168 --> 0:08:59.577
304
+ And we will see that it is not always the
305
+ same type of challenges you can paralyze some
306
+
307
+ 0:08:59.577 --> 0:09:07.096
308
+ things in training, which you cannot paralyze
309
+ in testing.
310
+
311
+ 0:09:07.347 --> 0:09:14.124
312
+ For example, in training you have to do back
313
+ propagation, so you have to store the activations.
314
+
315
+ 0:09:14.394 --> 0:09:23.901
316
+ Therefore, in testing we briefly discussed
317
+ that we would do it in more detail today in
318
+
319
+ 0:09:23.901 --> 0:09:24.994
320
+ training.
321
+
322
+ 0:09:25.265 --> 0:09:36.100
323
+ You know they're a target and you can process
324
+ everything in parallel while in testing.
325
+
326
+ 0:09:36.356 --> 0:09:46.741
327
+ So you can only do one word at a time, and
328
+ so you can less paralyze this.
329
+
330
+ 0:09:46.601 --> 0:09:50.536
331
+ Therefore, it's important.
332
+
333
+ 0:09:52.712 --> 0:09:55.347
334
+ Is a specific task on this.
335
+
336
+ 0:09:55.253 --> 0:10:03.158
337
+ For example, it's the efficiency task where
338
+ it's about making things as efficient.
339
+
340
+ 0:10:03.123 --> 0:10:09.230
341
+ Is possible and they can look at different
342
+ resources.
343
+
344
+ 0:10:09.117 --> 0:10:14.209
345
+ So how much deep fuel run time do you need?
346
+
347
+ 0:10:14.454 --> 0:10:19.366
348
+ See how much memory you need or you can have
349
+ a fixed memory budget and then have to build
350
+
351
+ 0:10:19.366 --> 0:10:20.294
352
+ the best system.
353
+
354
+ 0:10:20.500 --> 0:10:29.010
355
+ And here is a bit like an example of that,
356
+ so there's three teams from Edinburgh from
357
+
358
+ 0:10:29.010 --> 0:10:30.989
359
+ and they submitted.
360
+
361
+ 0:10:31.131 --> 0:10:36.278
362
+ So then, of course, if you want to know the
363
+ most efficient system you have to do a bit
364
+
365
+ 0:10:36.278 --> 0:10:36.515
366
+ of.
367
+
368
+ 0:10:36.776 --> 0:10:44.656
369
+ You want to have a better quality or more
370
+ runtime and there's not the one solution.
371
+
372
+ 0:10:44.562 --> 0:10:46.724
373
+ You can improve your.
374
+
375
+ 0:10:46.946 --> 0:10:49.662
376
+ And that you see that there are different
377
+ systems.
378
+
379
+ 0:10:49.909 --> 0:11:06.051
380
+ Here is how many words you can do for a second
381
+ on the clock, and you want to be as talk as
382
+
383
+ 0:11:06.051 --> 0:11:07.824
384
+ possible.
385
+
386
+ 0:11:08.068 --> 0:11:08.889
387
+ And you see here a bit.
388
+
389
+ 0:11:08.855 --> 0:11:09.985
390
+ This is a little bit different.
391
+
392
+ 0:11:11.051 --> 0:11:27.717
393
+ You want to be there on the top right corner
394
+ and you can get a score of something between
395
+
396
+ 0:11:27.717 --> 0:11:29.014
397
+ words.
398
+
399
+ 0:11:30.250 --> 0:11:34.161
400
+ Two hundred and fifty thousand, then you'll
401
+ ever come and score zero point three.
402
+
403
+ 0:11:34.834 --> 0:11:41.243
404
+ There is, of course, any bit of a decision,
405
+ but the question is, like how far can you again?
406
+
407
+ 0:11:41.174 --> 0:11:47.756
408
+ Some of all these points on this line would
409
+ be winners because they are somehow most efficient
410
+
411
+ 0:11:47.756 --> 0:11:53.923
412
+ in a way that there's no system which achieves
413
+ the same quality with less computational.
414
+
415
+ 0:11:57.657 --> 0:12:04.119
416
+ ResourcesSo there's the one question of which
417
+ resources are you interested.
418
+
419
+ 0:12:04.034 --> 0:12:07.362
420
+ Are you running it on CPU or GPU?
421
+
422
+ 0:12:07.264 --> 0:12:11.671
423
+ There's different ways of paralyzing stuff.
424
+
425
+ 0:12:14.654 --> 0:12:20.777
426
+ Another dimension is how you process your
427
+ data.
428
+
429
+ 0:12:20.649 --> 0:12:27.157
430
+ There's really the best processing and streaming.
431
+
432
+ 0:12:27.647 --> 0:12:34.672
433
+ So in batch processing you have the whole
434
+ document available so you can translate all
435
+
436
+ 0:12:34.672 --> 0:12:39.981
437
+ sentences in perimeter and then you're interested
438
+ in throughput.
439
+
440
+ 0:12:40.000 --> 0:12:43.844
441
+ But you can then process, for example, especially
442
+ in GPS.
443
+
444
+ 0:12:43.778 --> 0:12:49.772
445
+ That's interesting, you're not translating
446
+ one sentence at a time, but you're translating
447
+
448
+ 0:12:49.772 --> 0:12:56.099
449
+ one hundred sentences or so in parallel, so
450
+ you have one more dimension where you can paralyze
451
+
452
+ 0:12:56.099 --> 0:12:57.964
453
+ and then be more efficient.
454
+
455
+ 0:12:58.558 --> 0:13:14.863
456
+ On the other hand, for example sorts of documents,
457
+ so we learned that if you do badge processing
458
+
459
+ 0:13:14.863 --> 0:13:16.544
460
+ you have.
461
+
462
+ 0:13:16.636 --> 0:13:24.636
463
+ Then, of course, it makes sense to sort the
464
+ sentences in order to have the minimum thing
465
+
466
+ 0:13:24.636 --> 0:13:25.535
467
+ attached.
468
+
469
+ 0:13:27.427 --> 0:13:32.150
470
+ The other scenario is more the streaming scenario
471
+ where you do life translation.
472
+
473
+ 0:13:32.512 --> 0:13:40.212
474
+ So in that case you can't wait for the whole
475
+ document to pass, but you have to do.
476
+
477
+ 0:13:40.520 --> 0:13:49.529
478
+ And then, for example, that's especially in
479
+ situations like speech translation, and then
480
+
481
+ 0:13:49.529 --> 0:13:53.781
482
+ you're interested in things like latency.
483
+
484
+ 0:13:53.680 --> 0:14:00.362
485
+ So how much do you have to wait to get the
486
+ output of a sentence?
487
+
488
+ 0:14:06.566 --> 0:14:15.703
489
+ Finally, there is the thing about the implementation:
490
+ Today we're mainly looking at different algorithms,
491
+
492
+ 0:14:15.703 --> 0:14:23.115
493
+ different models of how you can model them
494
+ in your machine translation system, but of
495
+
496
+ 0:14:23.115 --> 0:14:29.235
497
+ course for the same algorithms there's also
498
+ different implementations.
499
+
500
+ 0:14:29.489 --> 0:14:38.643
501
+ So, for example, for a machine translation
502
+ this tool could be very fast.
503
+
504
+ 0:14:38.638 --> 0:14:46.615
505
+ So they have like coded a lot of the operations
506
+ very low resource, not low resource, low level
507
+
508
+ 0:14:46.615 --> 0:14:49.973
509
+ on the directly on the QDAC kernels in.
510
+
511
+ 0:14:50.110 --> 0:15:00.948
512
+ So the same attention network is typically
513
+ more efficient in that type of algorithm.
514
+
515
+ 0:15:00.880 --> 0:15:02.474
516
+ Than in in any other.
517
+
518
+ 0:15:03.323 --> 0:15:13.105
519
+ Of course, it might be other disadvantages,
520
+ so if you're a little worker or have worked
521
+
522
+ 0:15:13.105 --> 0:15:15.106
523
+ in the practical.
524
+
525
+ 0:15:15.255 --> 0:15:22.604
526
+ Because it's normally easier to understand,
527
+ easier to change, and so on, but there is again
528
+
529
+ 0:15:22.604 --> 0:15:23.323
530
+ a train.
531
+
532
+ 0:15:23.483 --> 0:15:29.440
533
+ You have to think about, do you want to include
534
+ this into my study or comparison or not?
535
+
536
+ 0:15:29.373 --> 0:15:36.450
537
+ Should it be like I compare different implementations
538
+ and I also find the most efficient implementation?
539
+
540
+ 0:15:36.383 --> 0:15:39.148
541
+ Or is it only about the pure algorithm?
542
+
543
+ 0:15:42.742 --> 0:15:50.355
544
+ Yeah, when building these systems there is
545
+ a different trade-off to do.
546
+
547
+ 0:15:50.850 --> 0:15:56.555
548
+ So there's one of the traders between memory
549
+ and throughput, so how many words can generate
550
+
551
+ 0:15:56.555 --> 0:15:57.299
552
+ per second.
553
+
554
+ 0:15:57.557 --> 0:16:03.351
555
+ So typically you can easily like increase
556
+ your scruple by increasing the batch size.
557
+
558
+ 0:16:03.643 --> 0:16:06.899
559
+ So that means you are translating more sentences
560
+ in parallel.
561
+
562
+ 0:16:07.107 --> 0:16:09.241
563
+ And gypsies are very good at that stuff.
564
+
565
+ 0:16:09.349 --> 0:16:15.161
566
+ It should translate one sentence or one hundred
567
+ sentences, not the same time, but its.
568
+
569
+ 0:16:15.115 --> 0:16:20.784
570
+ Rough are very similar because they are at
571
+ this efficient metrics multiplication so that
572
+
573
+ 0:16:20.784 --> 0:16:24.415
574
+ you can do the same operation on all sentences
575
+ parallel.
576
+
577
+ 0:16:24.351 --> 0:16:30.133
578
+ So typically that means if you increase your
579
+ benchmark you can do more things in parallel
580
+
581
+ 0:16:30.133 --> 0:16:31.996
582
+ and you will translate more.
583
+
584
+ 0:16:31.952 --> 0:16:33.370
585
+ Second.
586
+
587
+ 0:16:33.653 --> 0:16:43.312
588
+ On the other hand, with this advantage, of
589
+ course you will need higher badge sizes and
590
+
591
+ 0:16:43.312 --> 0:16:44.755
592
+ more memory.
593
+
594
+ 0:16:44.965 --> 0:16:56.452
595
+ To begin with, the other problem is that you
596
+ have such big models that you can only translate
597
+
598
+ 0:16:56.452 --> 0:16:59.141
599
+ with lower bed sizes.
600
+
601
+ 0:16:59.119 --> 0:17:08.466
602
+ If you are running out of memory with translating,
603
+ one idea to go on that is to decrease your.
604
+
605
+ 0:17:13.453 --> 0:17:24.456
606
+ Then there is the thing about quality in Screwport,
607
+ of course, and before it's like larger models,
608
+
609
+ 0:17:24.456 --> 0:17:28.124
610
+ but in generally higher quality.
611
+
612
+ 0:17:28.012 --> 0:17:31.906
613
+ The first one is always this way.
614
+
615
+ 0:17:32.092 --> 0:17:38.709
616
+ Course: Not always larger model helps you
617
+ have over fitting at some point, but in generally.
618
+
619
+ 0:17:43.883 --> 0:17:52.901
620
+ And with this a bit on this training and testing
621
+ thing we had before.
622
+
623
+ 0:17:53.113 --> 0:17:58.455
624
+ So it wears all the difference between training
625
+ and testing, and for the encoder and decoder.
626
+
627
+ 0:17:58.798 --> 0:18:09.553
628
+ So if we are looking at what mentioned before
629
+ at training time, we have a source sentence
630
+
631
+ 0:18:09.553 --> 0:18:17.201
632
+ here: And how this is processed on a is not
633
+ the attention here.
634
+
635
+ 0:18:17.081 --> 0:18:21.840
636
+ That's a tubical transformer.
637
+
638
+ 0:18:22.162 --> 0:18:31.626
639
+ And how we can do that on a is that we can
640
+ paralyze the ear ever since.
641
+
642
+ 0:18:31.494 --> 0:18:40.512
643
+ The first thing to know is: So that is, of
644
+ course, not in all cases.
645
+
646
+ 0:18:40.382 --> 0:18:49.184
647
+ We'll later talk about speech translation
648
+ where we might want to translate.
649
+
650
+ 0:18:49.389 --> 0:18:56.172
651
+ Without the general case in, it's like you
652
+ have the full sentence you want to translate.
653
+
654
+ 0:18:56.416 --> 0:19:02.053
655
+ So the important thing is we are here everything
656
+ available on the source side.
657
+
658
+ 0:19:03.323 --> 0:19:13.524
659
+ And then this was one of the big advantages
660
+ that you can remember back of transformer.
661
+
662
+ 0:19:13.407 --> 0:19:15.759
663
+ There are several.
664
+
665
+ 0:19:16.156 --> 0:19:25.229
666
+ But the other one is now that we can calculate
667
+ the full layer.
668
+
669
+ 0:19:25.645 --> 0:19:29.318
670
+ There is no dependency between this and this
671
+ state or this and this state.
672
+
673
+ 0:19:29.749 --> 0:19:36.662
674
+ So we always did like here to calculate the
675
+ key value and query, and based on that you
676
+
677
+ 0:19:36.662 --> 0:19:37.536
678
+ calculate.
679
+
680
+ 0:19:37.937 --> 0:19:46.616
681
+ Which means we can do all these calculations
682
+ here in parallel and in parallel.
683
+
684
+ 0:19:48.028 --> 0:19:55.967
685
+ And there, of course, is this very efficiency
686
+ because again for GPS it's too bigly possible
687
+
688
+ 0:19:55.967 --> 0:20:00.887
689
+ to do these things in parallel and one after
690
+ each other.
691
+
692
+ 0:20:01.421 --> 0:20:10.311
693
+ And then we can also for each layer one by
694
+ one, and then we calculate here the encoder.
695
+
696
+ 0:20:10.790 --> 0:20:21.921
697
+ In training now an important thing is that
698
+ for the decoder we have the full sentence available
699
+
700
+ 0:20:21.921 --> 0:20:28.365
701
+ because we know this is the target we should
702
+ generate.
703
+
704
+ 0:20:29.649 --> 0:20:33.526
705
+ We have models now in a different way.
706
+
707
+ 0:20:33.426 --> 0:20:38.299
708
+ This hidden state is only on the previous
709
+ ones.
710
+
711
+ 0:20:38.598 --> 0:20:51.887
712
+ And the first thing here depends only on this
713
+ information, so you see if you remember we
714
+
715
+ 0:20:51.887 --> 0:20:56.665
716
+ had this masked self-attention.
717
+
718
+ 0:20:56.896 --> 0:21:04.117
719
+ So that means, of course, we can only calculate
720
+ the decoder once the encoder is done, but that's.
721
+
722
+ 0:21:04.444 --> 0:21:06.656
723
+ Percent can calculate the end quarter.
724
+
725
+ 0:21:06.599 --> 0:21:08.926
726
+ Then we can calculate here the decoder.
727
+
728
+ 0:21:09.569 --> 0:21:25.566
729
+ But again in training we have x, y and that
730
+ is available so we can calculate everything
731
+
732
+ 0:21:25.566 --> 0:21:27.929
733
+ in parallel.
734
+
735
+ 0:21:28.368 --> 0:21:40.941
736
+ So the interesting thing or advantage of transformer
737
+ is in training.
738
+
739
+ 0:21:40.759 --> 0:21:46.414
740
+ We can do it for the decoder.
741
+
742
+ 0:21:46.866 --> 0:21:54.457
743
+ That means you will have more calculations
744
+ because you can only calculate one layer at
745
+
746
+ 0:21:54.457 --> 0:22:02.310
747
+ a time, but for example the length which is
748
+ too bigly quite long or doesn't really matter
749
+
750
+ 0:22:02.310 --> 0:22:03.270
751
+ that much.
752
+
753
+ 0:22:05.665 --> 0:22:10.704
754
+ However, in testing this situation is different.
755
+
756
+ 0:22:10.602 --> 0:22:13.280
757
+ In testing we only have.
758
+
759
+ 0:22:13.713 --> 0:22:21.427
760
+ So this means we start with a sense: We don't
761
+ know the full sentence yet because we ought
762
+
763
+ 0:22:21.427 --> 0:22:29.054
764
+ to regularly generate that so for the encoder
765
+ we have the same here but for the decoder.
766
+
767
+ 0:22:29.409 --> 0:22:39.598
768
+ In this case we only have the first and the
769
+ second instinct, but only for all states in
770
+
771
+ 0:22:39.598 --> 0:22:40.756
772
+ parallel.
773
+
774
+ 0:22:41.101 --> 0:22:51.752
775
+ And then we can do the next step for y because
776
+ we are putting our most probable one.
777
+
778
+ 0:22:51.626 --> 0:22:58.646
779
+ We do greedy search or beam search, but you
780
+ cannot do.
781
+
782
+ 0:23:03.663 --> 0:23:16.838
783
+ Yes, so if we are interesting in making things
784
+ more efficient for testing, which we see, for
785
+
786
+ 0:23:16.838 --> 0:23:22.363
787
+ example in the scenario of really our.
788
+
789
+ 0:23:22.642 --> 0:23:34.286
790
+ It makes sense that we think about our architecture
791
+ and that we are currently working on attention
792
+
793
+ 0:23:34.286 --> 0:23:35.933
794
+ based models.
795
+
796
+ 0:23:36.096 --> 0:23:44.150
797
+ The decoder there is some of the most time
798
+ spent testing and testing.
799
+
800
+ 0:23:44.035 --> 0:23:47.146
801
+ It's similar, but during.
802
+
803
+ 0:23:47.167 --> 0:23:50.248
804
+ Nothing about beam search.
805
+
806
+ 0:23:50.134 --> 0:23:59.835
807
+ It might be even more complicated because
808
+ in beam search you have to try different.
809
+
810
+ 0:24:02.762 --> 0:24:15.140
811
+ So the question is what can you now do in
812
+ order to make your model more efficient and
813
+
814
+ 0:24:15.140 --> 0:24:21.905
815
+ better in translation in these types of cases?
816
+
817
+ 0:24:24.604 --> 0:24:30.178
818
+ And the one thing is to look into the encoded
819
+ decoder trailer.
820
+
821
+ 0:24:30.690 --> 0:24:43.898
822
+ And then until now we typically assume that
823
+ the depth of the encoder and the depth of the
824
+
825
+ 0:24:43.898 --> 0:24:48.154
826
+ decoder is roughly the same.
827
+
828
+ 0:24:48.268 --> 0:24:55.553
829
+ So if you haven't thought about it, you just
830
+ take what is running well.
831
+
832
+ 0:24:55.452 --> 0:24:57.683
833
+ You would try to do.
834
+
835
+ 0:24:58.018 --> 0:25:04.148
836
+ However, we saw now that there is a quite
837
+ big challenge and the runtime is a lot longer
838
+
839
+ 0:25:04.148 --> 0:25:04.914
840
+ than here.
841
+
842
+ 0:25:05.425 --> 0:25:14.018
843
+ The question is also the case for the calculations,
844
+ or do we have there the same issue that we
845
+
846
+ 0:25:14.018 --> 0:25:21.887
847
+ only get the good quality if we are having
848
+ high and high, so we know that making these
849
+
850
+ 0:25:21.887 --> 0:25:25.415
851
+ more depths is increasing our quality.
852
+
853
+ 0:25:25.425 --> 0:25:31.920
854
+ But what we haven't talked about is really
855
+ important that we increase the depth the same
856
+
857
+ 0:25:31.920 --> 0:25:32.285
858
+ way.
859
+
860
+ 0:25:32.552 --> 0:25:41.815
861
+ So what we can put instead also do is something
862
+ like this where you have a deep encoder and
863
+
864
+ 0:25:41.815 --> 0:25:42.923
865
+ a shallow.
866
+
867
+ 0:25:43.163 --> 0:25:57.386
868
+ So that would be that you, for example, have
869
+ instead of having layers on the encoder, and
870
+
871
+ 0:25:57.386 --> 0:25:59.757
872
+ layers on the.
873
+
874
+ 0:26:00.080 --> 0:26:10.469
875
+ So in this case the overall depth from start
876
+ to end would be similar and so hopefully.
877
+
878
+ 0:26:11.471 --> 0:26:21.662
879
+ But we could a lot more things hear parallelized,
880
+ and hear what is costly at the end during decoding
881
+
882
+ 0:26:21.662 --> 0:26:22.973
883
+ the decoder.
884
+
885
+ 0:26:22.872 --> 0:26:29.331
886
+ Because that does change in an outer regressive
887
+ way, there we.
888
+
889
+ 0:26:31.411 --> 0:26:33.727
890
+ And that that can be analyzed.
891
+
892
+ 0:26:33.652 --> 0:26:38.744
893
+ So here is some examples: Where people have
894
+ done all this.
895
+
896
+ 0:26:39.019 --> 0:26:55.710
897
+ So here it's mainly interested on the orange
898
+ things, which is auto-regressive about the
899
+
900
+ 0:26:55.710 --> 0:26:57.607
901
+ speed up.
902
+
903
+ 0:26:57.717 --> 0:27:15.031
904
+ You have the system, so agree is not exactly
905
+ the same, but it's similar.
906
+
907
+ 0:27:15.055 --> 0:27:23.004
908
+ It's always the case if you look at speed
909
+ up.
910
+
911
+ 0:27:22.831 --> 0:27:31.647
912
+ Think they put a speed of so that's the baseline.
913
+
914
+ 0:27:31.771 --> 0:27:35.348
915
+ So between and times as fast.
916
+
917
+ 0:27:35.229 --> 0:27:42.623
918
+ If you switch from a system to where you have
919
+ layers in the.
920
+
921
+ 0:27:42.782 --> 0:27:52.309
922
+ You see that although you have slightly more
923
+ parameters, more calculations are also roughly
924
+
925
+ 0:27:52.309 --> 0:28:00.283
926
+ the same, but you can speed out because now
927
+ during testing you can paralyze.
928
+
929
+ 0:28:02.182 --> 0:28:09.754
930
+ The other thing is that you're speeding up,
931
+ but if you look at the performance it's similar,
932
+
933
+ 0:28:09.754 --> 0:28:13.500
934
+ so sometimes you improve, sometimes you lose.
935
+
936
+ 0:28:13.419 --> 0:28:20.422
937
+ There's a bit of losing English to Romania,
938
+ but in general the quality is very slow.
939
+
940
+ 0:28:20.680 --> 0:28:30.343
941
+ So you see that you can keep a similar performance
942
+ while improving your speed by just having different.
943
+
944
+ 0:28:30.470 --> 0:28:34.903
945
+ And you also see the encoder layers from speed.
946
+
947
+ 0:28:34.811 --> 0:28:38.125
948
+ They don't really metal that much.
949
+
950
+ 0:28:38.030 --> 0:28:38.712
951
+ Most.
952
+
953
+ 0:28:38.979 --> 0:28:50.319
954
+ Because if you compare the 12th system to
955
+ the 6th system you have a lower performance
956
+
957
+ 0:28:50.319 --> 0:28:57.309
958
+ with 6th and colder layers but the speed is
959
+ similar.
960
+
961
+ 0:28:57.897 --> 0:29:02.233
962
+ And see the huge decrease is it maybe due
963
+ to a lack of data.
964
+
965
+ 0:29:03.743 --> 0:29:11.899
966
+ Good idea would say it's not the case.
967
+
968
+ 0:29:11.690 --> 0:29:23.195
969
+ Romanian English should have the same number
970
+ of data.
971
+
972
+ 0:29:24.224 --> 0:29:31.184
973
+ Maybe it's just that something in that language.
974
+
975
+ 0:29:31.042 --> 0:29:40.704
976
+ If you generate Romanian maybe they need more
977
+ target dependencies.
978
+
979
+ 0:29:42.882 --> 0:29:46.263
980
+ The Wine's the Eye Also Don't Know Any Sex
981
+ People Want To.
982
+
983
+ 0:29:47.887 --> 0:29:49.034
984
+ There could be yeah the.
985
+
986
+ 0:29:49.889 --> 0:30:02.316
987
+ As the maybe if you go from like a movie sphere
988
+ to a hybrid sphere, you can: It's very much
989
+
990
+ 0:30:02.316 --> 0:30:12.447
991
+ easier to expand the vocabulary to English,
992
+ but it must be the vocabulary.
993
+
994
+ 0:30:13.333 --> 0:30:21.147
995
+ Have to check, but would assume that in this
996
+ case the system is not retrained, but it's
997
+
998
+ 0:30:21.147 --> 0:30:22.391
999
+ trained with.
1000
+
1001
+ 0:30:22.902 --> 0:30:30.213
1002
+ And that's why I was assuming that they have
1003
+ the same, but maybe you'll write that in this
1004
+
1005
+ 0:30:30.213 --> 0:30:35.595
1006
+ piece, for example, if they were pre-trained,
1007
+ the decoder English.
1008
+
1009
+ 0:30:36.096 --> 0:30:43.733
1010
+ But don't remember exactly if they do something
1011
+ like that, but that could be a good.
1012
+
1013
+ 0:30:45.325 --> 0:30:52.457
1014
+ So this is some of the most easy way to speed
1015
+ up.
1016
+
1017
+ 0:30:52.314 --> 0:31:01.446
1018
+ You just switch to hyperparameters, not to
1019
+ implement anything.
1020
+
1021
+ 0:31:02.722 --> 0:31:08.340
1022
+ ArchitectureOf course, there's other ways
1023
+ of doing that.
1024
+
1025
+ 0:31:08.242 --> 0:31:11.809
1026
+ We'll look into two things.
1027
+
1028
+ 0:31:11.682 --> 0:31:16.527
1029
+ The other thing is the architecture.
1030
+
1031
+ 0:31:16.796 --> 0:31:28.154
1032
+ We are now at some of the baselines that we
1033
+ are doing.
1034
+
1035
+ 0:31:28.488 --> 0:31:39.978
1036
+ However, in translation in the decoder side,
1037
+ it might not be the best solution.
1038
+
1039
+ 0:31:39.834 --> 0:31:41.857
1040
+ There is no.
1041
+
1042
+ 0:31:42.222 --> 0:31:47.130
1043
+ So we can use different types of architectures,
1044
+ also in the encoder and the.
1045
+
1046
+ 0:31:47.747 --> 0:31:52.475
1047
+ And there's two ways of what you could do
1048
+ different, or there's more ways.
1049
+
1050
+ 0:31:52.912 --> 0:31:54.825
1051
+ We will look into two todays.
1052
+
1053
+ 0:31:54.761 --> 0:31:58.843
1054
+ The one is average attention, which is a very
1055
+ simple solution.
1056
+
1057
+ 0:31:59.419 --> 0:32:01.464
1058
+ You can do as it says.
1059
+
1060
+ 0:32:01.375 --> 0:32:04.527
1061
+ It's not really attending anymore.
1062
+
1063
+ 0:32:04.437 --> 0:32:08.760
1064
+ It's just like equal attendance to everything.
1065
+
1066
+ 0:32:09.249 --> 0:32:23.422
1067
+ And the other idea, which is currently done
1068
+ in most systems which are optimized to efficiency,
1069
+
1070
+ 0:32:23.422 --> 0:32:24.913
1071
+ is we're.
1072
+
1073
+ 0:32:25.065 --> 0:32:32.623
1074
+ But on the decoder side we are then not using
1075
+ transformer or self attention, but we are using
1076
+
1077
+ 0:32:32.623 --> 0:32:39.700
1078
+ recurrent neural network because they are the
1079
+ disadvantage of recurrent neural network.
1080
+
1081
+ 0:32:39.799 --> 0:32:48.353
1082
+ And then the recurrent is normally easier
1083
+ to calculate because it only depends on inputs,
1084
+
1085
+ 0:32:48.353 --> 0:32:49.684
1086
+ the input on.
1087
+
1088
+ 0:32:51.931 --> 0:33:02.190
1089
+ So what is the difference between decoding
1090
+ and why is the tension maybe not sufficient
1091
+
1092
+ 0:33:02.190 --> 0:33:03.841
1093
+ for decoding?
1094
+
1095
+ 0:33:04.204 --> 0:33:14.390
1096
+ If we want to populate the new state, we only
1097
+ have to look at the input and the previous
1098
+
1099
+ 0:33:14.390 --> 0:33:15.649
1100
+ state, so.
1101
+
1102
+ 0:33:16.136 --> 0:33:19.029
1103
+ We are more conditional here networks.
1104
+
1105
+ 0:33:18.955 --> 0:33:20.000
1106
+ We have the.
1107
+
1108
+ 0:33:19.980 --> 0:33:31.291
1109
+ Dependency to a fixed number of previous ones,
1110
+ but that's rarely used for decoding.
1111
+
1112
+ 0:33:31.156 --> 0:33:39.776
1113
+ In contrast, in transformer we have this large
1114
+ dependency, so.
1115
+
1116
+ 0:33:40.000 --> 0:33:52.760
1117
+ So from t minus one to y t so that is somehow
1118
+ and mainly not very efficient in this way mean
1119
+
1120
+ 0:33:52.760 --> 0:33:56.053
1121
+ it's very good because.
1122
+
1123
+ 0:33:56.276 --> 0:34:03.543
1124
+ However, the disadvantage is that we also
1125
+ have to do all these calculations, so if we
1126
+
1127
+ 0:34:03.543 --> 0:34:10.895
1128
+ more view from the point of view of efficient
1129
+ calculation, this might not be the best.
1130
+
1131
+ 0:34:11.471 --> 0:34:20.517
1132
+ So the question is, can we change our architecture
1133
+ to keep some of the advantages but make things
1134
+
1135
+ 0:34:20.517 --> 0:34:21.994
1136
+ more efficient?
1137
+
1138
+ 0:34:24.284 --> 0:34:31.131
1139
+ The one idea is what is called the average
1140
+ attention, and the interesting thing is this
1141
+
1142
+ 0:34:31.131 --> 0:34:32.610
1143
+ work surprisingly.
1144
+
1145
+ 0:34:33.013 --> 0:34:38.917
1146
+ So the only idea what you're doing is doing
1147
+ the decoder.
1148
+
1149
+ 0:34:38.813 --> 0:34:42.592
1150
+ You're not doing attention anymore.
1151
+
1152
+ 0:34:42.487 --> 0:34:46.794
1153
+ The attention weights are all the same.
1154
+
1155
+ 0:34:47.027 --> 0:35:00.723
1156
+ So you don't calculate with query and key
1157
+ the different weights, and then you just take
1158
+
1159
+ 0:35:00.723 --> 0:35:03.058
1160
+ equal weights.
1161
+
1162
+ 0:35:03.283 --> 0:35:07.585
1163
+ So here would be one third from this, one
1164
+ third from this, and one third.
1165
+
1166
+ 0:35:09.009 --> 0:35:14.719
1167
+ And while it is sufficient you can now do
1168
+ precalculation and things get more efficient.
1169
+
1170
+ 0:35:15.195 --> 0:35:18.803
1171
+ So first go the formula that's maybe not directed
1172
+ here.
1173
+
1174
+ 0:35:18.979 --> 0:35:38.712
1175
+ So the difference here is that your new hint
1176
+ stage is the sum of all the hint states, then.
1177
+
1178
+ 0:35:38.678 --> 0:35:40.844
1179
+ So here would be with this.
1180
+
1181
+ 0:35:40.767 --> 0:35:45.023
1182
+ It would be one third of this plus one third
1183
+ of this.
1184
+
1185
+ 0:35:46.566 --> 0:35:57.162
1186
+ But if you calculate it this way, it's not
1187
+ yet being more efficient because you still
1188
+
1189
+ 0:35:57.162 --> 0:36:01.844
1190
+ have to sum over here all the hidden.
1191
+
1192
+ 0:36:04.524 --> 0:36:22.932
1193
+ But you can not easily speed up these things
1194
+ by having an in between value, which is just
1195
+
1196
+ 0:36:22.932 --> 0:36:24.568
1197
+ always.
1198
+
1199
+ 0:36:25.585 --> 0:36:30.057
1200
+ If you take this as ten to one, you take this
1201
+ one class this one.
1202
+
1203
+ 0:36:30.350 --> 0:36:36.739
1204
+ Because this one then was before this, and
1205
+ this one was this, so in the end.
1206
+
1207
+ 0:36:37.377 --> 0:36:49.545
1208
+ So now this one is not the final one in order
1209
+ to get the final one to do the average.
1210
+
1211
+ 0:36:49.404 --> 0:36:50.158
1212
+ So.
1213
+
1214
+ 0:36:50.430 --> 0:37:00.264
1215
+ But then if you do this calculation with speed
1216
+ up you can do it with a fixed number of steps.
1217
+
1218
+ 0:37:00.180 --> 0:37:11.300
1219
+ Instead of the sun which depends on age, so
1220
+ you only have to do calculations to calculate
1221
+
1222
+ 0:37:11.300 --> 0:37:12.535
1223
+ this one.
1224
+
1225
+ 0:37:12.732 --> 0:37:21.718
1226
+ Can you do the lakes and the lakes?
1227
+
1228
+ 0:37:21.469 --> 0:37:32.707
1229
+ For example, light bulb here now takes and.
1230
+
1231
+ 0:37:32.993 --> 0:37:38.762
1232
+ That's a very good point and that's why this
1233
+ is now in the image.
1234
+
1235
+ 0:37:38.675 --> 0:37:44.533
1236
+ It's not very good so this is the one with
1237
+ tilder and the tilder.
1238
+
1239
+ 0:37:44.884 --> 0:37:57.895
1240
+ So this one is just the sum of these two,
1241
+ because this is just this one.
1242
+
1243
+ 0:37:58.238 --> 0:38:08.956
1244
+ So the sum of this is exactly as the sum of
1245
+ these, and the sum of these is the sum of here.
1246
+
1247
+ 0:38:08.840 --> 0:38:15.133
1248
+ So you only do the sum in here, and the multiplying.
1249
+
1250
+ 0:38:15.255 --> 0:38:22.145
1251
+ So what you can mainly do here is you can
1252
+ do it more mathematically.
1253
+
1254
+ 0:38:22.045 --> 0:38:31.532
1255
+ You can know this by tea taking out of the
1256
+ sum, and then you can calculate the sum different.
1257
+
1258
+ 0:38:36.256 --> 0:38:42.443
1259
+ That maybe looks a bit weird and simple, so
1260
+ we were all talking about this great attention
1261
+
1262
+ 0:38:42.443 --> 0:38:47.882
1263
+ that we can focus on different parts, and a
1264
+ bit surprising on this work is now.
1265
+
1266
+ 0:38:47.814 --> 0:38:53.322
1267
+ In the end it might also work well without
1268
+ really putting and just doing equal.
1269
+
1270
+ 0:38:53.954 --> 0:38:56.164
1271
+ Mean it's not that easy.
1272
+
1273
+ 0:38:56.376 --> 0:38:58.261
1274
+ It's like sometimes this is working.
1275
+
1276
+ 0:38:58.210 --> 0:39:00.452
1277
+ There's also report weight work that well.
1278
+
1279
+ 0:39:01.481 --> 0:39:05.848
1280
+ But I think it's an interesting way and it
1281
+ maybe shows that a lot of.
1282
+
1283
+ 0:39:05.805 --> 0:39:10.624
1284
+ Things in the self or in the transformer paper
1285
+ which are more put as like yet.
1286
+
1287
+ 0:39:10.563 --> 0:39:15.890
1288
+ These are some hyperpermetheuss around it,
1289
+ like that you do the layer norm in between,
1290
+
1291
+ 0:39:15.890 --> 0:39:21.769
1292
+ and that you do a feat forward before, and
1293
+ things like that, that these are also all important,
1294
+
1295
+ 0:39:21.769 --> 0:39:25.566
1296
+ and that the right set up around that is also
1297
+ very important.
1298
+
1299
+ 0:39:28.969 --> 0:39:38.598
1300
+ The other thing you can do in the end is not
1301
+ completely different from this one.
1302
+
1303
+ 0:39:38.479 --> 0:39:42.524
1304
+ It's just like a very different.
1305
+
1306
+ 0:39:42.942 --> 0:39:54.338
1307
+ And that is a recurrent network which also
1308
+ has this type of highway connection that can
1309
+
1310
+ 0:39:54.338 --> 0:40:01.330
1311
+ ignore the recurrent unit and directly put
1312
+ the input.
1313
+
1314
+ 0:40:01.561 --> 0:40:10.770
1315
+ It's not really adding out, but if you see
1316
+ the hitting step is your input, but what you
1317
+
1318
+ 0:40:10.770 --> 0:40:15.480
1319
+ can do is somehow directly go to the output.
1320
+
1321
+ 0:40:17.077 --> 0:40:28.390
1322
+ These are the four components of the simple
1323
+ return unit, and the unit is motivated by GIS
1324
+
1325
+ 0:40:28.390 --> 0:40:33.418
1326
+ and by LCMs, which we have seen before.
1327
+
1328
+ 0:40:33.513 --> 0:40:43.633
1329
+ And that has proven to be very good for iron
1330
+ ends, which allows you to have a gate on your.
1331
+
1332
+ 0:40:44.164 --> 0:40:48.186
1333
+ In this thing we have two gates, the reset
1334
+ gate and the forget gate.
1335
+
1336
+ 0:40:48.768 --> 0:40:57.334
1337
+ So first we have the general structure which
1338
+ has a cell state.
1339
+
1340
+ 0:40:57.198 --> 0:41:01.282
1341
+ Here we have the cell state.
1342
+
1343
+ 0:41:01.361 --> 0:41:09.661
1344
+ And then this goes next, and we always get
1345
+ the different cell states over the times that.
1346
+
1347
+ 0:41:10.030 --> 0:41:11.448
1348
+ This Is the South Stand.
1349
+
1350
+ 0:41:11.771 --> 0:41:16.518
1351
+ How do we now calculate that just assume we
1352
+ have an initial cell safe here?
1353
+
1354
+ 0:41:17.017 --> 0:41:19.670
1355
+ But the first thing is we're doing the forget
1356
+ game.
1357
+
1358
+ 0:41:20.060 --> 0:41:34.774
1359
+ The forgetting models should the new cell
1360
+ state mainly depend on the previous cell state
1361
+
1362
+ 0:41:34.774 --> 0:41:40.065
1363
+ or should it depend on our age.
1364
+
1365
+ 0:41:40.000 --> 0:41:41.356
1366
+ Like Add to Them.
1367
+
1368
+ 0:41:41.621 --> 0:41:42.877
1369
+ How can we model that?
1370
+
1371
+ 0:41:44.024 --> 0:41:45.599
1372
+ First we were at a cocktail.
1373
+
1374
+ 0:41:45.945 --> 0:41:52.151
1375
+ The forget gait is depending on minus one.
1376
+
1377
+ 0:41:52.006 --> 0:41:56.485
1378
+ You also see here the former.
1379
+
1380
+ 0:41:57.057 --> 0:42:01.963
1381
+ So we are multiplying both the cell state
1382
+ and our input.
1383
+
1384
+ 0:42:01.877 --> 0:42:04.893
1385
+ With some weights we are getting.
1386
+
1387
+ 0:42:05.105 --> 0:42:08.472
1388
+ We are putting some Bay Inspector and then
1389
+ we are doing Sigma Weed on that.
1390
+
1391
+ 0:42:08.868 --> 0:42:13.452
1392
+ So in the end we have numbers between zero
1393
+ and one saying for each dimension.
1394
+
1395
+ 0:42:13.853 --> 0:42:22.041
1396
+ Like how much if it's near to zero we will
1397
+ mainly use the new input.
1398
+
1399
+ 0:42:21.922 --> 0:42:31.891
1400
+ If it's near to one we will keep the input
1401
+ and ignore the input at this dimension.
1402
+
1403
+ 0:42:33.313 --> 0:42:40.173
1404
+ And by this motivation we can then create
1405
+ here the new sound state, and here you see
1406
+
1407
+ 0:42:40.173 --> 0:42:41.141
1408
+ the formal.
1409
+
1410
+ 0:42:41.601 --> 0:42:55.048
1411
+ So you take your foot back gate and multiply
1412
+ it with your class.
1413
+
1414
+ 0:42:54.841 --> 0:43:00.435
1415
+ So if my was around then.
1416
+
1417
+ 0:43:00.800 --> 0:43:07.405
1418
+ In the other case, when the value was others,
1419
+ that's what you added.
1420
+
1421
+ 0:43:07.309 --> 0:43:10.949
1422
+ Then you're adding a transformation.
1423
+
1424
+ 0:43:11.351 --> 0:43:24.284
1425
+ So if this value was maybe zero then you're
1426
+ putting most of the information from inputting.
1427
+
1428
+ 0:43:25.065 --> 0:43:26.947
1429
+ Is already your element?
1430
+
1431
+ 0:43:26.872 --> 0:43:30.540
1432
+ The only question is now based on your element.
1433
+
1434
+ 0:43:30.463 --> 0:43:32.072
1435
+ What is the output?
1436
+
1437
+ 0:43:33.253 --> 0:43:47.951
1438
+ And there you have another opportunity so
1439
+ you can either take the output or instead you
1440
+
1441
+ 0:43:47.951 --> 0:43:50.957
1442
+ prefer the input.
1443
+
1444
+ 0:43:52.612 --> 0:43:58.166
1445
+ So is the value also the same for the recept
1446
+ game and the forget game.
1447
+
1448
+ 0:43:58.087 --> 0:43:59.422
1449
+ Yes, the movie.
1450
+
1451
+ 0:44:00.900 --> 0:44:10.004
1452
+ Yes exactly so the matrices are different
1453
+ and therefore it can be and that should be
1454
+
1455
+ 0:44:10.004 --> 0:44:16.323
1456
+ and maybe there is sometimes you want to have
1457
+ information.
1458
+
1459
+ 0:44:16.636 --> 0:44:23.843
1460
+ So here again we have this vector with values
1461
+ between zero and which says controlling how
1462
+
1463
+ 0:44:23.843 --> 0:44:25.205
1464
+ the information.
1465
+
1466
+ 0:44:25.505 --> 0:44:36.459
1467
+ And then the output is calculated here similar
1468
+ to a cell stage, but again input is from.
1469
+
1470
+ 0:44:36.536 --> 0:44:45.714
1471
+ So either the reset gate decides should give
1472
+ what is currently stored in there, or.
1473
+
1474
+ 0:44:46.346 --> 0:44:58.647
1475
+ So it's not exactly as the thing we had before,
1476
+ with the residual connections where we added
1477
+
1478
+ 0:44:58.647 --> 0:45:01.293
1479
+ up, but here we do.
1480
+
1481
+ 0:45:04.224 --> 0:45:08.472
1482
+ This is the general idea of a simple recurrent
1483
+ neural network.
1484
+
1485
+ 0:45:08.405 --> 0:45:13.094
1486
+ Then we will now look at how we can make things
1487
+ even more efficient.
1488
+
1489
+ 0:45:13.026 --> 0:45:17.106
1490
+ But first do you have more questions on how
1491
+ it is working?
1492
+
1493
+ 0:45:23.063 --> 0:45:38.799
1494
+ Now these calculations are a bit where things
1495
+ get more efficient because this somehow.
1496
+
1497
+ 0:45:38.718 --> 0:45:43.177
1498
+ It depends on all the other damage for the
1499
+ second one also.
1500
+
1501
+ 0:45:43.423 --> 0:45:48.904
1502
+ Because if you do a matrix multiplication
1503
+ with a vector like for the output vector, each
1504
+
1505
+ 0:45:48.904 --> 0:45:52.353
1506
+ diameter of the output vector depends on all
1507
+ the other.
1508
+
1509
+ 0:45:52.973 --> 0:46:06.561
1510
+ The cell state here depends because this one
1511
+ is used here, and somehow the first dimension
1512
+
1513
+ 0:46:06.561 --> 0:46:11.340
1514
+ of the cell state only depends.
1515
+
1516
+ 0:46:11.931 --> 0:46:17.973
1517
+ In order to make that, of course, is sometimes
1518
+ again making things less paralyzeable if things
1519
+
1520
+ 0:46:17.973 --> 0:46:18.481
1521
+ depend.
1522
+
1523
+ 0:46:19.359 --> 0:46:35.122
1524
+ Can easily make that different by changing
1525
+ from the metric product to not a vector.
1526
+
1527
+ 0:46:35.295 --> 0:46:51.459
1528
+ So you do first, just like inside here, you
1529
+ take like the first dimension, my second dimension.
1530
+
1531
+ 0:46:52.032 --> 0:46:53.772
1532
+ Is, of course, narrow.
1533
+
1534
+ 0:46:53.696 --> 0:46:59.295
1535
+ This should be reset or this should be because
1536
+ it should be a different.
1537
+
1538
+ 0:46:59.899 --> 0:47:12.053
1539
+ Now the first dimension only depends on the
1540
+ first dimension, so you don't have dependencies
1541
+
1542
+ 0:47:12.053 --> 0:47:16.148
1543
+ any longer between dimensions.
1544
+
1545
+ 0:47:18.078 --> 0:47:25.692
1546
+ Maybe it gets a bit clearer if you see about
1547
+ it in this way, so what we have to do now.
1548
+
1549
+ 0:47:25.966 --> 0:47:31.911
1550
+ First, we have to do a metrics multiplication
1551
+ on to gather and to get the.
1552
+
1553
+ 0:47:32.292 --> 0:47:38.041
1554
+ And then we only have the element wise operations
1555
+ where we take this output.
1556
+
1557
+ 0:47:37.966 --> 0:47:38.722
1558
+ We take.
1559
+
1560
+ 0:47:39.179 --> 0:47:42.978
1561
+ Minus one and our original.
1562
+
1563
+ 0:47:42.842 --> 0:47:52.750
1564
+ Here we only have elemental abrasions which
1565
+ can be optimally paralyzed.
1566
+
1567
+ 0:47:53.273 --> 0:48:07.603
1568
+ So here we have additional paralyzed things
1569
+ across the dimension and don't have to do that.
1570
+
1571
+ 0:48:09.929 --> 0:48:24.255
1572
+ Yeah, but this you can do like in parallel
1573
+ again for all xts.
1574
+
1575
+ 0:48:24.544 --> 0:48:33.014
1576
+ Here you can't do it in parallel, but you
1577
+ only have to do it on each seat, and then you
1578
+
1579
+ 0:48:33.014 --> 0:48:34.650
1580
+ can parallelize.
1581
+
1582
+ 0:48:35.495 --> 0:48:39.190
1583
+ But this maybe for the dimension.
1584
+
1585
+ 0:48:39.081 --> 0:48:42.036
1586
+ Maybe it's also important.
1587
+
1588
+ 0:48:41.926 --> 0:48:45.898
1589
+ I don't know if they have tried it.
1590
+
1591
+ 0:48:45.787 --> 0:48:55.386
1592
+ I assume it's not only for dimension reduction,
1593
+ but it's hard because you can easily.
1594
+
1595
+ 0:49:01.001 --> 0:49:08.164
1596
+ People have even like made the second thing
1597
+ even more easy.
1598
+
1599
+ 0:49:08.044 --> 0:49:10.214
1600
+ So there is this.
1601
+
1602
+ 0:49:10.093 --> 0:49:17.897
1603
+ This is how we have the highway connections
1604
+ in the transformer.
1605
+
1606
+ 0:49:17.776 --> 0:49:20.708
1607
+ Then it's like you do.
1608
+
1609
+ 0:49:20.780 --> 0:49:24.789
1610
+ So that is like how things are put together
1611
+ as a transformer.
1612
+
1613
+ 0:49:25.125 --> 0:49:39.960
1614
+ And that is a similar and simple recurring
1615
+ neural network where you do exactly the same
1616
+
1617
+ 0:49:39.960 --> 0:49:44.512
1618
+ for the so you don't have.
1619
+
1620
+ 0:49:46.326 --> 0:49:47.503
1621
+ This type of things.
1622
+
1623
+ 0:49:49.149 --> 0:50:01.196
1624
+ And with this we are at the end of how to
1625
+ make efficient architectures before we go to
1626
+
1627
+ 0:50:01.196 --> 0:50:02.580
1628
+ the next.
1629
+
1630
+ 0:50:13.013 --> 0:50:23.004
1631
+ Teacher ModelsBetween the ink or the trader
1632
+ and the architectures there is a next technique
1633
+
1634
+ 0:50:23.004 --> 0:50:28.977
1635
+ which is used in nearly all deburning very
1636
+ successful.
1637
+
1638
+ 0:50:29.449 --> 0:50:43.463
1639
+ So the idea is can we extract the knowledge
1640
+ from a large network into a smaller one, but
1641
+
1642
+ 0:50:43.463 --> 0:50:45.983
1643
+ it's similarly.
1644
+
1645
+ 0:50:47.907 --> 0:50:53.217
1646
+ And the nice thing is that this really works,
1647
+ and it may be very, very surprising.
1648
+
1649
+ 0:50:53.673 --> 0:51:03.000
1650
+ So the idea is that we have a large straw
1651
+ model which we train for long, and the question
1652
+
1653
+ 0:51:03.000 --> 0:51:07.871
1654
+ is: Can that help us to train a smaller model?
1655
+
1656
+ 0:51:08.148 --> 0:51:16.296
1657
+ So can what we refer to as teacher model tell
1658
+ us better to build a small student model than
1659
+
1660
+ 0:51:16.296 --> 0:51:17.005
1661
+ before.
1662
+
1663
+ 0:51:17.257 --> 0:51:27.371
1664
+ So what we're before in it as a student model,
1665
+ we learn from the data and that is how we train
1666
+
1667
+ 0:51:27.371 --> 0:51:28.755
1668
+ our systems.
1669
+
1670
+ 0:51:29.249 --> 0:51:37.949
1671
+ The question is: Can we train this small model
1672
+ better if we are not only learning from the
1673
+
1674
+ 0:51:37.949 --> 0:51:46.649
1675
+ data, but we are also learning from a large
1676
+ model which has been trained maybe in the same
1677
+
1678
+ 0:51:46.649 --> 0:51:47.222
1679
+ data?
1680
+
1681
+ 0:51:47.667 --> 0:51:55.564
1682
+ So that you have then in the end a smaller
1683
+ model that is somehow better performing than.
1684
+
1685
+ 0:51:55.895 --> 0:51:59.828
1686
+ And maybe that's on the first view.
1687
+
1688
+ 0:51:59.739 --> 0:52:05.396
1689
+ Very very surprising because it has seen the
1690
+ same data so it should have learned the same
1691
+
1692
+ 0:52:05.396 --> 0:52:11.053
1693
+ so the baseline model trained only on the data
1694
+ and the student teacher knowledge to still
1695
+
1696
+ 0:52:11.053 --> 0:52:11.682
1697
+ model it.
1698
+
1699
+ 0:52:11.619 --> 0:52:17.387
1700
+ They all have seen only this data because
1701
+ your teacher modeling was also trained typically
1702
+
1703
+ 0:52:17.387 --> 0:52:19.162
1704
+ only on this model however.
1705
+
1706
+ 0:52:20.580 --> 0:52:30.071
1707
+ It has by now shown that by many ways the
1708
+ model trained in the teacher and analysis framework
1709
+
1710
+ 0:52:30.071 --> 0:52:32.293
1711
+ is performing better.
1712
+
1713
+ 0:52:33.473 --> 0:52:40.971
1714
+ A bit of an explanation when we see how that
1715
+ works.
1716
+
1717
+ 0:52:40.827 --> 0:52:46.141
1718
+ There's different ways of doing it.
1719
+
1720
+ 0:52:45.993 --> 0:52:47.199
1721
+ Maybe.
1722
+
1723
+ 0:52:47.567 --> 0:52:51.501
1724
+ So how does it work?
1725
+
1726
+ 0:52:51.314 --> 0:53:04.787
1727
+ This is our student network, the normal one,
1728
+ some type of new network.
1729
+
1730
+ 0:53:04.597 --> 0:53:06.147
1731
+ We're.
1732
+
1733
+ 0:53:06.586 --> 0:53:17.050
1734
+ So we are training the model to predict the
1735
+ same thing as we are doing that by calculating.
1736
+
1737
+ 0:53:17.437 --> 0:53:23.173
1738
+ The cross angry loss was defined in a way
1739
+ where saying all the probabilities for the
1740
+
1741
+ 0:53:23.173 --> 0:53:25.332
1742
+ correct word should be as high.
1743
+
1744
+ 0:53:25.745 --> 0:53:32.207
1745
+ So you are calculating your alphabet probabilities
1746
+ always, and each time step you have an alphabet
1747
+
1748
+ 0:53:32.207 --> 0:53:33.055
1749
+ probability.
1750
+
1751
+ 0:53:32.990 --> 0:53:38.639
1752
+ What is the most probable in the next word
1753
+ and your training signal is put as much of
1754
+
1755
+ 0:53:38.639 --> 0:53:43.368
1756
+ your probability mass to the correct word to
1757
+ the word that is there in.
1758
+
1759
+ 0:53:43.903 --> 0:53:51.367
1760
+ And this is the chief by this cross entry
1761
+ loss, which says with some of the all training
1762
+
1763
+ 0:53:51.367 --> 0:53:58.664
1764
+ examples of all positions, with some of the
1765
+ full vocabulary, and then this one is this
1766
+
1767
+ 0:53:58.664 --> 0:54:03.947
1768
+ one that this current word is the case word
1769
+ in the vocabulary.
1770
+
1771
+ 0:54:04.204 --> 0:54:18.001
1772
+ And then we take here the lock for the ability
1773
+ of that, so what we made me do is: We have
1774
+
1775
+ 0:54:18.001 --> 0:54:27.200
1776
+ this metric here, so each position of your
1777
+ vocabulary size.
1778
+
1779
+ 0:54:27.507 --> 0:54:38.656
1780
+ In the end what you just do is some of these
1781
+ three lock probabilities, and then you want
1782
+
1783
+ 0:54:38.656 --> 0:54:40.785
1784
+ to have as much.
1785
+
1786
+ 0:54:41.041 --> 0:54:54.614
1787
+ So although this is a thumb over this metric
1788
+ here, in the end of each dimension you.
1789
+
1790
+ 0:54:54.794 --> 0:55:06.366
1791
+ So that is a normal cross end to be lost that
1792
+ we have discussed at the very beginning of
1793
+
1794
+ 0:55:06.366 --> 0:55:07.016
1795
+ how.
1796
+
1797
+ 0:55:08.068 --> 0:55:15.132
1798
+ So what can we do differently in the teacher
1799
+ network?
1800
+
1801
+ 0:55:15.001 --> 0:55:23.376
1802
+ We also have a teacher network which is trained
1803
+ on large data.
1804
+
1805
+ 0:55:24.224 --> 0:55:35.957
1806
+ And of course this distribution might be better
1807
+ than the one from the small model because it's.
1808
+
1809
+ 0:55:36.456 --> 0:55:40.941
1810
+ So in this case we have now the training signal
1811
+ from the teacher network.
1812
+
1813
+ 0:55:41.441 --> 0:55:46.262
1814
+ And it's the same way as we had before.
1815
+
1816
+ 0:55:46.142 --> 0:55:56.483
1817
+ The only difference is we're training not
1818
+ the ground truths per ability distribution
1819
+
1820
+ 0:55:56.483 --> 0:55:59.160
1821
+ year, which is sharp.
1822
+
1823
+ 0:55:59.299 --> 0:56:11.303
1824
+ That's also a probability, so this word has
1825
+ a high probability, but have some probability.
1826
+
1827
+ 0:56:12.612 --> 0:56:19.577
1828
+ And that is the main difference.
1829
+
1830
+ 0:56:19.366 --> 0:56:30.345
1831
+ Typically you do like the interpretation of
1832
+ these.
1833
+
1834
+ 0:56:33.213 --> 0:56:38.669
1835
+ Because there's more information contained
1836
+ in the distribution than in the front booth,
1837
+
1838
+ 0:56:38.669 --> 0:56:44.187
1839
+ because it encodes more information about the
1840
+ language, because language always has more
1841
+
1842
+ 0:56:44.187 --> 0:56:47.907
1843
+ options to put alone, that's the same sentence
1844
+ yes exactly.
1845
+
1846
+ 0:56:47.845 --> 0:56:53.115
1847
+ So there's ambiguity in there that is encoded
1848
+ hopefully very well in the complaint.
1849
+
1850
+ 0:56:53.513 --> 0:56:57.257
1851
+ Trade you two networks so better than a student
1852
+ network you have in there from your learner.
1853
+
1854
+ 0:56:57.537 --> 0:57:05.961
1855
+ So maybe often there's only one correct word,
1856
+ but it might be two or three, and then all
1857
+
1858
+ 0:57:05.961 --> 0:57:10.505
1859
+ of these three have a probability distribution.
1860
+
1861
+ 0:57:10.590 --> 0:57:21.242
1862
+ And then is the main advantage or one explanation
1863
+ of why it's better to train from the.
1864
+
1865
+ 0:57:21.361 --> 0:57:32.652
1866
+ Of course, it's good to also keep the signal
1867
+ in there because then you can prevent it because
1868
+
1869
+ 0:57:32.652 --> 0:57:33.493
1870
+ crazy.
1871
+
1872
+ 0:57:37.017 --> 0:57:49.466
1873
+ Any more questions on the first type of knowledge
1874
+ distillation, also distribution changes.
1875
+
1876
+ 0:57:50.550 --> 0:58:02.202
1877
+ Coming around again, this would put it a bit
1878
+ different, so this is not a solution to maintenance
1879
+
1880
+ 0:58:02.202 --> 0:58:04.244
1881
+ or distribution.
1882
+
1883
+ 0:58:04.744 --> 0:58:12.680
1884
+ But don't think it's performing worse than
1885
+ only doing the ground tours because they also.
1886
+
1887
+ 0:58:13.113 --> 0:58:21.254
1888
+ So it's more like it's not improving you would
1889
+ assume it's similarly helping you, but.
1890
+
1891
+ 0:58:21.481 --> 0:58:28.145
1892
+ Of course, if you now have a teacher, maybe
1893
+ you have no danger on your target to Maine,
1894
+
1895
+ 0:58:28.145 --> 0:58:28.524
1896
+ but.
1897
+
1898
+ 0:58:28.888 --> 0:58:39.895
1899
+ Then you can use this one which is not the
1900
+ ground truth but helpful to learn better for
1901
+
1902
+ 0:58:39.895 --> 0:58:42.147
1903
+ the distribution.
1904
+
1905
+ 0:58:46.326 --> 0:58:57.012
1906
+ The second idea is to do sequence level knowledge
1907
+ distillation, so what we have in this case
1908
+
1909
+ 0:58:57.012 --> 0:59:02.757
1910
+ is we have looked at each position independently.
1911
+
1912
+ 0:59:03.423 --> 0:59:05.436
1913
+ Mean, we do that often.
1914
+
1915
+ 0:59:05.352 --> 0:59:10.930
1916
+ We are not generating a lot of sequences,
1917
+ but that has a problem.
1918
+
1919
+ 0:59:10.845 --> 0:59:13.932
1920
+ We have this propagation of errors.
1921
+
1922
+ 0:59:13.846 --> 0:59:16.765
1923
+ We start with one area and then.
1924
+
1925
+ 0:59:17.237 --> 0:59:27.419
1926
+ So if we are doing word-level knowledge dissolution,
1927
+ we are treating each word in the sentence independently.
1928
+
1929
+ 0:59:28.008 --> 0:59:32.091
1930
+ So we are not trying to like somewhat model
1931
+ the dependency between.
1932
+
1933
+ 0:59:32.932 --> 0:59:47.480
1934
+ We can try to do that by sequence level knowledge
1935
+ dissolution, but the problem is, of course,.
1936
+
1937
+ 0:59:47.847 --> 0:59:53.478
1938
+ So we can that for each position we can get
1939
+ a distribution over all the words at this.
1940
+
1941
+ 0:59:53.793 --> 1:00:05.305
1942
+ But if we want to have a distribution of all
1943
+ possible target sentences, that's not possible
1944
+
1945
+ 1:00:05.305 --> 1:00:06.431
1946
+ because.
1947
+
1948
+ 1:00:08.508 --> 1:00:15.940
1949
+ Area, so we can then again do a bit of a heck
1950
+ on that.
1951
+
1952
+ 1:00:15.805 --> 1:00:23.240
1953
+ If we can't have a distribution of all sentences,
1954
+ it.
1955
+
1956
+ 1:00:23.843 --> 1:00:30.764
1957
+ So what we can't do is you can not use the
1958
+ teacher network and sample different translations.
1959
+
1960
+ 1:00:31.931 --> 1:00:39.327
1961
+ And now we can do different ways to train
1962
+ them.
1963
+
1964
+ 1:00:39.173 --> 1:00:49.345
1965
+ We can use them as their probability, the
1966
+ easiest one to assume.
1967
+
1968
+ 1:00:50.050 --> 1:00:56.373
1969
+ So what that ends to is that we're taking
1970
+ our teacher network, we're generating some
1971
+
1972
+ 1:00:56.373 --> 1:01:01.135
1973
+ translations, and these ones we're using as
1974
+ additional trading.
1975
+
1976
+ 1:01:01.781 --> 1:01:11.382
1977
+ Then we have mainly done this sequence level
1978
+ because the teacher network takes us.
1979
+
1980
+ 1:01:11.266 --> 1:01:17.515
1981
+ These are all probable translations of the
1982
+ sentence.
1983
+
1984
+ 1:01:26.286 --> 1:01:34.673
1985
+ And then you can do a bit of a yeah, and you
1986
+ can try to better make a bit of an interpolated
1987
+
1988
+ 1:01:34.673 --> 1:01:36.206
1989
+ version of that.
1990
+
1991
+ 1:01:36.716 --> 1:01:42.802
1992
+ So what people have also done is like subsequent
1993
+ level interpolations.
1994
+
1995
+ 1:01:42.717 --> 1:01:52.873
1996
+ You generate here several translations: But
1997
+ then you don't use all of them.
1998
+
1999
+ 1:01:52.739 --> 1:02:00.660
2000
+ You do some metrics on which of these ones.
2001
+
2002
+ 1:02:01.021 --> 1:02:12.056
2003
+ So it's a bit more training on this brown
2004
+ chose which might be improbable or unreachable
2005
+
2006
+ 1:02:12.056 --> 1:02:16.520
2007
+ because we can generate everything.
2008
+
2009
+ 1:02:16.676 --> 1:02:23.378
2010
+ And we are giving it an easier solution which
2011
+ is also good quality and training of that.
2012
+
2013
+ 1:02:23.703 --> 1:02:32.602
2014
+ So you're not training it on a very difficult
2015
+ solution, but you're training it on an easier
2016
+
2017
+ 1:02:32.602 --> 1:02:33.570
2018
+ solution.
2019
+
2020
+ 1:02:36.356 --> 1:02:38.494
2021
+ Any More Questions to This.
2022
+
2023
+ 1:02:40.260 --> 1:02:41.557
2024
+ Yeah.
2025
+
2026
+ 1:02:41.461 --> 1:02:44.296
2027
+ Good.
2028
+
2029
+ 1:02:43.843 --> 1:03:01.642
2030
+ Is to look at the vocabulary, so the problem
2031
+ is we have seen that vocabulary calculations
2032
+
2033
+ 1:03:01.642 --> 1:03:06.784
2034
+ are often very presuming.
2035
+
2036
+ 1:03:09.789 --> 1:03:19.805
2037
+ The thing is that most of the vocabulary is
2038
+ not needed for each sentence, so in each sentence.
2039
+
2040
+ 1:03:20.280 --> 1:03:28.219
2041
+ The question is: Can we somehow easily precalculate,
2042
+ which words are probable to occur in the sentence,
2043
+
2044
+ 1:03:28.219 --> 1:03:30.967
2045
+ and then only calculate these ones?
2046
+
2047
+ 1:03:31.691 --> 1:03:34.912
2048
+ And this can be done so.
2049
+
2050
+ 1:03:34.784 --> 1:03:43.934
2051
+ For example, if you have sentenced card, it's
2052
+ probably not happening.
2053
+
2054
+ 1:03:44.164 --> 1:03:48.701
2055
+ So what you can try to do is to limit your
2056
+ vocabulary.
2057
+
2058
+ 1:03:48.618 --> 1:03:51.096
2059
+ You're considering for each.
2060
+
2061
+ 1:03:51.151 --> 1:04:04.693
2062
+ So you're no longer taking the full vocabulary
2063
+ as possible output, but you're restricting.
2064
+
2065
+ 1:04:06.426 --> 1:04:18.275
2066
+ That typically works is that we limit it by
2067
+ the most frequent words we always take because
2068
+
2069
+ 1:04:18.275 --> 1:04:23.613
2070
+ these are not so easy to align to words.
2071
+
2072
+ 1:04:23.964 --> 1:04:32.241
2073
+ To take the most treatment taggin' words and
2074
+ then work that often aligns with one of the
2075
+
2076
+ 1:04:32.241 --> 1:04:32.985
2077
+ source.
2078
+
2079
+ 1:04:33.473 --> 1:04:46.770
2080
+ So for each source word you calculate the
2081
+ word alignment on your training data, and then
2082
+
2083
+ 1:04:46.770 --> 1:04:51.700
2084
+ you calculate which words occur.
2085
+
2086
+ 1:04:52.352 --> 1:04:57.680
2087
+ And then for decoding you build this union
2088
+ of maybe the source word list that other.
2089
+
2090
+ 1:04:59.960 --> 1:05:02.145
2091
+ Are like for each source work.
2092
+
2093
+ 1:05:02.075 --> 1:05:08.745
2094
+ One of the most frequent translations of these
2095
+ source words, for example for each source work
2096
+
2097
+ 1:05:08.745 --> 1:05:13.003
2098
+ like in the most frequent ones, and then the
2099
+ most frequent.
2100
+
2101
+ 1:05:13.193 --> 1:05:24.333
2102
+ In total, if you have short sentences, you
2103
+ have a lot less words, so in most cases it's
2104
+
2105
+ 1:05:24.333 --> 1:05:26.232
2106
+ not more than.
2107
+
2108
+ 1:05:26.546 --> 1:05:33.957
2109
+ And so you have dramatically reduced your
2110
+ vocabulary, and thereby can also fax a depot.
2111
+
2112
+ 1:05:35.495 --> 1:05:43.757
2113
+ That easy does anybody see what is challenging
2114
+ here and why that might not always need.
2115
+
2116
+ 1:05:47.687 --> 1:05:54.448
2117
+ The performance is not why this might not.
2118
+
2119
+ 1:05:54.291 --> 1:06:01.842
2120
+ If you implement it, it might not be a strong.
2121
+
2122
+ 1:06:01.941 --> 1:06:06.053
2123
+ You have to store this list.
2124
+
2125
+ 1:06:05.911 --> 1:06:14.138
2126
+ You have to burn the union and of course your
2127
+ safe time.
2128
+
2129
+ 1:06:14.554 --> 1:06:21.920
2130
+ The second thing the vocabulary is used in
2131
+ our last step, so we have the hidden state,
2132
+
2133
+ 1:06:21.920 --> 1:06:23.868
2134
+ and then we calculate.
2135
+
2136
+ 1:06:24.284 --> 1:06:29.610
2137
+ Now we are not longer calculating them for
2138
+ all output words, but for a subset of them.
2139
+
2140
+ 1:06:30.430 --> 1:06:35.613
2141
+ However, this metric multiplication is typically
2142
+ parallelized with the perfect but good.
2143
+
2144
+ 1:06:35.956 --> 1:06:46.937
2145
+ But if you not only calculate some of them,
2146
+ if you're not modeling it right, it will take
2147
+
2148
+ 1:06:46.937 --> 1:06:52.794
2149
+ as long as before because of the nature of
2150
+ the.
2151
+
2152
+ 1:06:56.776 --> 1:07:07.997
2153
+ Here for beam search there's some ideas of
2154
+ course you can go back to greedy search because
2155
+
2156
+ 1:07:07.997 --> 1:07:10.833
2157
+ that's more efficient.
2158
+
2159
+ 1:07:11.651 --> 1:07:18.347
2160
+ And better quality, and you can buffer some
2161
+ states in between, so how much buffering it's
2162
+
2163
+ 1:07:18.347 --> 1:07:22.216
2164
+ again this tradeoff between calculation and
2165
+ memory.
2166
+
2167
+ 1:07:25.125 --> 1:07:37.723
2168
+ Outer Regressive ModelThen at the end of today
2169
+ what we want to look into is one last type
2170
+
2171
+ 1:07:37.723 --> 1:07:42.902
2172
+ of new machine translation approach.
2173
+
2174
+ 1:07:43.403 --> 1:07:53.621
2175
+ And the idea is what we've already seen in
2176
+ our first two steps is that this ultra aggressive
2177
+
2178
+ 1:07:53.621 --> 1:07:57.246
2179
+ park is taking community coding.
2180
+
2181
+ 1:07:57.557 --> 1:08:04.461
2182
+ Can process everything in parallel, but we
2183
+ are always taking the most probable and then.
2184
+
2185
+ 1:08:05.905 --> 1:08:10.476
2186
+ The question is: Do we really need to do that?
2187
+
2188
+ 1:08:10.378 --> 1:08:14.015
2189
+ Therefore, there is a bunch of work.
2190
+
2191
+ 1:08:13.917 --> 1:08:16.518
2192
+ Can we do it differently?
2193
+
2194
+ 1:08:16.418 --> 1:08:19.622
2195
+ Can we generate a full target?
2196
+
2197
+ 1:08:20.160 --> 1:08:29.417
2198
+ We'll see it's not that easy and there's still
2199
+ an open debate whether this is really faster
2200
+
2201
+ 1:08:29.417 --> 1:08:31.832
2202
+ and quality, but think.
2203
+
2204
+ 1:08:32.712 --> 1:08:45.594
2205
+ So, as said, what we have done is our encoder
2206
+ decoder where we can process our encoder color,
2207
+
2208
+ 1:08:45.594 --> 1:08:50.527
2209
+ and then the output always depends.
2210
+
2211
+ 1:08:50.410 --> 1:08:54.709
2212
+ We generate the output and then we have to
2213
+ put it here the wide because then everything
2214
+
2215
+ 1:08:54.709 --> 1:08:56.565
2216
+ depends on the purpose of the output.
2217
+
2218
+ 1:08:56.916 --> 1:09:10.464
2219
+ This is what is referred to as an outer-regressive
2220
+ model and nearly outs speech generation and
2221
+
2222
+ 1:09:10.464 --> 1:09:16.739
2223
+ language generation or works in this outer.
2224
+
2225
+ 1:09:18.318 --> 1:09:21.132
2226
+ So the motivation is, can we do that more
2227
+ efficiently?
2228
+
2229
+ 1:09:21.361 --> 1:09:31.694
2230
+ And can we somehow process all target words
2231
+ in parallel?
2232
+
2233
+ 1:09:31.513 --> 1:09:41.305
2234
+ So instead of doing it one by one, we are
2235
+ inputting.
2236
+
2237
+ 1:09:45.105 --> 1:09:46.726
2238
+ So how does it work?
2239
+
2240
+ 1:09:46.649 --> 1:09:50.589
2241
+ So let's first have a basic auto regressive
2242
+ mode.
2243
+
2244
+ 1:09:50.810 --> 1:09:53.551
2245
+ So the encoder looks as it is before.
2246
+
2247
+ 1:09:53.478 --> 1:09:58.311
2248
+ That's maybe not surprising because here we
2249
+ know we can paralyze.
2250
+
2251
+ 1:09:58.618 --> 1:10:04.592
2252
+ So we have put in here our ink holder and
2253
+ generated the ink stash, so that's exactly
2254
+
2255
+ 1:10:04.592 --> 1:10:05.295
2256
+ the same.
2257
+
2258
+ 1:10:05.845 --> 1:10:16.069
2259
+ Machine TranslationHowever, now we need to
2260
+ do one more thing: One challenge is what we
2261
+
2262
+ 1:10:16.069 --> 1:10:26.764
2263
+ had before and that's a challenge of natural
2264
+ language generation like machine translation.
2265
+
2266
+ 1:10:32.672 --> 1:10:38.447
2267
+ We generate until we generate this out of
2268
+ end of center stock, but if we now generate
2269
+
2270
+ 1:10:38.447 --> 1:10:44.625
2271
+ everything at once that's no longer possible,
2272
+ so we cannot generate as long because we only
2273
+
2274
+ 1:10:44.625 --> 1:10:45.632
2275
+ generated one.
2276
+
2277
+ 1:10:46.206 --> 1:10:58.321
2278
+ So the question is how can we now determine
2279
+ how long the sequence is, and we can also accelerate.
2280
+
2281
+ 1:11:00.000 --> 1:11:06.384
2282
+ Yes, but there would be one idea, and there
2283
+ is other work which tries to do that.
2284
+
2285
+ 1:11:06.806 --> 1:11:15.702
2286
+ However, in here there's some work already
2287
+ done before and maybe you remember we had the
2288
+
2289
+ 1:11:15.702 --> 1:11:20.900
2290
+ IBM models and there was this concept of fertility.
2291
+
2292
+ 1:11:21.241 --> 1:11:26.299
2293
+ The concept of fertility is means like for
2294
+ one saucepan, and how many target pores does
2295
+
2296
+ 1:11:26.299 --> 1:11:27.104
2297
+ it translate?
2298
+
2299
+ 1:11:27.847 --> 1:11:34.805
2300
+ And exactly that we try to do here, and that
2301
+ means we are calculating like at the top we
2302
+
2303
+ 1:11:34.805 --> 1:11:36.134
2304
+ are calculating.
2305
+
2306
+ 1:11:36.396 --> 1:11:42.045
2307
+ So it says word is translated into word.
2308
+
2309
+ 1:11:41.908 --> 1:11:54.173
2310
+ Word might be translated into words into,
2311
+ so we're trying to predict in how many words.
2312
+
2313
+ 1:11:55.935 --> 1:12:10.314
2314
+ And then the end of the anchor, so this is
2315
+ like a length estimation.
2316
+
2317
+ 1:12:10.105 --> 1:12:15.532
2318
+ You can do it otherwise.
2319
+
2320
+ 1:12:16.236 --> 1:12:24.526
2321
+ You initialize your decoder input and we know
2322
+ it's good with word embeddings so we're trying
2323
+
2324
+ 1:12:24.526 --> 1:12:28.627
2325
+ to do the same thing and what people then do.
2326
+
2327
+ 1:12:28.538 --> 1:12:35.225
2328
+ They initialize it again with word embedding
2329
+ but in the frequency of the.
2330
+
2331
+ 1:12:35.315 --> 1:12:36.460
2332
+ So we have the cartilage.
2333
+
2334
+ 1:12:36.896 --> 1:12:47.816
2335
+ So one has two, so twice the is and then one
2336
+ is, so that is then our initialization.
2337
+
2338
+ 1:12:48.208 --> 1:12:57.151
2339
+ In other words, if you don't predict fertilities
2340
+ but predict lengths, you can just initialize
2341
+
2342
+ 1:12:57.151 --> 1:12:57.912
2343
+ second.
2344
+
2345
+ 1:12:58.438 --> 1:13:07.788
2346
+ This often works a bit better, but that's
2347
+ the other.
2348
+
2349
+ 1:13:07.611 --> 1:13:16.436
2350
+ Now you have everything in training and testing.
2351
+
2352
+ 1:13:16.656 --> 1:13:18.621
2353
+ This is all available at once.
2354
+
2355
+ 1:13:20.280 --> 1:13:31.752
2356
+ Then we can generate everything in parallel,
2357
+ so we have the decoder stack, and that is now
2358
+
2359
+ 1:13:31.752 --> 1:13:33.139
2360
+ as before.
2361
+
2362
+ 1:13:35.395 --> 1:13:41.555
2363
+ And then we're doing the translation predictions
2364
+ here on top of it in order to do.
2365
+
2366
+ 1:13:43.083 --> 1:13:59.821
2367
+ And then we are predicting here the target
2368
+ words and once predicted, and that is the basic
2369
+
2370
+ 1:13:59.821 --> 1:14:00.924
2371
+ idea.
2372
+
2373
+ 1:14:01.241 --> 1:14:08.171
2374
+ Machine translation: Where the idea is, we
2375
+ don't have to do one by one what we're.
2376
+
2377
+ 1:14:10.210 --> 1:14:13.900
2378
+ So this looks really, really, really great.
2379
+
2380
+ 1:14:13.816 --> 1:14:20.314
2381
+ On the first view there's one challenge with
2382
+ this, and this is the baseline.
2383
+
2384
+ 1:14:20.230 --> 1:14:27.572
2385
+ Of course there's some improvements, but in
2386
+ general the quality is often significant.
2387
+
2388
+ 1:14:28.068 --> 1:14:32.075
2389
+ So here you see the baseline models.
2390
+
2391
+ 1:14:31.967 --> 1:14:38.468
2392
+ You have a loss of ten blue points or something
2393
+ like that.
2394
+
2395
+ 1:14:38.878 --> 1:14:40.230
2396
+ So why does it change?
2397
+
2398
+ 1:14:40.171 --> 1:14:41.642
2399
+ So why is it happening?
2400
+
2401
+ 1:14:43.903 --> 1:14:56.250
2402
+ If you look at the errors there is repetitive
2403
+ tokens, so you have like or things like that.
2404
+
2405
+ 1:14:56.536 --> 1:15:01.995
2406
+ Broken senses or influent senses, so that
2407
+ exactly where algebra aggressive models are
2408
+
2409
+ 1:15:01.995 --> 1:15:04.851
2410
+ very good, we say that's a bit of a problem.
2411
+
2412
+ 1:15:04.788 --> 1:15:07.392
2413
+ They generate very fluid transcription.
2414
+
2415
+ 1:15:07.387 --> 1:15:10.898
2416
+ Translation: Sometimes there doesn't have
2417
+ to do anything with the input.
2418
+
2419
+ 1:15:11.411 --> 1:15:14.047
2420
+ But generally it really looks always very
2421
+ fluid.
2422
+
2423
+ 1:15:14.995 --> 1:15:20.865
2424
+ Here exactly the opposite, so the problem
2425
+ is that we don't have really fluid translation.
2426
+
2427
+ 1:15:21.421 --> 1:15:26.123
2428
+ And that is mainly due to the challenge that
2429
+ we have this independent assumption.
2430
+
2431
+ 1:15:26.646 --> 1:15:35.873
2432
+ So in this case, the probability of Y of the
2433
+ second position is independent of the probability
2434
+
2435
+ 1:15:35.873 --> 1:15:40.632
2436
+ of X, so we don't know what was there generated.
2437
+
2438
+ 1:15:40.535 --> 1:15:43.743
2439
+ We're just generating it there.
2440
+
2441
+ 1:15:43.964 --> 1:15:55.439
2442
+ You can see it also in a bit of examples.
2443
+
2444
+ 1:15:55.166 --> 1:16:03.646
2445
+ You can over-panelize shifts.
2446
+
2447
+ 1:16:04.024 --> 1:16:10.566
2448
+ And the problem is this is already an improvement
2449
+ again, but this is also similar to.
2450
+
2451
+ 1:16:11.071 --> 1:16:21.017
2452
+ So you can, for example, translate heeded
2453
+ back, or maybe you could also translate it
2454
+
2455
+ 1:16:21.017 --> 1:16:31.197
2456
+ with: But on their feeling down in feeling
2457
+ down, if the first position thinks of their
2458
+
2459
+ 1:16:31.197 --> 1:16:34.591
2460
+ feeling done and the second.
2461
+
2462
+ 1:16:35.075 --> 1:16:42.908
2463
+ So each position here and that is one of the
2464
+ main issues here doesn't know what the other.
2465
+
2466
+ 1:16:43.243 --> 1:16:53.846
2467
+ And for example, if you are translating something
2468
+ with, you can often translate things in two
2469
+
2470
+ 1:16:53.846 --> 1:16:58.471
2471
+ ways: German with a different agreement.
2472
+
2473
+ 1:16:58.999 --> 1:17:02.058
2474
+ And then here where you have to decide do
2475
+ a used jet.
2476
+
2477
+ 1:17:02.162 --> 1:17:05.460
2478
+ Interpretator: It doesn't know which word
2479
+ it has to select.
2480
+
2481
+ 1:17:06.086 --> 1:17:14.789
2482
+ Mean, of course, it knows a hidden state,
2483
+ but in the end you have a liability distribution.
2484
+
2485
+ 1:17:16.256 --> 1:17:20.026
2486
+ And that is the important thing in the outer
2487
+ regressive month.
2488
+
2489
+ 1:17:19.966 --> 1:17:24.295
2490
+ You know that because you have put it in you
2491
+ here, you don't know that.
2492
+
2493
+ 1:17:24.235 --> 1:17:29.624
2494
+ If it's equal probable here to two, you don't
2495
+ Know Which Is Selected, and of course that
2496
+
2497
+ 1:17:29.624 --> 1:17:32.833
2498
+ depends on what should be the latest traction
2499
+ under.
2500
+
2501
+ 1:17:33.333 --> 1:17:39.554
2502
+ Yep, that's the undershift, and we're going
2503
+ to last last the next time.
2504
+
2505
+ 1:17:39.467 --> 1:17:40.007
2506
+ Yes.
2507
+
2508
+ 1:17:40.840 --> 1:17:44.935
2509
+ Doesn't this also appear in and like now we're
2510
+ talking about physical training?
2511
+
2512
+ 1:17:46.586 --> 1:17:48.412
2513
+ The thing is in the auto regress.
2514
+
2515
+ 1:17:48.358 --> 1:17:50.185
2516
+ If you give it the correct one,.
2517
+
2518
+ 1:17:50.450 --> 1:17:55.827
2519
+ So if you predict here comma what the reference
2520
+ is feeling then you tell the model here.
2521
+
2522
+ 1:17:55.767 --> 1:17:59.540
2523
+ The last one was feeling and then it knows
2524
+ it has to be done.
2525
+
2526
+ 1:17:59.479 --> 1:18:04.045
2527
+ But here it doesn't know that because it doesn't
2528
+ get as input as a right.
2529
+
2530
+ 1:18:04.204 --> 1:18:24.286
2531
+ Yes, that's a bit depending on what.
2532
+
2533
+ 1:18:24.204 --> 1:18:27.973
2534
+ But in training, of course, you just try to
2535
+ make the highest one the current one.
2536
+
2537
+ 1:18:31.751 --> 1:18:38.181
2538
+ So what you can do is things like CDC loss
2539
+ which can adjust for this.
2540
+
2541
+ 1:18:38.089 --> 1:18:42.809
2542
+ So then you can also have this shifted correction.
2543
+
2544
+ 1:18:42.716 --> 1:18:50.584
2545
+ If you're doing this type of correction in
2546
+ the CDC loss you don't get full penalty.
2547
+
2548
+ 1:18:50.930 --> 1:18:58.486
2549
+ Just shifted by one, so it's a bit of a different
2550
+ loss, which is mainly used in, but.
2551
+
2552
+ 1:19:00.040 --> 1:19:03.412
2553
+ It can be used in order to address this problem.
2554
+
2555
+ 1:19:04.504 --> 1:19:13.844
2556
+ The other problem is that outer regressively
2557
+ we have the label buyers that tries to disimmigrate.
2558
+
2559
+ 1:19:13.749 --> 1:19:20.517
2560
+ That's the example did before was if you translate
2561
+ thank you to Dung.
2562
+
2563
+ 1:19:20.460 --> 1:19:31.925
2564
+ And then it might end up because it learns
2565
+ in the first position and the second also.
2566
+
2567
+ 1:19:32.492 --> 1:19:43.201
2568
+ In order to prevent that, it would be helpful
2569
+ for one output, only one output, so that makes
2570
+
2571
+ 1:19:43.201 --> 1:19:47.002
2572
+ the system already better learn.
2573
+
2574
+ 1:19:47.227 --> 1:19:53.867
2575
+ Might be that for slightly different inputs
2576
+ you have different outputs, but for the same.
2577
+
2578
+ 1:19:54.714 --> 1:19:57.467
2579
+ That we can luckily very easily solve.
2580
+
2581
+ 1:19:59.119 --> 1:19:59.908
2582
+ And it's done.
2583
+
2584
+ 1:19:59.855 --> 1:20:04.117
2585
+ We just learned the technique about it, which
2586
+ is called knowledge distillation.
2587
+
2588
+ 1:20:04.985 --> 1:20:13.398
2589
+ So what we can do and the easiest solution
2590
+ to prove your non-autoregressive model is to
2591
+
2592
+ 1:20:13.398 --> 1:20:16.457
2593
+ train an auto regressive model.
2594
+
2595
+ 1:20:16.361 --> 1:20:22.959
2596
+ Then you decode your whole training gamer
2597
+ with this model and then.
2598
+
2599
+ 1:20:23.603 --> 1:20:27.078
2600
+ While the main advantage of that is that this
2601
+ is more consistent,.
2602
+
2603
+ 1:20:27.407 --> 1:20:33.995
2604
+ So for the same input you always have the
2605
+ same output.
2606
+
2607
+ 1:20:33.875 --> 1:20:41.903
2608
+ So you have to make your training data more
2609
+ consistent and learn.
2610
+
2611
+ 1:20:42.482 --> 1:20:54.471
2612
+ So there is another advantage of knowledge
2613
+ distillation and that advantage is you have
2614
+
2615
+ 1:20:54.471 --> 1:20:59.156
2616
+ more consistent training signals.
2617
+
2618
+ 1:21:04.884 --> 1:21:10.630
2619
+ There's another to make the things more easy
2620
+ at the beginning.
2621
+
2622
+ 1:21:10.539 --> 1:21:16.469
2623
+ There's this plants model, black model where
2624
+ you do more masks.
2625
+
2626
+ 1:21:16.756 --> 1:21:26.080
2627
+ So during training, especially at the beginning,
2628
+ you give some correct solutions at the beginning.
2629
+
2630
+ 1:21:28.468 --> 1:21:38.407
2631
+ And there is this tokens at a time, so the
2632
+ idea is to establish other regressive training.
2633
+
2634
+ 1:21:40.000 --> 1:21:50.049
2635
+ And some targets are open, so you always predict
2636
+ only like first auto regression is K.
2637
+
2638
+ 1:21:50.049 --> 1:21:59.174
2639
+ It puts one, so you always have one input
2640
+ and one output, then you do partial.
2641
+
2642
+ 1:21:59.699 --> 1:22:05.825
2643
+ So in that way you can slowly learn what is
2644
+ a good and what is a bad answer.
2645
+
2646
+ 1:22:08.528 --> 1:22:10.862
2647
+ It doesn't sound very impressive.
2648
+
2649
+ 1:22:10.793 --> 1:22:12.536
2650
+ Don't contact me anyway.
2651
+
2652
+ 1:22:12.466 --> 1:22:15.326
2653
+ Go all over your training data several.
2654
+
2655
+ 1:22:15.875 --> 1:22:20.655
2656
+ You can even switch in between.
2657
+
2658
+ 1:22:20.506 --> 1:22:29.321
2659
+ There is a homework on this thing where you
2660
+ try to start.
2661
+
2662
+ 1:22:31.271 --> 1:22:41.563
2663
+ You have to learn so there's a whole work
2664
+ on that so this is often happening and it doesn't
2665
+
2666
+ 1:22:41.563 --> 1:22:46.598
2667
+ mean it's less efficient but still it helps.
2668
+
2669
+ 1:22:49.389 --> 1:22:57.979
2670
+ For later maybe here are some examples of
2671
+ how much things help.
2672
+
2673
+ 1:22:57.845 --> 1:23:04.961
2674
+ Maybe one point here is that it's really important.
2675
+
2676
+ 1:23:05.365 --> 1:23:13.787
2677
+ Here's the translation performance and speed.
2678
+
2679
+ 1:23:13.604 --> 1:23:24.410
2680
+ One point which is a point is if you compare
2681
+ researchers.
2682
+
2683
+ 1:23:24.784 --> 1:23:33.880
2684
+ So yeah, if you're compared to one very weak
2685
+ baseline transformer even with beam search,
2686
+
2687
+ 1:23:33.880 --> 1:23:40.522
2688
+ then you're ten times slower than a very strong
2689
+ auto regressive.
2690
+
2691
+ 1:23:40.961 --> 1:23:50.047
2692
+ If you make a strong baseline then it's going
2693
+ down to depending on times and here like: You
2694
+
2695
+ 1:23:50.047 --> 1:23:53.504
2696
+ have a lot of different speed ups.
2697
+
2698
+ 1:23:53.405 --> 1:24:03.262
2699
+ Generally, it makes a strong baseline and
2700
+ not very simple transformer.
2701
+
2702
+ 1:24:07.407 --> 1:24:19.020
2703
+ Half PrecisionYeah, with this one last thing
2704
+ that you can do to speed up things and also
2705
+
2706
+ 1:24:19.020 --> 1:24:25.936
2707
+ reduce your memory is what is called half precision.
2708
+
2709
+ 1:24:26.326 --> 1:24:29.139
2710
+ And especially for decoding issues for training.
2711
+
2712
+ 1:24:29.081 --> 1:24:31.150
2713
+ Sometimes it also gets less stale.
2714
+
2715
+ 1:24:32.592 --> 1:24:45.184
2716
+ With this we close nearly wait a bit, so what
2717
+ you should remember is that efficient machine
2718
+
2719
+ 1:24:45.184 --> 1:24:46.963
2720
+ translation.
2721
+
2722
+ 1:24:47.007 --> 1:24:51.939
2723
+ We have, for example, looked at knowledge
2724
+ distillation.
2725
+
2726
+ 1:24:51.851 --> 1:24:55.967
2727
+ We have looked at non auto regressive models.
2728
+
2729
+ 1:24:55.877 --> 1:24:57.671
2730
+ We have different.
2731
+
2732
+ 1:24:58.898 --> 1:25:02.383
2733
+ For today and then only requests.
2734
+
2735
+ 1:25:02.281 --> 1:25:08.432
2736
+ So if you haven't done so, please fill out
2737
+ the evaluation.
2738
+
2739
+ 1:25:08.388 --> 1:25:20.127
2740
+ So now if you have done so think then you
2741
+ should have and with the online people hopefully.
2742
+
2743
+ 1:25:20.320 --> 1:25:29.758
2744
+ Only possibility to tell us what things are
2745
+ good and what not the only one but the most
2746
+
2747
+ 1:25:29.758 --> 1:25:30.937
2748
+ efficient.
2749
+
2750
+ 1:25:31.851 --> 1:25:35.871
2751
+ So think of all the students doing it in this
2752
+ case okay and then thank.
2753
+
demo_data/lectures/Lecture-14-27.06.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59f384b3137c89cb3f00f2020badb6eb5ff6de5043bd9e015adab92072e27e62
3
+ size 113488295
demo_data/lectures/Lecture-15-11.07.2023/English.vtt ADDED
@@ -0,0 +1,2295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 0:00:00.060 --> 0:00:07.061
4
+ IntroductionOK good so today's lecture is
5
+ on on supervised machines and stations so what
6
+
7
+ 0:00:07.061 --> 0:00:13.512
8
+ you have seen so far is different techniques
9
+ are on supervised and MP so you are.
10
+
11
+ 0:00:13.593 --> 0:00:18.552
12
+ Data right so let's say in English coppers
13
+ you are one file and then in German you have
14
+
15
+ 0:00:18.552 --> 0:00:23.454
16
+ another file which is sentence to sentence
17
+ la and then you try to build systems around
18
+
19
+ 0:00:23.454 --> 0:00:23.679
20
+ it.
21
+
22
+ 0:00:24.324 --> 0:00:30.130
23
+ But what's different about this lecture is
24
+ that you assume that you have no final data
25
+
26
+ 0:00:30.130 --> 0:00:30.663
27
+ at all.
28
+
29
+ 0:00:30.597 --> 0:00:37.120
30
+ You only have monolingual data and the question
31
+ is how can we build systems to translate between
32
+
33
+ 0:00:37.120 --> 0:00:39.406
34
+ these two languages right and so.
35
+
36
+ 0:00:39.359 --> 0:00:44.658
37
+ This is a bit more realistic scenario because
38
+ you have so many languages in the world.
39
+
40
+ 0:00:44.597 --> 0:00:50.288
41
+ You cannot expect to have parallel data between
42
+ all the two languages and so, but in typical
43
+
44
+ 0:00:50.288 --> 0:00:55.612
45
+ cases you have newspapers and so on, which
46
+ is like monolingual files, and the question
47
+
48
+ 0:00:55.612 --> 0:00:57.999
49
+ is can we build something around them?
50
+
51
+ 0:00:59.980 --> 0:01:01.651
52
+ They like said for today.
53
+
54
+ 0:01:01.586 --> 0:01:05.844
55
+ First we'll start up with the interactions,
56
+ so why do we need it?
57
+
58
+ 0:01:05.780 --> 0:01:11.549
59
+ and also some infusion on how these models
60
+ work before going into the technical details.
61
+
62
+ 0:01:11.484 --> 0:01:17.303
63
+ I want to also go through an example,, which
64
+ kind of gives you more understanding on how
65
+
66
+ 0:01:17.303 --> 0:01:19.264
67
+ people came into more elders.
68
+
69
+ 0:01:20.820 --> 0:01:23.905
70
+ Then the rest of the lecture is going to be
71
+ two parts.
72
+
73
+ 0:01:23.849 --> 0:01:26.045
74
+ One is we're going to translate words.
75
+
76
+ 0:01:25.989 --> 0:01:29.949
77
+ We're not going to care about how can we translate
78
+ the full sentence.
79
+
80
+ 0:01:29.893 --> 0:01:35.136
81
+ But given to monolingual files, how can we
82
+ get a dictionary basically, which is much easier
83
+
84
+ 0:01:35.136 --> 0:01:37.814
85
+ than generating something in a sentence level?
86
+
87
+ 0:01:38.698 --> 0:01:43.533
88
+ Then we're going to go into the Edwards case,
89
+ which is the unsupervised sentence type solution.
90
+
91
+ 0:01:44.204 --> 0:01:50.201
92
+ And here what you'll see is what are the training
93
+ objectives which are quite different than the
94
+
95
+ 0:01:50.201 --> 0:01:55.699
96
+ word translation and also where it doesn't
97
+ but because this is also quite important and
98
+
99
+ 0:01:55.699 --> 0:02:01.384
100
+ it's one of the reasons why unsupervised does
101
+ not use anymore because the limitations kind
102
+
103
+ 0:02:01.384 --> 0:02:03.946
104
+ of go away from the realistic use cases.
105
+
106
+ 0:02:04.504 --> 0:02:06.922
107
+ And then that leads to the marketing world
108
+ model.
109
+
110
+ 0:02:06.873 --> 0:02:07.131
111
+ So.
112
+
113
+ 0:02:07.807 --> 0:02:12.915
114
+ People are trying to do to build systems for
115
+ languages that will not have any parallel data.
116
+
117
+ 0:02:12.860 --> 0:02:17.689
118
+ Is use multilingual models and combine with
119
+ these training objectives to get better at
120
+
121
+ 0:02:17.689 --> 0:02:17.911
122
+ it.
123
+
124
+ 0:02:17.856 --> 0:02:18.152
125
+ So.
126
+
127
+ 0:02:18.658 --> 0:02:24.396
128
+ People are not trying to build bilingual systems
129
+ currently for unsupervised arm translation,
130
+
131
+ 0:02:24.396 --> 0:02:30.011
132
+ but I think it's good to know how they came
133
+ to hear this point and what they're doing now.
134
+
135
+ 0:02:30.090 --> 0:02:34.687
136
+ You also see some patterns overlapping which
137
+ people are using.
138
+
139
+ 0:02:36.916 --> 0:02:41.642
140
+ So as you said before, and you probably hear
141
+ it multiple times now is that we have seven
142
+
143
+ 0:02:41.642 --> 0:02:43.076
144
+ thousand languages around.
145
+
146
+ 0:02:43.903 --> 0:02:49.460
147
+ Can be different dialects in someone, so it's
148
+ quite hard to distinguish what's the language,
149
+
150
+ 0:02:49.460 --> 0:02:54.957
151
+ but you can typically approximate that seven
152
+ thousand and that leads to twenty five million
153
+
154
+ 0:02:54.957 --> 0:02:59.318
155
+ pairs, which is the obvious reason why we do
156
+ not have any parallel data.
157
+
158
+ 0:03:00.560 --> 0:03:06.386
159
+ So you want to build an empty system for all
160
+ possible language pests and the question is
161
+
162
+ 0:03:06.386 --> 0:03:07.172
163
+ how can we?
164
+
165
+ 0:03:08.648 --> 0:03:13.325
166
+ The typical use case, but there are actually
167
+ quite few interesting use cases than what you
168
+
169
+ 0:03:13.325 --> 0:03:14.045
170
+ would expect.
171
+
172
+ 0:03:14.614 --> 0:03:20.508
173
+ One is the animal languages, which is the
174
+ real thing that's happening right now with.
175
+
176
+ 0:03:20.780 --> 0:03:26.250
177
+ The dog but with dolphins and so on, but I
178
+ couldn't find a picture that could show this,
179
+
180
+ 0:03:26.250 --> 0:03:31.659
181
+ but if you are interested in stuff like this
182
+ you can check out the website where people
183
+
184
+ 0:03:31.659 --> 0:03:34.916
185
+ are actually trying to understand how animals
186
+ speak.
187
+
188
+ 0:03:35.135 --> 0:03:37.356
189
+ It's Also a Bit More About.
190
+
191
+ 0:03:37.297 --> 0:03:44.124
192
+ Knowing what the animals want to say but may
193
+ not die dead but still people are trying to
194
+
195
+ 0:03:44.124 --> 0:03:44.661
196
+ do it.
197
+
198
+ 0:03:45.825 --> 0:03:50.689
199
+ More realistic thing that's happening is the
200
+ translation of programming languages.
201
+
202
+ 0:03:51.371 --> 0:03:56.963
203
+ And so this is quite a quite good scenario
204
+ for entrepreneurs and empty is that you have
205
+
206
+ 0:03:56.963 --> 0:04:02.556
207
+ a lot of code available online right in C +
208
+ + and in Python and the question is how can
209
+
210
+ 0:04:02.556 --> 0:04:08.402
211
+ we translate by just looking at the code alone
212
+ and no parallel functions and so on and this
213
+
214
+ 0:04:08.402 --> 0:04:10.754
215
+ is actually quite good right now so.
216
+
217
+ 0:04:12.032 --> 0:04:16.111
218
+ See how these techniques were applied to do
219
+ the programming translation.
220
+
221
+ 0:04:18.258 --> 0:04:23.882
222
+ And then you can also think of language as
223
+ something that is quite common so you can take
224
+
225
+ 0:04:23.882 --> 0:04:24.194
226
+ off.
227
+
228
+ 0:04:24.132 --> 0:04:29.594
229
+ Think of formal sentences in English as one
230
+ language and informal sentences in English
231
+
232
+ 0:04:29.594 --> 0:04:35.433
233
+ as another language and then learn the kind
234
+ to stay between them and then it kind of becomes
235
+
236
+ 0:04:35.433 --> 0:04:37.380
237
+ a style plan for a problem so.
238
+
239
+ 0:04:38.358 --> 0:04:43.042
240
+ Although it's translation, you can consider
241
+ different characteristics of a language and
242
+
243
+ 0:04:43.042 --> 0:04:46.875
244
+ then separate them as two different languages
245
+ and then try to map them.
246
+
247
+ 0:04:46.822 --> 0:04:52.022
248
+ So it's not only about languages, but you
249
+ can also do quite cool things by using unsophisticated
250
+
251
+ 0:04:52.022 --> 0:04:54.327
252
+ techniques, which are quite possible also.
253
+
254
+ 0:04:56.256 --> 0:04:56.990
255
+ I am so.
256
+
257
+ 0:04:56.909 --> 0:05:04.292
258
+ This is kind of TV modeling for many of the
259
+ use cases that we have for ours, ours and MD.
260
+
261
+ 0:05:04.210 --> 0:05:11.835
262
+ But before we go into the modeling of these
263
+ systems, what I want you to do is look at these
264
+
265
+ 0:05:11.835 --> 0:05:12.415
266
+ dummy.
267
+
268
+ 0:05:13.813 --> 0:05:19.720
269
+ We have text and language one, text and language
270
+ two right, and nobody knows what these languages
271
+
272
+ 0:05:19.720 --> 0:05:20.082
273
+ mean.
274
+
275
+ 0:05:20.021 --> 0:05:23.719
276
+ They completely are made up right, and the
277
+ question is also.
278
+
279
+ 0:05:23.658 --> 0:05:29.344
280
+ They're not parallel lines, so the first line
281
+ here and the first line is not a line, they're
282
+
283
+ 0:05:29.344 --> 0:05:30.811
284
+ just monolingual files.
285
+
286
+ 0:05:32.052 --> 0:05:38.281
287
+ And now think about how can you translate
288
+ the word M1 from language one to language two,
289
+
290
+ 0:05:38.281 --> 0:05:41.851
291
+ and this kind of you see how we try to model
292
+ this.
293
+
294
+ 0:05:42.983 --> 0:05:47.966
295
+ Would take your time and then think of how
296
+ can you translate more into language two?
297
+
298
+ 0:06:41.321 --> 0:06:45.589
299
+ About the model, if you ask somebody who doesn't
300
+ know anything about machine translation right,
301
+
302
+ 0:06:45.589 --> 0:06:47.411
303
+ and then you ask them to translate more.
304
+
305
+ 0:07:01.201 --> 0:07:10.027
306
+ But it's also not quite easy if you think
307
+ of the way that I made this example is relatively
308
+
309
+ 0:07:10.027 --> 0:07:10.986
310
+ easy, so.
311
+
312
+ 0:07:11.431 --> 0:07:17.423
313
+ Basically, the first two sentences are these
314
+ two: A, B, C is E, and G cured up the U, V
315
+
316
+ 0:07:17.423 --> 0:07:21.849
317
+ is L, A, A, C, S, and S, on and this is used
318
+ towards the German.
319
+
320
+ 0:07:22.662 --> 0:07:25.241
321
+ And then when you join these two words, it's.
322
+
323
+ 0:07:25.205 --> 0:07:32.445
324
+ English German the third line and the last
325
+ line, and then the fourth line is the first
326
+
327
+ 0:07:32.445 --> 0:07:38.521
328
+ line, so German language, English, and then
329
+ speak English, speak German.
330
+
331
+ 0:07:38.578 --> 0:07:44.393
332
+ So this is how I made made up the example
333
+ and what the intuition here is that you assume
334
+
335
+ 0:07:44.393 --> 0:07:50.535
336
+ that the languages have a fundamental structure
337
+ right and it's the same across all languages.
338
+
339
+ 0:07:51.211 --> 0:07:57.727
340
+ Doesn't matter what language you are thinking
341
+ of words kind of you have in the same way join
342
+
343
+ 0:07:57.727 --> 0:07:59.829
344
+ together is the same way and.
345
+
346
+ 0:07:59.779 --> 0:08:06.065
347
+ And plasma sign thinks the same way but this
348
+ is not a realistic assumption for sure but
349
+
350
+ 0:08:06.065 --> 0:08:12.636
351
+ it's actually a decent one to make and if you
352
+ can think of this like if you can assume this
353
+
354
+ 0:08:12.636 --> 0:08:16.207
355
+ then we can model systems in an unsupervised
356
+ way.
357
+
358
+ 0:08:16.396 --> 0:08:22.743
359
+ So this is the intuition that I want to give,
360
+ and you can see that whenever assumptions fail,
361
+
362
+ 0:08:22.743 --> 0:08:23.958
363
+ the systems fail.
364
+
365
+ 0:08:23.891 --> 0:08:29.824
366
+ So in practice whenever we go far away from
367
+ these assumptions, the systems try to more
368
+
369
+ 0:08:29.824 --> 0:08:30.778
370
+ time to fail.
371
+
372
+ 0:08:33.753 --> 0:08:39.711
373
+ So the example that I gave was actually perfect
374
+ mapping right, so it never really sticks bad.
375
+
376
+ 0:08:39.648 --> 0:08:45.321
377
+ They have the same number of words, same sentence
378
+ structure, perfect mapping, and so on.
379
+
380
+ 0:08:45.257 --> 0:08:50.995
381
+ This doesn't happen, but let's assume that
382
+ this happens and try to see how we can moral.
383
+
384
+ 0:08:53.493 --> 0:08:59.017
385
+ Unsupervised word translationOkay, now let's
386
+ go a bit more formal, so what you want to do
387
+
388
+ 0:08:59.017 --> 0:09:01.042
389
+ is unsupervise word translation.
390
+
391
+ 0:09:01.901 --> 0:09:08.773
392
+ Here the task is that we have input data as
393
+ monolingual data, so a bunch of sentences in
394
+
395
+ 0:09:08.773 --> 0:09:15.876
396
+ one file and a bunch of sentences another file
397
+ in two different languages, and the question
398
+
399
+ 0:09:15.876 --> 0:09:18.655
400
+ is how can we get a bilingual word?
401
+
402
+ 0:09:19.559 --> 0:09:25.134
403
+ So if you look at the picture you see that
404
+ it's just kind of projected down into two dimension
405
+
406
+ 0:09:25.134 --> 0:09:30.358
407
+ planes, but it's basically when you map them
408
+ into a plot you see that the words that are
409
+
410
+ 0:09:30.358 --> 0:09:35.874
411
+ parallel are closer together, and the question
412
+ is how can we do it just looking at two files?
413
+
414
+ 0:09:36.816 --> 0:09:42.502
415
+ And you can say that what we want to basically
416
+ do is create a dictionary in the end given
417
+
418
+ 0:09:42.502 --> 0:09:43.260
419
+ two fights.
420
+
421
+ 0:09:43.197 --> 0:09:45.410
422
+ So this is the task that we want.
423
+
424
+ 0:09:46.606 --> 0:09:52.262
425
+ And the first step on how we do this is to
426
+ learn word vectors, and this chicken is whatever
427
+
428
+ 0:09:52.262 --> 0:09:56.257
429
+ techniques that you have seen before, but to
430
+ work glow or so on.
431
+
432
+ 0:09:56.856 --> 0:10:00.699
433
+ So you take a monolingual data and try to
434
+ learn word embeddings.
435
+
436
+ 0:10:02.002 --> 0:10:07.675
437
+ Then you plot them into a graph, and then
438
+ typically what you would see is that they're
439
+
440
+ 0:10:07.675 --> 0:10:08.979
441
+ not aligned at all.
442
+
443
+ 0:10:08.914 --> 0:10:14.693
444
+ One word space is somewhere, and one word
445
+ space is somewhere else, and this is what you
446
+
447
+ 0:10:14.693 --> 0:10:18.043
448
+ would typically expect to see in the in the
449
+ image.
450
+
451
+ 0:10:19.659 --> 0:10:23.525
452
+ Now our assumption was that both lines we
453
+ just have the same.
454
+
455
+ 0:10:23.563 --> 0:10:28.520
456
+ Culture and so that we can use this information
457
+ to learn the mapping between these two spaces.
458
+
459
+ 0:10:30.130 --> 0:10:37.085
460
+ So before how we do it, I think this is quite
461
+ famous already, and everybody knows it a bit
462
+
463
+ 0:10:37.085 --> 0:10:41.824
464
+ more is that we're emitting capture semantic
465
+ relations right.
466
+
467
+ 0:10:41.747 --> 0:10:48.245
468
+ So the distance between man and woman is approximately
469
+ the same as king and prince.
470
+
471
+ 0:10:48.888 --> 0:10:54.620
472
+ It's also for world dances, country capital
473
+ and so on, so there are some relationships
474
+
475
+ 0:10:54.620 --> 0:11:00.286
476
+ happening in the word emmering space, which
477
+ is quite clear for at least one language.
478
+
479
+ 0:11:03.143 --> 0:11:08.082
480
+ Now if you think of this, let's say of the
481
+ English word embryng.
482
+
483
+ 0:11:08.006 --> 0:11:14.746
484
+ Let's say of German word embryng and the way
485
+ the King Keene Man woman organized is same
486
+
487
+ 0:11:14.746 --> 0:11:17.734
488
+ as the German translation of his word.
489
+
490
+ 0:11:17.998 --> 0:11:23.336
491
+ This is the main idea is that although they
492
+ are somewhere else, the relationship is the
493
+
494
+ 0:11:23.336 --> 0:11:28.008
495
+ same between the both languages and we can
496
+ use this to to learn the mapping.
497
+
498
+ 0:11:31.811 --> 0:11:35.716
499
+ 'S not only for these poor words where it
500
+ happens for all the words in the language,
501
+
502
+ 0:11:35.716 --> 0:11:37.783
503
+ and so we can use this to to learn the math.
504
+
505
+ 0:11:39.179 --> 0:11:43.828
506
+ This is the main idea is that both emittings
507
+ have a similar shape.
508
+
509
+ 0:11:43.759 --> 0:11:48.431
510
+ It's only that they're just not aligned and
511
+ so you go to the here.
512
+
513
+ 0:11:48.362 --> 0:11:50.821
514
+ They kind of have a similar shape.
515
+
516
+ 0:11:50.751 --> 0:11:57.211
517
+ They're just in some different spaces and
518
+ what you need to do is to map them into a common
519
+
520
+ 0:11:57.211 --> 0:11:57.708
521
+ space.
522
+
523
+ 0:12:06.086 --> 0:12:12.393
524
+ The w, such that if it multiplied w with x,
525
+ they both become.
526
+
527
+ 0:12:35.335 --> 0:12:41.097
528
+ That's true, but there are also many works
529
+ that have the relationship right, and we hope
530
+
531
+ 0:12:41.097 --> 0:12:43.817
532
+ that this is enough to learn the mapping.
533
+
534
+ 0:12:43.752 --> 0:12:49.823
535
+ So there's always going to be a bit of noise,
536
+ as in how when we align them they're not going
537
+
538
+ 0:12:49.823 --> 0:12:51.716
539
+ to be exactly the same, but.
540
+
541
+ 0:12:51.671 --> 0:12:57.293
542
+ What you can expect is that there are these
543
+ main works that allow us to learn the mapping,
544
+
545
+ 0:12:57.293 --> 0:13:02.791
546
+ so it's not going to be perfect, but it's an
547
+ approximation that we make to to see how it
548
+
549
+ 0:13:02.791 --> 0:13:04.521
550
+ works and then practice it.
551
+
552
+ 0:13:04.459 --> 0:13:10.078
553
+ Also, it's not that the fact that women do
554
+ not have any relationship does not affect that
555
+
556
+ 0:13:10.078 --> 0:13:10.452
557
+ much.
558
+
559
+ 0:13:10.550 --> 0:13:15.429
560
+ A lot of words usually have, so it kind of
561
+ works out in practice.
562
+
563
+ 0:13:22.242 --> 0:13:34.248
564
+ I have not heard about it, but if you want
565
+ to say something about it, I would be interested,
566
+
567
+ 0:13:34.248 --> 0:13:37.346
568
+ but we can do it later.
569
+
570
+ 0:13:41.281 --> 0:13:44.133
571
+ Usual case: This is supervised.
572
+
573
+ 0:13:45.205 --> 0:13:49.484
574
+ First way to do a supervised work translation
575
+ where we have a dictionary right and that we
576
+
577
+ 0:13:49.484 --> 0:13:53.764
578
+ can use that to learn the mapping, but in our
579
+ case we assume that we have nothing right so
580
+
581
+ 0:13:53.764 --> 0:13:55.222
582
+ we only have monolingual data.
583
+
584
+ 0:13:56.136 --> 0:14:03.126
585
+ Then we need unsupervised planning to figure
586
+ out W, and we're going to use guns to to find
587
+
588
+ 0:14:03.126 --> 0:14:06.122
589
+ W, and it's quite a nice way to do it.
590
+
591
+ 0:14:08.248 --> 0:14:15.393
592
+ So just before I go on how we use it to use
593
+ case, I'm going to go briefly on gas right,
594
+
595
+ 0:14:15.393 --> 0:14:19.940
596
+ so we have two components: generator and discriminator.
597
+
598
+ 0:14:21.441 --> 0:14:27.052
599
+ Gen data tries to generate something obviously,
600
+ and the discriminator tries to see if it's
601
+
602
+ 0:14:27.052 --> 0:14:30.752
603
+ real data or something that is generated by
604
+ the generation.
605
+
606
+ 0:14:31.371 --> 0:14:37.038
607
+ And there's like this two player game where
608
+ the winner decides to fool and the winner decides
609
+
610
+ 0:14:37.038 --> 0:14:41.862
611
+ to market food and they try to build these
612
+ two components and try to learn WWE.
613
+
614
+ 0:14:43.483 --> 0:14:53.163
615
+ Okay, so let's say we have two languages,
616
+ X and Y right, so the X language has N words
617
+
618
+ 0:14:53.163 --> 0:14:56.167
619
+ with numbering dimensions.
620
+
621
+ 0:14:56.496 --> 0:14:59.498
622
+ So what I'm reading is matrix is peak or something.
623
+
624
+ 0:14:59.440 --> 0:15:02.174
625
+ Then we have target language why with m words.
626
+
627
+ 0:15:02.116 --> 0:15:06.945
628
+ I'm also the same amount of things I mentioned
629
+ and then we have a matrix peak or.
630
+
631
+ 0:15:07.927 --> 0:15:13.784
632
+ Basically what you're going to do is use word
633
+ to work and learn our word embedded.
634
+
635
+ 0:15:14.995 --> 0:15:23.134
636
+ Now we have these X Mrings, Y Mrings, and
637
+ what you want to know is W, such that W X and
638
+
639
+ 0:15:23.134 --> 0:15:24.336
640
+ Y are align.
641
+
642
+ 0:15:29.209 --> 0:15:35.489
643
+ With guns you have two steps, one is a discriminative
644
+ step and one is the the mapping step and the
645
+
646
+ 0:15:35.489 --> 0:15:41.135
647
+ discriminative step is to see if the embeddings
648
+ are from the source or mapped embedding.
649
+
650
+ 0:15:41.072 --> 0:15:44.689
651
+ So it's going to be much scary when I go to
652
+ the figure.
653
+
654
+ 0:15:46.306 --> 0:15:50.041
655
+ So we have a monolingual documents with two
656
+ different languages.
657
+
658
+ 0:15:49.983 --> 0:15:54.498
659
+ From here we get our source language ambients
660
+ target language ambients right.
661
+
662
+ 0:15:54.440 --> 0:15:58.905
663
+ Then we randomly initialize the transformation
664
+ metrics W.
665
+
666
+ 0:15:58.905 --> 0:16:05.603
667
+ Then we have the discriminator which tries
668
+ to see if it's WX or Y, so it needs to know
669
+
670
+ 0:16:05.603 --> 0:16:13.379
671
+ that this is a mapped one and this is the original
672
+ language, and so if you look at the lost function
673
+
674
+ 0:16:13.379 --> 0:16:20.076
675
+ here, it's basically that source is one given
676
+ WX, so this is from the source language.
677
+
678
+ 0:16:23.543 --> 0:16:27.339
679
+ Which means it's the target language em yeah.
680
+
681
+ 0:16:27.257 --> 0:16:34.437
682
+ It's just like my figure is not that great,
683
+ but you can assume that they are totally.
684
+
685
+ 0:16:40.260 --> 0:16:43.027
686
+ So this is the kind of the lost function.
687
+
688
+ 0:16:42.961 --> 0:16:46.338
689
+ We have N source words, M target words, and
690
+ so on.
691
+
692
+ 0:16:46.272 --> 0:16:52.341
693
+ So that's why you have one by M, one by M,
694
+ and the discriminator is to just see if they're
695
+
696
+ 0:16:52.341 --> 0:16:55.742
697
+ mapped or they're from the original target
698
+ number.
699
+
700
+ 0:16:57.317 --> 0:17:04.024
701
+ And then we have the mapping step where we
702
+ train W to fool the the discriminators.
703
+
704
+ 0:17:04.564 --> 0:17:10.243
705
+ So here it's the same way, but what you're
706
+ going to just do is inverse the loss function.
707
+
708
+ 0:17:10.180 --> 0:17:15.829
709
+ So now we freeze the discriminators, so it's
710
+ important to note that in the previous sect
711
+
712
+ 0:17:15.829 --> 0:17:20.844
713
+ we freezed the transformation matrix, and here
714
+ we freezed your discriminators.
715
+
716
+ 0:17:22.482 --> 0:17:30.228
717
+ And now it's to fool the discriminated rights,
718
+ so it should predict that the source is zero
719
+
720
+ 0:17:30.228 --> 0:17:37.889
721
+ given the map numbering, and the source is
722
+ one given the target numbering, which is wrong,
723
+
724
+ 0:17:37.889 --> 0:17:40.920
725
+ which is why we're attaining the W.
726
+
727
+ 0:17:40.920 --> 0:17:46.308
728
+ Any questions on this okay so then how do
729
+ we know when to stop?
730
+
731
+ 0:17:46.224 --> 0:17:55.845
732
+ We just train until we reach convergence right
733
+ and then we have our W hopefully train and
734
+
735
+ 0:17:55.845 --> 0:17:59.265
736
+ map them into an airline space.
737
+
738
+ 0:18:02.222 --> 0:18:07.097
739
+ The question is how can we evaluate this mapping?
740
+
741
+ 0:18:07.000 --> 0:18:13.902
742
+ Does anybody know what we can use to mapping
743
+ or evaluate the mapping?
744
+
745
+ 0:18:13.803 --> 0:18:15.879
746
+ How good is a word?
747
+
748
+ 0:18:28.969 --> 0:18:33.538
749
+ We use as I said we use a dictionary, at least
750
+ in the end.
751
+
752
+ 0:18:33.461 --> 0:18:40.179
753
+ We need a dictionary to evaluate, so this
754
+ is our only final, so we aren't using it at
755
+
756
+ 0:18:40.179 --> 0:18:42.600
757
+ all in attaining data and the.
758
+
759
+ 0:18:43.223 --> 0:18:49.681
760
+ Is one is to check what's the position for
761
+ our dictionary, just that.
762
+
763
+ 0:18:50.650 --> 0:18:52.813
764
+ The first nearest neighbor and see if it's
765
+ there on.
766
+
767
+ 0:18:53.573 --> 0:18:56.855
768
+ But this is quite strict because there's a
769
+ lot of noise in the emitting space right.
770
+
771
+ 0:18:57.657 --> 0:19:03.114
772
+ Not always your first neighbor is going to
773
+ be the translation, so what people also report
774
+
775
+ 0:19:03.114 --> 0:19:05.055
776
+ is precision at file and so on.
777
+
778
+ 0:19:04.994 --> 0:19:10.175
779
+ So you take the finerest neighbors and see
780
+ if the translation is in there and so on.
781
+
782
+ 0:19:10.114 --> 0:19:15.529
783
+ So the more you increase, the more likely
784
+ that there is a translation because where I'm
785
+
786
+ 0:19:15.529 --> 0:19:16.698
787
+ being quite noisy.
788
+
789
+ 0:19:19.239 --> 0:19:25.924
790
+ What's interesting is that people have used
791
+ dictionary to to learn word translation, but
792
+
793
+ 0:19:25.924 --> 0:19:32.985
794
+ the way of doing this is much better than using
795
+ a dictionary, so somehow our assumption helps
796
+
797
+ 0:19:32.985 --> 0:19:36.591
798
+ us to to build better than a supervised system.
799
+
800
+ 0:19:39.099 --> 0:19:42.985
801
+ So as you see on the top you have a question
802
+ at one five ten.
803
+
804
+ 0:19:42.922 --> 0:19:47.310
805
+ These are the typical numbers that you report
806
+ for world translation.
807
+
808
+ 0:19:48.868 --> 0:19:55.996
809
+ But guns are usually quite tricky to to train,
810
+ and it does not converge on on language based,
811
+
812
+ 0:19:55.996 --> 0:20:02.820
813
+ and this kind of goes back to a assumption
814
+ that they kind of behave in the same structure
815
+
816
+ 0:20:02.820 --> 0:20:03.351
817
+ right.
818
+
819
+ 0:20:03.275 --> 0:20:07.144
820
+ But if you take a language like English and
821
+ some.
822
+
823
+ 0:20:07.087 --> 0:20:12.203
824
+ Other languages are almost very lotus, so
825
+ it's quite different from English and so on.
826
+
827
+ 0:20:12.144 --> 0:20:13.623
828
+ Then I've one language,.
829
+
830
+ 0:20:13.564 --> 0:20:18.754
831
+ So whenever whenever our assumption fails,
832
+ these unsupervised techniques always do not
833
+
834
+ 0:20:18.754 --> 0:20:21.200
835
+ converge or just give really bad scores.
836
+
837
+ 0:20:22.162 --> 0:20:27.083
838
+ And so the fact is that the monolingual embryons
839
+ for distant languages are too far.
840
+
841
+ 0:20:27.024 --> 0:20:30.950
842
+ They do not share the same structure, and
843
+ so they do not convert.
844
+
845
+ 0:20:32.452 --> 0:20:39.380
846
+ And so I just want to mention that there is
847
+ a better retrieval technique than the nearest
848
+
849
+ 0:20:39.380 --> 0:20:41.458
850
+ neighbor, which is called.
851
+
852
+ 0:20:42.882 --> 0:20:46.975
853
+ But it's more advanced than mathematical,
854
+ so I didn't want to go in it now.
855
+
856
+ 0:20:46.921 --> 0:20:51.811
857
+ But if your interest is in some quite good
858
+ retrieval segments, you can just look at these
859
+
860
+ 0:20:51.811 --> 0:20:53.007
861
+ if you're interested.
862
+
863
+ 0:20:55.615 --> 0:20:59.216
864
+ Cure for word translationOkay, so this is
865
+ about the the word translation.
866
+
867
+ 0:20:59.167 --> 0:21:02.278
868
+ Does anybody have any questions of cure?
869
+
870
+ 0:21:06.246 --> 0:21:07.501
871
+ Was the worst answer?
872
+
873
+ 0:21:07.444 --> 0:21:12.545
874
+ It was a bit easier than a sentence right,
875
+ so you just assume that there's a mapping and
876
+
877
+ 0:21:12.545 --> 0:21:14.551
878
+ then you try to learn the mapping.
879
+
880
+ 0:21:14.493 --> 0:21:19.641
881
+ But now it's a bit more difficult because
882
+ you need to jump at stuff also, which is quite
883
+
884
+ 0:21:19.641 --> 0:21:20.798
885
+ much more trickier.
886
+
887
+ 0:21:22.622 --> 0:21:28.512
888
+ Task here is that we have our input as manually
889
+ well data for both languages as before, but
890
+
891
+ 0:21:28.512 --> 0:21:34.017
892
+ now what we want to do is instead of translating
893
+ word by word we want to do sentence.
894
+
895
+ 0:21:37.377 --> 0:21:44.002
896
+ We have word of work now and so on to learn
897
+ word amber inks, but sentence amber inks are
898
+
899
+ 0:21:44.002 --> 0:21:50.627
900
+ actually not the site powered often, at least
901
+ when people try to work on Answer Voice M,
902
+
903
+ 0:21:50.627 --> 0:21:51.445
904
+ E, before.
905
+
906
+ 0:21:52.632 --> 0:21:54.008
907
+ Now they're a bit okay.
908
+
909
+ 0:21:53.951 --> 0:21:59.028
910
+ I mean, as you've seen in the practice on
911
+ where we used places, they were quite decent.
912
+
913
+ 0:21:58.971 --> 0:22:03.007
914
+ But then it's also the case on which data
915
+ it's trained on and so on.
916
+
917
+ 0:22:02.949 --> 0:22:03.261
918
+ So.
919
+
920
+ 0:22:04.164 --> 0:22:09.666
921
+ Sentence embedings are definitely much more
922
+ harder to get than were embedings, so this
923
+
924
+ 0:22:09.666 --> 0:22:13.776
925
+ is a bit more complicated than the task that
926
+ you've seen before.
927
+
928
+ 0:22:16.476 --> 0:22:16.994
929
+ How U.
930
+
931
+ 0:22:16.994 --> 0:22:17.216
932
+ N.
933
+
934
+ 0:22:17.216 --> 0:22:17.438
935
+ T.
936
+
937
+ 0:22:17.438 --> 0:22:19.659
938
+ WorksBefore we go into how U.
939
+
940
+ 0:22:19.659 --> 0:22:19.881
941
+ N.
942
+
943
+ 0:22:19.881 --> 0:22:20.103
944
+ M.
945
+
946
+ 0:22:20.103 --> 0:22:20.325
947
+ T.
948
+
949
+ 0:22:20.325 --> 0:22:24.470
950
+ Works, so this is your typical supervised
951
+ system right.
952
+
953
+ 0:22:24.396 --> 0:22:29.537
954
+ So we have parallel data source sentence target
955
+ centers.
956
+
957
+ 0:22:29.447 --> 0:22:31.166
958
+ We have a source.
959
+
960
+ 0:22:31.471 --> 0:22:36.709
961
+ We have a target decoder and then we try to
962
+ minimize the cross center pillar on this viral
963
+
964
+ 0:22:36.709 --> 0:22:37.054
965
+ data.
966
+
967
+ 0:22:37.157 --> 0:22:39.818
968
+ And this is how we train our typical system.
969
+
970
+ 0:22:43.583 --> 0:22:49.506
971
+ But now we do not have any parallel data,
972
+ and so the intuition here is that if we can
973
+
974
+ 0:22:49.506 --> 0:22:55.429
975
+ learn language independent representations
976
+ at the end quota outputs, then we can pass
977
+
978
+ 0:22:55.429 --> 0:22:58.046
979
+ it along to the decoder that we want.
980
+
981
+ 0:22:58.718 --> 0:23:03.809
982
+ It's going to get more clear in the future,
983
+ but I'm trying to give a bit more intuition
984
+
985
+ 0:23:03.809 --> 0:23:07.164
986
+ before I'm going to show you all the planning
987
+ objectives.
988
+
989
+ 0:23:08.688 --> 0:23:15.252
990
+ So I assume that we have these different encoders
991
+ right, so it's not only two, you have a bunch
992
+
993
+ 0:23:15.252 --> 0:23:21.405
994
+ of different source language encoders, a bunch
995
+ of different target language decoders, and
996
+
997
+ 0:23:21.405 --> 0:23:26.054
998
+ also I assume that the encoder is in the same
999
+ representation space.
1000
+
1001
+ 0:23:26.706 --> 0:23:31.932
1002
+ If you give a sentence in English and the
1003
+ same sentence in German, the embeddings are
1004
+
1005
+ 0:23:31.932 --> 0:23:38.313
1006
+ quite the same, so like the muddling when embeddings
1007
+ die right, and so then what we can do is, depending
1008
+
1009
+ 0:23:38.313 --> 0:23:42.202
1010
+ on the language we want, pass it to the the
1011
+ appropriate decode.
1012
+
1013
+ 0:23:42.682 --> 0:23:50.141
1014
+ And so the kind of goal here is to find out
1015
+ a way to create language independent representations
1016
+
1017
+ 0:23:50.141 --> 0:23:52.909
1018
+ and then pass it to the decodement.
1019
+
1020
+ 0:23:54.975 --> 0:23:59.714
1021
+ Just keep in mind that you're trying to do
1022
+ language independent for some reason, but it's
1023
+
1024
+ 0:23:59.714 --> 0:24:02.294
1025
+ going to be more clear once we see how it works.
1026
+
1027
+ 0:24:05.585 --> 0:24:12.845
1028
+ So in total we have three objectives that
1029
+ we're going to try to train in our systems,
1030
+
1031
+ 0:24:12.845 --> 0:24:16.981
1032
+ so this is and all of them use monolingual
1033
+ data.
1034
+
1035
+ 0:24:17.697 --> 0:24:19.559
1036
+ So there's no pilot data at all.
1037
+
1038
+ 0:24:19.503 --> 0:24:24.448
1039
+ The first one is denoising water encoding,
1040
+ so it's more like you add noise to noise to
1041
+
1042
+ 0:24:24.448 --> 0:24:27.404
1043
+ the sentence, and then they construct the original.
1044
+
1045
+ 0:24:28.388 --> 0:24:34.276
1046
+ Then we have the on the flyby translation,
1047
+ so this is where you take a sentence, generate
1048
+
1049
+ 0:24:34.276 --> 0:24:39.902
1050
+ a translation, and then learn the the word
1051
+ smarting, which I'm going to show pictures
1052
+
1053
+ 0:24:39.902 --> 0:24:45.725
1054
+ stated, and then we have an adverse serial
1055
+ planning to do learn the language independent
1056
+
1057
+ 0:24:45.725 --> 0:24:46.772
1058
+ representation.
1059
+
1060
+ 0:24:47.427 --> 0:24:52.148
1061
+ So somehow we'll fill in these three tasks
1062
+ or retain on these three tasks.
1063
+
1064
+ 0:24:52.085 --> 0:24:55.324
1065
+ We somehow get an answer to President M.
1066
+
1067
+ 0:24:55.324 --> 0:24:55.561
1068
+ T.
1069
+
1070
+ 0:24:55.561 --> 0:25:02.513
1071
+ OK, so first we're going to do is denoising
1072
+ what I'm cutting right, so as I said we add
1073
+
1074
+ 0:25:02.513 --> 0:25:06.305
1075
+ noise to the sentence, so we take our sentence.
1076
+
1077
+ 0:25:06.826 --> 0:25:09.709
1078
+ And then there are different ways to add noise.
1079
+
1080
+ 0:25:09.649 --> 0:25:11.463
1081
+ You can shuffle words around.
1082
+
1083
+ 0:25:11.402 --> 0:25:12.621
1084
+ You can drop words.
1085
+
1086
+ 0:25:12.560 --> 0:25:18.284
1087
+ Do whatever you want to do as long as there's
1088
+ enough information to reconstruct the original
1089
+
1090
+ 0:25:18.284 --> 0:25:18.900
1091
+ sentence.
1092
+
1093
+ 0:25:19.719 --> 0:25:25.051
1094
+ And then we assume that the nicest one and
1095
+ the original one are parallel data and train
1096
+
1097
+ 0:25:25.051 --> 0:25:26.687
1098
+ similar to the supervised.
1099
+
1100
+ 0:25:28.168 --> 0:25:30.354
1101
+ So we have a source sentence.
1102
+
1103
+ 0:25:30.281 --> 0:25:32.476
1104
+ We have a noisy source right.
1105
+
1106
+ 0:25:32.403 --> 0:25:37.037
1107
+ So here what basically happened is that the
1108
+ word got shuffled.
1109
+
1110
+ 0:25:36.963 --> 0:25:38.964
1111
+ One word is dropped right.
1112
+
1113
+ 0:25:38.890 --> 0:25:41.208
1114
+ So this was a noise of source.
1115
+
1116
+ 0:25:41.133 --> 0:25:47.042
1117
+ And then we treat the noise of source and
1118
+ source as a sentence bed basically.
1119
+
1120
+ 0:25:49.009 --> 0:25:53.874
1121
+ Way retainers optimizing the cross entropy
1122
+ loss similar to.
1123
+
1124
+ 0:25:57.978 --> 0:26:03.211
1125
+ Basically a picture to show what's happening
1126
+ and we have the nice resources.
1127
+
1128
+ 0:26:03.163 --> 0:26:09.210
1129
+ Now is the target and then we have the reconstructed
1130
+ original source and original tag and since
1131
+
1132
+ 0:26:09.210 --> 0:26:14.817
1133
+ the languages are different we have our source
1134
+ hand coded target and coded source coded.
1135
+
1136
+ 0:26:17.317 --> 0:26:20.202
1137
+ And for this task we only need monolingual
1138
+ data.
1139
+
1140
+ 0:26:20.143 --> 0:26:25.249
1141
+ We don't need any pedal data because it's
1142
+ just taking a sentence and shuffling it and
1143
+
1144
+ 0:26:25.249 --> 0:26:27.446
1145
+ reconstructing the the original one.
1146
+
1147
+ 0:26:28.848 --> 0:26:31.058
1148
+ And we are four different blocks.
1149
+
1150
+ 0:26:30.993 --> 0:26:36.842
1151
+ This is kind of very important to keep in
1152
+ mind on how we change these connections later.
1153
+
1154
+ 0:26:41.121 --> 0:26:49.093
1155
+ Then this is more like the mathematical formulation
1156
+ where you predict source given the noisy.
1157
+
1158
+ 0:26:52.492 --> 0:26:55.090
1159
+ So that was the nursing water encoding.
1160
+
1161
+ 0:26:55.025 --> 0:26:58.404
1162
+ The second step is on the flight back translation.
1163
+
1164
+ 0:26:59.479 --> 0:27:06.386
1165
+ So what we do is, we put our model inference
1166
+ mode right, we take a source of sentences,
1167
+
1168
+ 0:27:06.386 --> 0:27:09.447
1169
+ and we generate a translation pattern.
1170
+
1171
+ 0:27:09.829 --> 0:27:18.534
1172
+ It might be completely wrong or maybe partially
1173
+ correct or so on, but we assume that the moral
1174
+
1175
+ 0:27:18.534 --> 0:27:20.091
1176
+ knows of it and.
1177
+
1178
+ 0:27:20.680 --> 0:27:25.779
1179
+ Tend rate: T head right and then what we do
1180
+ is assume that T head or not assume but T head
1181
+
1182
+ 0:27:25.779 --> 0:27:27.572
1183
+ and S are sentence space right.
1184
+
1185
+ 0:27:27.516 --> 0:27:29.927
1186
+ That's how we can handle the translation.
1187
+
1188
+ 0:27:30.530 --> 0:27:38.824
1189
+ So we train a supervised system on this sentence
1190
+ bed, so we do inference and then build a reverse
1191
+
1192
+ 0:27:38.824 --> 0:27:39.924
1193
+ translation.
1194
+
1195
+ 0:27:42.442 --> 0:27:49.495
1196
+ Are both more concrete, so we have a false
1197
+ sentence right, then we chamber the translation,
1198
+
1199
+ 0:27:49.495 --> 0:27:55.091
1200
+ then we give the general translation as an
1201
+ input and try to predict the.
1202
+
1203
+ 0:27:58.378 --> 0:28:03.500
1204
+ This is how we would do in practice right,
1205
+ so not before the source encoder was connected
1206
+
1207
+ 0:28:03.500 --> 0:28:08.907
1208
+ to the source decoder, but now we interchanged
1209
+ connections, so the source encoder is connected
1210
+
1211
+ 0:28:08.907 --> 0:28:10.216
1212
+ to the target decoder.
1213
+
1214
+ 0:28:10.159 --> 0:28:13.291
1215
+ The target encoder is turned into the source
1216
+ decoder.
1217
+
1218
+ 0:28:13.974 --> 0:28:20.747
1219
+ And given s we get t-hat and given t we get
1220
+ s-hat, so this is the first time.
1221
+
1222
+ 0:28:21.661 --> 0:28:24.022
1223
+ On the second time step, what you're going
1224
+ to do is reverse.
1225
+
1226
+ 0:28:24.664 --> 0:28:32.625
1227
+ So as that is here, t hat is here, and given
1228
+ s hat we are trying to predict t, and given
1229
+
1230
+ 0:28:32.625 --> 0:28:34.503
1231
+ t hat we are trying.
1232
+
1233
+ 0:28:36.636 --> 0:28:39.386
1234
+ Is this clear you have any questions on?
1235
+
1236
+ 0:28:45.405 --> 0:28:50.823
1237
+ Bit more mathematically, we try to play the
1238
+ class, give and take and so it's always the
1239
+
1240
+ 0:28:50.823 --> 0:28:53.963
1241
+ supervised NMP technique that we are trying
1242
+ to do.
1243
+
1244
+ 0:28:53.901 --> 0:28:59.684
1245
+ But you're trying to create this synthetic
1246
+ pass that kind of helpers to build an unsurprised
1247
+
1248
+ 0:28:59.684 --> 0:29:00.182
1249
+ system.
1250
+
1251
+ 0:29:02.362 --> 0:29:08.611
1252
+ Now also with maybe you can see here is that
1253
+ if the source encoded and targeted encoded
1254
+
1255
+ 0:29:08.611 --> 0:29:14.718
1256
+ the language independent, we can always shuffle
1257
+ the connections and the translations.
1258
+
1259
+ 0:29:14.647 --> 0:29:21.252
1260
+ That's why it was important to find a way
1261
+ to generate language independent representations.
1262
+
1263
+ 0:29:21.441 --> 0:29:26.476
1264
+ And the way we try to force this language
1265
+ independence is the gan step.
1266
+
1267
+ 0:29:27.627 --> 0:29:34.851
1268
+ So the third step kind of combines all of
1269
+ them is where we try to use gun to make the
1270
+
1271
+ 0:29:34.851 --> 0:29:37.959
1272
+ encoded output language independent.
1273
+
1274
+ 0:29:37.875 --> 0:29:42.826
1275
+ So here it's the same picture but from a different
1276
+ paper.
1277
+
1278
+ 0:29:42.741 --> 0:29:43.196
1279
+ So.
1280
+
1281
+ 0:29:43.343 --> 0:29:48.888
1282
+ We have X-rays, X-ray objects which is monolingual
1283
+ in data.
1284
+
1285
+ 0:29:48.796 --> 0:29:50.189
1286
+ We add noise.
1287
+
1288
+ 0:29:50.690 --> 0:29:54.736
1289
+ Then we encode it using the source and the
1290
+ target encoders right.
1291
+
1292
+ 0:29:54.675 --> 0:29:58.247
1293
+ Then we get the latent space Z source and
1294
+ Z target right.
1295
+
1296
+ 0:29:58.185 --> 0:30:03.451
1297
+ Then we decode and try to reconstruct the
1298
+ original one and this is the auto encoding
1299
+
1300
+ 0:30:03.451 --> 0:30:08.470
1301
+ loss which takes the X source which is the
1302
+ original one and then the translated.
1303
+
1304
+ 0:30:08.468 --> 0:30:09.834
1305
+ Predicted output.
1306
+
1307
+ 0:30:09.758 --> 0:30:16.699
1308
+ So hello, it always is the auto encoding step
1309
+ where the gun concern is in the between gang
1310
+
1311
+ 0:30:16.699 --> 0:30:24.097
1312
+ cord outputs, and here we have an discriminator
1313
+ which tries to predict which language the latent
1314
+
1315
+ 0:30:24.097 --> 0:30:25.241
1316
+ space is from.
1317
+
1318
+ 0:30:26.466 --> 0:30:33.782
1319
+ So given Z source it has to predict that the
1320
+ representation is from a language source and
1321
+
1322
+ 0:30:33.782 --> 0:30:39.961
1323
+ given Z target it has to predict the representation
1324
+ from a language target.
1325
+
1326
+ 0:30:40.520 --> 0:30:45.135
1327
+ And our headquarters are kind of teaching
1328
+ data right now, and then we have a separate
1329
+
1330
+ 0:30:45.135 --> 0:30:49.803
1331
+ network discriminator which tries to predict
1332
+ which language the Latin spaces are from.
1333
+
1334
+ 0:30:53.393 --> 0:30:57.611
1335
+ And then this one is when we combined guns
1336
+ with the other ongoing step.
1337
+
1338
+ 0:30:57.552 --> 0:31:02.765
1339
+ Then we had an on the fly back translation
1340
+ step right, and so here what we're trying to
1341
+
1342
+ 0:31:02.765 --> 0:31:03.002
1343
+ do.
1344
+
1345
+ 0:31:03.863 --> 0:31:07.260
1346
+ Is the same, basically just exactly the same.
1347
+
1348
+ 0:31:07.186 --> 0:31:12.947
1349
+ But when we are doing the training, we are
1350
+ at the adversarial laws here, so.
1351
+
1352
+ 0:31:13.893 --> 0:31:20.762
1353
+ We take our X source, gender and intermediate
1354
+ translation, so why target and why source right?
1355
+
1356
+ 0:31:20.690 --> 0:31:27.309
1357
+ This is the previous time step, and then we
1358
+ have to encode the new sentences and basically
1359
+
1360
+ 0:31:27.309 --> 0:31:32.765
1361
+ make them language independent or train to
1362
+ make them language independent.
1363
+
1364
+ 0:31:33.974 --> 0:31:43.502
1365
+ And then the hope is that now if we do this
1366
+ using monolingual data alone we can just switch
1367
+
1368
+ 0:31:43.502 --> 0:31:47.852
1369
+ connections and then get our translation.
1370
+
1371
+ 0:31:47.748 --> 0:31:49.619
1372
+ So the scale of.
1373
+
1374
+ 0:31:54.574 --> 0:32:03.749
1375
+ And so as I said before, guns are quite good
1376
+ for vision right, so this is kind of like the
1377
+
1378
+ 0:32:03.749 --> 0:32:11.312
1379
+ cycle gun approach that you might have seen
1380
+ in any computer vision course.
1381
+
1382
+ 0:32:11.911 --> 0:32:19.055
1383
+ Somehow protect that place at least not as
1384
+ promising as for merchants, and so people.
1385
+
1386
+ 0:32:18.972 --> 0:32:23.708
1387
+ What they did is to enforce this language
1388
+ independence.
1389
+
1390
+ 0:32:25.045 --> 0:32:31.226
1391
+ They try to use a shared encoder instead of
1392
+ having these different encoders right, and
1393
+
1394
+ 0:32:31.226 --> 0:32:37.835
1395
+ so this is basically the same painting objectives
1396
+ as before, but what you're going to do now
1397
+
1398
+ 0:32:37.835 --> 0:32:43.874
1399
+ is learn cross language language and then use
1400
+ the single encoder for both languages.
1401
+
1402
+ 0:32:44.104 --> 0:32:49.795
1403
+ And this kind also forces them to be in the
1404
+ same space, and then you can choose whichever
1405
+
1406
+ 0:32:49.795 --> 0:32:50.934
1407
+ decoder you want.
1408
+
1409
+ 0:32:52.552 --> 0:32:58.047
1410
+ You can use guns or you can just use a shared
1411
+ encoder and type to build your unsupervised
1412
+
1413
+ 0:32:58.047 --> 0:32:58.779
1414
+ MTT system.
1415
+
1416
+ 0:33:08.488 --> 0:33:09.808
1417
+ These are now the.
1418
+
1419
+ 0:33:09.738 --> 0:33:15.984
1420
+ The enhancements that you can do on top of
1421
+ your unsavoizant system is one you can create
1422
+
1423
+ 0:33:15.984 --> 0:33:16.686
1424
+ a shared.
1425
+
1426
+ 0:33:18.098 --> 0:33:22.358
1427
+ On top of the shared encoder you can ask are
1428
+ your guns lost or whatever so there's a lot
1429
+
1430
+ 0:33:22.358 --> 0:33:22.550
1431
+ of.
1432
+
1433
+ 0:33:24.164 --> 0:33:28.909
1434
+ Parallel data by word translationThe other
1435
+ thing that is more relevant right now is that
1436
+
1437
+ 0:33:28.909 --> 0:33:33.709
1438
+ you can create parallel data by word to word
1439
+ translation right because you know how to do
1440
+
1441
+ 0:33:33.709 --> 0:33:35.468
1442
+ all supervised word translation.
1443
+
1444
+ 0:33:36.376 --> 0:33:40.548
1445
+ First step is to create parallel data, assuming
1446
+ that word translations are quite good.
1447
+
1448
+ 0:33:41.361 --> 0:33:47.162
1449
+ And then you claim a supervised and empty
1450
+ model on these more likely wrong model data,
1451
+
1452
+ 0:33:47.162 --> 0:33:50.163
1453
+ but somehow gives you a good starting point.
1454
+
1455
+ 0:33:50.097 --> 0:33:56.072
1456
+ So you build your supervised and empty system
1457
+ on the word translation data, and then you
1458
+
1459
+ 0:33:56.072 --> 0:33:59.967
1460
+ initialize it before you're doing unsupervised
1461
+ and empty.
1462
+
1463
+ 0:34:00.260 --> 0:34:05.810
1464
+ And the hope is that when you're doing the
1465
+ back pain installation, it's a good starting
1466
+
1467
+ 0:34:05.810 --> 0:34:11.234
1468
+ point, but it's one technique that you can
1469
+ do to to improve your anthropoids and the.
1470
+
1471
+ 0:34:17.097 --> 0:34:23.697
1472
+ Back translation techniqueIn the previous
1473
+ case we had: The way we know when to stop was
1474
+
1475
+ 0:34:23.697 --> 0:34:26.547
1476
+ to see comedians on the gun training.
1477
+
1478
+ 0:34:26.472 --> 0:34:28.838
1479
+ Actually, all we want to do is when W.
1480
+
1481
+ 0:34:28.838 --> 0:34:32.053
1482
+ Comedians, which is quite easy to know when
1483
+ to stop.
1484
+
1485
+ 0:34:31.993 --> 0:34:37.486
1486
+ But in a realistic case, we don't have any
1487
+ parallel data right, so there's no validation.
1488
+
1489
+ 0:34:37.425 --> 0:34:42.003
1490
+ Or I mean, we might have test data in the
1491
+ end, but there's no validation.
1492
+
1493
+ 0:34:43.703 --> 0:34:48.826
1494
+ How will we tune our hyper parameters in this
1495
+ case because it's not really there's nothing
1496
+
1497
+ 0:34:48.826 --> 0:34:49.445
1498
+ for us to?
1499
+
1500
+ 0:34:50.130 --> 0:34:53.326
1501
+ Or the gold data in a sense like so.
1502
+
1503
+ 0:34:53.239 --> 0:35:01.188
1504
+ How do you think we can evaluate such systems
1505
+ or how can we tune hyper parameters in this?
1506
+
1507
+ 0:35:11.711 --> 0:35:17.089
1508
+ So what you're going to do is use the back
1509
+ translation technique.
1510
+
1511
+ 0:35:17.007 --> 0:35:24.299
1512
+ It's like a common technique where you have
1513
+ nothing okay that is to use back translation
1514
+
1515
+ 0:35:24.299 --> 0:35:26.921
1516
+ somehow and what you can do is.
1517
+
1518
+ 0:35:26.839 --> 0:35:31.674
1519
+ The main idea is validate on how good the
1520
+ reconstruction.
1521
+
1522
+ 0:35:32.152 --> 0:35:37.534
1523
+ So the idea is that if you have a good system
1524
+ then the intermediate translation is quite
1525
+
1526
+ 0:35:37.534 --> 0:35:39.287
1527
+ good and going back is easy.
1528
+
1529
+ 0:35:39.227 --> 0:35:44.651
1530
+ But if it's just noise that you generate in
1531
+ the forward step then it's really hard to go
1532
+
1533
+ 0:35:44.651 --> 0:35:46.967
1534
+ back, which is kind of the main idea.
1535
+
1536
+ 0:35:48.148 --> 0:35:53.706
1537
+ So the way it works is that we take a source
1538
+ sentence, we generate a translation in target
1539
+
1540
+ 0:35:53.706 --> 0:35:59.082
1541
+ language right, and then again can state the
1542
+ generated sentence and compare it with the
1543
+
1544
+ 0:35:59.082 --> 0:36:01.342
1545
+ original one, and if they're closer.
1546
+
1547
+ 0:36:01.841 --> 0:36:09.745
1548
+ It means that we have a good system, and if
1549
+ they are far this is kind of like an unsupervised
1550
+
1551
+ 0:36:09.745 --> 0:36:10.334
1552
+ grade.
1553
+
1554
+ 0:36:17.397 --> 0:36:21.863
1555
+ As far as the amount of data that you need.
1556
+
1557
+ 0:36:23.083 --> 0:36:27.995
1558
+ This was like the first initial resistance
1559
+ on on these systems is that you had.
1560
+
1561
+ 0:36:27.933 --> 0:36:32.067
1562
+ They wanted to do English and French and they
1563
+ had fifteen million.
1564
+
1565
+ 0:36:32.005 --> 0:36:37.972
1566
+ There was fifteen million more linguist sentences
1567
+ so it's quite a lot and they were able to get
1568
+
1569
+ 0:36:37.972 --> 0:36:40.582
1570
+ thirty two blue on these kinds of setups.
1571
+
1572
+ 0:36:41.721 --> 0:36:47.580
1573
+ But unsurprisingly if you have zero point
1574
+ one million pilot sentences you get the same
1575
+
1576
+ 0:36:47.580 --> 0:36:48.455
1577
+ performance.
1578
+
1579
+ 0:36:48.748 --> 0:36:50.357
1580
+ So it's a lot of training.
1581
+
1582
+ 0:36:50.298 --> 0:36:55.924
1583
+ It's a lot of monolingual data, but monolingual
1584
+ data is relatively easy to obtain is the fact
1585
+
1586
+ 0:36:55.924 --> 0:37:01.251
1587
+ that the training is also quite longer than
1588
+ the supervised system, but it's unsupervised
1589
+
1590
+ 0:37:01.251 --> 0:37:04.304
1591
+ so it's kind of the trade off that you are
1592
+ making.
1593
+
1594
+ 0:37:07.367 --> 0:37:13.101
1595
+ The other thing to note is that it's English
1596
+ and French, which is very close to our exemptions.
1597
+
1598
+ 0:37:13.041 --> 0:37:18.238
1599
+ Also, the monolingual data that they took
1600
+ are kind of from similar domains and so on.
1601
+
1602
+ 0:37:18.638 --> 0:37:27.564
1603
+ So that's why they're able to build such a
1604
+ good system, but you'll see later that it fails.
1605
+
1606
+ 0:37:36.256 --> 0:37:46.888
1607
+ Voice, and so mean what people usually do
1608
+ is first build a system right using whatever
1609
+
1610
+ 0:37:46.888 --> 0:37:48.110
1611
+ parallel.
1612
+
1613
+ 0:37:48.608 --> 0:37:56.549
1614
+ Then they use monolingual data and do back
1615
+ translation, so this is always being the standard
1616
+
1617
+ 0:37:56.549 --> 0:38:04.148
1618
+ way to to improve, and what people have seen
1619
+ is that: You don't even need zero point one
1620
+
1621
+ 0:38:04.148 --> 0:38:05.429
1622
+ million right.
1623
+
1624
+ 0:38:05.344 --> 0:38:10.701
1625
+ You just need like ten thousand or so on and
1626
+ then you do the monolingual back time station
1627
+
1628
+ 0:38:10.701 --> 0:38:12.173
1629
+ and you're still better.
1630
+
1631
+ 0:38:12.114 --> 0:38:13.295
1632
+ The answer is why.
1633
+
1634
+ 0:38:13.833 --> 0:38:19.534
1635
+ The question is it's really worth trying to
1636
+ to do this or maybe it's always better to find
1637
+
1638
+ 0:38:19.534 --> 0:38:20.787
1639
+ some parallel data.
1640
+
1641
+ 0:38:20.725 --> 0:38:26.076
1642
+ I'll expand a bit of money on getting few
1643
+ parallel data and then use it to start and
1644
+
1645
+ 0:38:26.076 --> 0:38:27.776
1646
+ find to build your system.
1647
+
1648
+ 0:38:27.713 --> 0:38:33.757
1649
+ So it was kind of the understanding that billing
1650
+ wool and spoiled systems are not that really.
1651
+
1652
+ 0:38:50.710 --> 0:38:54.347
1653
+ The thing is that with unlabeled data.
1654
+
1655
+ 0:38:57.297 --> 0:39:05.488
1656
+ Not in an obtaining signal, so when we are
1657
+ starting basically what we want to do is first
1658
+
1659
+ 0:39:05.488 --> 0:39:13.224
1660
+ get a good translation system and then use
1661
+ an unlabeled monolingual data to improve.
1662
+
1663
+ 0:39:13.613 --> 0:39:15.015
1664
+ But if you start from U.
1665
+
1666
+ 0:39:15.015 --> 0:39:15.183
1667
+ N.
1668
+
1669
+ 0:39:15.183 --> 0:39:20.396
1670
+ Empty our model might be really bad like it
1671
+ would be somewhere translating completely wrong.
1672
+
1673
+ 0:39:20.760 --> 0:39:26.721
1674
+ And then when you find your unlabeled data,
1675
+ it basically might be harming, or maybe the
1676
+
1677
+ 0:39:26.721 --> 0:39:28.685
1678
+ same as supervised applause.
1679
+
1680
+ 0:39:28.617 --> 0:39:35.323
1681
+ So the thing is, I hope, by fine tuning on
1682
+ labeled data as first is to get a good initialization.
1683
+
1684
+ 0:39:35.835 --> 0:39:38.404
1685
+ And then use the unsupervised techniques to
1686
+ get better.
1687
+
1688
+ 0:39:38.818 --> 0:39:42.385
1689
+ But if your starting point is really bad then
1690
+ it's not.
1691
+
1692
+ 0:39:45.185 --> 0:39:47.324
1693
+ Year so as we said before.
1694
+
1695
+ 0:39:47.245 --> 0:39:52.451
1696
+ This is kind of like the self supervised training
1697
+ usually works.
1698
+
1699
+ 0:39:52.371 --> 0:39:54.777
1700
+ First we have parallel data.
1701
+
1702
+ 0:39:56.456 --> 0:39:58.062
1703
+ Source language is X.
1704
+
1705
+ 0:39:57.989 --> 0:39:59.604
1706
+ Target language is Y.
1707
+
1708
+ 0:39:59.531 --> 0:40:05.961
1709
+ In the end we want a system that does X to
1710
+ Y, not Y to X, but first we want to train a
1711
+
1712
+ 0:40:05.961 --> 0:40:10.544
1713
+ backward model as it is Y to X, so target language
1714
+ to source.
1715
+
1716
+ 0:40:11.691 --> 0:40:17.353
1717
+ Then we take our moonlighting will target
1718
+ sentences, use our backward model to generate
1719
+
1720
+ 0:40:17.353 --> 0:40:21.471
1721
+ synthetic source, and then we join them with
1722
+ our original data.
1723
+
1724
+ 0:40:21.406 --> 0:40:27.568
1725
+ So now we have this noisy input, but always
1726
+ the gold output, which is kind of really important
1727
+
1728
+ 0:40:27.568 --> 0:40:29.514
1729
+ when you're doing backpaints.
1730
+
1731
+ 0:40:30.410 --> 0:40:36.992
1732
+ And then you can coordinate these big data
1733
+ and then you can train your X to Y cholesterol
1734
+
1735
+ 0:40:36.992 --> 0:40:44.159
1736
+ system and then you can always do this in multiple
1737
+ steps and usually three, four steps which kind
1738
+
1739
+ 0:40:44.159 --> 0:40:48.401
1740
+ of improves always and then finally get your
1741
+ best system.
1742
+
1743
+ 0:40:49.029 --> 0:40:54.844
1744
+ The point that I'm trying to make is that
1745
+ although answers and MPs the scores that I've
1746
+
1747
+ 0:40:54.844 --> 0:41:00.659
1748
+ shown before were quite good, you probably
1749
+ can get the same performance with with fifty
1750
+
1751
+ 0:41:00.659 --> 0:41:06.474
1752
+ thousand sentences, and also the languages
1753
+ that they've shown are quite similar and the
1754
+
1755
+ 0:41:06.474 --> 0:41:08.654
1756
+ texts were from the same domain.
1757
+
1758
+ 0:41:14.354 --> 0:41:21.494
1759
+ So any questions on u n m t ok yeah.
1760
+
1761
+ 0:41:22.322 --> 0:41:28.714
1762
+ MultilingualitySo after this fact that temperature
1763
+ was already better than than empty, what people
1764
+
1765
+ 0:41:28.714 --> 0:41:34.655
1766
+ have tried is to use this idea of multilinguality
1767
+ as you have seen in the previous lecture.
1768
+
1769
+ 0:41:34.590 --> 0:41:41.029
1770
+ The question is how can we do this knowledge
1771
+ transfer from high resource language to lower
1772
+
1773
+ 0:41:41.029 --> 0:41:42.232
1774
+ source language?
1775
+
1776
+ 0:41:44.484 --> 0:41:51.074
1777
+ One way to promote this language independent
1778
+ representations is to share the encoder and
1779
+
1780
+ 0:41:51.074 --> 0:41:57.960
1781
+ decoder for all languages, all their available
1782
+ languages, and that kind of hopefully enables
1783
+
1784
+ 0:41:57.960 --> 0:42:00.034
1785
+ the the knowledge transfer.
1786
+
1787
+ 0:42:03.323 --> 0:42:08.605
1788
+ When we're doing multilinguality, the two
1789
+ questions we need to to think of is how does
1790
+
1791
+ 0:42:08.605 --> 0:42:09.698
1792
+ the encoder know?
1793
+
1794
+ 0:42:09.637 --> 0:42:14.495
1795
+ How does the encoder encoder know which language
1796
+ that we're dealing with that?
1797
+
1798
+ 0:42:15.635 --> 0:42:20.715
1799
+ You already might have known the answer also,
1800
+ and the second question is how can we promote
1801
+
1802
+ 0:42:20.715 --> 0:42:24.139
1803
+ the encoder to generate language independent
1804
+ representations?
1805
+
1806
+ 0:42:25.045 --> 0:42:32.580
1807
+ By solving these two problems we can take
1808
+ help of high resource languages to do unsupervised
1809
+
1810
+ 0:42:32.580 --> 0:42:33.714
1811
+ translations.
1812
+
1813
+ 0:42:34.134 --> 0:42:40.997
1814
+ Typical example would be you want to do unsurpressed
1815
+ between English and Dutch right, but you are
1816
+
1817
+ 0:42:40.997 --> 0:42:47.369
1818
+ parallel data between English and German, so
1819
+ the question is can we use this parallel data
1820
+
1821
+ 0:42:47.369 --> 0:42:51.501
1822
+ to help building an unsurpressed betweenEnglish
1823
+ and Dutch?
1824
+
1825
+ 0:42:56.296 --> 0:43:01.240
1826
+ For the first one we try to take help of language
1827
+ embeddings for tokens, and this kind of is
1828
+
1829
+ 0:43:01.240 --> 0:43:05.758
1830
+ a straightforward way to know to tell them
1831
+ well which language they're dealing with.
1832
+
1833
+ 0:43:06.466 --> 0:43:11.993
1834
+ And for the second one we're going to look
1835
+ at some pre training objectives which are also
1836
+
1837
+ 0:43:11.993 --> 0:43:17.703
1838
+ kind of unsupervised so we need monolingual
1839
+ data mostly and this kind of helps us to promote
1840
+
1841
+ 0:43:17.703 --> 0:43:20.221
1842
+ the language independent representation.
1843
+
1844
+ 0:43:23.463 --> 0:43:29.954
1845
+ So the first three things more that we'll
1846
+ look at is excel, which is quite famous if
1847
+
1848
+ 0:43:29.954 --> 0:43:32.168
1849
+ you haven't heard of it yet.
1850
+
1851
+ 0:43:32.552 --> 0:43:40.292
1852
+ And: The way it works is that it's basically
1853
+ a transformer encoder right, so it's like the
1854
+
1855
+ 0:43:40.292 --> 0:43:42.419
1856
+ just the encoder module.
1857
+
1858
+ 0:43:42.334 --> 0:43:44.499
1859
+ No, there's no decoder here.
1860
+
1861
+ 0:43:44.884 --> 0:43:51.481
1862
+ And what we're trying to do is mask two tokens
1863
+ in a sequence and try to predict these mask
1864
+
1865
+ 0:43:51.481 --> 0:43:52.061
1866
+ tokens.
1867
+
1868
+ 0:43:51.988 --> 0:43:55.469
1869
+ So I quickly called us mask language modeling.
1870
+
1871
+ 0:43:55.996 --> 0:44:05.419
1872
+ Typical language modeling that you see is
1873
+ the Danish language modeling where you predict
1874
+
1875
+ 0:44:05.419 --> 0:44:08.278
1876
+ the next token in English.
1877
+
1878
+ 0:44:08.172 --> 0:44:11.140
1879
+ Then we have the position.
1880
+
1881
+ 0:44:11.871 --> 0:44:18.774
1882
+ Then we have the token embellings, and then
1883
+ here we have the mass token, and then we have
1884
+
1885
+ 0:44:18.774 --> 0:44:22.378
1886
+ the transformer encoder blocks to predict the.
1887
+
1888
+ 0:44:24.344 --> 0:44:30.552
1889
+ To do this for all languages using the same
1890
+ tang somewhere encoded and this kind of helps
1891
+
1892
+ 0:44:30.552 --> 0:44:36.760
1893
+ us to push the the sentence and bearings or
1894
+ the output of the encoded into a common space
1895
+
1896
+ 0:44:36.760 --> 0:44:37.726
1897
+ per multiple.
1898
+
1899
+ 0:44:42.782 --> 0:44:49.294
1900
+ So first we train an MLM on both source, both
1901
+ source and target language sites, and then
1902
+
1903
+ 0:44:49.294 --> 0:44:54.928
1904
+ we use it as a starting point for the encoded
1905
+ and decoded for a UNMP system.
1906
+
1907
+ 0:44:55.475 --> 0:45:03.034
1908
+ So we take a monolingual data, build a mass
1909
+ language model on both source and target languages,
1910
+
1911
+ 0:45:03.034 --> 0:45:07.129
1912
+ and then read it to be or initialize that in
1913
+ the U.
1914
+
1915
+ 0:45:07.129 --> 0:45:07.365
1916
+ N.
1917
+
1918
+ 0:45:07.365 --> 0:45:07.601
1919
+ P.
1920
+
1921
+ 0:45:07.601 --> 0:45:07.837
1922
+ C.
1923
+
1924
+ 0:45:07.837 --> 0:45:14.688
1925
+ Here we look at two languages, but you can
1926
+ also do it with one hundred languages once.
1927
+
1928
+ 0:45:14.609 --> 0:45:20.174
1929
+ So they're retain checkpoints that you can
1930
+ use, which are quite which have seen quite
1931
+
1932
+ 0:45:20.174 --> 0:45:21.662
1933
+ a lot of data and use.
1934
+
1935
+ 0:45:21.597 --> 0:45:24.412
1936
+ It always has a starting point for your U.
1937
+
1938
+ 0:45:24.412 --> 0:45:24.608
1939
+ N.
1940
+
1941
+ 0:45:24.608 --> 0:45:27.292
1942
+ MP system, which in practice works well.
1943
+
1944
+ 0:45:31.491 --> 0:45:36.759
1945
+ This detail is that since this is an encoder
1946
+ block only, and your U.
1947
+
1948
+ 0:45:36.759 --> 0:45:36.988
1949
+ N.
1950
+
1951
+ 0:45:36.988 --> 0:45:37.217
1952
+ M.
1953
+
1954
+ 0:45:37.217 --> 0:45:37.446
1955
+ T.
1956
+
1957
+ 0:45:37.446 --> 0:45:40.347
1958
+ System is encodered, decodered right.
1959
+
1960
+ 0:45:40.271 --> 0:45:47.517
1961
+ So there's this cross attention that's missing,
1962
+ but you can always branch like that randomly.
1963
+
1964
+ 0:45:47.440 --> 0:45:48.373
1965
+ It's fine.
1966
+
1967
+ 0:45:48.508 --> 0:45:53.077
1968
+ Not everything is initialized, but it's still
1969
+ decent.
1970
+
1971
+ 0:45:56.056 --> 0:46:02.141
1972
+ Then we have the other one is M by plane,
1973
+ and here you see that this kind of builds on
1974
+
1975
+ 0:46:02.141 --> 0:46:07.597
1976
+ the the unsupervised training objector, which
1977
+ is the realizing auto encoding.
1978
+
1979
+ 0:46:08.128 --> 0:46:14.337
1980
+ So what they do is they say that we don't
1981
+ even need to do the gun outback translation,
1982
+
1983
+ 0:46:14.337 --> 0:46:17.406
1984
+ but you can do it later, but pre training.
1985
+
1986
+ 0:46:17.335 --> 0:46:24.954
1987
+ We just do do doing doing doing water inputting
1988
+ on all different languages, and that also gives
1989
+
1990
+ 0:46:24.954 --> 0:46:32.651
1991
+ you: Out of the box good performance, so what
1992
+ we basically have here is the transformer encoded.
1993
+
1994
+ 0:46:34.334 --> 0:46:37.726
1995
+ You are trying to generate a reconstructed
1996
+ sequence.
1997
+
1998
+ 0:46:37.662 --> 0:46:38.946
1999
+ You need a tickle.
2000
+
2001
+ 0:46:39.899 --> 0:46:42.022
2002
+ So we gave an input sentence.
2003
+
2004
+ 0:46:41.952 --> 0:46:48.138
2005
+ We tried to predict the masked tokens from
2006
+ the or we tried to reconstruct the original
2007
+
2008
+ 0:46:48.138 --> 0:46:52.475
2009
+ sentence from the input segments, which was
2010
+ corrupted right.
2011
+
2012
+ 0:46:52.404 --> 0:46:57.169
2013
+ So this is the same denoting objective that
2014
+ you have seen before.
2015
+
2016
+ 0:46:58.418 --> 0:46:59.737
2017
+ This is for English.
2018
+
2019
+ 0:46:59.674 --> 0:47:04.156
2020
+ I think this is for Japanese and then once
2021
+ we do it for all languages.
2022
+
2023
+ 0:47:04.093 --> 0:47:09.567
2024
+ I mean they have this difference on twenty
2025
+ five, fifty or so on and then you can find
2026
+
2027
+ 0:47:09.567 --> 0:47:11.795
2028
+ you on your sentence and document.
2029
+
2030
+ 0:47:13.073 --> 0:47:20.454
2031
+ And so what they is this for the supervised
2032
+ techniques, but you can also use this as initializations
2033
+
2034
+ 0:47:20.454 --> 0:47:25.058
2035
+ for unsupervised buildup on that which also
2036
+ in practice works.
2037
+
2038
+ 0:47:30.790 --> 0:47:36.136
2039
+ Then we have these, so still now we kind of
2040
+ didn't see the the states benefit from the
2041
+
2042
+ 0:47:36.136 --> 0:47:38.840
2043
+ high resource language right, so as I said.
2044
+
2045
+ 0:47:38.878 --> 0:47:44.994
2046
+ Why you can use English as something for English
2047
+ to Dutch, and if you want a new Catalan, you
2048
+
2049
+ 0:47:44.994 --> 0:47:46.751
2050
+ can use English to French.
2051
+
2052
+ 0:47:48.408 --> 0:47:55.866
2053
+ One typical way to do this is to use favorite
2054
+ translation lights or you take the.
2055
+
2056
+ 0:47:55.795 --> 0:48:01.114
2057
+ So here it's finished two weeks so you take
2058
+ your time say from finish to English English
2059
+
2060
+ 0:48:01.114 --> 0:48:03.743
2061
+ two weeks and then you get the translation.
2062
+
2063
+ 0:48:04.344 --> 0:48:10.094
2064
+ What's important is that you have these different
2065
+ techniques and you can always think of which
2066
+
2067
+ 0:48:10.094 --> 0:48:12.333
2068
+ one to use given the data situation.
2069
+
2070
+ 0:48:12.273 --> 0:48:18.007
2071
+ So if it was like finish to Greek maybe it's
2072
+ pivotal better because you might get good finish
2073
+
2074
+ 0:48:18.007 --> 0:48:20.020
2075
+ to English and English to Greek.
2076
+
2077
+ 0:48:20.860 --> 0:48:23.255
2078
+ Sometimes it also depends on the language
2079
+ pair.
2080
+
2081
+ 0:48:23.205 --> 0:48:27.577
2082
+ There might be some information loss and so
2083
+ on, so there are quite a few variables you
2084
+
2085
+ 0:48:27.577 --> 0:48:30.040
2086
+ need to think of and decide which system to
2087
+ use.
2088
+
2089
+ 0:48:32.752 --> 0:48:39.654
2090
+ Then there's a zero shot, which probably also
2091
+ I've seen in the multilingual course, and how
2092
+
2093
+ 0:48:39.654 --> 0:48:45.505
2094
+ if you can improve the language independence
2095
+ then your zero shot gets better.
2096
+
2097
+ 0:48:45.430 --> 0:48:52.107
2098
+ So maybe if you use the multilingual models
2099
+ and do zero shot directly, it's quite good.
2100
+
2101
+ 0:48:53.093 --> 0:48:58.524
2102
+ Thought we have zero shots per word, and then
2103
+ we have the answer to voice translation where
2104
+
2105
+ 0:48:58.524 --> 0:49:00.059
2106
+ we can calculate between.
2107
+
2108
+ 0:49:00.600 --> 0:49:02.762
2109
+ Just when there is no battle today.
2110
+
2111
+ 0:49:06.686 --> 0:49:07.565
2112
+ Is to solve.
2113
+
2114
+ 0:49:07.497 --> 0:49:11.960
2115
+ So sometimes what we have seen so far is that
2116
+ we basically have.
2117
+
2118
+ 0:49:15.255 --> 0:49:16.754
2119
+ To do from looking at it.
2120
+
2121
+ 0:49:16.836 --> 0:49:19.307
2122
+ These two files alone you can create a dictionary.
2123
+
2124
+ 0:49:19.699 --> 0:49:26.773
2125
+ Can build an unsupervised entry system, not
2126
+ always, but if the domains are similar in the
2127
+
2128
+ 0:49:26.773 --> 0:49:28.895
2129
+ languages, that's similar.
2130
+
2131
+ 0:49:28.816 --> 0:49:36.279
2132
+ But if there are distant languages, then the
2133
+ unsupervised texting doesn't usually work really
2134
+
2135
+ 0:49:36.279 --> 0:49:36.756
2136
+ well.
2137
+
2138
+ 0:49:37.617 --> 0:49:40.297
2139
+ What um.
2140
+
2141
+ 0:49:40.720 --> 0:49:46.338
2142
+ Would be is that if you can get some paddle
2143
+ data from somewhere or do bitex mining that
2144
+
2145
+ 0:49:46.338 --> 0:49:51.892
2146
+ we have seen in the in the laser practicum
2147
+ then you can use that as to initialize your
2148
+
2149
+ 0:49:51.892 --> 0:49:57.829
2150
+ system and then try and accept a semi supervised
2151
+ energy system and that would be better than
2152
+
2153
+ 0:49:57.829 --> 0:50:00.063
2154
+ just building an unsupervised and.
2155
+
2156
+ 0:50:00.820 --> 0:50:06.546
2157
+ With that as the end.
2158
+
2159
+ 0:50:07.207 --> 0:50:08.797
2160
+ Quickly could be.
2161
+
2162
+ 0:50:16.236 --> 0:50:25.591
2163
+ In common, they can catch the worst because
2164
+ the thing about finding a language is: And
2165
+
2166
+ 0:50:25.591 --> 0:50:35.053
2167
+ there's another joy in playing these games,
2168
+ almost in the middle of a game, and she's a
2169
+
2170
+ 0:50:35.053 --> 0:50:40.107
2171
+ characteristic too, and she is a global waver.
2172
+
2173
+ 0:50:56.916 --> 0:51:03.798
2174
+ Next talk inside and this somehow gives them
2175
+ many abilities, not only translation but other
2176
+
2177
+ 0:51:03.798 --> 0:51:08.062
2178
+ than that there are quite a few things that
2179
+ they can do.
2180
+
2181
+ 0:51:10.590 --> 0:51:17.706
2182
+ But the translation in itself usually doesn't
2183
+ really work really well if you build a system
2184
+
2185
+ 0:51:17.706 --> 0:51:20.878
2186
+ from your specific system for your case.
2187
+
2188
+ 0:51:22.162 --> 0:51:27.924
2189
+ I would guess that it's usually better than
2190
+ the LLM, but you can always adapt the LLM to
2191
+
2192
+ 0:51:27.924 --> 0:51:31.355
2193
+ the task that you want, and then it could be
2194
+ better.
2195
+
2196
+ 0:51:32.152 --> 0:51:37.849
2197
+ A little amount of the box might not be the
2198
+ best choice for your task force.
2199
+
2200
+ 0:51:37.775 --> 0:51:44.138
2201
+ For me, I'm working on new air translation,
2202
+ so it's more about translating software.
2203
+
2204
+ 0:51:45.065 --> 0:51:50.451
2205
+ And it's quite often each domain as well,
2206
+ and if use the LLM out of the box, they're
2207
+
2208
+ 0:51:50.451 --> 0:51:53.937
2209
+ actually quite bad compared to the systems
2210
+ that built.
2211
+
2212
+ 0:51:54.414 --> 0:51:56.736
2213
+ But you can do these different techniques
2214
+ like prompting.
2215
+
2216
+ 0:51:57.437 --> 0:52:03.442
2217
+ This is what people usually do is heart prompting
2218
+ where they give similar translation pairs in
2219
+
2220
+ 0:52:03.442 --> 0:52:08.941
2221
+ the prompt and then ask it to translate and
2222
+ then that kind of improves the performance
2223
+
2224
+ 0:52:08.941 --> 0:52:09.383
2225
+ a lot.
2226
+
2227
+ 0:52:09.320 --> 0:52:15.124
2228
+ So there are different techniques that you
2229
+ can do to adapt your eye lens and then it might
2230
+
2231
+ 0:52:15.124 --> 0:52:16.400
2232
+ be better than the.
2233
+
2234
+ 0:52:16.376 --> 0:52:17.742
2235
+ Task a fixed system.
2236
+
2237
+ 0:52:18.418 --> 0:52:22.857
2238
+ But if you're looking for niche things, I
2239
+ don't think error limbs are that good.
2240
+
2241
+ 0:52:22.802 --> 0:52:26.268
2242
+ But if you want to do to do, let's say, unplugged
2243
+ translation.
2244
+
2245
+ 0:52:26.213 --> 0:52:29.974
2246
+ In this case you can never be sure that they
2247
+ haven't seen the data.
2248
+
2249
+ 0:52:29.918 --> 0:52:35.048
2250
+ First of all is that if you see the data in
2251
+ that language or not, and if they're panthetic,
2252
+
2253
+ 0:52:35.048 --> 0:52:36.832
2254
+ they probably did see the data.
2255
+
2256
+ 0:52:40.360 --> 0:53:00.276
2257
+ I feel like they have pretty good understanding
2258
+ of each million people.
2259
+
2260
+ 0:53:04.784 --> 0:53:09.059
2261
+ Depends on the language, but I'm pretty surprised
2262
+ that it works on a lotus language.
2263
+
2264
+ 0:53:09.009 --> 0:53:11.122
2265
+ I would expect it to work on German and.
2266
+
2267
+ 0:53:11.972 --> 0:53:13.633
2268
+ But if you take a lot of first language,.
2269
+
2270
+ 0:53:14.474 --> 0:53:20.973
2271
+ Don't think it works, and also there are quite
2272
+ a few papers where they've already showed that
2273
+
2274
+ 0:53:20.973 --> 0:53:27.610
2275
+ if you build a system yourself or build a typical
2276
+ way to build a system, it's quite better than
2277
+
2278
+ 0:53:27.610 --> 0:53:29.338
2279
+ the bit better than the.
2280
+
2281
+ 0:53:29.549 --> 0:53:34.883
2282
+ But you can always do things with limbs to
2283
+ get better, but then I'm probably.
2284
+
2285
+ 0:53:37.557 --> 0:53:39.539
2286
+ Anymore.
2287
+
2288
+ 0:53:41.421 --> 0:53:47.461
2289
+ So if not then we're going to end the lecture
2290
+ here and then on Thursday we're going to have
2291
+
2292
+ 0:53:47.461 --> 0:53:51.597
2293
+ documented empty which is also run by me so
2294
+ thanks for coming.
2295
+
demo_data/lectures/Lecture-15-11.07.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62985057e3dfdb7c34a3ef8e74a9b52e9529b2a974ff62438c617e6d699b5a89
3
+ size 81272567
demo_data/lectures/Lecture-18-18.07.2023/English.vtt ADDED
@@ -0,0 +1,2738 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 0:00:01.541 --> 0:00:06.914
4
+ IntroOkay, so we'll come back to today's lecture.
5
+
6
+ 0:00:08.528 --> 0:00:23.334
7
+ We want to talk about is speech translation,
8
+ so we'll have two lectures in this week about
9
+
10
+ 0:00:23.334 --> 0:00:26.589
11
+ speech translation.
12
+
13
+ 0:00:27.087 --> 0:00:36.456
14
+ And so in the last week we'll have some exercise
15
+ and repetition.
16
+
17
+ 0:00:36.312 --> 0:00:46.692
18
+ We want to look at what is now to do when
19
+ we want to translate speech.
20
+
21
+ 0:00:46.946 --> 0:00:55.675
22
+ So we want to address the specific challenges
23
+ that occur when we switch from translating
24
+
25
+ 0:00:55.675 --> 0:00:56.754
26
+ to speech.
27
+
28
+ 0:00:57.697 --> 0:01:13.303
29
+ Today we will look at the more general picture
30
+ out and build the systems.
31
+
32
+ 0:01:13.493 --> 0:01:22.219
33
+ Speech TranslationAnd then secondly an end
34
+ approach where we are going to put in audio
35
+
36
+ 0:01:22.219 --> 0:01:23.623
37
+ and generate.
38
+
39
+ 0:01:24.224 --> 0:01:41.439
40
+ Which are the main dominant systems which
41
+ are used in research and commercial systems.
42
+
43
+ 0:01:43.523 --> 0:01:56.879
44
+ More general, what is the general task of
45
+ speech translation that is shown here?
46
+
47
+ 0:01:56.714 --> 0:02:01.832
48
+ The idea is we have a speech.
49
+
50
+ 0:02:02.202 --> 0:02:12.838
51
+ Then we want to have a system which takes
52
+ this audio and then translates it into another
53
+
54
+ 0:02:12.838 --> 0:02:14.033
55
+ language.
56
+
57
+ 0:02:15.095 --> 0:02:20.694
58
+ Then it's no longer as clear the output modality.
59
+
60
+ 0:02:20.582 --> 0:02:33.378
61
+ In contrast, for humans we can typically have:
62
+ So you can either have more textual translation,
63
+
64
+ 0:02:33.378 --> 0:02:37.911
65
+ then you have subtitles, and the.
66
+
67
+ 0:02:38.538 --> 0:02:57.010
68
+ Are you want to have it also in audio like
69
+ it's done for human interpretation?
70
+
71
+ 0:02:57.417 --> 0:03:03.922
72
+ See there is not the one best solution, so
73
+ all of this one is always better.
74
+
75
+ 0:03:03.837 --> 0:03:09.415
76
+ It heavily depends on what is the use of what
77
+ the people prefer.
78
+
79
+ 0:03:09.929 --> 0:03:14.950
80
+ For example, you can think of if you know
81
+ a bit the source of language, but you're a
82
+
83
+ 0:03:14.950 --> 0:03:17.549
84
+ bit unsure and don't understand everything.
85
+
86
+ 0:03:17.490 --> 0:03:23.138
87
+ They may texture it out for this pattern because
88
+ you can direct your gear to what was said and
89
+
90
+ 0:03:23.138 --> 0:03:26.706
91
+ only if you're unsure you check down with your
92
+ translation.
93
+
94
+ 0:03:27.727 --> 0:03:33.511
95
+ Are another things that might be preferable
96
+ to have a complete spoken of.
97
+
98
+ 0:03:34.794 --> 0:03:48.727
99
+ So there are both ones for a long time in
100
+ automatic systems focused mainly on text output.
101
+
102
+ 0:03:48.574 --> 0:04:06.741
103
+ In most cases: But of course you can always
104
+ hand them to text to speech systems which generates
105
+
106
+ 0:04:06.741 --> 0:04:09.958
107
+ audio from that.
108
+
109
+ 0:04:12.772 --> 0:04:14.494
110
+ Why should we care about that?
111
+
112
+ 0:04:14.438 --> 0:04:15.773
113
+ Why should we do that?
114
+
115
+ 0:04:17.737 --> 0:04:24.141
116
+ There is the nice thing that yeah, with a
117
+ globalized world, we are able to now interact
118
+
119
+ 0:04:24.141 --> 0:04:25.888
120
+ with a lot more people.
121
+
122
+ 0:04:25.815 --> 0:04:29.206
123
+ You can do some conferences around the world.
124
+
125
+ 0:04:29.132 --> 0:04:31.567
126
+ We can travel around the world.
127
+
128
+ 0:04:31.671 --> 0:04:37.802
129
+ We can by Internet watch movies from all over
130
+ the world and watch TV from all over the world.
131
+
132
+ 0:04:38.618 --> 0:04:47.812
133
+ However, there is still this barrier that
134
+ is mainly to watch videos, either in English
135
+
136
+ 0:04:47.812 --> 0:04:49.715
137
+ or in a language.
138
+
139
+ 0:04:50.250 --> 0:05:00.622
140
+ So what is currently happening in order to
141
+ reach a large audience is that everybody.
142
+
143
+ 0:05:00.820 --> 0:05:07.300
144
+ So if we are going, for example, to a conferences,
145
+ these are international conferences.
146
+
147
+ 0:05:08.368 --> 0:05:22.412
148
+ However, everybody will then speak English
149
+ since that is some of the common language that
150
+
151
+ 0:05:22.412 --> 0:05:26.001
152
+ everybody understands.
153
+
154
+ 0:05:26.686 --> 0:05:32.929
155
+ So on the other hand, we cannot like have
156
+ human interpreters like they ever work.
157
+
158
+ 0:05:32.892 --> 0:05:37.797
159
+ You have that maybe in the European Parliament
160
+ or in important business meetings.
161
+
162
+ 0:05:38.078 --> 0:05:47.151
163
+ But this is relatively expensive, and so the
164
+ question is, can we enable communication in
165
+
166
+ 0:05:47.151 --> 0:05:53.675
167
+ your mother-in-law without having to have human
168
+ interpretation?
169
+
170
+ 0:05:54.134 --> 0:06:04.321
171
+ And there like speech translation can be helpful
172
+ in order to help you bridge this gap.
173
+
174
+ 0:06:06.726 --> 0:06:22.507
175
+ In this case, there are different scenarios
176
+ of how you can apply speech translation.
177
+
178
+ 0:06:22.422 --> 0:06:29.282
179
+ That's typically more interactive than we
180
+ are talking about text translation.
181
+
182
+ 0:06:29.194 --> 0:06:32.802
183
+ Text translation is most commonly used.
184
+
185
+ 0:06:33.153 --> 0:06:41.637
186
+ Course: Nowadays there's things like chat
187
+ and so on where it could also be interactive.
188
+
189
+ 0:06:42.082 --> 0:06:48.299
190
+ In contrast to speech translation, that is
191
+ less static, so there is different ways of
192
+
193
+ 0:06:48.299 --> 0:06:48.660
194
+ how.
195
+
196
+ 0:06:49.149 --> 0:07:00.544
197
+ The one scenario is what is called a translation
198
+ where you first get an input, then you translate
199
+
200
+ 0:07:00.544 --> 0:07:03.799
201
+ this fixed input, and then.
202
+
203
+ 0:07:04.944 --> 0:07:12.823
204
+ With me, which means you have always like
205
+ fixed, yeah fixed challenges which you need
206
+
207
+ 0:07:12.823 --> 0:07:14.105
208
+ to translate.
209
+
210
+ 0:07:14.274 --> 0:07:25.093
211
+ You don't need to like beat your mind what
212
+ are the boundaries where there's an end.
213
+
214
+ 0:07:25.405 --> 0:07:31.023
215
+ Also, there is no overlapping.
216
+
217
+ 0:07:30.842 --> 0:07:42.986
218
+ There is always a one-person sentence that
219
+ is getting translated.
220
+
221
+ 0:07:43.443 --> 0:07:51.181
222
+ Of course, this has a disadvantage that it
223
+ makes the conversation a lot longer because
224
+
225
+ 0:07:51.181 --> 0:07:55.184
226
+ you always have only speech and translation.
227
+
228
+ 0:07:57.077 --> 0:08:03.780
229
+ For example, if you would use that for a presentation
230
+ there would be yeah quite get quite long, if
231
+
232
+ 0:08:03.780 --> 0:08:09.738
233
+ I would just imagine you sitting here in the
234
+ lecture I would say three sentences that I
235
+
236
+ 0:08:09.738 --> 0:08:15.765
237
+ would wait for this interpreter to translate
238
+ it, then I would say the next two sentences
239
+
240
+ 0:08:15.765 --> 0:08:16.103
241
+ and.
242
+
243
+ 0:08:16.676 --> 0:08:28.170
244
+ That is why in these situations, for example,
245
+ if you have a direct conversation with a patient,
246
+
247
+ 0:08:28.170 --> 0:08:28.888
248
+ then.
249
+
250
+ 0:08:29.209 --> 0:08:32.733
251
+ But still there it's too big to be taking
252
+ them very long.
253
+
254
+ 0:08:33.473 --> 0:08:42.335
255
+ And that's why there's also the research on
256
+ simultaneous translation, where the idea is
257
+
258
+ 0:08:42.335 --> 0:08:43.644
259
+ in parallel.
260
+
261
+ 0:08:43.964 --> 0:08:46.179
262
+ That Is the Dining for Human.
263
+
264
+ 0:08:46.126 --> 0:08:52.429
265
+ Interpretation like if you think of things
266
+ like the European Parliament where they of
267
+
268
+ 0:08:52.429 --> 0:08:59.099
269
+ course not only speak always one sentence but
270
+ are just giving their speech and in parallel
271
+
272
+ 0:08:59.099 --> 0:09:04.157
273
+ human interpreters are translating the speech
274
+ into another language.
275
+
276
+ 0:09:04.985 --> 0:09:12.733
277
+ The same thing is interesting for automatic
278
+ speech translation where we in parallel generate
279
+
280
+ 0:09:12.733 --> 0:09:13.817
281
+ translation.
282
+
283
+ 0:09:15.415 --> 0:09:32.271
284
+ The challenges then, of course, are that we
285
+ need to segment our speech into somehow's chunks.
286
+
287
+ 0:09:32.152 --> 0:09:34.903
288
+ We just looked for the dots we saw.
289
+
290
+ 0:09:34.827 --> 0:09:38.619
291
+ There are some challenges that we have to
292
+ check.
293
+
294
+ 0:09:38.541 --> 0:09:41.020
295
+ The Doctor may not understand.
296
+
297
+ 0:09:41.201 --> 0:09:47.478
298
+ But in generally getting sentence boundary
299
+ sentences is not a really research question.
300
+
301
+ 0:09:47.647 --> 0:09:51.668
302
+ While in speech translation, this is not that
303
+ easy.
304
+
305
+ 0:09:51.952 --> 0:10:05.908
306
+ Either getting that in the audio is difficult
307
+ because it's not like we typically do breaks
308
+
309
+ 0:10:05.908 --> 0:10:09.742
310
+ when there's a sentence.
311
+
312
+ 0:10:10.150 --> 0:10:17.432
313
+ And even if you then see the transcript and
314
+ would have to add the punctuation, this is
315
+
316
+ 0:10:17.432 --> 0:10:18.101
317
+ not as.
318
+
319
+ 0:10:20.340 --> 0:10:25.942
320
+ Another question is how many speakers we have
321
+ here.
322
+
323
+ 0:10:25.834 --> 0:10:31.761
324
+ In presentations you have more like a single
325
+ speaker.
326
+
327
+ 0:10:31.931 --> 0:10:40.186
328
+ That is normally easier from the part of audio
329
+ processing, so in general in speech translation.
330
+
331
+ 0:10:40.460 --> 0:10:49.308
332
+ You can have different challenges and they
333
+ can be of different components.
334
+
335
+ 0:10:49.190 --> 0:10:56.039
336
+ In addition to translation, you have: And
337
+ if you're not going, for example, the magical
338
+
339
+ 0:10:56.039 --> 0:11:00.398
340
+ speaker, there are significantly additional
341
+ challenges.
342
+
343
+ 0:11:00.720 --> 0:11:10.313
344
+ So we as humans we are very good in filtering
345
+ out noises, or if two people speak in parallel
346
+
347
+ 0:11:10.313 --> 0:11:15.058
348
+ to like separate these two speakers and hear.
349
+
350
+ 0:11:15.495 --> 0:11:28.300
351
+ However, if you want to do that with automatic
352
+ systems that is very challenging so that you
353
+
354
+ 0:11:28.300 --> 0:11:33.172
355
+ can separate the speakers so that.
356
+
357
+ 0:11:33.453 --> 0:11:41.284
358
+ For the more of you have this multi-speaker
359
+ scenario, typically it's also less well prepared.
360
+
361
+ 0:11:41.721 --> 0:11:45.807
362
+ So you're getting very, we'll talk about the
363
+ spontaneous effects.
364
+
365
+ 0:11:46.186 --> 0:11:53.541
366
+ So people like will stop in the middle of
367
+ the sentence, they change their sentence, and
368
+
369
+ 0:11:53.541 --> 0:12:01.481
370
+ so on, and like filtering these, these fluences
371
+ out of the text and working with them is often
372
+
373
+ 0:12:01.481 --> 0:12:02.986
374
+ very challenging.
375
+
376
+ 0:12:05.565 --> 0:12:09.144
377
+ So these are all additional challenges when
378
+ you have multiples.
379
+
380
+ 0:12:10.330 --> 0:12:19.995
381
+ Then there's a question of an online or offline
382
+ system, sometimes textbook station.
383
+
384
+ 0:12:19.880 --> 0:12:21.844
385
+ We also mainly.
386
+
387
+ 0:12:21.962 --> 0:12:36.507
388
+ That means you can take the whole text and
389
+ you can translate it in a badge.
390
+
391
+ 0:12:37.337 --> 0:12:44.344
392
+ However, for speech translation there's also
393
+ several scenarios where this is the case.
394
+
395
+ 0:12:44.264 --> 0:12:51.488
396
+ For example, when you're translating a movie,
397
+ it's not only that you don't have to do it
398
+
399
+ 0:12:51.488 --> 0:12:54.735
400
+ live, but you can take the whole movie.
401
+
402
+ 0:12:55.215 --> 0:13:05.473
403
+ However, there is also a lot of situations
404
+ where you don't have this opportunity like
405
+
406
+ 0:13:05.473 --> 0:13:06.785
407
+ or sports.
408
+
409
+ 0:13:07.247 --> 0:13:13.963
410
+ And you don't want to like first like let
411
+ around a sports event and then like show in
412
+
413
+ 0:13:13.963 --> 0:13:19.117
414
+ the game three hours later then there is not
415
+ really any interest.
416
+
417
+ 0:13:19.399 --> 0:13:31.118
418
+ So you have to do it live, and so we have
419
+ the additional challenge of translating the
420
+
421
+ 0:13:31.118 --> 0:13:32.208
422
+ system.
423
+
424
+ 0:13:32.412 --> 0:13:42.108
425
+ There are still things on the one end of course.
426
+
427
+ 0:13:41.910 --> 0:13:49.632
428
+ It needs to be real time translation.
429
+
430
+ 0:13:49.869 --> 0:13:54.153
431
+ It's taking longer, then you're getting more
432
+ and more and more delayed.
433
+
434
+ 0:13:55.495 --> 0:14:05.245
435
+ So it maybe seems simple, but there have been
436
+ research systems which are undertime slower
437
+
438
+ 0:14:05.245 --> 0:14:07.628
439
+ than real time or so.
440
+
441
+ 0:14:07.520 --> 0:14:15.104
442
+ If you want to show what is possible with
443
+ the best current systems,.
444
+
445
+ 0:14:16.596 --> 0:14:18.477
446
+ But that isn't even not enough.
447
+
448
+ 0:14:18.918 --> 0:14:29.593
449
+ The other question: You can have a system
450
+ which is even like several times real time.
451
+
452
+ 0:14:29.509 --> 0:14:33.382
453
+ In less than one second, it might still be
454
+ not useful.
455
+
456
+ 0:14:33.311 --> 0:14:39.646
457
+ Then the question is like the latency, so
458
+ how much time has passed since you can produce
459
+
460
+ 0:14:39.646 --> 0:14:39.931
461
+ an.
462
+
463
+ 0:14:40.120 --> 0:14:45.814
464
+ It might be that in average you can like concress
465
+ it, but you still can't do it directly.
466
+
467
+ 0:14:45.751 --> 0:14:51.547
468
+ You need to do it after, or you need to have
469
+ the full context of thirty seconds before you
470
+
471
+ 0:14:51.547 --> 0:14:55.178
472
+ can output something, and then you have a large
473
+ latency.
474
+
475
+ 0:14:55.335 --> 0:15:05.871
476
+ So it can be that do it as fast as it is produced,
477
+ but have to wait until the food.
478
+
479
+ 0:15:06.426 --> 0:15:13.772
480
+ So we'll look into that on Thursday how we
481
+ can then generate translations that are having
482
+
483
+ 0:15:13.772 --> 0:15:14.996
484
+ a low latency.
485
+
486
+ 0:15:15.155 --> 0:15:21.587
487
+ You can imagine, for example, in German that
488
+ it's maybe quite challenging since the word
489
+
490
+ 0:15:21.587 --> 0:15:23.466
491
+ is often like at the end.
492
+
493
+ 0:15:23.394 --> 0:15:30.108
494
+ If you're using perfect, like in harbor and
495
+ so on, and then in English you have to directly
496
+
497
+ 0:15:30.108 --> 0:15:30.983
498
+ produce it.
499
+
500
+ 0:15:31.311 --> 0:15:38.757
501
+ So if you really want to have no context you
502
+ might need to wait until the end of the sentence.
503
+
504
+ 0:15:41.021 --> 0:15:45.920
505
+ Besides that, of course, offline and it gives
506
+ you more additional help.
507
+
508
+ 0:15:45.852 --> 0:15:51.399
509
+ Context Based SystemsI think last week you
510
+ talked about context based systems that typically
511
+
512
+ 0:15:51.399 --> 0:15:55.575
513
+ have context from maybe from the past but maybe
514
+ also from the future.
515
+
516
+ 0:15:55.595 --> 0:16:02.923
517
+ Then, of course, you cannot use anything from
518
+ the future in this case, but you can use it.
519
+
520
+ 0:16:07.407 --> 0:16:24.813
521
+ Finally, there is a thing about how you want
522
+ to present it to the audience in automatic
523
+
524
+ 0:16:24.813 --> 0:16:27.384
525
+ translation.
526
+
527
+ 0:16:27.507 --> 0:16:31.361
528
+ There is also the thing that you want to do.
529
+
530
+ 0:16:31.275 --> 0:16:35.302
531
+ All your outfits are running like the system.
532
+
533
+ 0:16:35.996 --> 0:16:36.990
534
+ Top of it.
535
+
536
+ 0:16:36.900 --> 0:16:44.315
537
+ Then they answered questions: How should it
538
+ be spoken so you can do things like.
539
+
540
+ 0:16:46.586 --> 0:16:52.507
541
+ Voice cloning so that it's like even the same
542
+ voice than the original speaker.
543
+
544
+ 0:16:53.994 --> 0:16:59.081
545
+ And if you do text or dubbing then there might
546
+ be additional constraints.
547
+
548
+ 0:16:59.012 --> 0:17:05.614
549
+ So if you think about subtitles: And they
550
+ should be readable, and we are too big to speak
551
+
552
+ 0:17:05.614 --> 0:17:07.961
553
+ faster than you can maybe read.
554
+
555
+ 0:17:08.908 --> 0:17:14.239
556
+ So you might need to shorten your text.
557
+
558
+ 0:17:14.105 --> 0:17:20.170
559
+ People say that a subtitle can be two lines.
560
+
561
+ 0:17:20.035 --> 0:17:26.103
562
+ Each line can be this number of characters.
563
+
564
+ 0:17:26.346 --> 0:17:31.753
565
+ So you cannot like if you have too long text,
566
+ we might need to shorten that to do that.
567
+
568
+ 0:17:32.052 --> 0:17:48.272
569
+ Similarly, if you think about dubbing, if
570
+ you want to produce dubbing voice, then the
571
+
572
+ 0:17:48.272 --> 0:17:50.158
573
+ original.
574
+
575
+ 0:17:51.691 --> 0:17:59.294
576
+ Here is another problem that we have different
577
+ settings like a more formal setting and let's
578
+
579
+ 0:17:59.294 --> 0:18:00.602
580
+ have different.
581
+
582
+ 0:18:00.860 --> 0:18:09.775
583
+ If you think about the United Nations maybe
584
+ you want more former things and between friends
585
+
586
+ 0:18:09.775 --> 0:18:14.911
587
+ maybe that former and there are languages which
588
+ use.
589
+
590
+ 0:18:15.355 --> 0:18:21.867
591
+ That is sure that is an important research
592
+ question.
593
+
594
+ 0:18:21.744 --> 0:18:28.013
595
+ To do that would more think of it more generally.
596
+
597
+ 0:18:28.308 --> 0:18:32.902
598
+ That's important in text translation.
599
+
600
+ 0:18:32.781 --> 0:18:41.003
601
+ If you translate a letter to your boss, it
602
+ should sound different.
603
+
604
+ 0:18:42.202 --> 0:18:53.718
605
+ So there is a question of how you can do this
606
+ style work on how you can do that.
607
+
608
+ 0:18:53.576 --> 0:19:00.545
609
+ For example, if you can specify that you might.
610
+
611
+ 0:19:00.460 --> 0:19:10.954
612
+ So you can tax the center or generate an informal
613
+ style because, as you correctly said, this
614
+
615
+ 0:19:10.954 --> 0:19:16.709
616
+ is especially challenging again in the situations.
617
+
618
+ 0:19:16.856 --> 0:19:20.111
619
+ Of course, there are ways of like being formal
620
+ or less formal.
621
+
622
+ 0:19:20.500 --> 0:19:24.940
623
+ But it's not like as clear as you do it, for
624
+ example, in German where you have the twin
625
+
626
+ 0:19:24.940 --> 0:19:25.091
627
+ C.
628
+
629
+ 0:19:25.091 --> 0:19:26.857
630
+ So there is no one to own mapping.
631
+
632
+ 0:19:27.287 --> 0:19:34.269
633
+ If you want to make that sure you can build
634
+ a system which generates different styles in
635
+
636
+ 0:19:34.269 --> 0:19:38.662
637
+ the output, so yeah that's definitely also
638
+ a challenge.
639
+
640
+ 0:19:38.584 --> 0:19:43.763
641
+ It just may be not mentioned here because
642
+ it's not specific now.
643
+
644
+ 0:19:44.524 --> 0:19:54.029
645
+ Generally, of course, these are all challenges
646
+ in how to customize and adapt systems to use
647
+
648
+ 0:19:54.029 --> 0:19:56.199
649
+ cases with specific.
650
+
651
+ 0:20:00.360 --> 0:20:10.230
652
+ Cascading SystemsSpeech translation has been
653
+ done for quite a while and it's maybe not surprising
654
+
655
+ 0:20:10.230 --> 0:20:13.554
656
+ it started with more simple use.
657
+
658
+ 0:20:13.793 --> 0:20:24.557
659
+ So people first started to look into, for
660
+ example, limited to main translations.
661
+
662
+ 0:20:24.424 --> 0:20:33.728
663
+ The tourist was typically application if you're
664
+ going to a new city.
665
+
666
+ 0:20:34.834 --> 0:20:44.028
667
+ Then there are several open things of doing
668
+ open domain translation, especially people.
669
+
670
+ 0:20:44.204 --> 0:20:51.957
671
+ Like where there's a lot of data so you could
672
+ build systems which are more open to main,
673
+
674
+ 0:20:51.957 --> 0:20:55.790
675
+ but of course it's still a bit restrictive.
676
+
677
+ 0:20:55.703 --> 0:20:59.061
678
+ It's true in the European Parliament.
679
+
680
+ 0:20:58.973 --> 0:21:01.892
681
+ People talk about anything but.
682
+
683
+ 0:21:02.162 --> 0:21:04.820
684
+ And so it's not completely used for everything.
685
+
686
+ 0:21:05.165 --> 0:21:11.545
687
+ Nowadays we've seen this technology in a lot
688
+ of different situations guess you ought.
689
+
690
+ 0:21:11.731 --> 0:21:17.899
691
+ Use it so there is some basic technologies
692
+ where you can use them already.
693
+
694
+ 0:21:18.218 --> 0:21:33.599
695
+ There is still a lot of open questions going
696
+ from if you are going to really spontaneous
697
+
698
+ 0:21:33.599 --> 0:21:35.327
699
+ meetings.
700
+
701
+ 0:21:35.655 --> 0:21:41.437
702
+ Then these systems typically work good for
703
+ like some languages where we have a lot of
704
+
705
+ 0:21:41.437 --> 0:21:42.109
706
+ friendly.
707
+
708
+ 0:21:42.742 --> 0:21:48.475
709
+ But if we want to go for really low resource
710
+ data then things are often challenging.
711
+
712
+ 0:21:48.448 --> 0:22:02.294
713
+ Last week we had a workshop on spoken language
714
+ translation and there is a low-resource data
715
+
716
+ 0:22:02.294 --> 0:22:05.756
717
+ track which is dialed.
718
+
719
+ 0:22:05.986 --> 0:22:06.925
720
+ And so on.
721
+
722
+ 0:22:06.840 --> 0:22:14.700
723
+ All these languages can still then have significantly
724
+ lower performance than for a higher.
725
+
726
+ 0:22:17.057 --> 0:22:20.126
727
+ So how does this work?
728
+
729
+ 0:22:19.993 --> 0:22:30.061
730
+ If we want to do speech translation, there's
731
+ like three basic technology: So on the one
732
+
733
+ 0:22:30.061 --> 0:22:40.815
734
+ hand, it's automatic speech recognition where
735
+ automatic speech recognition normally transacts
736
+
737
+ 0:22:40.815 --> 0:22:41.615
738
+ audio.
739
+
740
+ 0:22:42.822 --> 0:22:58.289
741
+ Then what we talked about here is machine
742
+ translation, which takes input and translates
743
+
744
+ 0:22:58.289 --> 0:23:01.276
745
+ into the target.
746
+
747
+ 0:23:02.642 --> 0:23:11.244
748
+ And the very simple model now, if you think
749
+ about it, is of course the similar combination.
750
+
751
+ 0:23:11.451 --> 0:23:14.740
752
+ We have solved all these parts in a salt bedrock.
753
+
754
+ 0:23:14.975 --> 0:23:31.470
755
+ We are working on all these problems there,
756
+ so if we want to do a speech transition, maybe.
757
+
758
+ 0:23:31.331 --> 0:23:35.058
759
+ Such problems we just put all these combinations
760
+ together.
761
+
762
+ 0:23:35.335 --> 0:23:45.130
763
+ And then you get what you have as a cascading
764
+ system, which first is so you take your audio.
765
+
766
+ 0:23:45.045 --> 0:23:59.288
767
+ To take this as input and generate the output,
768
+ and then you take this text output, put it
769
+
770
+ 0:23:59.288 --> 0:24:00.238
771
+ into.
772
+
773
+ 0:24:00.640 --> 0:24:05.782
774
+ So in that way you have now.
775
+
776
+ 0:24:08.008 --> 0:24:18.483
777
+ Have now a solution for generating doing speech
778
+ translation for these types of systems, and
779
+
780
+ 0:24:18.483 --> 0:24:20.874
781
+ this type is called.
782
+
783
+ 0:24:21.681 --> 0:24:28.303
784
+ It is still often reaching state of the art,
785
+ however it has benefits and disadvantages.
786
+
787
+ 0:24:28.668 --> 0:24:41.709
788
+ So the one big benefit is we have independent
789
+ components and some of that is nice.
790
+
791
+ 0:24:41.552 --> 0:24:48.469
792
+ So if there are great ideas put into your.
793
+
794
+ 0:24:48.788 --> 0:24:57.172
795
+ And then some other times people develop a
796
+ new good way of how to improve.
797
+
798
+ 0:24:57.060 --> 0:25:00.976
799
+ You can also take this model and.
800
+
801
+ 0:25:01.381 --> 0:25:07.639
802
+ So you can leverage improvements from all
803
+ the different communities in order to adapt.
804
+
805
+ 0:25:08.288 --> 0:25:18.391
806
+ Furthermore, we would like to see, since all
807
+ of them is learning, that the biggest advantage
808
+
809
+ 0:25:18.391 --> 0:25:23.932
810
+ is that we have training data for each individual.
811
+
812
+ 0:25:24.164 --> 0:25:34.045
813
+ So there's a lot less training data where
814
+ you have the English audio, so it's easy to
815
+
816
+ 0:25:34.045 --> 0:25:34.849
817
+ train.
818
+
819
+ 0:25:36.636 --> 0:25:48.595
820
+ Now am a one that we will focus on when talking
821
+ about the cascaded approach is that often it.
822
+
823
+ 0:25:48.928 --> 0:25:58.049
824
+ So you need to adapt each component a bit
825
+ so that it's adapting to its input and.
826
+
827
+ 0:25:58.278 --> 0:26:08.728
828
+ So we'll focus there especially on how to
829
+ combine and since said the main focus is: So
830
+
831
+ 0:26:08.728 --> 0:26:18.578
832
+ if you would directly use an output that might
833
+ not work as perfect as you would,.
834
+
835
+ 0:26:18.918 --> 0:26:33.467
836
+ So a major challenge when building a cascade
837
+ of speech translation systems is how can we
838
+
839
+ 0:26:33.467 --> 0:26:38.862
840
+ adapt these systems and how can?
841
+
842
+ 0:26:41.681 --> 0:26:43.918
843
+ So why, why is this the kick?
844
+
845
+ 0:26:44.164 --> 0:26:49.183
846
+ So it would look quite nice.
847
+
848
+ 0:26:49.010 --> 0:26:54.617
849
+ It seems to be very reasonable.
850
+
851
+ 0:26:54.442 --> 0:26:58.196
852
+ You have some audio.
853
+
854
+ 0:26:58.018 --> 0:27:03.388
855
+ You put it into your system.
856
+
857
+ 0:27:04.965 --> 0:27:23.759
858
+ However, this is a bit which for thinking
859
+ because if you speak what you speak is more.
860
+
861
+ 0:27:23.984 --> 0:27:29.513
862
+ And especially all that rarely have punctuations
863
+ in there, and while the anti-system.
864
+
865
+ 0:27:29.629 --> 0:27:43.247
866
+ They assume, of course, that it's a full sentence,
867
+ that you don't have there some.
868
+
869
+ 0:27:43.523 --> 0:27:55.087
870
+ So we see we want to get this bridge between
871
+ the output and the input, and we might need
872
+
873
+ 0:27:55.087 --> 0:27:56.646
874
+ additional.
875
+
876
+ 0:27:58.778 --> 0:28:05.287
877
+ And that is typically what is referred to
878
+ as re-case and re-piculation system.
879
+
880
+ 0:28:05.445 --> 0:28:15.045
881
+ So the idea is that you might be good to have
882
+ something like an adapter here in between,
883
+
884
+ 0:28:15.045 --> 0:28:20.007
885
+ which really tries to adapt the speech input.
886
+
887
+ 0:28:20.260 --> 0:28:28.809
888
+ That can be at different levels, but it might
889
+ be even more rephrasing.
890
+
891
+ 0:28:29.569 --> 0:28:40.620
892
+ If you think of the sentence, if you have
893
+ false starts, then when speaking you sometimes
894
+
895
+ 0:28:40.620 --> 0:28:41.986
896
+ assume oh.
897
+
898
+ 0:28:41.901 --> 0:28:52.224
899
+ You restart it, then you might want to delete
900
+ that because if you read it you don't want
901
+
902
+ 0:28:52.224 --> 0:28:52.688
903
+ to.
904
+
905
+ 0:28:56.096 --> 0:28:57.911
906
+ Why is this yeah?
907
+
908
+ 0:28:57.810 --> 0:29:01.445
909
+ The case in punctuation important.
910
+
911
+ 0:29:02.622 --> 0:29:17.875
912
+ One important thing is directly for the challenge
913
+ is when speak is just a continuous stream of
914
+
915
+ 0:29:17.875 --> 0:29:18.999
916
+ words.
917
+
918
+ 0:29:19.079 --> 0:29:27.422
919
+ Then just speaking and punctuation marks,
920
+ and so on are all notes are there in natural.
921
+
922
+ 0:29:27.507 --> 0:29:30.281
923
+ However, they are of course important.
924
+
925
+ 0:29:30.410 --> 0:29:33.877
926
+ They are first of all very important for readability.
927
+
928
+ 0:29:34.174 --> 0:29:41.296
929
+ If you have once read a text without characterization
930
+ marks, you need more time to process it.
931
+
932
+ 0:29:41.861 --> 0:29:47.375
933
+ They're sometimes even semantically important.
934
+
935
+ 0:29:47.258 --> 0:29:52.892
936
+ There's a list for grandpa and big difference.
937
+
938
+ 0:29:53.553 --> 0:30:00.089
939
+ And so this, of course, with humans as well,
940
+ it'd be easy to distinguish by again doing
941
+
942
+ 0:30:00.089 --> 0:30:01.426
943
+ it automatically.
944
+
945
+ 0:30:01.352 --> 0:30:06.181
946
+ It's more typically and finally, in our case,
947
+ if we want to do.
948
+
949
+ 0:30:06.386 --> 0:30:13.672
950
+ We are assuming normally sentence wise, so
951
+ we always enter out system which is like one
952
+
953
+ 0:30:13.672 --> 0:30:16.238
954
+ sentence by the next sentence.
955
+
956
+ 0:30:16.736 --> 0:30:26.058
957
+ If you want to do speech translation of a
958
+ continuous stream, then of course what are
959
+
960
+ 0:30:26.058 --> 0:30:26.716
961
+ your.
962
+
963
+ 0:30:28.168 --> 0:30:39.095
964
+ And the easiest and most straightforward situation
965
+ is, of course, if you have a continuously.
966
+
967
+ 0:30:39.239 --> 0:30:51.686
968
+ And if it generates your calculation marks,
969
+ it's easy to separate your text into sentences.
970
+
971
+ 0:30:52.032 --> 0:31:09.157
972
+ So we can again reuse our system and thereby
973
+ have a normal anti-system on this continuous.
974
+
975
+ 0:31:14.174 --> 0:31:21.708
976
+ These are a bit older numbers, but they show
977
+ you a bit also how important all that is.
978
+
979
+ 0:31:21.861 --> 0:31:31.719
980
+ So this was so the best is if you do insurance
981
+ transcript you get roughly a blue score of.
982
+
983
+ 0:31:32.112 --> 0:31:47.678
984
+ If you have as it is with some air based length
985
+ segmentation, then you get something like.
986
+
987
+ 0:31:47.907 --> 0:31:57.707
988
+ If you then use the segments correctly as
989
+ it's done from the reference, you get one blue
990
+
991
+ 0:31:57.707 --> 0:32:01.010
992
+ point and another blue point.
993
+
994
+ 0:32:01.201 --> 0:32:08.085
995
+ So you see that you have been total like nearly
996
+ two blue points just by having the correct
997
+
998
+ 0:32:08.085 --> 0:32:09.144
999
+ segmentation.
1000
+
1001
+ 0:32:10.050 --> 0:32:21.178
1002
+ This shows you that it's important to estimate
1003
+ as good a segmentation because even if you
1004
+
1005
+ 0:32:21.178 --> 0:32:25.629
1006
+ still have the same arrows in your.
1007
+
1008
+ 0:32:27.147 --> 0:32:35.718
1009
+ Is to be into this movement, which is also
1010
+ not as unusual as we do in translation.
1011
+
1012
+ 0:32:36.736 --> 0:32:40.495
1013
+ So this is done by looking at the reference.
1014
+
1015
+ 0:32:40.412 --> 0:32:48.055
1016
+ It should show you how much these scores are
1017
+ done to just analyze how important are these.
1018
+
1019
+ 0:32:47.971 --> 0:32:55.700
1020
+ So you take the A's R transcript and you look
1021
+ at the reference and it's only done for the.
1022
+
1023
+ 0:32:55.635 --> 0:33:05.843
1024
+ If we have optimal punctuations, if our model
1025
+ is as good and optimal, so as a reference we
1026
+
1027
+ 0:33:05.843 --> 0:33:15.939
1028
+ could: But of course this is not how we can
1029
+ do it in reality because we don't have access
1030
+
1031
+ 0:33:15.939 --> 0:33:16.948
1032
+ to that.
1033
+
1034
+ 0:33:17.657 --> 0:33:24.044
1035
+ Because one would invade you okay, why should
1036
+ we do that?
1037
+
1038
+ 0:33:23.933 --> 0:33:28.781
1039
+ If we have the optimal then it's possible.
1040
+
1041
+ 0:33:31.011 --> 0:33:40.060
1042
+ And yeah, that is why a typical system does
1043
+ not only yeah depend on if our key component.
1044
+
1045
+ 0:33:40.280 --> 0:33:56.468
1046
+ But in between you have this segmentation
1047
+ in there in order to have more input and.
1048
+
1049
+ 0:33:56.496 --> 0:34:01.595
1050
+ You can also prefer often this invariability
1051
+ over the average study.
1052
+
1053
+ 0:34:04.164 --> 0:34:17.896
1054
+ SegmentationSo the task of segmentation is
1055
+ to re-segment the text into what is called
1056
+
1057
+ 0:34:17.896 --> 0:34:24.283
1058
+ sentence like unit, so you also assign.
1059
+
1060
+ 0:34:24.444 --> 0:34:39.421
1061
+ That is more a traditional thing because for
1062
+ a long time case information was not provided.
1063
+
1064
+ 0:34:39.879 --> 0:34:50.355
1065
+ So there was any good ASR system which directly
1066
+ provides you with case information and this
1067
+
1068
+ 0:34:50.355 --> 0:34:52.746
1069
+ may not be any more.
1070
+
1071
+ 0:34:56.296 --> 0:35:12.060
1072
+ How that can be done is you can have three
1073
+ different approaches because that was some
1074
+
1075
+ 0:35:12.060 --> 0:35:16.459
1076
+ of the most common one.
1077
+
1078
+ 0:35:17.097 --> 0:35:23.579
1079
+ Course: That is not the only thing you can
1080
+ do.
1081
+
1082
+ 0:35:23.441 --> 0:35:30.891
1083
+ You can also try to train the data to generate
1084
+ that.
1085
+
1086
+ 0:35:31.891 --> 0:35:41.324
1087
+ On the other hand, that is of course more
1088
+ challenging.
1089
+
1090
+ 0:35:41.153 --> 0:35:47.503
1091
+ You need some type of segmentation.
1092
+
1093
+ 0:35:48.028 --> 0:35:59.382
1094
+ Mean, of course, you can easily remove and
1095
+ capture information from your data and then
1096
+
1097
+ 0:35:59.382 --> 0:36:05.515
1098
+ play a system which does non-case to non-case.
1099
+
1100
+ 0:36:05.945 --> 0:36:15.751
1101
+ You can also, of course, try to combine these
1102
+ two into one so that you directly translate
1103
+
1104
+ 0:36:15.751 --> 0:36:17.386
1105
+ from non-case.
1106
+
1107
+ 0:36:17.817 --> 0:36:24.722
1108
+ What is more happening by now is that you
1109
+ also try to provide these to that you provide.
1110
+
1111
+ 0:36:24.704 --> 0:36:35.267
1112
+ The ASR is a segmentation directly get these
1113
+ information in there.
1114
+
1115
+ 0:36:35.110 --> 0:36:45.597
1116
+ The systems that combine the A's and A's are:
1117
+ Yes, there is a valid rule.
1118
+
1119
+ 0:36:45.455 --> 0:36:51.182
1120
+ What we come later to today is that you do
1121
+ audio to text in the target language.
1122
+
1123
+ 0:36:51.111 --> 0:36:54.880
1124
+ That is what is referred to as an end to end
1125
+ system.
1126
+
1127
+ 0:36:54.809 --> 0:36:59.686
1128
+ So it's directly and this is still more often
1129
+ done for text output.
1130
+
1131
+ 0:36:59.614 --> 0:37:03.416
1132
+ But there is also end to end system which
1133
+ directly.
1134
+
1135
+ 0:37:03.683 --> 0:37:09.109
1136
+ There you have additional challenges by how
1137
+ to even measure if things are correct or not.
1138
+
1139
+ 0:37:09.089 --> 0:37:10.522
1140
+ Mean for text.
1141
+
1142
+ 0:37:10.427 --> 0:37:18.074
1143
+ You can mention, in other words, that for
1144
+ audio the audio signal is even more.
1145
+
1146
+ 0:37:18.318 --> 0:37:27.156
1147
+ That's why it's currently mostly speech to
1148
+ text, but that is one single system, but of
1149
+
1150
+ 0:37:27.156 --> 0:37:27.969
1151
+ course.
1152
+
1153
+ 0:37:32.492 --> 0:37:35.605
1154
+ Yeah, how can you do that?
1155
+
1156
+ 0:37:35.490 --> 0:37:45.161
1157
+ You can do adding these calculation information:
1158
+ Will look into three systems.
1159
+
1160
+ 0:37:45.039 --> 0:37:53.132
1161
+ You can do that as a sequence labeling problem
1162
+ or as a monolingual.
1163
+
1164
+ 0:37:54.534 --> 0:37:57.145
1165
+ Let's have a little bit of a series.
1166
+
1167
+ 0:37:57.075 --> 0:37:59.485
1168
+ This was some of the first ideas.
1169
+
1170
+ 0:37:59.414 --> 0:38:04.545
1171
+ There's the idea where you try to do it mainly
1172
+ based on language model.
1173
+
1174
+ 0:38:04.474 --> 0:38:11.446
1175
+ So how probable is that there is a punctuation
1176
+ that was done with like old style engram language
1177
+
1178
+ 0:38:11.446 --> 0:38:12.884
1179
+ models to visually.
1180
+
1181
+ 0:38:13.073 --> 0:38:24.687
1182
+ So you can, for example, if you have a program
1183
+ language model to calculate the score of Hello,
1184
+
1185
+ 0:38:24.687 --> 0:38:25.787
1186
+ how are?
1187
+
1188
+ 0:38:25.725 --> 0:38:33.615
1189
+ And then you compare this probability and
1190
+ take the one which has the highest probability.
1191
+
1192
+ 0:38:33.527 --> 0:38:39.928
1193
+ You might have something like if you have
1194
+ very long pauses, you anyway.
1195
+
1196
+ 0:38:40.340 --> 0:38:49.345
1197
+ So this is a very easy model, which only calculates
1198
+ some language model probabilities, and however
1199
+
1200
+ 0:38:49.345 --> 0:38:57.440
1201
+ the advantages of course are: And then, of
1202
+ course, in general, so what we will look into
1203
+
1204
+ 0:38:57.440 --> 0:39:05.535
1205
+ here is that maybe interesting is that most
1206
+ of the systems, also the advance, are really
1207
+
1208
+ 0:39:05.535 --> 0:39:08.719
1209
+ mainly focused purely on the text.
1210
+
1211
+ 0:39:09.289 --> 0:39:19.237
1212
+ If you think about how to insert punctuation
1213
+ marks, maybe your first idea would have been
1214
+
1215
+ 0:39:19.237 --> 0:39:22.553
1216
+ we can use pause information.
1217
+
1218
+ 0:39:23.964 --> 0:39:30.065
1219
+ But however interestingly most systems that
1220
+ use are really focusing on the text.
1221
+
1222
+ 0:39:31.151 --> 0:39:34.493
1223
+ There are several reasons.
1224
+
1225
+ 0:39:34.369 --> 0:39:44.149
1226
+ One is that it's easier to get training data
1227
+ so you only need pure text data.
1228
+
1229
+ 0:39:46.806 --> 0:40:03.221
1230
+ The next way you can do it is you can make
1231
+ it as a secret labeling tax or something like
1232
+
1233
+ 0:40:03.221 --> 0:40:04.328
1234
+ that.
1235
+
1236
+ 0:40:04.464 --> 0:40:11.734
1237
+ Then you have how there is nothing in you,
1238
+ and there is a.
1239
+
1240
+ 0:40:11.651 --> 0:40:15.015
1241
+ A question.
1242
+
1243
+ 0:40:15.315 --> 0:40:31.443
1244
+ So you have the number of labels, the number
1245
+ of punctuation symbols you have for the basic
1246
+
1247
+ 0:40:31.443 --> 0:40:32.329
1248
+ one.
1249
+
1250
+ 0:40:32.892 --> 0:40:44.074
1251
+ Typically nowadays it would use something
1252
+ like bird, and then you can train a sister.
1253
+
1254
+ 0:40:48.168 --> 0:40:59.259
1255
+ Any questions to that then it would probably
1256
+ be no contrary, you know, or not.
1257
+
1258
+ 0:41:00.480 --> 0:41:03.221
1259
+ Yeah, you have definitely a labeled imbalance.
1260
+
1261
+ 0:41:04.304 --> 0:41:12.405
1262
+ Think that works relatively well and haven't
1263
+ seen that.
1264
+
1265
+ 0:41:12.260 --> 0:41:21.087
1266
+ It's not a completely crazy label, maybe twenty
1267
+ times more.
1268
+
1269
+ 0:41:21.561 --> 0:41:29.636
1270
+ It can and especially for the more rare things
1271
+ mean, the more rare things is question marks.
1272
+
1273
+ 0:41:30.670 --> 0:41:43.877
1274
+ At least for question marks you have typically
1275
+ very strong indicator words.
1276
+
1277
+ 0:41:47.627 --> 0:42:03.321
1278
+ And then what was done for quite a long time
1279
+ can we know how to do machine translation?
1280
+
1281
+ 0:42:04.504 --> 0:42:12.640
1282
+ So the idea is, can we just translate non
1283
+ punctuated English into punctuated English
1284
+
1285
+ 0:42:12.640 --> 0:42:14.650
1286
+ and do it correctly?
1287
+
1288
+ 0:42:15.855 --> 0:42:25.344
1289
+ So what you need is something like this type
1290
+ of data where the source doesn't have punctuation.
1291
+
1292
+ 0:42:25.845 --> 0:42:30.641
1293
+ Course: A year is already done.
1294
+
1295
+ 0:42:30.491 --> 0:42:36.490
1296
+ You have to make it a bit challenging.
1297
+
1298
+ 0:42:41.661 --> 0:42:44.550
1299
+ Yeah, that is true.
1300
+
1301
+ 0:42:44.405 --> 0:42:55.188
1302
+ If you think about the normal trained age,
1303
+ you have to do one thing more.
1304
+
1305
+ 0:42:55.043 --> 0:43:00.730
1306
+ Is it otherwise difficult to predict?
1307
+
1308
+ 0:43:05.745 --> 0:43:09.277
1309
+ Here it's already this already looks different
1310
+ than normal training data.
1311
+
1312
+ 0:43:09.229 --> 0:43:09.901
1313
+ What is the.
1314
+
1315
+ 0:43:10.350 --> 0:43:15.305
1316
+ People want to use this transcript of speech.
1317
+
1318
+ 0:43:15.198 --> 0:43:19.509
1319
+ We'll probably go to our text editors.
1320
+
1321
+ 0:43:19.419 --> 0:43:25.906
1322
+ Yes, that is all already quite too difficult.
1323
+
1324
+ 0:43:26.346 --> 0:43:33.528
1325
+ Mean, that's making things a lot better with
1326
+ the first and easiest thing is you have to
1327
+
1328
+ 0:43:33.528 --> 0:43:35.895
1329
+ randomly cut your sentences.
1330
+
1331
+ 0:43:35.813 --> 0:43:43.310
1332
+ So if you take just me normally we have one
1333
+ sentence per line and if you take this as your
1334
+
1335
+ 0:43:43.310 --> 0:43:44.546
1336
+ training data.
1337
+
1338
+ 0:43:44.924 --> 0:43:47.857
1339
+ And that is, of course, not very helpful.
1340
+
1341
+ 0:43:48.208 --> 0:44:01.169
1342
+ So in order to build the training corpus for
1343
+ doing punctuation you randomly cut your sentences
1344
+
1345
+ 0:44:01.169 --> 0:44:08.264
1346
+ and then you can remove all your punctuation
1347
+ marks.
1348
+
1349
+ 0:44:08.528 --> 0:44:21.598
1350
+ Because of course there is no longer to do
1351
+ when you have some random segments in your
1352
+
1353
+ 0:44:21.598 --> 0:44:22.814
1354
+ system.
1355
+
1356
+ 0:44:25.065 --> 0:44:37.984
1357
+ And then you can, for example, if you then
1358
+ have generated your punctuation marks before
1359
+
1360
+ 0:44:37.984 --> 0:44:41.067
1361
+ going to the system.
1362
+
1363
+ 0:44:41.221 --> 0:44:54.122
1364
+ And that is an important thing, which we like
1365
+ to see is more challenging for end systems.
1366
+
1367
+ 0:44:53.979 --> 0:45:00.146
1368
+ We can change the segmentation, so maybe.
1369
+
1370
+ 0:45:00.040 --> 0:45:06.417
1371
+ You can, then if you're combining these things
1372
+ you can change the segmentation here, so.
1373
+
1374
+ 0:45:06.406 --> 0:45:18.178
1375
+ While you have ten new ten segments in your,
1376
+ you might only have five ones in your anymore.
1377
+
1378
+ 0:45:18.050 --> 0:45:18.972
1379
+ Then.
1380
+
1381
+ 0:45:19.259 --> 0:45:33.172
1382
+ Which might be more useful or helpful in because
1383
+ you have to reorder things and so on.
1384
+
1385
+ 0:45:33.273 --> 0:45:43.994
1386
+ And if you think of the wrong segmentation
1387
+ then you cannot reorder things from the beginning
1388
+
1389
+ 0:45:43.994 --> 0:45:47.222
1390
+ to the end of the sentence.
1391
+
1392
+ 0:45:49.749 --> 0:45:57.998
1393
+ ArrowsOkay, so much about segmentation do
1394
+ you have any more questions about that?
1395
+
1396
+ 0:46:02.522 --> 0:46:21.299
1397
+ Then there is one additional thing you can
1398
+ do, and that is when we refer to the idea.
1399
+
1400
+ 0:46:21.701 --> 0:46:29.356
1401
+ And when you get input there might be some
1402
+ arrows in there, so it might not be perfect.
1403
+
1404
+ 0:46:29.889 --> 0:46:36.322
1405
+ So the question is, can we adapt to that?
1406
+
1407
+ 0:46:36.169 --> 0:46:45.360
1408
+ And can the system be improved by saying that
1409
+ it can some.
1410
+
1411
+ 0:46:45.265 --> 0:46:50.591
1412
+ So that is as aware that before there is a.
1413
+
1414
+ 0:46:50.490 --> 0:46:55.449
1415
+ Their arm might not be the best one.
1416
+
1417
+ 0:46:55.935 --> 0:47:01.961
1418
+ There are different ways of dealing with them.
1419
+
1420
+ 0:47:01.833 --> 0:47:08.118
1421
+ You can use a best list but several best lists.
1422
+
1423
+ 0:47:08.408 --> 0:47:16.711
1424
+ So the idea is that you're not only telling
1425
+ the system this is the transcript, but here
1426
+
1427
+ 0:47:16.711 --> 0:47:18.692
1428
+ I'm not going to be.
1429
+
1430
+ 0:47:19.419 --> 0:47:30.748
1431
+ Or that you can try to make it more robust
1432
+ towards arrows from an system so that.
1433
+
1434
+ 0:47:32.612 --> 0:47:48.657
1435
+ Interesting what is often done is hope convince
1436
+ you it might be a good idea to deal.
1437
+
1438
+ 0:47:48.868 --> 0:47:57.777
1439
+ The interesting thing is if you're looking
1440
+ into a lot of systems, this is often ignored,
1441
+
1442
+ 0:47:57.777 --> 0:48:04.784
1443
+ so they are not adapting their T-system to
1444
+ this type of A-S-R system.
1445
+
1446
+ 0:48:05.345 --> 0:48:15.232
1447
+ So it's not really doing any handling of Arab,
1448
+ and the interesting thing is often works as
1449
+
1450
+ 0:48:15.232 --> 0:48:15.884
1451
+ good.
1452
+
1453
+ 0:48:16.516 --> 0:48:23.836
1454
+ And one reason is, of course, one reason is
1455
+ if the ASR system does not arrow up to like
1456
+
1457
+ 0:48:23.836 --> 0:48:31.654
1458
+ a challenging situation, and then the antisystem
1459
+ is really for the antisystem hard to detect.
1460
+
1461
+ 0:48:31.931 --> 0:48:38.080
1462
+ If it would be easy for the system to detect
1463
+ the error you would integrate this information
1464
+
1465
+ 0:48:38.080 --> 0:48:44.296
1466
+ into: That is not always the case, but that
1467
+ of course makes it a bit challenging, and that's
1468
+
1469
+ 0:48:44.296 --> 0:48:49.776
1470
+ why there is a lot of systems where it's not
1471
+ explicitly handled how to deal with.
1472
+
1473
+ 0:48:52.912 --> 0:49:06.412
1474
+ But of course it might be good, so one thing
1475
+ is you can give him a best list and you can
1476
+
1477
+ 0:49:06.412 --> 0:49:09.901
1478
+ translate every entry.
1479
+
1480
+ 0:49:10.410 --> 0:49:17.705
1481
+ And then you have two scores like the anti-probability
1482
+ and the square probability.
1483
+
1484
+ 0:49:18.058 --> 0:49:25.695
1485
+ Combine them and then generate or output the
1486
+ output from what has the best combined.
1487
+
1488
+ 0:49:26.366 --> 0:49:29.891
1489
+ And then it might no longer be the best.
1490
+
1491
+ 0:49:29.805 --> 0:49:38.144
1492
+ It might like we had a bean search, so this
1493
+ has the best score, but this has a better combined.
1494
+
1495
+ 0:49:39.059 --> 0:49:46.557
1496
+ The problem sometimes works, but the problem
1497
+ is that the anti-system might then tend to
1498
+
1499
+ 0:49:46.557 --> 0:49:52.777
1500
+ just translate not the correct sentence but
1501
+ the one easier to translate.
1502
+
1503
+ 0:49:53.693 --> 0:50:03.639
1504
+ You can also generate a more compact representation
1505
+ of this invest in it by having this type of
1506
+
1507
+ 0:50:03.639 --> 0:50:04.467
1508
+ graphs.
1509
+
1510
+ 0:50:05.285 --> 0:50:22.952
1511
+ Lettices: So then you could like try to do
1512
+ a graph to text translation so you can translate.
1513
+
1514
+ 0:50:22.802 --> 0:50:26.582
1515
+ Where like all possibilities, by the way our
1516
+ systems are invented.
1517
+
1518
+ 0:50:26.906 --> 0:50:31.485
1519
+ So it can be like a hostage, a conference
1520
+ with some programs.
1521
+
1522
+ 0:50:31.591 --> 0:50:35.296
1523
+ So the highest probability is here.
1524
+
1525
+ 0:50:35.193 --> 0:50:41.986
1526
+ Conference is being recorded, but there are
1527
+ other possibilities.
1528
+
1529
+ 0:50:42.302 --> 0:50:53.054
1530
+ And you can take all of this information out
1531
+ there with your probabilities.
1532
+
1533
+ 0:50:59.980 --> 0:51:07.614
1534
+ But we'll see this type of arrow propagation
1535
+ that if you have an error that this might then
1536
+
1537
+ 0:51:07.614 --> 0:51:15.165
1538
+ propagate to, and t errors is one of the main
1539
+ reasons why people looked into other ways of
1540
+
1541
+ 0:51:15.165 --> 0:51:17.240
1542
+ doing it and not having.
1543
+
1544
+ 0:51:19.219 --> 0:51:28.939
1545
+ Advantages and DisadvantagesBy generally a
1546
+ cascaded combination, as we've seen it, it
1547
+
1548
+ 0:51:28.939 --> 0:51:39.224
1549
+ has several advantages: The biggest maybe is
1550
+ the data availability so we can train systems
1551
+
1552
+ 0:51:39.224 --> 0:51:42.615
1553
+ for the different components.
1554
+
1555
+ 0:51:42.822 --> 0:51:47.228
1556
+ So you can train your individual components
1557
+ on relatively large stages.
1558
+
1559
+ 0:51:47.667 --> 0:51:58.207
1560
+ A modular system where you can improve each
1561
+ individual model and if there's new development
1562
+
1563
+ 0:51:58.207 --> 0:52:01.415
1564
+ and models you can improve.
1565
+
1566
+ 0:52:01.861 --> 0:52:11.603
1567
+ There are several advantages, but of course
1568
+ there are also some disadvantages: The most
1569
+
1570
+ 0:52:11.603 --> 0:52:19.574
1571
+ common thing is that there is what is referred
1572
+ to as arrow propagation.
1573
+
1574
+ 0:52:19.463 --> 0:52:28.223
1575
+ If the arrow is arrow, probably your output
1576
+ will then directly do an arrow.
1577
+
1578
+ 0:52:28.868 --> 0:52:41.740
1579
+ Typically it's like if there's an error in
1580
+ the system, it's easier to like ignore by a
1581
+
1582
+ 0:52:41.740 --> 0:52:46.474
1583
+ quantity scale than the output.
1584
+
1585
+ 0:52:46.967 --> 0:52:49.785
1586
+ What do that mean?
1587
+
1588
+ 0:52:49.637 --> 0:53:01.211
1589
+ It's complicated, so if you have German, the
1590
+ ASR does the Arab, and instead.
1591
+
1592
+ 0:53:01.101 --> 0:53:05.976
1593
+ Then most probably you'll ignore it or you'll
1594
+ still know what it was said.
1595
+
1596
+ 0:53:05.911 --> 0:53:11.817
1597
+ Maybe you even don't notice because you'll
1598
+ fastly read over it and don't see that there's
1599
+
1600
+ 0:53:11.817 --> 0:53:12.998
1601
+ one letter wrong.
1602
+
1603
+ 0:53:13.673 --> 0:53:25.291
1604
+ However, if you translate this one in an English
1605
+ sentence about speeches, there's something
1606
+
1607
+ 0:53:25.291 --> 0:53:26.933
1608
+ about wines.
1609
+
1610
+ 0:53:27.367 --> 0:53:37.238
1611
+ So it's a lot easier typically to read over
1612
+ like arrows in the than reading over them in
1613
+
1614
+ 0:53:37.238 --> 0:53:38.569
1615
+ the speech.
1616
+
1617
+ 0:53:40.120 --> 0:53:45.863
1618
+ But there is additional challenges in in cascaded
1619
+ systems.
1620
+
1621
+ 0:53:46.066 --> 0:53:52.667
1622
+ So secondly we have seen that we optimize
1623
+ each component individually so you have a separate
1624
+
1625
+ 0:53:52.667 --> 0:53:59.055
1626
+ optimization and that doesn't mean that the
1627
+ overall performance is really the best at the
1628
+
1629
+ 0:53:59.055 --> 0:53:59.410
1630
+ end.
1631
+
1632
+ 0:53:59.899 --> 0:54:07.945
1633
+ And we have tried to do that by already saying
1634
+ yes.
1635
+
1636
+ 0:54:07.790 --> 0:54:17.694
1637
+ You need to adapt them a bit to work good
1638
+ together, but still.
1639
+
1640
+ 0:54:20.280 --> 0:54:24.185
1641
+ Secondly, like that, there's a computational
1642
+ complexity.
1643
+
1644
+ 0:54:24.116 --> 0:54:30.315
1645
+ You always need to run an ASR system and an
1646
+ MTT system, and especially if you think about
1647
+
1648
+ 0:54:30.315 --> 0:54:32.864
1649
+ it, it should be fast and real time.
1650
+
1651
+ 0:54:32.795 --> 0:54:37.067
1652
+ It's challenging to always run two systems
1653
+ and not a single.
1654
+
1655
+ 0:54:38.038 --> 0:54:45.245
1656
+ And one final thing which you might have not
1657
+ directly thought of, but most of the world's
1658
+
1659
+ 0:54:45.245 --> 0:54:47.407
1660
+ languages do not have any.
1661
+
1662
+ 0:54:48.108 --> 0:55:01.942
1663
+ So if you have a language which doesn't have
1664
+ any script, then of course if you want to translate
1665
+
1666
+ 0:55:01.942 --> 0:55:05.507
1667
+ it you cannot first use.
1668
+
1669
+ 0:55:05.905 --> 0:55:13.705
1670
+ So in order to do this, the pressure was mentioned
1671
+ before ready.
1672
+
1673
+ 0:55:13.585 --> 0:55:24.265
1674
+ Build somehow a system which takes the audio
1675
+ and directly generates text in the target.
1676
+
1677
+ 0:55:26.006 --> 0:55:41.935
1678
+ And there is quite big opportunity for that
1679
+ because before that there was very different
1680
+
1681
+ 0:55:41.935 --> 0:55:44.082
1682
+ technology.
1683
+
1684
+ 0:55:44.644 --> 0:55:55.421
1685
+ However, since we are using neuromachine translation
1686
+ encoded decoder models, the interesting thing
1687
+
1688
+ 0:55:55.421 --> 0:56:00.429
1689
+ is that we are using very similar technology.
1690
+
1691
+ 0:56:00.360 --> 0:56:06.047
1692
+ It's like in both cases very similar architecture.
1693
+
1694
+ 0:56:05.935 --> 0:56:09.284
1695
+ The main difference is once.
1696
+
1697
+ 0:56:09.649 --> 0:56:17.143
1698
+ But generally how it's done is very similar,
1699
+ and therefore of course it might be put everything
1700
+
1701
+ 0:56:17.143 --> 0:56:22.140
1702
+ together, and that is what is referred to as
1703
+ end-to-end speech.
1704
+
1705
+ 0:56:22.502 --> 0:56:31.411
1706
+ So that means we're having one large neural
1707
+ network and decoded voice system, but we put
1708
+
1709
+ 0:56:31.411 --> 0:56:34.914
1710
+ an audio in one language and then.
1711
+
1712
+ 0:56:36.196 --> 0:56:43.106
1713
+ We can then have a system which directly does
1714
+ the full process.
1715
+
1716
+ 0:56:42.998 --> 0:56:46.457
1717
+ We don't have to care anymore.
1718
+
1719
+ 0:56:48.048 --> 0:57:02.615
1720
+ So if you think of it as before, so we have
1721
+ this decoder, and that's the two separate.
1722
+
1723
+ 0:57:02.447 --> 0:57:04.805
1724
+ We have the.
1725
+
1726
+ 0:57:05.085 --> 0:57:18.044
1727
+ And instead of going via the discrete text
1728
+ representation in the Suez language, we can
1729
+
1730
+ 0:57:18.044 --> 0:57:21.470
1731
+ go via the continuous.
1732
+
1733
+ 0:57:21.681 --> 0:57:26.027
1734
+ Of course, they hope it's by not doing this
1735
+ discrimination in between.
1736
+
1737
+ 0:57:26.146 --> 0:57:30.275
1738
+ We don't have a problem at doing errors.
1739
+
1740
+ 0:57:30.174 --> 0:57:32.797
1741
+ We can only cover later.
1742
+
1743
+ 0:57:32.772 --> 0:57:47.849
1744
+ But we can encode here the variability or
1745
+ so that we have and then only define the decision.
1746
+
1747
+ 0:57:51.711 --> 0:57:54.525
1748
+ And so.
1749
+
1750
+ 0:57:54.274 --> 0:58:02.253
1751
+ What we're doing is we're having very similar
1752
+ technique.
1753
+
1754
+ 0:58:02.113 --> 0:58:12.194
1755
+ We're having still the decoder model where
1756
+ we're coming from the main.
1757
+
1758
+ 0:58:12.552 --> 0:58:24.098
1759
+ Instead of getting discrete tokens in there
1760
+ as we have subwords, we always encoded that
1761
+
1762
+ 0:58:24.098 --> 0:58:26.197
1763
+ in one pattern.
1764
+
1765
+ 0:58:26.846 --> 0:58:42.505
1766
+ The problem is that this is in continuous,
1767
+ so we have to check how we can work with continuous
1768
+
1769
+ 0:58:42.505 --> 0:58:43.988
1770
+ signals.
1771
+
1772
+ 0:58:47.627 --> 0:58:55.166
1773
+ Mean, the first thing in your system is when
1774
+ you do your disc freeze and code it.
1775
+
1776
+ 0:59:02.402 --> 0:59:03.888
1777
+ A newer machine translation.
1778
+
1779
+ 0:59:03.837 --> 0:59:05.041
1780
+ You're getting a word.
1781
+
1782
+ 0:59:04.989 --> 0:59:06.300
1783
+ It's one hot, some not.
1784
+
1785
+ 0:59:21.421 --> 0:59:24.678
1786
+ The first layer of the machine translation.
1787
+
1788
+ 0:59:27.287 --> 0:59:36.147
1789
+ Yes, you do the word embedding, so then you
1790
+ have a continuous thing.
1791
+
1792
+ 0:59:36.019 --> 0:59:40.132
1793
+ So if you know get continuous.
1794
+
1795
+ 0:59:40.961 --> 0:59:46.316
1796
+ Deal with it the same way, so we'll see not
1797
+ a big of a challenge.
1798
+
1799
+ 0:59:46.235 --> 0:59:48.672
1800
+ What is more challenging is.
1801
+
1802
+ 0:59:49.349 --> 1:00:04.498
1803
+ So the audio signal is ten times longer or
1804
+ so, like more time steps you have.
1805
+
1806
+ 1:00:04.764 --> 1:00:10.332
1807
+ And so that is, of course, any challenge how
1808
+ we can deal with this type of long sequence.
1809
+
1810
+ 1:00:11.171 --> 1:00:13.055
1811
+ The advantage is a bit.
1812
+
1813
+ 1:00:12.976 --> 1:00:17.867
1814
+ The long sequence is only at the input and
1815
+ not at the output.
1816
+
1817
+ 1:00:17.789 --> 1:00:24.938
1818
+ So when you remember for the efficiency, for
1819
+ example, like a long sequence are especially
1820
+
1821
+ 1:00:24.938 --> 1:00:29.228
1822
+ challenging in the decoder, but also for the
1823
+ encoder.
1824
+
1825
+ 1:00:31.371 --> 1:00:33.595
1826
+ So how it is this?
1827
+
1828
+ 1:00:33.478 --> 1:00:40.619
1829
+ How can we process audio into an speech translation
1830
+ system?
1831
+
1832
+ 1:00:41.501 --> 1:00:51.856
1833
+ And you can follow mainly what is done in
1834
+ an system, so you have the audio signal.
1835
+
1836
+ 1:00:52.172 --> 1:00:59.135
1837
+ Then you measure your amplitude at every time
1838
+ step.
1839
+
1840
+ 1:00:59.001 --> 1:01:04.361
1841
+ It's typically something like killing.
1842
+
1843
+ 1:01:04.384 --> 1:01:13.893
1844
+ And then you're doing this, this windowing,
1845
+ so that you get a signal of a length twenty
1846
+
1847
+ 1:01:13.893 --> 1:01:22.430
1848
+ to thirty seconds, and you have all these windowings
1849
+ so that you measure them.
1850
+
1851
+ 1:01:22.342 --> 1:01:32.260
1852
+ A simple gear, and then you look at these
1853
+ time signals of seconds.
1854
+
1855
+ 1:01:32.432 --> 1:01:36.920
1856
+ So in the end then it is ten seconds, ten
1857
+ million seconds.
1858
+
1859
+ 1:01:36.844 --> 1:01:39.737
1860
+ You have for every ten milliseconds.
1861
+
1862
+ 1:01:40.000 --> 1:01:48.309
1863
+ Some type of representation which type of
1864
+ representation you can generate from that,
1865
+
1866
+ 1:01:48.309 --> 1:01:49.286
1867
+ but that.
1868
+
1869
+ 1:01:49.649 --> 1:02:06.919
1870
+ So instead of having no letter or word, you
1871
+ have no representations for every 10mm of your
1872
+
1873
+ 1:02:06.919 --> 1:02:08.437
1874
+ system.
1875
+
1876
+ 1:02:08.688 --> 1:02:13.372
1877
+ How we record that now your thirty second
1878
+ window here there is different ways.
1879
+
1880
+ 1:02:16.176 --> 1:02:31.891
1881
+ Was a traditional way of how people have done
1882
+ that from an audio signal what frequencies
1883
+
1884
+ 1:02:31.891 --> 1:02:34.010
1885
+ are in the.
1886
+
1887
+ 1:02:34.114 --> 1:02:44.143
1888
+ So to do that you can do this malfrequency,
1889
+ capsule co-pression so you can use gear transformations.
1890
+
1891
+ 1:02:44.324 --> 1:02:47.031
1892
+ Which frequencies are there?
1893
+
1894
+ 1:02:46.938 --> 1:02:53.568
1895
+ You know that the letters are different by
1896
+ the different frequencies.
1897
+
1898
+ 1:02:53.813 --> 1:03:04.243
1899
+ And then if you're doing that, use the matte
1900
+ to covers for your window we have before.
1901
+
1902
+ 1:03:04.624 --> 1:03:15.086
1903
+ So for each of these windows: You will calculate
1904
+ what frequencies in there and then get features
1905
+
1906
+ 1:03:15.086 --> 1:03:20.047
1907
+ for this window and features for this window.
1908
+
1909
+ 1:03:19.980 --> 1:03:28.028
1910
+ These are the frequencies that occur there
1911
+ and that help you to model which letters are
1912
+
1913
+ 1:03:28.028 --> 1:03:28.760
1914
+ spoken.
1915
+
1916
+ 1:03:31.611 --> 1:03:43.544
1917
+ More recently, instead of doing the traditional
1918
+ signal processing, you can also replace that
1919
+
1920
+ 1:03:43.544 --> 1:03:45.853
1921
+ by deep learning.
1922
+
1923
+ 1:03:46.126 --> 1:03:56.406
1924
+ So that we are using a self-supervised approach
1925
+ from language model to generate features that
1926
+
1927
+ 1:03:56.406 --> 1:03:58.047
1928
+ describe what.
1929
+
1930
+ 1:03:58.358 --> 1:03:59.821
1931
+ So you have your.
1932
+
1933
+ 1:03:59.759 --> 1:04:07.392
1934
+ All your signal again, and then for each child
1935
+ to do your convolutional neural networks to
1936
+
1937
+ 1:04:07.392 --> 1:04:07.811
1938
+ get.
1939
+
1940
+ 1:04:07.807 --> 1:04:23.699
1941
+ First representation here is a transformer
1942
+ network here, and in the end it's similar to
1943
+
1944
+ 1:04:23.699 --> 1:04:25.866
1945
+ a language.
1946
+
1947
+ 1:04:25.705 --> 1:04:30.238
1948
+ And you tried to predict what was referenced
1949
+ here.
1950
+
1951
+ 1:04:30.670 --> 1:04:40.806
1952
+ So that is in a way similar that you also
1953
+ try to learn a good representation of all these
1954
+
1955
+ 1:04:40.806 --> 1:04:51.281
1956
+ audio signals by predicting: And then you don't
1957
+ do the signal processing base, but have this
1958
+
1959
+ 1:04:51.281 --> 1:04:52.745
1960
+ way to make.
1961
+
1962
+ 1:04:52.812 --> 1:04:59.430
1963
+ But in all the things that you have to remember
1964
+ what is most important for you, and to end
1965
+
1966
+ 1:04:59.430 --> 1:05:05.902
1967
+ system is, of course, that you in the end get
1968
+ for every minute ten milliseconds, you get
1969
+
1970
+ 1:05:05.902 --> 1:05:11.283
1971
+ a representation of this audio signal, which
1972
+ is again a vector, and that.
1973
+
1974
+ 1:05:11.331 --> 1:05:15.365
1975
+ And then you can use your normal encoder to
1976
+ code your model to do this research.
1977
+
1978
+ 1:05:21.861 --> 1:05:32.694
1979
+ So that is all which directly has to be changed,
1980
+ and then you can build your first base.
1981
+
1982
+ 1:05:33.213 --> 1:05:37.167
1983
+ You do the audio processing.
1984
+
1985
+ 1:05:37.031 --> 1:05:49.153
1986
+ You of course need data which is like Audio
1987
+ and English and Text in German and then you
1988
+
1989
+ 1:05:49.153 --> 1:05:50.668
1990
+ can train.
1991
+
1992
+ 1:05:53.333 --> 1:05:57.854
1993
+ And interestingly, it works at the beginning.
1994
+
1995
+ 1:05:57.756 --> 1:06:03.263
1996
+ The systems were maybe a bit worse, but we
1997
+ saw really.
1998
+
1999
+ 1:06:03.964 --> 1:06:11.803
2000
+ This is like from the biggest workshop where
2001
+ people like compared different systems.
2002
+
2003
+ 1:06:11.751 --> 1:06:17.795
2004
+ Special challenge on comparing Cascaded to
2005
+ end to end systems and you see two thousand
2006
+
2007
+ 1:06:17.795 --> 1:06:18.767
2008
+ and eighteen.
2009
+
2010
+ 1:06:18.698 --> 1:06:25.068
2011
+ We had quite a huge gap between the Cascaded
2012
+ and end to end systems and then it got nearer
2013
+
2014
+ 1:06:25.068 --> 1:06:27.937
2015
+ and earlier in starting in two thousand.
2016
+
2017
+ 1:06:27.907 --> 1:06:33.619
2018
+ Twenty the performance was mainly the same,
2019
+ so there was no clear difference anymore.
2020
+
2021
+ 1:06:34.014 --> 1:06:42.774
2022
+ So this is, of course, writing a bit of hope
2023
+ saying if we better learn how to build these
2024
+
2025
+ 1:06:42.774 --> 1:06:47.544
2026
+ internal systems, they might really fall better.
2027
+
2028
+ 1:06:49.549 --> 1:06:52.346
2029
+ However, a bit.
2030
+
2031
+ 1:06:52.452 --> 1:06:59.018
2032
+ This satisfying this is how this all continues,
2033
+ and this is not only in two thousand and twenty
2034
+
2035
+ 1:06:59.018 --> 1:07:04.216
2036
+ one, but even nowadays we can say there is
2037
+ no clear performance difference.
2038
+
2039
+ 1:07:04.148 --> 1:07:10.919
2040
+ It's not like the one model is better than
2041
+ the other, but we are seeing very similar performance.
2042
+
2043
+ 1:07:11.391 --> 1:07:19.413
2044
+ So the question is what is the difference?
2045
+
2046
+ 1:07:19.227 --> 1:07:29.119
2047
+ Of course, this can only be achieved by new
2048
+ tricks.
2049
+
2050
+ 1:07:30.570 --> 1:07:35.658
2051
+ Yes and no, that's what we will mainly look
2052
+ into now.
2053
+
2054
+ 1:07:35.564 --> 1:07:39.335
2055
+ How can we make use of other types of.
2056
+
2057
+ 1:07:39.359 --> 1:07:53.236
2058
+ In that case you can achieve some performance
2059
+ by using different types of training so you
2060
+
2061
+ 1:07:53.236 --> 1:07:55.549
2062
+ can also make.
2063
+
2064
+ 1:07:55.855 --> 1:08:04.961
2065
+ So if you are training or preparing the systems
2066
+ only on very small corpora where you have as
2067
+
2068
+ 1:08:04.961 --> 1:08:10.248
2069
+ much data than you have for the individual
2070
+ ones then.
2071
+
2072
+ 1:08:10.550 --> 1:08:22.288
2073
+ So that is the biggest challenge of an end
2074
+ system that you have small corpora and therefore.
2075
+
2076
+ 1:08:24.404 --> 1:08:30.479
2077
+ Of course, there is several advantages so
2078
+ you can give access to the audio information.
2079
+
2080
+ 1:08:30.750 --> 1:08:42.046
2081
+ So that's, for example, interesting if you
2082
+ think about it, you might not have modeled
2083
+
2084
+ 1:08:42.046 --> 1:08:45.198
2085
+ everything in the text.
2086
+
2087
+ 1:08:45.067 --> 1:08:50.324
2088
+ So remember when we talk about biases.
2089
+
2090
+ 1:08:50.230 --> 1:08:55.448
2091
+ Male or female, and that of course is not
2092
+ in the text any more, but in the audio signal
2093
+
2094
+ 1:08:55.448 --> 1:08:56.515
2095
+ it's still there.
2096
+
2097
+ 1:08:58.078 --> 1:09:03.108
2098
+ It also allows you to talk about that on Thursday
2099
+ when you talk about latency.
2100
+
2101
+ 1:09:03.044 --> 1:09:08.871
2102
+ You have a bit better chance if you do an
2103
+ end to end system to get a lower latency because
2104
+
2105
+ 1:09:08.871 --> 1:09:14.378
2106
+ you only have one system and you don't have
2107
+ two systems which might have to wait for.
2108
+
2109
+ 1:09:14.934 --> 1:09:20.046
2110
+ And having one system might be also a bit
2111
+ easier management.
2112
+
2113
+ 1:09:19.962 --> 1:09:23.149
2114
+ See that two systems work and so on.
2115
+
2116
+ 1:09:26.346 --> 1:09:41.149
2117
+ The biggest challenge of end systems is the
2118
+ data, so as you correctly pointed out, typically
2119
+
2120
+ 1:09:41.149 --> 1:09:42.741
2121
+ there is.
2122
+
2123
+ 1:09:43.123 --> 1:09:45.829
2124
+ There is some data for Ted.
2125
+
2126
+ 1:09:45.733 --> 1:09:47.399
2127
+ People did that.
2128
+
2129
+ 1:09:47.301 --> 1:09:52.792
2130
+ They took the English audio with all the translations.
2131
+
2132
+ 1:09:53.273 --> 1:10:02.423
2133
+ But in January there is a lot less so we'll
2134
+ look into how you can use other data sources.
2135
+
2136
+ 1:10:05.305 --> 1:10:10.934
2137
+ Audio TranslationAnd secondly, the second
2138
+ challenge is that we have to deal with audio.
2139
+
2140
+ 1:10:11.431 --> 1:10:22.163
2141
+ For example, in input length, and therefore
2142
+ it's also important to handle this in your
2143
+
2144
+ 1:10:22.163 --> 1:10:27.590
2145
+ network and maybe have dedicated solutions.
2146
+
2147
+ 1:10:31.831 --> 1:10:40.265
2148
+ So in general we have this challenge that
2149
+ we have a lot of text and translation and audio
2150
+
2151
+ 1:10:40.265 --> 1:10:43.076
2152
+ transcript data by quite few.
2153
+
2154
+ 1:10:43.643 --> 1:10:50.844
2155
+ So what can we do in one trick?
2156
+
2157
+ 1:10:50.619 --> 1:11:00.750
2158
+ You already know a bit from other research.
2159
+
2160
+ 1:11:02.302 --> 1:11:14.325
2161
+ Exactly so what you can do is you can, for
2162
+ example, use to take a power locust, generate
2163
+
2164
+ 1:11:14.325 --> 1:11:19.594
2165
+ an audio of a Suez language, and then.
2166
+
2167
+ 1:11:21.341 --> 1:11:33.780
2168
+ There has been a bit motivated by what we
2169
+ have seen in Beck translation, which was very
2170
+
2171
+ 1:11:33.780 --> 1:11:35.476
2172
+ successful.
2173
+
2174
+ 1:11:38.758 --> 1:11:54.080
2175
+ However, it's a bit more challenging because
2176
+ it is often very different from real audience.
2177
+
2178
+ 1:11:54.314 --> 1:12:07.131
2179
+ So often if you build a system only trained
2180
+ on, but then generalized to real audio data
2181
+
2182
+ 1:12:07.131 --> 1:12:10.335
2183
+ is quite challenging.
2184
+
2185
+ 1:12:10.910 --> 1:12:20.927
2186
+ And therefore here the synthetic data generation
2187
+ is significantly more challenging than when.
2188
+
2189
+ 1:12:20.981 --> 1:12:27.071
2190
+ Because if you read a text, it's maybe bad
2191
+ translation.
2192
+
2193
+ 1:12:26.962 --> 1:12:33.163
2194
+ It's hard, but it's a real text or a text
2195
+ generated by.
2196
+
2197
+ 1:12:35.835 --> 1:12:42.885
2198
+ But it's a valid solution, and for example
2199
+ we use that also for say current systems.
2200
+
2201
+ 1:12:43.923 --> 1:12:53.336
2202
+ Of course you can also do a bit of forward
2203
+ translation that is done so that you take data.
2204
+
2205
+ 1:12:53.773 --> 1:13:02.587
2206
+ But then the problem is that your reference
2207
+ is not always correct, and you remember when
2208
+
2209
+ 1:13:02.587 --> 1:13:08.727
2210
+ we talked about back translation, it's a bit
2211
+ of an advantage.
2212
+
2213
+ 1:13:09.229 --> 1:13:11.930
2214
+ But both can be done and both have been done.
2215
+
2216
+ 1:13:12.212 --> 1:13:20.277
2217
+ So you can think about this picture again.
2218
+
2219
+ 1:13:20.089 --> 1:13:30.221
2220
+ You can take this data and generate the audio
2221
+ to it.
2222
+
2223
+ 1:13:30.750 --> 1:13:39.007
2224
+ However, it is only synthetic of what can
2225
+ be used for the voice handling technology for:
2226
+
2227
+ 1:13:39.007 --> 1:13:47.078
2228
+ But you have not, I mean, yet you get text
2229
+ to speech, but the voice cloning would need
2230
+
2231
+ 1:13:47.078 --> 1:13:47.913
2232
+ a voice.
2233
+
2234
+ 1:13:47.821 --> 1:13:53.113
2235
+ You can use, of course, and then it's nothing
2236
+ else than a normal.
2237
+
2238
+ 1:13:54.594 --> 1:14:03.210
2239
+ But still think there are better than both,
2240
+ but there are some characteristics of that
2241
+
2242
+ 1:14:03.210 --> 1:14:05.784
2243
+ which is quite different.
2244
+
2245
+ 1:14:07.327 --> 1:14:09.341
2246
+ But yeah, it's getting better.
2247
+
2248
+ 1:14:09.276 --> 1:14:13.499
2249
+ That is definitely true, and then this might
2250
+ get more and more.
2251
+
2252
+ 1:14:16.596 --> 1:14:21.885
2253
+ Here make sure it's a good person and our
2254
+ own systems because we try to train and.
2255
+
2256
+ 1:14:21.881 --> 1:14:24.356
2257
+ And it's like a feedback mood.
2258
+
2259
+ 1:14:24.277 --> 1:14:28.669
2260
+ There's anything like the Dutch English model
2261
+ that's.
2262
+
2263
+ 1:14:28.648 --> 1:14:33.081
2264
+ Yeah, you of course need a decent amount of
2265
+ real data.
2266
+
2267
+ 1:14:33.001 --> 1:14:40.228
2268
+ But I mean, as I said, so there is always
2269
+ an advantage if you have this synthetics thing
2270
+
2271
+ 1:14:40.228 --> 1:14:44.045
2272
+ only on the input side and not on the outside.
2273
+
2274
+ 1:14:44.464 --> 1:14:47.444
2275
+ That you at least always generate correct
2276
+ outcomes.
2277
+
2278
+ 1:14:48.688 --> 1:14:54.599
2279
+ That's different in a language case because
2280
+ they have input and the output and it's not
2281
+
2282
+ 1:14:54.599 --> 1:14:55.002
2283
+ like.
2284
+
2285
+ 1:14:58.618 --> 1:15:15.815
2286
+ The other idea is to integrate additional
2287
+ sources so you can have more model sharing.
2288
+
2289
+ 1:15:16.376 --> 1:15:23.301
2290
+ But you can use these components also in the
2291
+ system.
2292
+
2293
+ 1:15:23.171 --> 1:15:28.662
2294
+ Typically the text decoder and the text.
2295
+
2296
+ 1:15:29.169 --> 1:15:41.845
2297
+ And so the other way of languaging is to join
2298
+ a train or somehow train all these tasks.
2299
+
2300
+ 1:15:43.403 --> 1:15:54.467
2301
+ The first and easy thing to do is multi task
2302
+ training so the idea is you take these components
2303
+
2304
+ 1:15:54.467 --> 1:16:02.038
2305
+ and train these two components and train the
2306
+ speech translation.
2307
+
2308
+ 1:16:02.362 --> 1:16:13.086
2309
+ So then, for example, all your encoders used
2310
+ by the speech translation system can also gain
2311
+
2312
+ 1:16:13.086 --> 1:16:14.951
2313
+ from the large.
2314
+
2315
+ 1:16:14.975 --> 1:16:24.048
2316
+ So everything can gain a bit of emphasis,
2317
+ but it can partly gain in there quite a bit.
2318
+
2319
+ 1:16:27.407 --> 1:16:39.920
2320
+ The other idea is to do it in a pre-training
2321
+ phase.
2322
+
2323
+ 1:16:40.080 --> 1:16:50.414
2324
+ And then you take the end coder and the text
2325
+ decoder and trade your model on that.
2326
+
2327
+ 1:16:54.774 --> 1:17:04.895
2328
+ Finally, there is also what is referred to
2329
+ as knowledge distillation, so there you have
2330
+
2331
+ 1:17:04.895 --> 1:17:11.566
2332
+ to remember if you learn from a probability
2333
+ distribution.
2334
+
2335
+ 1:17:11.771 --> 1:17:24.371
2336
+ So what you can do then is you have your system
2337
+ and if you then have your audio and text input
2338
+
2339
+ 1:17:24.371 --> 1:17:26.759
2340
+ you can use your.
2341
+
2342
+ 1:17:27.087 --> 1:17:32.699
2343
+ And then get a more rich signal that you'll
2344
+ not only know this is the word, but you have
2345
+
2346
+ 1:17:32.699 --> 1:17:33.456
2347
+ a complete.
2348
+
2349
+ 1:17:34.394 --> 1:17:41.979
2350
+ Example is typically also done because, of
2351
+ course, if you have ski data, it still begins
2352
+
2353
+ 1:17:41.979 --> 1:17:49.735
2354
+ that you don't only have source language audio
2355
+ and target language text, but then you also
2356
+
2357
+ 1:17:49.735 --> 1:17:52.377
2358
+ have the source language text.
2359
+
2360
+ 1:17:53.833 --> 1:18:00.996
2361
+ Get a good idea of the text editor and the
2362
+ artist design.
2363
+
2364
+ 1:18:00.872 --> 1:18:16.051
2365
+ Now have to be aligned so that: Otherwise
2366
+ they wouldn't be able to determine which degree
2367
+
2368
+ 1:18:16.051 --> 1:18:17.906
2369
+ they'd be.
2370
+
2371
+ 1:18:18.178 --> 1:18:25.603
2372
+ What you've been doing in non-stasilation
2373
+ is you run your MP and then you get your probability
2374
+
2375
+ 1:18:25.603 --> 1:18:32.716
2376
+ distribution for all the words and you use
2377
+ that to train and that is not only more helpful
2378
+
2379
+ 1:18:32.716 --> 1:18:34.592
2380
+ than only getting back.
2381
+
2382
+ 1:18:35.915 --> 1:18:44.427
2383
+ You can, of course, use the same decoder to
2384
+ be even similar.
2385
+
2386
+ 1:18:44.287 --> 1:18:49.732
2387
+ Otherwise you don't have exactly the.
2388
+
2389
+ 1:18:52.832 --> 1:19:03.515
2390
+ Is a good point making these tools, and generally
2391
+ in all these cases it's good to have more similar
2392
+
2393
+ 1:19:03.515 --> 1:19:05.331
2394
+ representations.
2395
+
2396
+ 1:19:05.224 --> 1:19:07.260
2397
+ You can transfer.
2398
+
2399
+ 1:19:07.607 --> 1:19:23.743
2400
+ If you hear your representation to give from
2401
+ the audio encoder and the text encoder are
2402
+
2403
+ 1:19:23.743 --> 1:19:27.410
2404
+ more similar, then.
2405
+
2406
+ 1:19:30.130 --> 1:19:39.980
2407
+ So here you have your text encoder in the
2408
+ target language and you can train it on large
2409
+
2410
+ 1:19:39.980 --> 1:19:40.652
2411
+ data.
2412
+
2413
+ 1:19:41.341 --> 1:19:45.994
2414
+ But of course you want to benefit also for
2415
+ this task because that's what your most interested.
2416
+
2417
+ 1:19:46.846 --> 1:19:59.665
2418
+ Of course, the most benefit for this task
2419
+ is if these two representations you give are
2420
+
2421
+ 1:19:59.665 --> 1:20:01.728
2422
+ more similar.
2423
+
2424
+ 1:20:02.222 --> 1:20:11.631
2425
+ Therefore, it's interesting to look into how
2426
+ can we make these two representations as similar
2427
+
2428
+ 1:20:11.631 --> 1:20:21.141
2429
+ as: The hope is that in the end you can't even
2430
+ do something like zero shot transfer, but while
2431
+
2432
+ 1:20:21.141 --> 1:20:25.945
2433
+ you only learn this one you can also deal with.
2434
+
2435
+ 1:20:30.830 --> 1:20:40.257
2436
+ So what you can do is you can look at these
2437
+ two representations.
2438
+
2439
+ 1:20:40.112 --> 1:20:42.876
2440
+ So once the text.
2441
+
2442
+ 1:20:43.003 --> 1:20:51.184
2443
+ And you can either put them into the text
2444
+ decoder to the encoder.
2445
+
2446
+ 1:20:51.060 --> 1:20:53.487
2447
+ We have seen both.
2448
+
2449
+ 1:20:53.359 --> 1:21:03.617
2450
+ You can think: If you want to build an A's
2451
+ and to insist on you can either take the audio
2452
+
2453
+ 1:21:03.617 --> 1:21:06.580
2454
+ encoder and see how deep.
2455
+
2456
+ 1:21:08.748 --> 1:21:21.915
2457
+ However, you have these two representations
2458
+ and you want to make them more similar.
2459
+
2460
+ 1:21:21.759 --> 1:21:23.655
2461
+ One thing.
2462
+
2463
+ 1:21:23.863 --> 1:21:32.797
2464
+ Here we have, like you said, for every ten
2465
+ million seconds we have a representation.
2466
+
2467
+ 1:21:35.335 --> 1:21:45.763
2468
+ So what people may have done, for example,
2469
+ is to remove redundant information so you can:
2470
+
2471
+ 1:21:45.763 --> 1:21:56.308
2472
+ So you can use your system to put India based
2473
+ on letter or words and then average over the
2474
+
2475
+ 1:21:56.308 --> 1:21:58.394
2476
+ words or letters.
2477
+
2478
+ 1:21:59.179 --> 1:22:07.965
2479
+ So that the number of representations from
2480
+ the encoder is the same as you would get from.
2481
+
2482
+ 1:22:12.692 --> 1:22:20.919
2483
+ Okay, that much to data do have any more questions
2484
+ first about that.
2485
+
2486
+ 1:22:27.207 --> 1:22:39.507
2487
+ Then we'll finish with the audience assessing
2488
+ and highlight a bit while this is challenging,
2489
+
2490
+ 1:22:39.507 --> 1:22:52.864
2491
+ so here's: One test here has one thousand eight
2492
+ hundred sentences, so there are words or characters.
2493
+
2494
+ 1:22:53.954 --> 1:22:59.336
2495
+ If you look how many all your features, so
2496
+ how many samples there is like one point five
2497
+
2498
+ 1:22:59.336 --> 1:22:59.880
2499
+ million.
2500
+
2501
+ 1:23:00.200 --> 1:23:10.681
2502
+ So you have ten times more pizzas than you
2503
+ have characters, and then again five times
2504
+
2505
+ 1:23:10.681 --> 1:23:11.413
2506
+ more.
2507
+
2508
+ 1:23:11.811 --> 1:23:23.934
2509
+ So you have the sequence leg of the audio
2510
+ as long as you have for words, and that is
2511
+
2512
+ 1:23:23.934 --> 1:23:25.788
2513
+ a challenge.
2514
+
2515
+ 1:23:26.086 --> 1:23:34.935
2516
+ So the question is what can you do to make
2517
+ the sequins a bit shorter and not have this?
2518
+
2519
+ 1:23:38.458 --> 1:23:48.466
2520
+ The one thing is you can try to reduce the
2521
+ dimensional entity in your encounter.
2522
+
2523
+ 1:23:48.343 --> 1:23:50.821
2524
+ There's different.
2525
+
2526
+ 1:23:50.991 --> 1:24:04.302
2527
+ So, for example, you can just sum up always
2528
+ over some or you can do a congregation.
2529
+
2530
+ 1:24:04.804 --> 1:24:12.045
2531
+ Are you a linear projectile or you even take
2532
+ not every feature but only every fifth or something?
2533
+
2534
+ 1:24:12.492 --> 1:24:23.660
2535
+ So this way you can very easily reduce your
2536
+ number of features in there, and there has
2537
+
2538
+ 1:24:23.660 --> 1:24:25.713
2539
+ been different.
2540
+
2541
+ 1:24:26.306 --> 1:24:38.310
2542
+ There's also what you can do with things like
2543
+ a convolutional layer.
2544
+
2545
+ 1:24:38.136 --> 1:24:43.883
2546
+ If you skip over what you can,.
2547
+
2548
+ 1:24:47.327 --> 1:24:55.539
2549
+ And then, in addition to the audio, the other
2550
+ problem is higher variability.
2551
+
2552
+ 1:24:55.432 --> 1:25:04.641
2553
+ So if you have a text you can: But there are
2554
+ very different ways of saying that you can
2555
+
2556
+ 1:25:04.641 --> 1:25:09.874
2557
+ distinguish whether say a sentence or your
2558
+ voice.
2559
+
2560
+ 1:25:10.510 --> 1:25:21.224
2561
+ That of course makes it more challenging because
2562
+ now you get different inputs and while they
2563
+
2564
+ 1:25:21.224 --> 1:25:22.837
2565
+ were in text.
2566
+
2567
+ 1:25:23.263 --> 1:25:32.360
2568
+ So that makes especially for limited data
2569
+ things more challenging and you want to somehow
2570
+
2571
+ 1:25:32.360 --> 1:25:35.796
2572
+ learn that this is not important.
2573
+
2574
+ 1:25:36.076 --> 1:25:39.944
2575
+ So there is the idea again okay.
2576
+
2577
+ 1:25:39.827 --> 1:25:47.566
2578
+ Can we doing some type of data augmentation
2579
+ to better deal with?
2580
+
2581
+ 1:25:48.908 --> 1:25:55.735
2582
+ And again people can mainly use what has been
2583
+ done in and try to do the same things.
2584
+
2585
+ 1:25:56.276 --> 1:26:02.937
2586
+ You can try to do a bit of noise and speech
2587
+ perturbation so playing the audio like slower
2588
+
2589
+ 1:26:02.937 --> 1:26:08.563
2590
+ and a bit faster to get more samples then and
2591
+ you can train on all of them.
2592
+
2593
+ 1:26:08.489 --> 1:26:14.929
2594
+ What is very important and very successful
2595
+ recently is what is called Spektr augment.
2596
+
2597
+ 1:26:15.235 --> 1:26:25.882
2598
+ The idea is that you directly work on all
2599
+ your features and you can try to last them
2600
+
2601
+ 1:26:25.882 --> 1:26:29.014
2602
+ and that gives you more.
2603
+
2604
+ 1:26:29.469 --> 1:26:41.717
2605
+ What do they mean with masking so this is
2606
+ your audio feature and then there is different?
2607
+
2608
+ 1:26:41.962 --> 1:26:47.252
2609
+ You can do what is referred to as mask and
2610
+ a time masking.
2611
+
2612
+ 1:26:47.162 --> 1:26:50.482
2613
+ That means you just set some masks.
2614
+
2615
+ 1:26:50.730 --> 1:26:58.003
2616
+ And since then you should be still able to
2617
+ to deal with it because you can normally.
2618
+
2619
+ 1:26:57.937 --> 1:27:05.840
2620
+ Also without that you are getting more robust
2621
+ and not and you can handle that because then
2622
+
2623
+ 1:27:05.840 --> 1:27:10.877
2624
+ many symbols which have different time look
2625
+ more similar.
2626
+
2627
+ 1:27:11.931 --> 1:27:22.719
2628
+ You are not only doing that for time masking
2629
+ but also for frequency masking so that if you
2630
+
2631
+ 1:27:22.719 --> 1:27:30.188
2632
+ have here the frequency channels you mask a
2633
+ frequency channel.
2634
+
2635
+ 1:27:30.090 --> 1:27:33.089
2636
+ Thereby being able to better recognize these
2637
+ things.
2638
+
2639
+ 1:27:35.695 --> 1:27:43.698
2640
+ This we have had an overview of the two main
2641
+ approaches for speech translation that is on
2642
+
2643
+ 1:27:43.698 --> 1:27:51.523
2644
+ the one hand cascaded speech translation and
2645
+ on the other hand we talked about advanced
2646
+
2647
+ 1:27:51.523 --> 1:27:53.302
2648
+ speech translation.
2649
+
2650
+ 1:27:53.273 --> 1:28:02.080
2651
+ It's like how to combine things and what they
2652
+ work together for end speech translations.
2653
+
2654
+ 1:28:02.362 --> 1:28:06.581
2655
+ Here was data challenges and a bit about long
2656
+ circuits.
2657
+
2658
+ 1:28:07.747 --> 1:28:09.304
2659
+ We have any more questions.
2660
+
2661
+ 1:28:11.451 --> 1:28:19.974
2662
+ Can you really describe the change in cascading
2663
+ from translation to text to speech because
2664
+
2665
+ 1:28:19.974 --> 1:28:22.315
2666
+ thought the translation.
2667
+
2668
+ 1:28:25.745 --> 1:28:30.201
2669
+ Yes, so mean that works again the easiest
2670
+ thing.
2671
+
2672
+ 1:28:30.111 --> 1:28:32.954
2673
+ What of course is challenging?
2674
+
2675
+ 1:28:32.863 --> 1:28:40.753
2676
+ What can be challenging is how to make that
2677
+ more lively and like that pronunciation?
2678
+
2679
+ 1:28:40.680 --> 1:28:47.369
2680
+ And yeah, which things are put more important,
2681
+ how to put things like that into.
2682
+
2683
+ 1:28:47.627 --> 1:28:53.866
2684
+ In the normal text, otherwise it would sound
2685
+ very monotone.
2686
+
2687
+ 1:28:53.762 --> 1:28:57.404
2688
+ You want to add this information.
2689
+
2690
+ 1:28:58.498 --> 1:29:02.656
2691
+ That is maybe one thing to make it a bit more
2692
+ emotional.
2693
+
2694
+ 1:29:02.583 --> 1:29:04.920
2695
+ That is maybe one thing which.
2696
+
2697
+ 1:29:05.305 --> 1:29:13.448
2698
+ But you are right there and out of the box.
2699
+
2700
+ 1:29:13.263 --> 1:29:20.670
2701
+ If you have everything works decently.
2702
+
2703
+ 1:29:20.800 --> 1:29:30.507
2704
+ Still, especially if you have a very monotone
2705
+ voice, so think these are quite some open challenges.
2706
+
2707
+ 1:29:30.750 --> 1:29:35.898
2708
+ Maybe another open challenge is that it's
2709
+ not so much for the end product, but for the
2710
+
2711
+ 1:29:35.898 --> 1:29:37.732
2712
+ development is very important.
2713
+
2714
+ 1:29:37.673 --> 1:29:40.100
2715
+ It's very hard to evaluate the quality.
2716
+
2717
+ 1:29:40.740 --> 1:29:48.143
2718
+ So you cannot doubt that there is a way about
2719
+ most systems are currently evaluated by human
2720
+
2721
+ 1:29:48.143 --> 1:29:49.109
2722
+ evaluation.
2723
+
2724
+ 1:29:49.589 --> 1:29:54.474
2725
+ So you cannot try hundreds of things and run
2726
+ your blue score and get this score.
2727
+
2728
+ 1:29:54.975 --> 1:30:00.609
2729
+ So therefore no means very important to have
2730
+ some type of evaluation metric and that is
2731
+
2732
+ 1:30:00.609 --> 1:30:01.825
2733
+ quite challenging.
2734
+
2735
+ 1:30:08.768 --> 1:30:15.550
2736
+ And thanks for listening, and we'll have the
2737
+ second part of speech translation on search.
2738
+
demo_data/lectures/Lecture-18-18.07.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7158cf58687ceeb69cae55cb9786cecc77ea95e9afcc0b29251b8b9cfe54cdb5
3
+ size 125329284
demo_data/lectures/Lecture-19-21.07.2023/English.vtt ADDED
@@ -0,0 +1,2860 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 0:00:01.121 --> 0:00:12.579
4
+ IntroductionOkay, so welcome to today's lecture,
5
+ on Tuesday we started to talk about speech
6
+
7
+ 0:00:12.579 --> 0:00:14.198
8
+ translation.
9
+
10
+ 0:00:14.634 --> 0:00:27.037
11
+ And the idea is hopefully an idea of the basic
12
+ ideas we have in speech translation, the two
13
+
14
+ 0:00:27.037 --> 0:00:29.464
15
+ major approaches.
16
+
17
+ 0:00:29.829 --> 0:00:41.459
18
+ And the other one is the end system where
19
+ we have one large system which is everything
20
+
21
+ 0:00:41.459 --> 0:00:42.796
22
+ together.
23
+
24
+ 0:00:43.643 --> 0:00:58.459
25
+ Until now we mainly focus on text output that
26
+ we'll see today, but you can extend these ideas
27
+
28
+ 0:00:58.459 --> 0:01:01.138
29
+ to other speech.
30
+
31
+ 0:01:01.441 --> 0:01:08.592
32
+ But since it's also like a machine translation
33
+ lecture, you of course mainly focus a bit on
34
+
35
+ 0:01:08.592 --> 0:01:10.768
36
+ the translation challenges.
37
+
38
+ 0:01:12.172 --> 0:01:25.045
39
+ And what is the main focus of today's lecture
40
+ is to look into why that is challenging speech
41
+
42
+ 0:01:25.045 --> 0:01:26.845
43
+ translation.
44
+
45
+ 0:01:27.627 --> 0:01:33.901
46
+ So a bit more focus on what is now really
47
+ the difference to all you and how we can address.
48
+
49
+ 0:01:34.254 --> 0:01:39.683
50
+ SegmentationWe'll start there by with the
51
+ segmentation problem.
52
+
53
+ 0:01:39.598 --> 0:01:45.992
54
+ We had that already of bits, but especially
55
+ for end-to-end.
56
+
57
+ 0:01:46.386 --> 0:01:57.253
58
+ So the problem is that until now it was easy
59
+ to segment the input into sentences and then
60
+
61
+ 0:01:57.253 --> 0:02:01.842
62
+ translate each sentence individually.
63
+
64
+ 0:02:02.442 --> 0:02:17.561
65
+ When you're now translating audio, the challenge
66
+ is that you have just a sequence of audio input
67
+
68
+ 0:02:17.561 --> 0:02:20.055
69
+ and there's no.
70
+
71
+ 0:02:21.401 --> 0:02:27.834
72
+ So you have this difference that your audio
73
+ is a continuous stream, but the text is typically
74
+
75
+ 0:02:27.834 --> 0:02:28.930
76
+ sentence based.
77
+
78
+ 0:02:28.861 --> 0:02:31.614
79
+ So how can you match this gap in there?
80
+
81
+ 0:02:31.545 --> 0:02:37.645
82
+ We'll see that is really essential, and if
83
+ you're not using a decent good system there,
84
+
85
+ 0:02:37.645 --> 0:02:41.250
86
+ then you can lose a lot of quality and performance.
87
+
88
+ 0:02:41.641 --> 0:02:44.267
89
+ That is what also meant before.
90
+
91
+ 0:02:44.185 --> 0:02:51.702
92
+ So if you have a more complex system out of
93
+ several units, it's really essential that they
94
+
95
+ 0:02:51.702 --> 0:02:56.659
96
+ all work together and it's very easy to lose
97
+ significantly.
98
+
99
+ 0:02:57.497 --> 0:03:13.029
100
+ The second challenge we'll talk about is disfluencies,
101
+ so the style of speaking is very different
102
+
103
+ 0:03:13.029 --> 0:03:14.773
104
+ from text.
105
+
106
+ 0:03:15.135 --> 0:03:24.727
107
+ So if you translate or TedTalks, that's normally
108
+ very good speakers.
109
+
110
+ 0:03:24.588 --> 0:03:30.152
111
+ They will give you a very fluent text.
112
+
113
+ 0:03:30.670 --> 0:03:36.692
114
+ When you want to translate a lecture, it might
115
+ be more difficult or rednested.
116
+
117
+ 0:03:37.097 --> 0:03:39.242
118
+ Mean people are not well that well.
119
+
120
+ 0:03:39.182 --> 0:03:42.282
121
+ They should be prepared in giving the lecture
122
+ and.
123
+
124
+ 0:03:42.362 --> 0:03:48.241
125
+ But it's not that I mean, typically a lecture
126
+ will have like rehearsal like five times before
127
+
128
+ 0:03:48.241 --> 0:03:52.682
129
+ he is giving this lecture, and then like will
130
+ it completely be fluent?
131
+
132
+ 0:03:52.619 --> 0:03:56.089
133
+ He might at some point notice all this is
134
+ not perfect.
135
+
136
+ 0:03:56.026 --> 0:04:00.064
137
+ I want to rephrase, and he'll have to sing
138
+ during the lecture.
139
+
140
+ 0:04:00.300 --> 0:04:04.049
141
+ Might be also good that he's thinking, so
142
+ he's not going too fast and things like.
143
+
144
+ 0:04:05.305 --> 0:04:07.933
145
+ If you then go to the other extreme, it's
146
+ more meetings.
147
+
148
+ 0:04:08.208 --> 0:04:15.430
149
+ If you have a lively discussion, of course,
150
+ people will interrupt, they will restart, they
151
+
152
+ 0:04:15.430 --> 0:04:22.971
153
+ will think while they speak, and you know that
154
+ sometimes you tell people first think and speak
155
+
156
+ 0:04:22.971 --> 0:04:26.225
157
+ because they are changing their opinion.
158
+
159
+ 0:04:26.606 --> 0:04:31.346
160
+ So the question of how can you deal with this?
161
+
162
+ 0:04:31.245 --> 0:04:37.499
163
+ And there again it might be solutions for
164
+ that, or at least.
165
+
166
+ 0:04:39.759 --> 0:04:46.557
167
+ Then for the output we will look into simultaneous
168
+ translation that is at least not very important
169
+
170
+ 0:04:46.557 --> 0:04:47.175
171
+ in text.
172
+
173
+ 0:04:47.107 --> 0:04:53.696
174
+ There might be some cases but normally you
175
+ have all text available and then you're translating
176
+
177
+ 0:04:53.696 --> 0:04:54.043
178
+ and.
179
+
180
+ 0:04:54.394 --> 0:05:09.220
181
+ While for speech translation, since it's often
182
+ a life interaction, then of course it's important.
183
+
184
+ 0:05:09.149 --> 0:05:12.378
185
+ Otherwise it's hard to follow.
186
+
187
+ 0:05:12.274 --> 0:05:19.464
188
+ You see what said five minutes ago and the
189
+ slide is not as helpful.
190
+
191
+ 0:05:19.739 --> 0:05:35.627
192
+ You have to wait very long before you can
193
+ answer because you have to first wait for what
194
+
195
+ 0:05:35.627 --> 0:05:39.197
196
+ is happening there.
197
+
198
+ 0:05:40.660 --> 0:05:46.177
199
+ And finally, we can talk a bit about presentation.
200
+
201
+ 0:05:46.069 --> 0:05:54.724
202
+ For example, mentioned that if you're generating
203
+ subtitles, it's not possible.
204
+
205
+ 0:05:54.854 --> 0:06:01.110
206
+ So in professional subtitles there are clear
207
+ rules.
208
+
209
+ 0:06:00.989 --> 0:06:05.632
210
+ Subtitle has to be shown for seconds.
211
+
212
+ 0:06:05.510 --> 0:06:08.935
213
+ It's maximum of two lines.
214
+
215
+ 0:06:09.549 --> 0:06:13.156
216
+ Because otherwise it's getting too long, it's
217
+ not able to read it anymore, and so.
218
+
219
+ 0:06:13.613 --> 0:06:19.826
220
+ So if you want to achieve that, of course,
221
+ you might have to adjust and select what you
222
+
223
+ 0:06:19.826 --> 0:06:20.390
224
+ really.
225
+
226
+ 0:06:23.203 --> 0:06:28.393
227
+ The first date starts with the segmentation.
228
+
229
+ 0:06:28.277 --> 0:06:36.353
230
+ On the one end it's an issue while training,
231
+ on the other hand it's.
232
+
233
+ 0:06:38.678 --> 0:06:47.781
234
+ What is the problem so when we train it's
235
+ relatively easy to separate our data into sentence
236
+
237
+ 0:06:47.781 --> 0:06:48.466
238
+ level.
239
+
240
+ 0:06:48.808 --> 0:07:02.241
241
+ So if you have your example, you have the
242
+ audio and the text, then you typically know
243
+
244
+ 0:07:02.241 --> 0:07:07.083
245
+ that this sentence is aligned.
246
+
247
+ 0:07:07.627 --> 0:07:16.702
248
+ You can use these time information to cut
249
+ your audio and then you can train and then.
250
+
251
+ 0:07:18.018 --> 0:07:31.775
252
+ Because what we need for an enchilada model
253
+ is to be an output chart, in this case an audio
254
+
255
+ 0:07:31.775 --> 0:07:32.822
256
+ chart.
257
+
258
+ 0:07:33.133 --> 0:07:38.551
259
+ And even if this is a long speech, it's easy
260
+ then since we have this time information to
261
+
262
+ 0:07:38.551 --> 0:07:39.159
263
+ separate.
264
+
265
+ 0:07:39.579 --> 0:07:43.866
266
+ But we are using therefore, of course, the
267
+ target side information.
268
+
269
+ 0:07:45.865 --> 0:07:47.949
270
+ The problem is now in runtime.
271
+
272
+ 0:07:47.881 --> 0:07:49.367
273
+ This is not possible.
274
+
275
+ 0:07:49.300 --> 0:07:55.273
276
+ Here we can do that based on the calculation
277
+ marks and the sentence segmentation on the
278
+
279
+ 0:07:55.273 --> 0:07:57.921
280
+ target side because that is splitting.
281
+
282
+ 0:07:57.853 --> 0:08:02.131
283
+ But during transcript, during translation
284
+ it is not possible.
285
+
286
+ 0:08:02.442 --> 0:08:10.297
287
+ Because there is just a long audio signal,
288
+ and of course if you have your test data to
289
+
290
+ 0:08:10.297 --> 0:08:15.263
291
+ split it into: That has been done for some
292
+ experience.
293
+
294
+ 0:08:15.173 --> 0:08:22.834
295
+ It's fine, but it's not a realistic scenario
296
+ because if you really apply it in real world,
297
+
298
+ 0:08:22.834 --> 0:08:25.949
299
+ we won't have a manual segmentation.
300
+
301
+ 0:08:26.266 --> 0:08:31.838
302
+ If a human has to do that then he can do the
303
+ translation so you want to have a full automatic
304
+
305
+ 0:08:31.838 --> 0:08:32.431
306
+ pipeline.
307
+
308
+ 0:08:32.993 --> 0:08:38.343
309
+ So the question is how can we deal with this
310
+ type of you know?
311
+
312
+ 0:09:09.309 --> 0:09:20.232
313
+ So the question is how can we deal with this
314
+ time of situation and how can we segment the
315
+
316
+ 0:09:20.232 --> 0:09:23.024
317
+ audio into some units?
318
+
319
+ 0:09:23.863 --> 0:09:32.079
320
+ And here is one further really big advantage
321
+ of a cascaded sauce: Because how is this done
322
+
323
+ 0:09:32.079 --> 0:09:34.336
324
+ in a cascade of systems?
325
+
326
+ 0:09:34.245 --> 0:09:38.484
327
+ We are splitting the audio with some features
328
+ we are doing.
329
+
330
+ 0:09:38.414 --> 0:09:42.042
331
+ We can use similar ones which we'll discuss
332
+ later.
333
+
334
+ 0:09:41.970 --> 0:09:43.840
335
+ Then we run against chin.
336
+
337
+ 0:09:43.768 --> 0:09:48.801
338
+ We have the transcript and then we can do
339
+ what we talked last about.
340
+
341
+ 0:09:49.069 --> 0:10:02.260
342
+ So if you have this is an audio signal and
343
+ the training data it was good.
344
+
345
+ 0:10:02.822 --> 0:10:07.951
346
+ So here we have a big advantage.
347
+
348
+ 0:10:07.795 --> 0:10:16.758
349
+ We can use a different segmentation for the
350
+ and for the.
351
+
352
+ 0:10:16.601 --> 0:10:21.323
353
+ Why is that a big advantage?
354
+
355
+ 0:10:23.303 --> 0:10:34.067
356
+ Will say for a team task is more important
357
+ because we can then do the sentence transformation.
358
+
359
+ 0:10:34.955 --> 0:10:37.603
360
+ See and Yeah, We Can Do the Same Thing.
361
+
362
+ 0:10:37.717 --> 0:10:40.226
363
+ To save us, why is it not as important for
364
+ us?
365
+
366
+ 0:10:40.173 --> 0:10:40.819
367
+ Are maybe.
368
+
369
+ 0:10:43.363 --> 0:10:48.589
370
+ We don't need that much context.
371
+
372
+ 0:10:48.430 --> 0:11:01.101
373
+ We only try to restrict the word, but the
374
+ context to consider is mainly small.
375
+
376
+ 0:11:03.283 --> 0:11:11.419
377
+ Would agree with it in more context, but there
378
+ is one more important: its.
379
+
380
+ 0:11:11.651 --> 0:11:16.764
381
+ The is monotone, so there's no reordering.
382
+
383
+ 0:11:16.645 --> 0:11:22.455
384
+ The second part of the signal is no reordering.
385
+
386
+ 0:11:22.334 --> 0:11:23.559
387
+ We have.
388
+
389
+ 0:11:23.683 --> 0:11:29.147
390
+ And of course if we are doing that we cannot
391
+ really order across boundaries between segments.
392
+
393
+ 0:11:29.549 --> 0:11:37.491
394
+ It might be challenging if we split the words
395
+ so that it's not perfect for so that.
396
+
397
+ 0:11:37.637 --> 0:11:40.846
398
+ But we need to do quite long range reordering.
399
+
400
+ 0:11:40.777 --> 0:11:47.035
401
+ If you think about the German where the work
402
+ has moved, and now the English work is in one
403
+
404
+ 0:11:47.035 --> 0:11:50.198
405
+ part, but the end of the sentence is another.
406
+
407
+ 0:11:50.670 --> 0:11:59.427
408
+ And of course this advantage we have now here
409
+ that if we have a segment we have.
410
+
411
+ 0:12:01.441 --> 0:12:08.817
412
+ And that this segmentation is important.
413
+
414
+ 0:12:08.638 --> 0:12:15.300
415
+ Here are some motivations for that.
416
+
417
+ 0:12:15.675 --> 0:12:25.325
418
+ What you are doing is you are taking the reference
419
+ text and you are segmenting.
420
+
421
+ 0:12:26.326 --> 0:12:30.991
422
+ And then, of course, your segments are exactly
423
+ yeah cute.
424
+
425
+ 0:12:31.471 --> 0:12:42.980
426
+ If you're now using different segmentation
427
+ strategies, you're using significantly in blue
428
+
429
+ 0:12:42.980 --> 0:12:44.004
430
+ points.
431
+
432
+ 0:12:43.876 --> 0:12:50.400
433
+ If the segmentation is bad, you have a lot
434
+ worse.
435
+
436
+ 0:12:52.312 --> 0:13:10.323
437
+ And interesting, here you ought to see how
438
+ it was a human, but people have in a competition.
439
+
440
+ 0:13:10.450 --> 0:13:22.996
441
+ You can see that by working on the segmentation
442
+ and using better segmentation you can improve
443
+
444
+ 0:13:22.996 --> 0:13:25.398
445
+ your performance.
446
+
447
+ 0:13:26.006 --> 0:13:29.932
448
+ So it's really essential.
449
+
450
+ 0:13:29.781 --> 0:13:41.714
451
+ One other interesting thing is if you're looking
452
+ into the difference between.
453
+
454
+ 0:13:42.082 --> 0:13:49.145
455
+ So it really seems to be more important to
456
+ have a good segmentation for our cascaded system.
457
+
458
+ 0:13:49.109 --> 0:13:56.248
459
+ For an intra-end system because there you
460
+ can't re-segment while it is less important
461
+
462
+ 0:13:56.248 --> 0:13:58.157
463
+ for a cascaded system.
464
+
465
+ 0:13:58.074 --> 0:14:05.049
466
+ Of course, it's still important, but the difference
467
+ between the two segmentations.
468
+
469
+ 0:14:06.466 --> 0:14:18.391
470
+ It was a shared task some years ago like it's
471
+ just one system from different.
472
+
473
+ 0:14:22.122 --> 0:14:31.934
474
+ So the question is how can we deal with this
475
+ in speech translation and what people look
476
+
477
+ 0:14:31.934 --> 0:14:32.604
478
+ into?
479
+
480
+ 0:14:32.752 --> 0:14:48.360
481
+ Now we want to use different techniques to
482
+ split the audio signal into segments.
483
+
484
+ 0:14:48.848 --> 0:14:54.413
485
+ You have the disadvantage that you can't change
486
+ it.
487
+
488
+ 0:14:54.306 --> 0:15:00.409
489
+ Therefore, some of the quality might be more
490
+ important.
491
+
492
+ 0:15:00.660 --> 0:15:15.678
493
+ But in both cases, of course, the A's are
494
+ better if you have a good segmentation.
495
+
496
+ 0:15:17.197 --> 0:15:23.149
497
+ So any idea, how would you have this task
498
+ now split this audio?
499
+
500
+ 0:15:23.056 --> 0:15:26.221
501
+ What type of tool would you use?
502
+
503
+ 0:15:28.648 --> 0:15:41.513
504
+ The fuse was a new network to segment half
505
+ for instance supervise.
506
+
507
+ 0:15:41.962 --> 0:15:44.693
508
+ Yes, that's exactly already the better system.
509
+
510
+ 0:15:44.635 --> 0:15:50.376
511
+ So for long time people have done more simple
512
+ things because we'll come to that a bit challenging
513
+
514
+ 0:15:50.376 --> 0:15:52.250
515
+ as creating or having the data.
516
+
517
+ 0:15:53.193 --> 0:16:00.438
518
+ The first thing is you use some tool out of
519
+ the box like voice activity detection which
520
+
521
+ 0:16:00.438 --> 0:16:07.189
522
+ has been there as a whole research field so
523
+ people find when somebody's speaking.
524
+
525
+ 0:16:07.647 --> 0:16:14.952
526
+ And then you use that in this different threshold
527
+ you always have the ability that somebody's
528
+
529
+ 0:16:14.952 --> 0:16:16.273
530
+ speaking or not.
531
+
532
+ 0:16:17.217 --> 0:16:19.889
533
+ Then you split your signal.
534
+
535
+ 0:16:19.794 --> 0:16:26.763
536
+ It will not be perfect, but you transcribe
537
+ or translate each component.
538
+
539
+ 0:16:28.508 --> 0:16:39.337
540
+ But as you see, a supervised classification
541
+ task is even better, and that is now the most
542
+
543
+ 0:16:39.337 --> 0:16:40.781
544
+ common use.
545
+
546
+ 0:16:41.441 --> 0:16:49.909
547
+ The supervisor is doing that as a supervisor
548
+ classification and then you'll try to use this
549
+
550
+ 0:16:49.909 --> 0:16:50.462
551
+ type.
552
+
553
+ 0:16:50.810 --> 0:16:53.217
554
+ We're going into a bit more detail on how
555
+ to do that.
556
+
557
+ 0:16:53.633 --> 0:17:01.354
558
+ So what you need to do first is, of course,
559
+ you have to have some labels whether this is
560
+
561
+ 0:17:01.354 --> 0:17:03.089
562
+ an end of sentence.
563
+
564
+ 0:17:03.363 --> 0:17:10.588
565
+ You do that by using the alignment between
566
+ the segments and the audio.
567
+
568
+ 0:17:10.487 --> 0:17:12.021
569
+ You have the.
570
+
571
+ 0:17:12.212 --> 0:17:15.365
572
+ The two people have not for each word, so
573
+ these tank steps.
574
+
575
+ 0:17:15.312 --> 0:17:16.891
576
+ This word is said this time.
577
+
578
+ 0:17:17.157 --> 0:17:27.935
579
+ This word is said by what you typically have
580
+ from this time to time to time.
581
+
582
+ 0:17:27.795 --> 0:17:34.657
583
+ We have the second segment, the second segment.
584
+
585
+ 0:17:35.195 --> 0:17:39.051
586
+ Which also used to trade for example your
587
+ advanced system and everything.
588
+
589
+ 0:17:41.661 --> 0:17:53.715
590
+ Based on that you can label each frame in
591
+ there so if you have a green or blue that is
592
+
593
+ 0:17:53.715 --> 0:17:57.455
594
+ our speech segment so you.
595
+
596
+ 0:17:58.618 --> 0:18:05.690
597
+ And these labels will then later help you,
598
+ but you extract exactly these types of.
599
+
600
+ 0:18:07.067 --> 0:18:08.917
601
+ There's one big challenge.
602
+
603
+ 0:18:08.848 --> 0:18:15.113
604
+ If you have two sentences which are directly
605
+ connected to each other, then if you're doing
606
+
607
+ 0:18:15.113 --> 0:18:18.693
608
+ this labeling, you would not have a break in
609
+ later.
610
+
611
+ 0:18:18.624 --> 0:18:23.513
612
+ If you tried to extract that, there should
613
+ be something great or not.
614
+
615
+ 0:18:23.943 --> 0:18:31.955
616
+ So what you typically do is in the last frame.
617
+
618
+ 0:18:31.785 --> 0:18:41.334
619
+ You mark as outside, although it's not really
620
+ outside.
621
+
622
+ 0:18:43.463 --> 0:18:46.882
623
+ Yes, I guess you could also do that in more
624
+ of a below check.
625
+
626
+ 0:18:46.827 --> 0:18:48.653
627
+ I mean, this is the most simple.
628
+
629
+ 0:18:48.598 --> 0:18:51.431
630
+ It's like inside outside, so it's related
631
+ to that.
632
+
633
+ 0:18:51.376 --> 0:18:54.894
634
+ Of course, you could have an extra startup
635
+ segment, and so on.
636
+
637
+ 0:18:54.838 --> 0:18:57.370
638
+ I guess this is just to make it more simple.
639
+
640
+ 0:18:57.314 --> 0:19:00.159
641
+ You only have two labels, not a street classroom.
642
+
643
+ 0:19:00.103 --> 0:19:02.380
644
+ But yeah, you could do similar things.
645
+
646
+ 0:19:12.432 --> 0:19:20.460
647
+ Has caused down the roads to problems because
648
+ it could be an important part of a segment
649
+
650
+ 0:19:20.460 --> 0:19:24.429
651
+ which has some meaning and we do something.
652
+
653
+ 0:19:24.339 --> 0:19:28.400
654
+ The good thing is frames are normally very.
655
+
656
+ 0:19:28.688 --> 0:19:37.586
657
+ Like some milliseconds, so normally if you
658
+ remove some milliseconds you can still understand
659
+
660
+ 0:19:37.586 --> 0:19:38.734
661
+ everything.
662
+
663
+ 0:19:38.918 --> 0:19:46.999
664
+ Mean the speech signal is very repetitive,
665
+ and so you have information a lot of times.
666
+
667
+ 0:19:47.387 --> 0:19:50.730
668
+ That's why we talked along there last time
669
+ they could try to shrink the steak and.
670
+
671
+ 0:19:51.031 --> 0:20:00.995
672
+ If you now have a short sequence where there
673
+ is like which would be removed and that's not
674
+
675
+ 0:20:00.995 --> 0:20:01.871
676
+ really.
677
+
678
+ 0:20:02.162 --> 0:20:06.585
679
+ Yeah, but it's not a full letter is missing.
680
+
681
+ 0:20:06.487 --> 0:20:11.011
682
+ It's like only the last ending of the vocal.
683
+
684
+ 0:20:11.751 --> 0:20:15.369
685
+ Think it doesn't really happen.
686
+
687
+ 0:20:15.256 --> 0:20:23.057
688
+ We have our audio signal and we have these
689
+ gags that are not above.
690
+
691
+ 0:20:23.883 --> 0:20:29.288
692
+ With this blue rectangulars the inside speech
693
+ segment and with the guess it's all set yes.
694
+
695
+ 0:20:29.669 --> 0:20:35.736
696
+ So then you have the full signal and you're
697
+ meaning now labeling your task as a blue or
698
+
699
+ 0:20:35.736 --> 0:20:36.977
700
+ white prediction.
701
+
702
+ 0:20:36.908 --> 0:20:39.202
703
+ So that is your prediction task.
704
+
705
+ 0:20:39.133 --> 0:20:44.975
706
+ You have the audio signal only and your prediction
707
+ task is like label one or zero.
708
+
709
+ 0:20:45.305 --> 0:20:55.585
710
+ Once you do that then based on this labeling
711
+ you can extract each segment again like each
712
+
713
+ 0:20:55.585 --> 0:20:58.212
714
+ consecutive blue area.
715
+
716
+ 0:20:58.798 --> 0:21:05.198
717
+ See then removed maybe the non-speaking part
718
+ already and duo speech translation only on
719
+
720
+ 0:21:05.198 --> 0:21:05.998
721
+ the parts.
722
+
723
+ 0:21:06.786 --> 0:21:19.768
724
+ Which is good because the training would have
725
+ done similarly.
726
+
727
+ 0:21:20.120 --> 0:21:26.842
728
+ So on the noise in between you never saw in
729
+ the training, so it's good to throw it away.
730
+
731
+ 0:21:29.649 --> 0:21:34.930
732
+ One challenge, of course, is now if you're
733
+ doing that, what is your input?
734
+
735
+ 0:21:34.860 --> 0:21:40.664
736
+ You cannot do the sequence labeling normally
737
+ on the whole talk, so it's too long.
738
+
739
+ 0:21:40.593 --> 0:21:46.738
740
+ So if you're doing this prediction of the
741
+ label, you also have a window for which you
742
+
743
+ 0:21:46.738 --> 0:21:48.239
744
+ do the segmentation.
745
+
746
+ 0:21:48.788 --> 0:21:54.515
747
+ And that's the bedline we have in the punctuation
748
+ prediction.
749
+
750
+ 0:21:54.422 --> 0:22:00.392
751
+ If we don't have good borders, random splits
752
+ are normally good.
753
+
754
+ 0:22:00.299 --> 0:22:03.939
755
+ So what we do now is split the audio.
756
+
757
+ 0:22:04.344 --> 0:22:09.134
758
+ So that would be our input, and then the part
759
+ three would be our labels.
760
+
761
+ 0:22:09.269 --> 0:22:15.606
762
+ This green would be the input and here we
763
+ want, for example, blue labels and then white.
764
+
765
+ 0:22:16.036 --> 0:22:20.360
766
+ Here only do labors and here at the beginning
767
+ why maybe at the end why.
768
+
769
+ 0:22:21.401 --> 0:22:28.924
770
+ So thereby you have now a fixed window always
771
+ for which you're doing than this task of predicting.
772
+
773
+ 0:22:33.954 --> 0:22:43.914
774
+ How you build your classifier that is based
775
+ again.
776
+
777
+ 0:22:43.719 --> 0:22:52.512
778
+ We had this wave to be mentioned last week.
779
+
780
+ 0:22:52.752 --> 0:23:00.599
781
+ So in training you use labels to say whether
782
+ it's in speech or outside speech.
783
+
784
+ 0:23:01.681 --> 0:23:17.740
785
+ Inference: You give them always the chance
786
+ and then predict whether this part like each
787
+
788
+ 0:23:17.740 --> 0:23:20.843
789
+ label is afraid.
790
+
791
+ 0:23:23.143 --> 0:23:29.511
792
+ Bit more complicated, so one challenge is
793
+ if you randomly split off cognition, losing
794
+
795
+ 0:23:29.511 --> 0:23:32.028
796
+ your context for the first brain.
797
+
798
+ 0:23:31.954 --> 0:23:38.693
799
+ It might be very hard to predict whether this
800
+ is now in or out of, and also for the last.
801
+
802
+ 0:23:39.980 --> 0:23:48.449
803
+ You often need a bit of context whether this
804
+ is audio or not, and at the beginning.
805
+
806
+ 0:23:49.249 --> 0:23:59.563
807
+ So what you do is you put the audio in twice.
808
+
809
+ 0:23:59.339 --> 0:24:08.538
810
+ You want to do it with splits and then.
811
+
812
+ 0:24:08.788 --> 0:24:15.996
813
+ It is shown you have shifted the two offsets,
814
+ so one is predicted with the other offset.
815
+
816
+ 0:24:16.416 --> 0:24:23.647
817
+ And then averaging the probabilities so that
818
+ at each time you have, at least for one of
819
+
820
+ 0:24:23.647 --> 0:24:25.127
821
+ the predictions,.
822
+
823
+ 0:24:25.265 --> 0:24:36.326
824
+ Because at the end of the second it might
825
+ be very hard to predict whether this is now
826
+
827
+ 0:24:36.326 --> 0:24:39.027
828
+ speech or nonspeech.
829
+
830
+ 0:24:39.939 --> 0:24:47.956
831
+ Think it is a high parameter, but you are
832
+ not optimizing it, so you just take two shifts.
833
+
834
+ 0:24:48.328 --> 0:24:54.636
835
+ Of course try a lot of different shifts and
836
+ so on.
837
+
838
+ 0:24:54.512 --> 0:24:59.649
839
+ The thing is it's mainly a problem here.
840
+
841
+ 0:24:59.523 --> 0:25:04.412
842
+ If you don't do two outsets you have.
843
+
844
+ 0:25:05.105 --> 0:25:14.761
845
+ You could get better by doing that, but would
846
+ be skeptical if it really matters, and also
847
+
848
+ 0:25:14.761 --> 0:25:18.946
849
+ have not seen any experience in doing.
850
+
851
+ 0:25:19.159 --> 0:25:27.629
852
+ Guess you're already good, you have maybe
853
+ some arrows in there and you're getting.
854
+
855
+ 0:25:31.191 --> 0:25:37.824
856
+ So with this you have your segmentation.
857
+
858
+ 0:25:37.663 --> 0:25:44.228
859
+ However, there is a problem in between.
860
+
861
+ 0:25:44.064 --> 0:25:49.158
862
+ Once the model is wrong then.
863
+
864
+ 0:25:49.789 --> 0:26:01.755
865
+ The normal thing would be the first thing
866
+ that you take some threshold and that you always
867
+
868
+ 0:26:01.755 --> 0:26:05.436
869
+ label everything in speech.
870
+
871
+ 0:26:06.006 --> 0:26:19.368
872
+ The problem is when you are just doing this
873
+ one threshold that you might have.
874
+
875
+ 0:26:19.339 --> 0:26:23.954
876
+ Those are the challenges.
877
+
878
+ 0:26:23.777 --> 0:26:31.168
879
+ Short segments mean you have no context.
880
+
881
+ 0:26:30.988 --> 0:26:35.503
882
+ The policy will be bad.
883
+
884
+ 0:26:37.077 --> 0:26:48.954
885
+ Therefore, people use this probabilistic divided
886
+ cocker algorithm, so the main idea is start
887
+
888
+ 0:26:48.954 --> 0:26:56.744
889
+ with the whole segment, and now you split the
890
+ whole segment.
891
+
892
+ 0:26:57.397 --> 0:27:09.842
893
+ Then you split there and then you continue
894
+ until each segment is smaller than the maximum
895
+
896
+ 0:27:09.842 --> 0:27:10.949
897
+ length.
898
+
899
+ 0:27:11.431 --> 0:27:23.161
900
+ But you can ignore some splits, and if you
901
+ split one segment into two parts you first
902
+
903
+ 0:27:23.161 --> 0:27:23.980
904
+ trim.
905
+
906
+ 0:27:24.064 --> 0:27:40.197
907
+ So normally it's not only one signal position,
908
+ it's a longer area of non-voice, so you try
909
+
910
+ 0:27:40.197 --> 0:27:43.921
911
+ to find this longer.
912
+
913
+ 0:27:43.943 --> 0:27:51.403
914
+ Now your large segment is split into two smaller
915
+ segments.
916
+
917
+ 0:27:51.277 --> 0:27:56.085
918
+ Now you are checking these segments.
919
+
920
+ 0:27:56.296 --> 0:28:04.683
921
+ So if they are very, very short, it might
922
+ be good not to spin at this point because you're
923
+
924
+ 0:28:04.683 --> 0:28:05.697
925
+ ending up.
926
+
927
+ 0:28:06.006 --> 0:28:09.631
928
+ And this way you continue all the time, and
929
+ then hopefully you'll have a good stretch.
930
+
931
+ 0:28:10.090 --> 0:28:19.225
932
+ So, of course, there's one challenge with
933
+ this approach: if you think about it later,
934
+
935
+ 0:28:19.225 --> 0:28:20.606
936
+ low latency.
937
+
938
+ 0:28:25.405 --> 0:28:31.555
939
+ So in this case you have to have the full
940
+ audio available.
941
+
942
+ 0:28:32.132 --> 0:28:38.112
943
+ So you cannot continuously do that mean if
944
+ you would do it just always.
945
+
946
+ 0:28:38.029 --> 0:28:45.589
947
+ If the probability is higher you split but
948
+ in this case you try to find a global optimal.
949
+
950
+ 0:28:46.706 --> 0:28:49.134
951
+ A heuristic body.
952
+
953
+ 0:28:48.999 --> 0:28:58.130
954
+ You find a global solution for your whole
955
+ tar and not a local one.
956
+
957
+ 0:28:57.993 --> 0:29:02.223
958
+ Where's the system most sure?
959
+
960
+ 0:29:02.802 --> 0:29:12.467
961
+ So that's a bit of a challenge here, but the
962
+ advantage of course is that in the end you
963
+
964
+ 0:29:12.467 --> 0:29:14.444
965
+ have no segments.
966
+
967
+ 0:29:17.817 --> 0:29:23.716
968
+ Any more questions like this.
969
+
970
+ 0:29:23.519 --> 0:29:36.696
971
+ Then the next thing is we also need to evaluate
972
+ in this scenario.
973
+
974
+ 0:29:37.097 --> 0:29:44.349
975
+ So know machine translation is quite a long
976
+ way.
977
+
978
+ 0:29:44.201 --> 0:29:55.305
979
+ History now was the beginning of the semester,
980
+ but hope you can remember.
981
+
982
+ 0:29:55.675 --> 0:30:09.214
983
+ Might be with blue score, might be with comment
984
+ or similar, but you need to have.
985
+
986
+ 0:30:10.310 --> 0:30:22.335
987
+ But this assumes that you have this one-to-one
988
+ match, so you always have an output and machine
989
+
990
+ 0:30:22.335 --> 0:30:26.132
991
+ translation, which is nicely.
992
+
993
+ 0:30:26.506 --> 0:30:34.845
994
+ So then it might be that our output has four
995
+ segments, while our reference output has only
996
+
997
+ 0:30:34.845 --> 0:30:35.487
998
+ three.
999
+
1000
+ 0:30:36.756 --> 0:30:40.649
1001
+ And now is, of course, questionable like what
1002
+ should we compare in our metric.
1003
+
1004
+ 0:30:44.704 --> 0:30:53.087
1005
+ So it's no longer directly possible to directly
1006
+ do that because what should you compare?
1007
+
1008
+ 0:30:53.413 --> 0:31:00.214
1009
+ Just have four segments there and three segments
1010
+ there, and of course it seems to be that.
1011
+
1012
+ 0:31:00.920 --> 0:31:06.373
1013
+ The first one it likes to the first one when
1014
+ you see I can't speak Spanish, but you're an
1015
+
1016
+ 0:31:06.373 --> 0:31:09.099
1017
+ audience of the guests who is already there.
1018
+
1019
+ 0:31:09.039 --> 0:31:14.472
1020
+ So even like just a woman, the blue comparing
1021
+ wouldn't work, so you need to do something
1022
+
1023
+ 0:31:14.472 --> 0:31:17.158
1024
+ about that to take this type of evaluation.
1025
+
1026
+ 0:31:19.019 --> 0:31:21.727
1027
+ Still any suggestions what you could do.
1028
+
1029
+ 0:31:25.925 --> 0:31:44.702
1030
+ How can you calculate a blue score because
1031
+ you don't have one you want to see?
1032
+
1033
+ 0:31:45.925 --> 0:31:49.365
1034
+ Here you put another layer which spies to
1035
+ add in the second.
1036
+
1037
+ 0:31:51.491 --> 0:31:56.979
1038
+ It's even not aligning only, but that's one
1039
+ solution, so you need to align and resign.
1040
+
1041
+ 0:31:57.177 --> 0:32:06.886
1042
+ Because even if you have no alignment so this
1043
+ to this and this to that you see that it's
1044
+
1045
+ 0:32:06.886 --> 0:32:12.341
1046
+ not good because the audio would compare to
1047
+ that.
1048
+
1049
+ 0:32:13.453 --> 0:32:16.967
1050
+ That we'll discuss is even one simpler solution.
1051
+
1052
+ 0:32:16.896 --> 0:32:19.065
1053
+ Yes, it's a simpler solution.
1054
+
1055
+ 0:32:18.993 --> 0:32:23.086
1056
+ It's called document based blue or something
1057
+ like that.
1058
+
1059
+ 0:32:23.013 --> 0:32:25.720
1060
+ So you just take the full document.
1061
+
1062
+ 0:32:26.566 --> 0:32:32.630
1063
+ For some matrix it's good and it's not clear
1064
+ how good it is to the other, but there might
1065
+
1066
+ 0:32:32.630 --> 0:32:32.900
1067
+ be.
1068
+
1069
+ 0:32:33.393 --> 0:32:36.454
1070
+ Think of more simple metrics like blue.
1071
+
1072
+ 0:32:36.377 --> 0:32:40.358
1073
+ Do you have any idea what could be a disadvantage?
1074
+
1075
+ 0:32:49.249 --> 0:32:56.616
1076
+ Blue is matching ingrams so you start with
1077
+ the original.
1078
+
1079
+ 0:32:56.487 --> 0:33:01.274
1080
+ You check how many ingrams in here.
1081
+
1082
+ 0:33:01.901 --> 0:33:11.233
1083
+ If you're not doing that on the full document,
1084
+ you can also match grams from year to year.
1085
+
1086
+ 0:33:11.751 --> 0:33:15.680
1087
+ So you can match things very far away.
1088
+
1089
+ 0:33:15.579 --> 0:33:21.323
1090
+ Start doing translation and you just randomly
1091
+ randomly.
1092
+
1093
+ 0:33:22.142 --> 0:33:27.938
1094
+ And that, of course, could be a bit of a disadvantage
1095
+ or like is a problem, and therefore people
1096
+
1097
+ 0:33:27.938 --> 0:33:29.910
1098
+ also look into the segmentation.
1099
+
1100
+ 0:33:29.850 --> 0:33:34.655
1101
+ But I've recently seen some things, so document
1102
+ levels tours are also normally.
1103
+
1104
+ 0:33:34.594 --> 0:33:39.924
1105
+ If you have a relatively high quality system
1106
+ or state of the art, then they also have a
1107
+
1108
+ 0:33:39.924 --> 0:33:41.802
1109
+ good correlation of the human.
1110
+
1111
+ 0:33:46.546 --> 0:33:59.241
1112
+ So how are we doing that so we are putting
1113
+ end of sentence boundaries in there and then.
1114
+
1115
+ 0:33:59.179 --> 0:34:07.486
1116
+ Alignment based on a similar Livingston distance,
1117
+ so at a distance between our output and the
1118
+
1119
+ 0:34:07.486 --> 0:34:09.077
1120
+ reference output.
1121
+
1122
+ 0:34:09.449 --> 0:34:13.061
1123
+ And here is our boundary.
1124
+
1125
+ 0:34:12.922 --> 0:34:23.484
1126
+ We map the boundary based on the alignment,
1127
+ so in Lithuania you only have.
1128
+
1129
+ 0:34:23.803 --> 0:34:36.036
1130
+ And then, like all the words that are before,
1131
+ it might be since there is not a random.
1132
+
1133
+ 0:34:36.336 --> 0:34:44.890
1134
+ Mean it should be, but it can happen things
1135
+ like that, and it's not clear where.
1136
+
1137
+ 0:34:44.965 --> 0:34:49.727
1138
+ At the break, however, they are typically
1139
+ not that bad because they are words which are
1140
+
1141
+ 0:34:49.727 --> 0:34:52.270
1142
+ not matching between reference and hypothesis.
1143
+
1144
+ 0:34:52.216 --> 0:34:56.871
1145
+ So normally it doesn't really matter that
1146
+ much because they are anyway not matching.
1147
+
1148
+ 0:34:57.657 --> 0:35:05.888
1149
+ And then you take the mule as a T output and
1150
+ use that to calculate your metric.
1151
+
1152
+ 0:35:05.785 --> 0:35:12.576
1153
+ Then it's again a perfect alignment for which
1154
+ you can calculate.
1155
+
1156
+ 0:35:14.714 --> 0:35:19.229
1157
+ Any idea you could do it the other way around.
1158
+
1159
+ 0:35:19.133 --> 0:35:23.361
1160
+ You could resigment your reference to the.
1161
+
1162
+ 0:35:29.309 --> 0:35:30.368
1163
+ Which one would you select?
1164
+
1165
+ 0:35:34.214 --> 0:35:43.979
1166
+ I think segmenting the assertive also is much
1167
+ more natural because the reference sentence
1168
+
1169
+ 0:35:43.979 --> 0:35:46.474
1170
+ is the fixed solution.
1171
+
1172
+ 0:35:47.007 --> 0:35:52.947
1173
+ Yes, that's the right motivation if you do
1174
+ think about blue or so.
1175
+
1176
+ 0:35:52.858 --> 0:35:57.647
1177
+ Additionally important if you change your
1178
+ reference.
1179
+
1180
+ 0:35:57.857 --> 0:36:07.175
1181
+ You might have a different number of diagrams
1182
+ or diagrams because the sentences are different
1183
+
1184
+ 0:36:07.175 --> 0:36:08.067
1185
+ lengths.
1186
+
1187
+ 0:36:08.068 --> 0:36:15.347
1188
+ Here your five system, you're always comparing
1189
+ it to the same system, and you don't compare
1190
+
1191
+ 0:36:15.347 --> 0:36:16.455
1192
+ to different.
1193
+
1194
+ 0:36:16.736 --> 0:36:22.317
1195
+ The only different base of segmentation, but
1196
+ still it could make some do.
1197
+
1198
+ 0:36:25.645 --> 0:36:37.129
1199
+ DisfluenciesGood, that's all about sentence
1200
+ segmentation, then a bit about disfluencies
1201
+
1202
+ 0:36:37.129 --> 0:36:40.130
1203
+ and what there really.
1204
+
1205
+ 0:36:42.182 --> 0:36:51.138
1206
+ So as said in daily life, you're not speaking
1207
+ like very nice full sentences every.
1208
+
1209
+ 0:36:51.471 --> 0:36:53.420
1210
+ He was speaking powerful sentences.
1211
+
1212
+ 0:36:53.365 --> 0:36:54.451
1213
+ We do repetitions.
1214
+
1215
+ 0:36:54.834 --> 0:37:00.915
1216
+ It's especially if it's more interactive,
1217
+ so in meetings, phone calls and so on.
1218
+
1219
+ 0:37:00.840 --> 0:37:04.521
1220
+ If you have multiple speakers, they also break.
1221
+
1222
+ 0:37:04.724 --> 0:37:16.651
1223
+ Each other, and then if you keep them, they
1224
+ are harder to translate because most of your
1225
+
1226
+ 0:37:16.651 --> 0:37:17.991
1227
+ training.
1228
+
1229
+ 0:37:18.278 --> 0:37:30.449
1230
+ It's also very difficult to read, so we'll
1231
+ have some examples there to transcribe everything
1232
+
1233
+ 0:37:30.449 --> 0:37:32.543
1234
+ as it was said.
1235
+
1236
+ 0:37:33.473 --> 0:37:36.555
1237
+ What type of things are there?
1238
+
1239
+ 0:37:37.717 --> 0:37:42.942
1240
+ So you have all these pillow works.
1241
+
1242
+ 0:37:42.797 --> 0:37:47.363
1243
+ These are very easy to remove.
1244
+
1245
+ 0:37:47.216 --> 0:37:52.964
1246
+ You can just use regular expressions.
1247
+
1248
+ 0:37:53.433 --> 0:38:00.139
1249
+ Is getting more difficult with some other
1250
+ type of filler works.
1251
+
1252
+ 0:38:00.034 --> 0:38:03.391
1253
+ In German you have this or in.
1254
+
1255
+ 0:38:04.024 --> 0:38:08.473
1256
+ And these ones you cannot just remove by regular
1257
+ expression.
1258
+
1259
+ 0:38:08.400 --> 0:38:15.032
1260
+ You shouldn't remove all yacht from a text
1261
+ because it might be very important information
1262
+
1263
+ 0:38:15.032 --> 0:38:15.769
1264
+ for well.
1265
+
1266
+ 0:38:15.715 --> 0:38:19.995
1267
+ It may be not as important as you are, but
1268
+ still it might be very important.
1269
+
1270
+ 0:38:20.300 --> 0:38:24.215
1271
+ So just removing them is there already more
1272
+ difficult.
1273
+
1274
+ 0:38:26.586 --> 0:38:29.162
1275
+ Then you have these repetitions.
1276
+
1277
+ 0:38:29.084 --> 0:38:32.580
1278
+ You have something like mean saw him there.
1279
+
1280
+ 0:38:32.500 --> 0:38:33.619
1281
+ There was a.
1282
+
1283
+ 0:38:34.334 --> 0:38:41.001
1284
+ And while for the first one that might be
1285
+ very easy to remove because you just look for
1286
+
1287
+ 0:38:41.001 --> 0:38:47.821
1288
+ double, the thing is that the repetition might
1289
+ not be exactly the same, so there is there
1290
+
1291
+ 0:38:47.821 --> 0:38:48.199
1292
+ was.
1293
+
1294
+ 0:38:48.124 --> 0:38:54.110
1295
+ So there is already getting a bit more complicated,
1296
+ of course still possible.
1297
+
1298
+ 0:38:54.614 --> 0:39:01.929
1299
+ You can remove Denver so the real sense would
1300
+ be like to have a ticket to Houston.
1301
+
1302
+ 0:39:02.882 --> 0:39:13.327
1303
+ But there the detection, of course, is getting
1304
+ more challenging as you want to get rid of.
1305
+
1306
+ 0:39:13.893 --> 0:39:21.699
1307
+ You don't have the data, of course, which
1308
+ makes all the tasks harder, but you probably
1309
+
1310
+ 0:39:21.699 --> 0:39:22.507
1311
+ want to.
1312
+
1313
+ 0:39:22.417 --> 0:39:24.774
1314
+ That's really meaningful.
1315
+
1316
+ 0:39:24.684 --> 0:39:26.063
1317
+ Current isn't.
1318
+
1319
+ 0:39:25.972 --> 0:39:31.124
1320
+ That is now a really good point and it's really
1321
+ there.
1322
+
1323
+ 0:39:31.051 --> 0:39:34.785
1324
+ The thing about what is your final task?
1325
+
1326
+ 0:39:35.155 --> 0:39:45.526
1327
+ If you want to have a transcript reading it,
1328
+ I'm not sure if we have another example.
1329
+
1330
+ 0:39:45.845 --> 0:39:54.171
1331
+ So there it's nicer if you have a clean transfer
1332
+ and if you see subtitles in, they're also not
1333
+
1334
+ 0:39:54.171 --> 0:39:56.625
1335
+ having all the repetitions.
1336
+
1337
+ 0:39:56.537 --> 0:40:03.812
1338
+ It's the nice way to shorten but also getting
1339
+ the structure you cannot even make.
1340
+
1341
+ 0:40:04.064 --> 0:40:11.407
1342
+ So in this situation, of course, they might
1343
+ give you information.
1344
+
1345
+ 0:40:11.296 --> 0:40:14.749
1346
+ There is a lot of stuttering.
1347
+
1348
+ 0:40:15.015 --> 0:40:22.835
1349
+ So in this case agree it might be helpful
1350
+ in some way, but meaning reading all the disfluencies
1351
+
1352
+ 0:40:22.835 --> 0:40:25.198
1353
+ is getting really difficult.
1354
+
1355
+ 0:40:25.116 --> 0:40:28.051
1356
+ If you have the next one, we have.
1357
+
1358
+ 0:40:28.308 --> 0:40:31.630
1359
+ That's a very long text.
1360
+
1361
+ 0:40:31.497 --> 0:40:35.824
1362
+ You need a bit of time to pass.
1363
+
1364
+ 0:40:35.689 --> 0:40:39.479
1365
+ This one is not important.
1366
+
1367
+ 0:40:40.480 --> 0:40:48.461
1368
+ It might be nice if you can start reading
1369
+ from here.
1370
+
1371
+ 0:40:48.310 --> 0:40:52.012
1372
+ Let's have a look here.
1373
+
1374
+ 0:40:51.858 --> 0:40:54.798
1375
+ Try to read this.
1376
+
1377
+ 0:40:57.297 --> 0:41:02.725
1378
+ You can understand it, but think you need
1379
+ a bit of time to really understand what was.
1380
+
1381
+ 0:41:11.711 --> 0:41:21.480
1382
+ And now we have the same text, but you have
1383
+ highlighted in bold, and not only read the
1384
+
1385
+ 0:41:21.480 --> 0:41:22.154
1386
+ bold.
1387
+
1388
+ 0:41:23.984 --> 0:41:25.995
1389
+ And ignore everything which is not bold.
1390
+
1391
+ 0:41:30.250 --> 0:41:49.121
1392
+ Would assume it's easier to read just the
1393
+ book part more faster and more faster.
1394
+
1395
+ 0:41:50.750 --> 0:41:57.626
1396
+ Yeah, it might be, but I'm not sure we have
1397
+ a master thesis of that.
1398
+
1399
+ 0:41:57.526 --> 0:41:59.624
1400
+ If seen my videos,.
1401
+
1402
+ 0:42:00.000 --> 0:42:09.875
1403
+ Of the recordings, I also have it more likely
1404
+ that it's like a fluent speak and I'm not like
1405
+
1406
+ 0:42:09.875 --> 0:42:12.318
1407
+ doing the hesitations.
1408
+
1409
+ 0:42:12.652 --> 0:42:23.764
1410
+ Don't know if somebody else has looked into
1411
+ the Cusera video, but notice that.
1412
+
1413
+ 0:42:25.005 --> 0:42:31.879
1414
+ For these videos spoke every minute, three
1415
+ times or something, and then people were there
1416
+
1417
+ 0:42:31.879 --> 0:42:35.011
1418
+ and cutting things and making hopefully.
1419
+
1420
+ 0:42:35.635 --> 0:42:42.445
1421
+ And therefore if you want to more achieve
1422
+ that, of course, no longer exactly what was
1423
+
1424
+ 0:42:42.445 --> 0:42:50.206
1425
+ happening, but if it more looks like a professional
1426
+ video, then you would have to do that and cut
1427
+
1428
+ 0:42:50.206 --> 0:42:50.998
1429
+ that out.
1430
+
1431
+ 0:42:50.919 --> 0:42:53.535
1432
+ But yeah, there are definitely.
1433
+
1434
+ 0:42:55.996 --> 0:42:59.008
1435
+ We're also going to do this thing again.
1436
+
1437
+ 0:42:58.935 --> 0:43:02.317
1438
+ First turn is like I'm going to have a very.
1439
+
1440
+ 0:43:02.422 --> 0:43:07.449
1441
+ Which in the end they start to slow down just
1442
+ without feeling as though they're.
1443
+
1444
+ 0:43:07.407 --> 0:43:10.212
1445
+ It's a good point for the next.
1446
+
1447
+ 0:43:10.124 --> 0:43:13.561
1448
+ There is not the one perfect solution.
1449
+
1450
+ 0:43:13.473 --> 0:43:20.656
1451
+ There's some work on destruction removal,
1452
+ but of course there's also disability.
1453
+
1454
+ 0:43:20.567 --> 0:43:27.397
1455
+ Removal is not that easy, so do you just remove
1456
+ that's in order everywhere.
1457
+
1458
+ 0:43:27.607 --> 0:43:29.708
1459
+ But how much like cleaning do you do?
1460
+
1461
+ 0:43:29.652 --> 0:43:31.368
1462
+ It's more a continuous thing.
1463
+
1464
+ 0:43:31.811 --> 0:43:38.211
1465
+ Is it more really you only remove stuff or
1466
+ are you also into rephrasing and here is only
1467
+
1468
+ 0:43:38.211 --> 0:43:38.930
1469
+ removing?
1470
+
1471
+ 0:43:39.279 --> 0:43:41.664
1472
+ But maybe you want to rephrase it.
1473
+
1474
+ 0:43:41.596 --> 0:43:43.234
1475
+ That's hearing better.
1476
+
1477
+ 0:43:43.503 --> 0:43:49.185
1478
+ So then it's going into what people are doing
1479
+ in style transfer.
1480
+
1481
+ 0:43:49.097 --> 0:43:52.422
1482
+ We are going from a speech style to.
1483
+
1484
+ 0:43:52.872 --> 0:44:07.632
1485
+ So there is more continuum, and of course
1486
+ Airconditioner is not the perfect solution,
1487
+
1488
+ 0:44:07.632 --> 0:44:10.722
1489
+ but exactly what.
1490
+
1491
+ 0:44:15.615 --> 0:44:19.005
1492
+ Yeah, we're challenging.
1493
+
1494
+ 0:44:18.869 --> 0:44:30.216
1495
+ You have examples where the direct copy is
1496
+ not as hard or is not exactly the same.
1497
+
1498
+ 0:44:30.080 --> 0:44:35.415
1499
+ That is, of course, more challenging.
1500
+
1501
+ 0:44:41.861 --> 0:44:49.889
1502
+ If it's getting really mean why it's so challenging,
1503
+ if it's really spontaneous even for the speaker,
1504
+
1505
+ 0:44:49.889 --> 0:44:55.634
1506
+ you need maybe even the video to really get
1507
+ that and at least the audio.
1508
+
1509
+ 0:45:01.841 --> 0:45:06.025
1510
+ Yeah what it also depends on.
1511
+
1512
+ 0:45:06.626 --> 0:45:15.253
1513
+ The purpose, of course, and very important
1514
+ thing is the easiest tasks just to removing.
1515
+
1516
+ 0:45:15.675 --> 0:45:25.841
1517
+ Of course you have to be very careful because
1518
+ if you remove some of the not, it's normally
1519
+
1520
+ 0:45:25.841 --> 0:45:26.958
1521
+ not much.
1522
+
1523
+ 0:45:27.227 --> 0:45:33.176
1524
+ But if you remove too much, of course, that's
1525
+ very, very bad because you're losing important.
1526
+
1527
+ 0:45:33.653 --> 0:45:46.176
1528
+ And this might be even more challenging if
1529
+ you think about rarer and unseen works.
1530
+
1531
+ 0:45:46.226 --> 0:45:56.532
1532
+ So when doing this removal, it's important
1533
+ to be careful and normally more conservative.
1534
+
1535
+ 0:46:03.083 --> 0:46:15.096
1536
+ Of course, also you have to again see if you're
1537
+ doing that now in a two step approach, not
1538
+
1539
+ 0:46:15.096 --> 0:46:17.076
1540
+ an end to end.
1541
+
1542
+ 0:46:16.944 --> 0:46:20.777
1543
+ So first you need a remote.
1544
+
1545
+ 0:46:21.501 --> 0:46:30.230
1546
+ But you have to somehow sing it in the whole
1547
+ type line.
1548
+
1549
+ 0:46:30.074 --> 0:46:36.936
1550
+ If you learn text or remove disfluencies,.
1551
+
1552
+ 0:46:36.796 --> 0:46:44.070
1553
+ But it might be that the ASR system is outputing
1554
+ something else or that it's more of an ASR
1555
+
1556
+ 0:46:44.070 --> 0:46:44.623
1557
+ error.
1558
+
1559
+ 0:46:44.864 --> 0:46:46.756
1560
+ So um.
1561
+
1562
+ 0:46:46.506 --> 0:46:52.248
1563
+ Just for example, if you do it based on language
1564
+ modeling scores, it might be that you're just
1565
+
1566
+ 0:46:52.248 --> 0:46:57.568
1567
+ the language modeling score because the has
1568
+ done some errors, so you really have to see
1569
+
1570
+ 0:46:57.568 --> 0:46:59.079
1571
+ the combination of that.
1572
+
1573
+ 0:46:59.419 --> 0:47:04.285
1574
+ And for example, we had like partial words.
1575
+
1576
+ 0:47:04.174 --> 0:47:06.441
1577
+ They are like some.
1578
+
1579
+ 0:47:06.328 --> 0:47:08.827
1580
+ We didn't have that.
1581
+
1582
+ 0:47:08.908 --> 0:47:18.248
1583
+ So these feelings cannot be that you start
1584
+ in the middle of the world and then you switch
1585
+
1586
+ 0:47:18.248 --> 0:47:19.182
1587
+ because.
1588
+
1589
+ 0:47:19.499 --> 0:47:23.214
1590
+ And of course, in text in perfect transcript,
1591
+ that's very easy to recognize.
1592
+
1593
+ 0:47:23.166 --> 0:47:24.374
1594
+ That's not a real word.
1595
+
1596
+ 0:47:24.904 --> 0:47:37.198
1597
+ However, when you really do it into an system,
1598
+ he will normally detect some type of word because
1599
+
1600
+ 0:47:37.198 --> 0:47:40.747
1601
+ he only can help the words.
1602
+
1603
+ 0:47:50.050 --> 0:48:03.450
1604
+ Example: We should think so if you have this
1605
+ in the transcript it's easy to detect as a
1606
+
1607
+ 0:48:03.450 --> 0:48:05.277
1608
+ disgusting.
1609
+
1610
+ 0:48:05.986 --> 0:48:11.619
1611
+ And then, of course, it's more challenging
1612
+ in a real world example where you have.
1613
+
1614
+ 0:48:12.492 --> 0:48:27.834
1615
+ Style TransferNow to the approaches one thing
1616
+ is to really put it in between so you put your
1617
+
1618
+ 0:48:27.834 --> 0:48:29.814
1619
+ A's system.
1620
+
1621
+ 0:48:31.391 --> 0:48:45.139
1622
+ So what your task is like, so you have this
1623
+ text and the outputs in this text.
1624
+
1625
+ 0:48:45.565 --> 0:48:49.605
1626
+ There is different formulations of that.
1627
+
1628
+ 0:48:49.507 --> 0:48:54.535
1629
+ You might not be able to do everything like
1630
+ that.
1631
+
1632
+ 0:48:55.195 --> 0:49:10.852
1633
+ Or do you also allow, for example, rephrasing
1634
+ for reordering so in text you might have the
1635
+
1636
+ 0:49:10.852 --> 0:49:13.605
1637
+ word correctly.
1638
+
1639
+ 0:49:13.513 --> 0:49:24.201
1640
+ But the easiest thing is you only do it more
1641
+ like removing, so some things can be removed.
1642
+
1643
+ 0:49:29.049 --> 0:49:34.508
1644
+ Any ideas how to do that this is output.
1645
+
1646
+ 0:49:34.375 --> 0:49:41.036
1647
+ You have training data so we have training
1648
+ data.
1649
+
1650
+ 0:49:47.507 --> 0:49:55.869
1651
+ To put in with the spoon you can eat it even
1652
+ after it is out, but after the machine has.
1653
+
1654
+ 0:50:00.000 --> 0:50:05.511
1655
+ Was wearing rocks, so you have not just the
1656
+ shoes you remove but wearing them as input,
1657
+
1658
+ 0:50:05.511 --> 0:50:07.578
1659
+ as disfluent text and as output.
1660
+
1661
+ 0:50:07.515 --> 0:50:09.152
1662
+ It should be fueled text.
1663
+
1664
+ 0:50:09.089 --> 0:50:15.168
1665
+ It can be before or after recycling as you
1666
+ said, but you have this type of task, so technically
1667
+
1668
+ 0:50:15.168 --> 0:50:20.043
1669
+ how would you address this type of task when
1670
+ you have to solve this type of.
1671
+
1672
+ 0:50:24.364 --> 0:50:26.181
1673
+ That's exactly so.
1674
+
1675
+ 0:50:26.086 --> 0:50:28.802
1676
+ That's one way of doing it.
1677
+
1678
+ 0:50:28.705 --> 0:50:33.072
1679
+ It's a translation task and you train your.
1680
+
1681
+ 0:50:33.913 --> 0:50:34.683
1682
+ Can do.
1683
+
1684
+ 0:50:34.587 --> 0:50:42.859
1685
+ Then, of course, the bit of the challenge
1686
+ is that you automatically allow rephrasing
1687
+
1688
+ 0:50:42.859 --> 0:50:43.540
1689
+ stuff.
1690
+
1691
+ 0:50:43.943 --> 0:50:52.240
1692
+ Which of the one end is good so you have more
1693
+ opportunities but it might be also a bad thing
1694
+
1695
+ 0:50:52.240 --> 0:50:58.307
1696
+ because if you have more opportunities you
1697
+ have more opportunities.
1698
+
1699
+ 0:51:01.041 --> 0:51:08.300
1700
+ If you want to prevent that, it can also do
1701
+ more simple labeling, so for each word your
1702
+
1703
+ 0:51:08.300 --> 0:51:10.693
1704
+ label should not be removed.
1705
+
1706
+ 0:51:12.132 --> 0:51:17.658
1707
+ People have also been looked into parsley.
1708
+
1709
+ 0:51:17.530 --> 0:51:29.098
1710
+ You remember maybe the past trees at the beginning
1711
+ like the structure because the ideas.
1712
+
1713
+ 0:51:29.649 --> 0:51:45.779
1714
+ There's also more unsupervised approaches
1715
+ where you then phrase it as a style transfer
1716
+
1717
+ 0:51:45.779 --> 0:51:46.892
1718
+ task.
1719
+
1720
+ 0:51:50.310 --> 0:51:58.601
1721
+ At the last point since we have that yes,
1722
+ it has also been done in an end-to-end fashion
1723
+
1724
+ 0:51:58.601 --> 0:52:06.519
1725
+ so that it's really you have as input the audio
1726
+ signal and output you have than the.
1727
+
1728
+ 0:52:06.446 --> 0:52:10.750
1729
+ The text, without influence, is a clearly
1730
+ clear text.
1731
+
1732
+ 0:52:11.131 --> 0:52:19.069
1733
+ You model every single total, which of course
1734
+ has a big advantage.
1735
+
1736
+ 0:52:18.950 --> 0:52:25.706
1737
+ You can use these paralinguistic features,
1738
+ pauses, and.
1739
+
1740
+ 0:52:25.705 --> 0:52:34.091
1741
+ If you switch so you start something then
1742
+ oh it doesn't work continue differently so.
1743
+
1744
+ 0:52:34.374 --> 0:52:42.689
1745
+ So you can easily use in a fashion while in
1746
+ a cascade approach.
1747
+
1748
+ 0:52:42.559 --> 0:52:47.500
1749
+ As we saw there you have text input.
1750
+
1751
+ 0:52:49.990 --> 0:53:02.389
1752
+ But on the one end we have again, and in the
1753
+ more extreme case the problem before was endless.
1754
+
1755
+ 0:53:02.258 --> 0:53:06.961
1756
+ Of course there is even less data.
1757
+
1758
+ 0:53:11.611 --> 0:53:12.837
1759
+ Good.
1760
+
1761
+ 0:53:12.633 --> 0:53:30.817
1762
+ This is all about the input to a very more
1763
+ person, or maybe if you think about YouTube.
1764
+
1765
+ 0:53:32.752 --> 0:53:34.989
1766
+ Talk so this could use be very exciting.
1767
+
1768
+ 0:53:36.296 --> 0:53:42.016
1769
+ Is more viewed as style transferred.
1770
+
1771
+ 0:53:41.861 --> 0:53:53.149
1772
+ You can use ideas from machine translation
1773
+ where you have one language.
1774
+
1775
+ 0:53:53.713 --> 0:53:57.193
1776
+ So there is ways of trying to do this type
1777
+ of style transfer.
1778
+
1779
+ 0:53:57.637 --> 0:54:02.478
1780
+ Think is definitely also very promising to
1781
+ make it more and more fluent in a business.
1782
+
1783
+ 0:54:03.223 --> 0:54:17.974
1784
+ Because one major issue about all the previous
1785
+ ones is that you need training data and then
1786
+
1787
+ 0:54:17.974 --> 0:54:21.021
1788
+ you need training.
1789
+
1790
+ 0:54:21.381 --> 0:54:32.966
1791
+ So I mean, think that we are only really of
1792
+ data that we have for English.
1793
+
1794
+ 0:54:32.811 --> 0:54:39.457
1795
+ Maybe there is a very few data in German.
1796
+
1797
+ 0:54:42.382 --> 0:54:49.680
1798
+ Low Latency SpeechOkay, then let's talk about
1799
+ low latency speech.
1800
+
1801
+ 0:54:50.270 --> 0:55:05.158
1802
+ So the idea is if we are doing life translation
1803
+ of a talker, so we want to start out.
1804
+
1805
+ 0:55:05.325 --> 0:55:23.010
1806
+ This is possible because there is typically
1807
+ some kind of monotony in many languages.
1808
+
1809
+ 0:55:24.504 --> 0:55:29.765
1810
+ And this is also what, for example, human
1811
+ interpreters are doing to have a really low
1812
+
1813
+ 0:55:29.765 --> 0:55:30.071
1814
+ leg.
1815
+
1816
+ 0:55:30.750 --> 0:55:34.393
1817
+ They are even going further.
1818
+
1819
+ 0:55:34.268 --> 0:55:40.928
1820
+ They guess what will be the ending of the
1821
+ sentence.
1822
+
1823
+ 0:55:41.421 --> 0:55:51.120
1824
+ Then they can already continue, although it's
1825
+ not sad it might be needed, but that is even
1826
+
1827
+ 0:55:51.120 --> 0:55:53.039
1828
+ more challenging.
1829
+
1830
+ 0:55:54.714 --> 0:55:58.014
1831
+ Why is it so difficult?
1832
+
1833
+ 0:55:57.876 --> 0:56:09.799
1834
+ There is this train of on the one end for
1835
+ a and you want to have more context because
1836
+
1837
+ 0:56:09.799 --> 0:56:14.513
1838
+ we learn if we have more context.
1839
+
1840
+ 0:56:15.015 --> 0:56:24.033
1841
+ And therefore to have more contacts you have
1842
+ to wait as long as possible.
1843
+
1844
+ 0:56:23.911 --> 0:56:27.693
1845
+ The best is to have the full.
1846
+
1847
+ 0:56:28.168 --> 0:56:35.244
1848
+ On the other hand, you want to have a low
1849
+ latency for the user to wait to generate as
1850
+
1851
+ 0:56:35.244 --> 0:56:35.737
1852
+ soon.
1853
+
1854
+ 0:56:36.356 --> 0:56:47.149
1855
+ So if you're doing no situation you have to
1856
+ find the best way to start in order to have
1857
+
1858
+ 0:56:47.149 --> 0:56:48.130
1859
+ a good.
1860
+
1861
+ 0:56:48.728 --> 0:56:52.296
1862
+ There's no longer the perfect solution.
1863
+
1864
+ 0:56:52.207 --> 0:56:56.847
1865
+ People will also evaluate what is the translation.
1866
+
1867
+ 0:56:57.657 --> 0:57:09.942
1868
+ While it's challenging in German to English,
1869
+ German has this very nice thing where the prefix
1870
+
1871
+ 0:57:09.942 --> 0:57:16.607
1872
+ of the word can be put at the end of the sentence.
1873
+
1874
+ 0:57:17.137 --> 0:57:24.201
1875
+ And you only know if the person registers
1876
+ or cancels his station at the end of the center.
1877
+
1878
+ 0:57:24.985 --> 0:57:33.690
1879
+ So if you want to start the translation in
1880
+ English you need to know at this point is the.
1881
+
1882
+ 0:57:35.275 --> 0:57:39.993
1883
+ So you would have to wait until the end of
1884
+ the year.
1885
+
1886
+ 0:57:39.904 --> 0:57:42.934
1887
+ That's not really what you want.
1888
+
1889
+ 0:57:43.843 --> 0:57:45.795
1890
+ What happened.
1891
+
1892
+ 0:57:47.207 --> 0:58:09.887
1893
+ Other solutions of doing that are: Have been
1894
+ motivating like how we can do that subject
1895
+
1896
+ 0:58:09.887 --> 0:58:16.073
1897
+ object or subject work.
1898
+
1899
+ 0:58:16.496 --> 0:58:24.582
1900
+ In German it's not always subject, but there
1901
+ are relative sentence where you have that,
1902
+
1903
+ 0:58:24.582 --> 0:58:25.777
1904
+ so it needs.
1905
+
1906
+ 0:58:28.808 --> 0:58:41.858
1907
+ How we can do that is, we'll look today into
1908
+ three ways of doing that.
1909
+
1910
+ 0:58:41.674 --> 0:58:46.277
1911
+ The one is to mitigate.
1912
+
1913
+ 0:58:46.766 --> 0:58:54.824
1914
+ And then the IVAR idea is to do retranslating,
1915
+ and there you can now use the text output.
1916
+
1917
+ 0:58:54.934 --> 0:59:02.302
1918
+ So the idea is you translate, and if you later
1919
+ notice it was wrong then you can retranslate
1920
+
1921
+ 0:59:02.302 --> 0:59:03.343
1922
+ and correct.
1923
+
1924
+ 0:59:03.803 --> 0:59:14.383
1925
+ Or you can do what is called extremely coding,
1926
+ so you can generically.
1927
+
1928
+ 0:59:17.237 --> 0:59:30.382
1929
+ Let's start with the optimization, so if you
1930
+ have a sentence, it may reach a conference,
1931
+
1932
+ 0:59:30.382 --> 0:59:33.040
1933
+ and in this time.
1934
+
1935
+ 0:59:32.993 --> 0:59:39.592
1936
+ So you have a good translation quality while
1937
+ still having low latency.
1938
+
1939
+ 0:59:39.699 --> 0:59:50.513
1940
+ You have an extra model which does your segmentation
1941
+ before, but your aim is not to have a segmentation.
1942
+
1943
+ 0:59:50.470 --> 0:59:53.624
1944
+ But you can somehow measure in training data.
1945
+
1946
+ 0:59:53.555 --> 0:59:59.841
1947
+ If do these types of segment lengths, that's
1948
+ my latency and that's my translation quality,
1949
+
1950
+ 0:59:59.841 --> 1:00:02.811
1951
+ and then you can try to search a good way.
1952
+
1953
+ 1:00:03.443 --> 1:00:20.188
1954
+ If you're doing that one, it's an extra component,
1955
+ so you can use your system as it was.
1956
+
1957
+ 1:00:22.002 --> 1:00:28.373
1958
+ The other idea is to directly output the first
1959
+ high processes always, so always when you have
1960
+
1961
+ 1:00:28.373 --> 1:00:34.201
1962
+ text or audio we translate, and if we then
1963
+ have more context available we can update.
1964
+
1965
+ 1:00:35.015 --> 1:00:50.195
1966
+ So imagine before, if get an eye register
1967
+ and there's a sentence continued, then.
1968
+
1969
+ 1:00:50.670 --> 1:00:54.298
1970
+ So you change the output.
1971
+
1972
+ 1:00:54.159 --> 1:01:07.398
1973
+ Of course, that might be also leading to bad
1974
+ user experience if you always flicker and change
1975
+
1976
+ 1:01:07.398 --> 1:01:09.229
1977
+ your output.
1978
+
1979
+ 1:01:09.669 --> 1:01:15.329
1980
+ The bit like human interpreters also are able
1981
+ to correct, so they're doing a more long text.
1982
+
1983
+ 1:01:15.268 --> 1:01:20.829
1984
+ If they are guessing how to continue to say
1985
+ and then he's saying something different, they
1986
+
1987
+ 1:01:20.829 --> 1:01:22.480
1988
+ also have to correct them.
1989
+
1990
+ 1:01:22.418 --> 1:01:26.795
1991
+ So here, since it's not all you, we can even
1992
+ change what we have said.
1993
+
1994
+ 1:01:26.733 --> 1:01:29.632
1995
+ Yes, that's exactly what we have implemented.
1996
+
1997
+ 1:01:31.431 --> 1:01:49.217
1998
+ So how that works is, we are aware, and then
1999
+ we translate it, and if we get more input like
2000
+
2001
+ 1:01:49.217 --> 1:01:51.344
2002
+ you, then.
2003
+
2004
+ 1:01:51.711 --> 1:02:00.223
2005
+ And so we can always continue to do that and
2006
+ improve the transcript that we have.
2007
+
2008
+ 1:02:00.480 --> 1:02:07.729
2009
+ So in the end we have the lowest possible
2010
+ latency because we always output what is possible.
2011
+
2012
+ 1:02:07.651 --> 1:02:14.368
2013
+ On the other hand, introducing a bit of a
2014
+ new problem is: There's another challenge when
2015
+
2016
+ 1:02:14.368 --> 1:02:20.104
2017
+ we first used that this one was first used
2018
+ for old and that it worked fine.
2019
+
2020
+ 1:02:20.029 --> 1:02:21.353
2021
+ You switch to NMT.
2022
+
2023
+ 1:02:21.283 --> 1:02:25.573
2024
+ You saw one problem that is even generating
2025
+ more flickering.
2026
+
2027
+ 1:02:25.503 --> 1:02:28.880
2028
+ The problem is the normal machine translation.
2029
+
2030
+ 1:02:29.669 --> 1:02:35.414
2031
+ So implicitly learn all the output that always
2032
+ ends with a dot, and it's always a full sentence.
2033
+
2034
+ 1:02:36.696 --> 1:02:42.466
2035
+ And this was even more important somewhere
2036
+ in the model than really what is in the input.
2037
+
2038
+ 1:02:42.983 --> 1:02:55.910
2039
+ So if you give him a partial sentence, it
2040
+ will still generate a full sentence.
2041
+
2042
+ 1:02:55.747 --> 1:02:58.214
2043
+ So encourage.
2044
+
2045
+ 1:02:58.298 --> 1:03:05.821
2046
+ It's like trying to just continue it somehow
2047
+ to a full sentence and if it's doing better
2048
+
2049
+ 1:03:05.821 --> 1:03:10.555
2050
+ guessing stuff then you have to even have more
2051
+ changes.
2052
+
2053
+ 1:03:10.890 --> 1:03:23.944
2054
+ So here we have a trained mismatch and that's
2055
+ maybe more a general important thing that the
2056
+
2057
+ 1:03:23.944 --> 1:03:28.910
2058
+ modem might learn a bit different.
2059
+
2060
+ 1:03:29.289 --> 1:03:32.636
2061
+ It's always ending with a dog, so you don't
2062
+ just guess something in general.
2063
+
2064
+ 1:03:33.053 --> 1:03:35.415
2065
+ So we have your trained test mismatch.
2066
+
2067
+ 1:03:38.918 --> 1:03:41.248
2068
+ And we have a trained test message.
2069
+
2070
+ 1:03:41.184 --> 1:03:43.710
2071
+ What is the best way to address that?
2072
+
2073
+ 1:03:46.526 --> 1:03:51.934
2074
+ That's exactly the right, so we have to like
2075
+ train also on that.
2076
+
2077
+ 1:03:52.692 --> 1:03:55.503
2078
+ The problem is for particle sentences.
2079
+
2080
+ 1:03:55.431 --> 1:03:59.612
2081
+ There's not training data, so it's hard to
2082
+ find all our.
2083
+
2084
+ 1:04:00.580 --> 1:04:06.531
2085
+ Hi, I'm ransom quite easy to generate artificial
2086
+ pottery scent or at least for the source.
2087
+
2088
+ 1:04:06.926 --> 1:04:15.367
2089
+ So you just take, you take all the prefixes
2090
+ of the source data.
2091
+
2092
+ 1:04:17.017 --> 1:04:22.794
2093
+ On the problem of course, with a bit what
2094
+ do you know lying?
2095
+
2096
+ 1:04:22.699 --> 1:04:30.846
2097
+ If you have a sentence, I encourage all of
2098
+ what should be the right target for that.
2099
+
2100
+ 1:04:31.491 --> 1:04:45.381
2101
+ And the constraints on the one hand, it should
2102
+ be as long as possible, so you always have
2103
+
2104
+ 1:04:45.381 --> 1:04:47.541
2105
+ a long delay.
2106
+
2107
+ 1:04:47.687 --> 1:04:55.556
2108
+ On the other hand, it should be also a suspect
2109
+ of the previous ones, and it should be not
2110
+
2111
+ 1:04:55.556 --> 1:04:57.304
2112
+ too much inventing.
2113
+
2114
+ 1:04:58.758 --> 1:05:02.170
2115
+ A very easy solution works fine.
2116
+
2117
+ 1:05:02.066 --> 1:05:05.421
2118
+ You can just do a length space.
2119
+
2120
+ 1:05:05.316 --> 1:05:09.617
2121
+ You also take two thirds of the target.
2122
+
2123
+ 1:05:10.070 --> 1:05:19.626
2124
+ His learning then implicitly to guess a bit
2125
+ if you think about the beginning of example.
2126
+
2127
+ 1:05:20.000 --> 1:05:30.287
2128
+ This one, if you do two sorts like half, in
2129
+ this case the target would be eye register.
2130
+
2131
+ 1:05:30.510 --> 1:05:39.289
2132
+ So you're doing a bit of implicit guessing,
2133
+ and if it's getting wrong you have rewriting,
2134
+
2135
+ 1:05:39.289 --> 1:05:43.581
2136
+ but you're doing a good amount of guessing.
2137
+
2138
+ 1:05:49.849 --> 1:05:53.950
2139
+ In addition, this would be like how it looks
2140
+ like if it was like.
2141
+
2142
+ 1:05:53.888 --> 1:05:58.301
2143
+ If it wasn't a housing game, then the target
2144
+ could be something like.
2145
+
2146
+ 1:05:58.979 --> 1:06:02.513
2147
+ One problem is that you just do that this
2148
+ way.
2149
+
2150
+ 1:06:02.438 --> 1:06:04.622
2151
+ It's most of your training.
2152
+
2153
+ 1:06:05.245 --> 1:06:11.983
2154
+ And in the end you're interested in the overall
2155
+ translation quality, so for full sentence.
2156
+
2157
+ 1:06:11.909 --> 1:06:18.998
2158
+ So if you train on that, it will mainly learn
2159
+ how to translate prefixes because ninety percent
2160
+
2161
+ 1:06:18.998 --> 1:06:21.536
2162
+ or more of your data is prefixed.
2163
+
2164
+ 1:06:22.202 --> 1:06:31.636
2165
+ That's why we'll see that it's better to do
2166
+ like a ratio.
2167
+
2168
+ 1:06:31.473 --> 1:06:39.285
2169
+ So half your training data are full sentences.
2170
+
2171
+ 1:06:39.759 --> 1:06:47.693
2172
+ Because if you're doing this well you see
2173
+ that for every word prefix and only one sentence.
2174
+
2175
+ 1:06:48.048 --> 1:06:52.252
2176
+ You also see that nicely here here are both.
2177
+
2178
+ 1:06:52.158 --> 1:06:56.551
2179
+ This is the blue scores and you see the bass.
2180
+
2181
+ 1:06:58.518 --> 1:06:59.618
2182
+ Is this one?
2183
+
2184
+ 1:06:59.534 --> 1:07:03.284
2185
+ It has a good quality because it's trained.
2186
+
2187
+ 1:07:03.198 --> 1:07:11.371
2188
+ If you know, train with all the partial sentences
2189
+ is more focusing on how to translate partial
2190
+
2191
+ 1:07:11.371 --> 1:07:12.318
2192
+ sentences.
2193
+
2194
+ 1:07:12.752 --> 1:07:17.840
2195
+ Because all the partial sentences will at
2196
+ some point be removed, because at the end you
2197
+
2198
+ 1:07:17.840 --> 1:07:18.996
2199
+ translate the full.
2200
+
2201
+ 1:07:20.520 --> 1:07:24.079
2202
+ There's many tasks to read, but you have the
2203
+ same performances.
2204
+
2205
+ 1:07:24.504 --> 1:07:26.938
2206
+ On the other hand, you see here the other
2207
+ problem.
2208
+
2209
+ 1:07:26.890 --> 1:07:28.657
2210
+ This is how many words got updated.
2211
+
2212
+ 1:07:29.009 --> 1:07:31.579
2213
+ You want to have as few updates as possible.
2214
+
2215
+ 1:07:31.522 --> 1:07:34.892
2216
+ Updates need to remove things which are once
2217
+ being shown.
2218
+
2219
+ 1:07:35.255 --> 1:07:40.538
2220
+ This is quite high for the baseline.
2221
+
2222
+ 1:07:40.395 --> 1:07:50.535
2223
+ If you know the partials that are going down,
2224
+ they should be removed.
2225
+
2226
+ 1:07:51.151 --> 1:07:58.648
2227
+ And then for moody tasks you have a bit like
2228
+ the best note of swim.
2229
+
2230
+ 1:08:02.722 --> 1:08:05.296
2231
+ Any more questions to this type of.
2232
+
2233
+ 1:08:09.309 --> 1:08:20.760
2234
+ The last thing is that you want to do an extremely.
2235
+
2236
+ 1:08:21.541 --> 1:08:23.345
2237
+ Again, it's a bit implication.
2238
+
2239
+ 1:08:23.287 --> 1:08:25.271
2240
+ Scenario is what you really want.
2241
+
2242
+ 1:08:25.213 --> 1:08:30.135
2243
+ As you said, we sometimes use this updating,
2244
+ and for text output it'd be very nice.
2245
+
2246
+ 1:08:30.077 --> 1:08:35.203
2247
+ But imagine if you want to audio output, of
2248
+ course you can't change it anymore because
2249
+
2250
+ 1:08:35.203 --> 1:08:37.854
2251
+ on one side you cannot change what was said.
2252
+
2253
+ 1:08:37.795 --> 1:08:40.860
2254
+ So in this time you more need like a fixed
2255
+ output.
2256
+
2257
+ 1:08:41.121 --> 1:08:47.440
2258
+ And then the style of street decoding is interesting.
2259
+
2260
+ 1:08:47.323 --> 1:08:55.586
2261
+ Where you, for example, get sourced, the seagullins
2262
+ are so stoked in.
2263
+
2264
+ 1:08:55.468 --> 1:09:00.901
2265
+ Then you decide oh, now it's better to wait.
2266
+
2267
+ 1:09:01.041 --> 1:09:14.643
2268
+ So you somehow need to have this type of additional
2269
+ information.
2270
+
2271
+ 1:09:15.295 --> 1:09:23.074
2272
+ Here you have to decide should know I'll put
2273
+ a token or should wait for my and feel.
2274
+
2275
+ 1:09:26.546 --> 1:09:32.649
2276
+ So you have to do this additional labels like
2277
+ weight, weight, output, output, wage and so
2278
+
2279
+ 1:09:32.649 --> 1:09:32.920
2280
+ on.
2281
+
2282
+ 1:09:33.453 --> 1:09:38.481
2283
+ There are different ways of doing that.
2284
+
2285
+ 1:09:38.355 --> 1:09:45.773
2286
+ You can have an additional model that does
2287
+ this decision.
2288
+
2289
+ 1:09:46.166 --> 1:09:53.669
2290
+ And then have a higher quality or better to
2291
+ continue and then have a lower latency in this
2292
+
2293
+ 1:09:53.669 --> 1:09:54.576
2294
+ different.
2295
+
2296
+ 1:09:55.215 --> 1:09:59.241
2297
+ Surprisingly, a very easy task also works,
2298
+ sometimes quite good.
2299
+
2300
+ 1:10:03.043 --> 1:10:10.981
2301
+ And that is the so called way care policy
2302
+ and the idea is there at least for text to
2303
+
2304
+ 1:10:10.981 --> 1:10:14.623
2305
+ text translation that is working well.
2306
+
2307
+ 1:10:14.530 --> 1:10:22.376
2308
+ It's like you wait for words and then you
2309
+ always output one and like one for each.
2310
+
2311
+ 1:10:22.682 --> 1:10:28.908
2312
+ So your weight slow works at the beginning
2313
+ of the sentence, and every time a new board
2314
+
2315
+ 1:10:28.908 --> 1:10:29.981
2316
+ is coming you.
2317
+
2318
+ 1:10:31.091 --> 1:10:39.459
2319
+ So you have the same times to beat as input,
2320
+ so you're not legging more or less, but to
2321
+
2322
+ 1:10:39.459 --> 1:10:41.456
2323
+ have enough context.
2324
+
2325
+ 1:10:43.103 --> 1:10:49.283
2326
+ Of course this for example for the unmarried
2327
+ will not solve it perfectly but if you have
2328
+
2329
+ 1:10:49.283 --> 1:10:55.395
2330
+ a bit of local reordering inside your token
2331
+ that you can manage very well and then it's
2332
+
2333
+ 1:10:55.395 --> 1:10:57.687
2334
+ a very simple solution but it's.
2335
+
2336
+ 1:10:57.877 --> 1:11:00.481
2337
+ The other one was dynamic.
2338
+
2339
+ 1:11:00.385 --> 1:11:06.944
2340
+ Depending on the context you can decide how
2341
+ long you want to wait.
2342
+
2343
+ 1:11:07.687 --> 1:11:21.506
2344
+ It also only works if you have a similar amount
2345
+ of tokens, so if your target is very short
2346
+
2347
+ 1:11:21.506 --> 1:11:22.113
2348
+ of.
2349
+
2350
+ 1:11:22.722 --> 1:11:28.791
2351
+ That's why it's also more challenging for
2352
+ audio input because the speaking rate is changing
2353
+
2354
+ 1:11:28.791 --> 1:11:29.517
2355
+ and so on.
2356
+
2357
+ 1:11:29.451 --> 1:11:35.582
2358
+ You would have to do something like I'll output
2359
+ a word for every second a year or something
2360
+
2361
+ 1:11:35.582 --> 1:11:35.982
2362
+ like.
2363
+
2364
+ 1:11:36.636 --> 1:11:45.459
2365
+ The problem is that the audio speaking speed
2366
+ is not like fixed but quite very, and therefore.
2367
+
2368
+ 1:11:50.170 --> 1:11:58.278
2369
+ Therefore, what you can also do is you can
2370
+ use a similar solution than we had before with
2371
+
2372
+ 1:11:58.278 --> 1:11:59.809
2373
+ the resetteling.
2374
+
2375
+ 1:12:00.080 --> 1:12:02.904
2376
+ You remember we were re-decoded all the time.
2377
+
2378
+ 1:12:03.423 --> 1:12:12.253
2379
+ And you can do something similar in this case
2380
+ except that you add something in that you're
2381
+
2382
+ 1:12:12.253 --> 1:12:16.813
2383
+ saying, oh, if I read it cold, I'm not always.
2384
+
2385
+ 1:12:16.736 --> 1:12:22.065
2386
+ Can decode as I want, but you can do this
2387
+ target prefix decoding, so what you say is
2388
+
2389
+ 1:12:22.065 --> 1:12:23.883
2390
+ in your achievement section.
2391
+
2392
+ 1:12:23.820 --> 1:12:26.830
2393
+ You can easily say generate a translation
2394
+ bus.
2395
+
2396
+ 1:12:27.007 --> 1:12:29.810
2397
+ The translation has to start with the prefix.
2398
+
2399
+ 1:12:31.251 --> 1:12:35.350
2400
+ How can you do that?
2401
+
2402
+ 1:12:39.839 --> 1:12:49.105
2403
+ In the decoder exactly you start, so if you
2404
+ do beam search you select always the most probable.
2405
+
2406
+ 1:12:49.349 --> 1:12:57.867
2407
+ And now you say oh, I'm not selecting the
2408
+ most perfect, but this is the fourth, so in
2409
+
2410
+ 1:12:57.867 --> 1:13:04.603
2411
+ the first step have to take this one, in the
2412
+ second start decoding.
2413
+
2414
+ 1:13:04.884 --> 1:13:09.387
2415
+ And then you're making sure that your second
2416
+ always starts with this prefix.
2417
+
2418
+ 1:13:10.350 --> 1:13:18.627
2419
+ And then you can use your immediate retranslation,
2420
+ but you're no longer changing the output.
2421
+
2422
+ 1:13:19.099 --> 1:13:31.595
2423
+ Out as it works, so it may get a speech signal
2424
+ and input, and it is not outputing any.
2425
+
2426
+ 1:13:32.212 --> 1:13:45.980
2427
+ So then if you got you get a translation maybe
2428
+ and then you decide yes output.
2429
+
2430
+ 1:13:46.766 --> 1:13:54.250
2431
+ And then you're translating as one as two
2432
+ as sweet as four, but now you say generate
2433
+
2434
+ 1:13:54.250 --> 1:13:55.483
2435
+ only outputs.
2436
+
2437
+ 1:13:55.935 --> 1:14:07.163
2438
+ And then you're translating and maybe you're
2439
+ deciding on and now a good translation.
2440
+
2441
+ 1:14:07.031 --> 1:14:08.891
2442
+ Then you're.
2443
+
2444
+ 1:14:09.749 --> 1:14:29.984
2445
+ Yes, but don't get to worry about what the
2446
+ effect is.
2447
+
2448
+ 1:14:30.050 --> 1:14:31.842
2449
+ We're generating your target text.
2450
+
2451
+ 1:14:32.892 --> 1:14:36.930
2452
+ But we're not always outputing the full target
2453
+ text now.
2454
+
2455
+ 1:14:36.859 --> 1:14:43.693
2456
+ What we are having is we have here some strategy
2457
+ to decide: Oh, is a system already sure enough
2458
+
2459
+ 1:14:43.693 --> 1:14:44.405
2460
+ about it?
2461
+
2462
+ 1:14:44.334 --> 1:14:49.374
2463
+ If it's sure enough and it has all the information,
2464
+ we can output it.
2465
+
2466
+ 1:14:49.302 --> 1:14:50.746
2467
+ And then the next.
2468
+
2469
+ 1:14:51.291 --> 1:14:55.931
2470
+ If we say here sometimes with better not to
2471
+ get output we won't output it already.
2472
+
2473
+ 1:14:57.777 --> 1:15:06.369
2474
+ And thereby the hope is in the uphill model
2475
+ should not yet outcut a register because it
2476
+
2477
+ 1:15:06.369 --> 1:15:10.568
2478
+ doesn't mean no yet if it's a case or not.
2479
+
2480
+ 1:15:13.193 --> 1:15:18.039
2481
+ Output StrategiesSo what we have to discuss
2482
+ is what is a good output strategy.
2483
+
2484
+ 1:15:18.658 --> 1:15:20.070
2485
+ So you could do.
2486
+
2487
+ 1:15:19.987 --> 1:15:23.808
2488
+ The output strategy could be something like.
2489
+
2490
+ 1:15:23.743 --> 1:15:39.871
2491
+ If you think of weight cape, this is an output
2492
+ strategy here that you always input.
2493
+
2494
+ 1:15:40.220 --> 1:15:44.990
2495
+ Good, and you can view your weight in a similar
2496
+ way as.
2497
+
2498
+ 1:15:45.265 --> 1:15:55.194
2499
+ But now, of course, we can also look at other
2500
+ output strategies where it's more generic and
2501
+
2502
+ 1:15:55.194 --> 1:15:59.727
2503
+ it's deciding whether in some situations.
2504
+
2505
+ 1:16:01.121 --> 1:16:12.739
2506
+ And one thing that works quite well is referred
2507
+ to as local agreement, and that means you're
2508
+
2509
+ 1:16:12.739 --> 1:16:13.738
2510
+ always.
2511
+
2512
+ 1:16:14.234 --> 1:16:26.978
2513
+ Then you're looking what is the same thing
2514
+ between my current translation and the one
2515
+
2516
+ 1:16:26.978 --> 1:16:28.756
2517
+ did before.
2518
+
2519
+ 1:16:29.349 --> 1:16:31.201
2520
+ So let's do that again in six hours.
2521
+
2522
+ 1:16:31.891 --> 1:16:45.900
2523
+ So your input is a first audio segment and
2524
+ your title text is all model trains.
2525
+
2526
+ 1:16:46.346 --> 1:16:53.231
2527
+ Then you're getting six opposites, one and
2528
+ two, and this time the output is all models.
2529
+
2530
+ 1:16:54.694 --> 1:17:08.407
2531
+ You see trains are different, but both of
2532
+ them agree that it's all so in those cases.
2533
+
2534
+ 1:17:09.209 --> 1:17:13.806
2535
+ So we can be hopefully a big show that really
2536
+ starts with all.
2537
+
2538
+ 1:17:15.155 --> 1:17:22.604
2539
+ So now we say we're output all, so at this
2540
+ time instead we'll output all, although before.
2541
+
2542
+ 1:17:23.543 --> 1:17:27.422
2543
+ We are getting one, two, three as input.
2544
+
2545
+ 1:17:27.327 --> 1:17:35.703
2546
+ This time we have a prefix, so now we are
2547
+ only allowing translations to start with all.
2548
+
2549
+ 1:17:35.608 --> 1:17:42.939
2550
+ We cannot change that anymore, so we now need
2551
+ to generate some translation.
2552
+
2553
+ 1:17:43.363 --> 1:17:46.323
2554
+ And then it can be that its now all models
2555
+ are run.
2556
+
2557
+ 1:17:47.927 --> 1:18:01.908
2558
+ Then we compare here and see this agrees on
2559
+ all models so we can output all models.
2560
+
2561
+ 1:18:02.882 --> 1:18:07.356
2562
+ So this by we can dynamically decide is a
2563
+ model is very anxious.
2564
+
2565
+ 1:18:07.288 --> 1:18:10.180
2566
+ We always talk with something different.
2567
+
2568
+ 1:18:11.231 --> 1:18:24.872
2569
+ Then it's, we'll wait longer, it's more for
2570
+ the same thing, and hope we don't need to wait.
2571
+
2572
+ 1:18:30.430 --> 1:18:40.238
2573
+ Is it clear again that the signal wouldn't
2574
+ be able to detect?
2575
+
2576
+ 1:18:43.203 --> 1:18:50.553
2577
+ The hope it is because if it's not sure of,
2578
+ of course, it in this kind would have to switch
2579
+
2580
+ 1:18:50.553 --> 1:18:51.671
2581
+ all the time.
2582
+
2583
+ 1:18:56.176 --> 1:19:01.375
2584
+ So if it would be the first step to register
2585
+ and the second time to cancel and they may
2586
+
2587
+ 1:19:01.375 --> 1:19:03.561
2588
+ register again, they wouldn't do it.
2589
+
2590
+ 1:19:03.502 --> 1:19:08.348
2591
+ Of course, it is very short because in register
2592
+ a long time, then it can't deal.
2593
+
2594
+ 1:19:08.568 --> 1:19:23.410
2595
+ That's why there's two parameters that you
2596
+ can use and which might be important, or how.
2597
+
2598
+ 1:19:23.763 --> 1:19:27.920
2599
+ So you do it like every one second, every
2600
+ five seconds or something like that.
2601
+
2602
+ 1:19:28.648 --> 1:19:37.695
2603
+ Put it more often as your latency will be
2604
+ because your weight is less long, but also
2605
+
2606
+ 1:19:37.695 --> 1:19:39.185
2607
+ you might do.
2608
+
2609
+ 1:19:40.400 --> 1:19:50.004
2610
+ So that is the one thing and the other thing
2611
+ is for words you might do everywhere, but if
2612
+
2613
+ 1:19:50.004 --> 1:19:52.779
2614
+ you think about audio it.
2615
+
2616
+ 1:19:53.493 --> 1:20:04.287
2617
+ And the other question you can do like the
2618
+ agreement, so the model is sure.
2619
+
2620
+ 1:20:04.145 --> 1:20:10.255
2621
+ If you say have to agree, then hopefully.
2622
+
2623
+ 1:20:10.650 --> 1:20:21.369
2624
+ What we saw is think there has been a really
2625
+ normally good performance and otherwise your
2626
+
2627
+ 1:20:21.369 --> 1:20:22.441
2628
+ latency.
2629
+
2630
+ 1:20:22.963 --> 1:20:42.085
2631
+ Okay, we'll just make more tests and we'll
2632
+ get the confidence.
2633
+
2634
+ 1:20:44.884 --> 1:20:47.596
2635
+ Have to completely agree with that.
2636
+
2637
+ 1:20:47.520 --> 1:20:52.968
2638
+ So when this was done, that was our first
2639
+ idea of using the confidence.
2640
+
2641
+ 1:20:52.892 --> 1:21:00.206
2642
+ The problem is that currently that's my assumption
2643
+ is that the modeling the model confidence is
2644
+
2645
+ 1:21:00.206 --> 1:21:03.940
2646
+ not that easy, and they are often overconfident.
2647
+
2648
+ 1:21:04.324 --> 1:21:17.121
2649
+ In the paper there is this type also where
2650
+ you try to use the confidence in some way to
2651
+
2652
+ 1:21:17.121 --> 1:21:20.465
2653
+ decide the confidence.
2654
+
2655
+ 1:21:21.701 --> 1:21:26.825
2656
+ But that gave worse results, and that's why
2657
+ we looked into that.
2658
+
2659
+ 1:21:27.087 --> 1:21:38.067
2660
+ So it's a very good idea think, but it seems
2661
+ not to at least how it was implemented.
2662
+
2663
+ 1:21:38.959 --> 1:21:55.670
2664
+ There is one way that maybe goes in more direction,
2665
+ which is very new.
2666
+
2667
+ 1:21:55.455 --> 1:22:02.743
2668
+ If this one, the last word is attending mainly
2669
+ to the end of the audio.
2670
+
2671
+ 1:22:02.942 --> 1:22:04.934
2672
+ You might you should not output it yet.
2673
+
2674
+ 1:22:05.485 --> 1:22:15.539
2675
+ Because they might think there is something
2676
+ more missing than you need to know, so they
2677
+
2678
+ 1:22:15.539 --> 1:22:24.678
2679
+ look at the attention and only output parts
2680
+ which look to not the audio signal.
2681
+
2682
+ 1:22:25.045 --> 1:22:40.175
2683
+ So there is, of course, a lot of ways how
2684
+ you can do it better or easier in some way.
2685
+
2686
+ 1:22:41.901 --> 1:22:53.388
2687
+ Instead tries to predict the next word with
2688
+ a large language model, and then for text translation
2689
+
2690
+ 1:22:53.388 --> 1:22:54.911
2691
+ you predict.
2692
+
2693
+ 1:22:55.215 --> 1:23:01.177
2694
+ Then you translate all of them and decide
2695
+ if there is a change so you can even earlier
2696
+
2697
+ 1:23:01.177 --> 1:23:02.410
2698
+ do your decision.
2699
+
2700
+ 1:23:02.362 --> 1:23:08.714
2701
+ The idea is that if we continue and then this
2702
+ will be to a change in the translation, then
2703
+
2704
+ 1:23:08.714 --> 1:23:10.320
2705
+ we should have opened.
2706
+
2707
+ 1:23:10.890 --> 1:23:18.302
2708
+ So it's more doing your estimate about possible
2709
+ continuations of the source instead of looking
2710
+
2711
+ 1:23:18.302 --> 1:23:19.317
2712
+ at previous.
2713
+
2714
+ 1:23:23.783 --> 1:23:31.388
2715
+ All that works is a bit here like one example.
2716
+
2717
+ 1:23:31.227 --> 1:23:39.644
2718
+ It has a legacy baselines and you are not
2719
+ putting.
2720
+
2721
+ 1:23:40.040 --> 1:23:47.041
2722
+ And you see in this case you have worse blood
2723
+ scores here.
2724
+
2725
+ 1:23:46.923 --> 1:23:51.673
2726
+ For equal one you have better latency.
2727
+
2728
+ 1:23:52.032 --> 1:24:01.123
2729
+ The how to and how does anybody have an idea
2730
+ of what could be challenging there or when?
2731
+
2732
+ 1:24:05.825 --> 1:24:20.132
2733
+ One problem of these models are hallucinations,
2734
+ and often very long has a negative impact on.
2735
+
2736
+ 1:24:24.884 --> 1:24:30.869
2737
+ If you don't remove the last four words but
2738
+ your model now starts to hallucinate and invent
2739
+
2740
+ 1:24:30.869 --> 1:24:37.438
2741
+ just a lot of new stuff then yeah you're removing
2742
+ the last four words of that but if it has invented
2743
+
2744
+ 1:24:37.438 --> 1:24:41.406
2745
+ ten words and you're still outputting six of
2746
+ these invented.
2747
+
2748
+ 1:24:41.982 --> 1:24:48.672
2749
+ Typically once it starts hallucination generating
2750
+ some output, it's quite long, so then it's
2751
+
2752
+ 1:24:48.672 --> 1:24:50.902
2753
+ no longer enough to just hold.
2754
+
2755
+ 1:24:51.511 --> 1:24:57.695
2756
+ And then, of course, a bit better if you compare
2757
+ to the previous ones.
2758
+
2759
+ 1:24:57.608 --> 1:25:01.530
2760
+ Their destinations are typically different.
2761
+
2762
+ 1:25:07.567 --> 1:25:25.939
2763
+ Yes, so we don't talk about the details, but
2764
+ for outputs, for presentations, there's different
2765
+
2766
+ 1:25:25.939 --> 1:25:27.100
2767
+ ways.
2768
+
2769
+ 1:25:27.347 --> 1:25:36.047
2770
+ So you want to have maximum two lines, maximum
2771
+ forty-two characters per line, and the reading
2772
+
2773
+ 1:25:36.047 --> 1:25:40.212
2774
+ speed is a maximum of twenty-one characters.
2775
+
2776
+ 1:25:40.981 --> 1:25:43.513
2777
+ How to Do That We Can Skip.
2778
+
2779
+ 1:25:43.463 --> 1:25:46.804
2780
+ Then you can generate something like that.
2781
+
2782
+ 1:25:46.886 --> 1:25:53.250
2783
+ Another challenge is, of course, that you
2784
+ not only need to generate the translation,
2785
+
2786
+ 1:25:53.250 --> 1:25:59.614
2787
+ but for subtlyning you also want to generate
2788
+ when to put breaks and what to display.
2789
+
2790
+ 1:25:59.619 --> 1:26:06.234
2791
+ Because it cannot be full sentences, as said
2792
+ here, if you have like maximum twenty four
2793
+
2794
+ 1:26:06.234 --> 1:26:10.443
2795
+ characters per line, that's not always a full
2796
+ sentence.
2797
+
2798
+ 1:26:10.368 --> 1:26:12.250
2799
+ So how can you make it?
2800
+
2801
+ 1:26:13.093 --> 1:26:16.253
2802
+ And then for speech there's not even a hint
2803
+ of wisdom.
2804
+
2805
+ 1:26:18.398 --> 1:26:27.633
2806
+ So what we have done today is yeah, we looked
2807
+ into maybe three challenges: We have this segmentation,
2808
+
2809
+ 1:26:27.633 --> 1:26:33.065
2810
+ which is a challenge both in evaluation and
2811
+ in the decoder.
2812
+
2813
+ 1:26:32.974 --> 1:26:40.604
2814
+ We talked about disfluencies and we talked
2815
+ about simultaneous translations and how to
2816
+
2817
+ 1:26:40.604 --> 1:26:42.911
2818
+ address these challenges.
2819
+
2820
+ 1:26:43.463 --> 1:26:45.507
2821
+ Any more questions.
2822
+
2823
+ 1:26:48.408 --> 1:26:52.578
2824
+ Good then new content.
2825
+
2826
+ 1:26:52.396 --> 1:26:58.100
2827
+ We are done for this semester.
2828
+
2829
+ 1:26:57.916 --> 1:27:04.913
2830
+ You can keep your knowledge in that.
2831
+
2832
+ 1:27:04.744 --> 1:27:09.405
2833
+ Repetition where we can try to repeat a bit
2834
+ what we've done all over the semester.
2835
+
2836
+ 1:27:10.010 --> 1:27:13.776
2837
+ Now prepare a bit of repetition to what think
2838
+ is important.
2839
+
2840
+ 1:27:14.634 --> 1:27:21.441
2841
+ But of course is also the chance for you to
2842
+ ask specific questions.
2843
+
2844
+ 1:27:21.341 --> 1:27:25.447
2845
+ It's not clear to me how things relate.
2846
+
2847
+ 1:27:25.745 --> 1:27:34.906
2848
+ So if you have any specific questions, please
2849
+ come to me or send me an email or so, then
2850
+
2851
+ 1:27:34.906 --> 1:27:36.038
2852
+ I'm happy.
2853
+
2854
+ 1:27:36.396 --> 1:27:46.665
2855
+ If should focus on it really in depth, it
2856
+ might be good not to come and send me an email
2857
+
2858
+ 1:27:46.665 --> 1:27:49.204
2859
+ on Wednesday evening.
2860
+
demo_data/lectures/Lecture-19-21.07.2023/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:627fd6a73ed6853821cd58c2fc9e938a7844998ed51c4163f2d0a4771dc5c156
3
+ size 130103518
demo_data/nips-2021/25953/metadata.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "title": "Sliced Mutual Information: A Scalable Measure of Statistical Dependence"
3
+ }
demo_data/nips-2021/25953/transcript_whisper_large-v2.vtt ADDED
@@ -0,0 +1,581 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 00:00.000 --> 00:13.140
4
+ Hi everyone, my name is Zyw Goldfeld and this is a joint work with Christian Greenwald about
5
+
6
+ 00:13.140 --> 00:18.200
7
+ sliced mutual information, which is a new measure of statistical dependence that has
8
+
9
+ 00:18.200 --> 00:22.520
10
+ some nice scalability properties to high dimensional settings.
11
+
12
+ 00:22.520 --> 00:26.540
13
+ And to get started, I think we're all familiar with classic mutual information that is defined
14
+
15
+ 00:26.540 --> 00:30.920
16
+ between let's say continuous high dimensional random variables, which is the regime that
17
+
18
+ 00:30.920 --> 00:36.240
19
+ we'll mostly be interested in, like SOH, basically the KL divergence between their joint distributions
20
+
21
+ 00:36.240 --> 00:39.040
22
+ and the product of their marginals.
23
+
24
+ 00:39.040 --> 00:44.520
25
+ And mutual information is indeed this fundamental measure of dependence that enjoys many good
26
+
27
+ 00:44.520 --> 00:50.060
28
+ properties such that the fact that it nullifies if and only if our random variables are independent,
29
+
30
+ 00:50.060 --> 00:55.200
31
+ it is invariant to bijections and it meets several useful representations, decompositions,
32
+
33
+ 00:55.200 --> 00:56.600
34
+ variational forms, etc.
35
+
36
+ 00:56.600 --> 01:02.440
37
+ And in fact, it can be even obtained axiomatically as the unique functional of the joint distribution
38
+
39
+ 01:02.440 --> 01:07.760
40
+ that satisfies some natural informativeness conditions.
41
+
42
+ 01:07.760 --> 01:11.120
43
+ And as such, mutual information has seen a variety of applications in information theory
44
+
45
+ 01:11.120 --> 01:13.680
46
+ and statistics more recently in machine learning.
47
+
48
+ 01:13.680 --> 01:18.920
49
+ But the problem is that all this nice structure comes with a hefty price, since computing
50
+
51
+ 01:18.920 --> 01:24.520
52
+ mutual information in high dimensions or estimating it from samples is very, very hard, effectively
53
+
54
+ 01:24.520 --> 01:25.520
55
+ infeasible.
56
+
57
+ 01:25.520 --> 01:30.240
58
+ And this is the so-called curse of dimensionality and sort of the problem that we try to tackle
59
+
60
+ 01:30.240 --> 01:31.400
61
+ in this work.
62
+
63
+ 01:31.400 --> 01:37.040
64
+ And to address this difficulty, what we propose is sliced mutual information, which is, like
65
+
66
+ 01:37.040 --> 01:42.520
67
+ I said, a new measure of statistical dependence, not necessarily a proxy of mutual information
68
+
69
+ 01:42.520 --> 01:48.820
70
+ as such, but rather an alternative notion, which is defined as this average of scalar
71
+
72
+ 01:48.820 --> 01:54.640
73
+ mutual information terms between projections of our high dimensional variables onto randomly
74
+
75
+ 01:54.640 --> 01:58.520
76
+ chosen directions from the corresponding unit spheres.
77
+
78
+ 01:58.520 --> 02:03.520
79
+ And it's of course inspired by the recent popularization of slicing techniques for statistical
80
+
81
+ 02:03.520 --> 02:07.480
82
+ divergences, in particular the Wasserstein, the sliced Wasserstein distance is a great
83
+
84
+ 02:07.480 --> 02:08.480
85
+ example.
86
+
87
+ 02:08.480 --> 02:14.440
88
+ But the way it works for sliced mutual information is roughly so, well, let's say that this is
89
+
90
+ 02:14.440 --> 02:19.120
91
+ our first high dimensional variable X and this is its distribution.
92
+
93
+ 02:19.120 --> 02:22.480
94
+ What you do is draw a projection direction uniformly from the sphere.
95
+
96
+ 02:22.480 --> 02:26.960
97
+ You then project this random variable onto that direction, do the same for your other
98
+
99
+ 02:26.960 --> 02:28.200
100
+ random variable.
101
+
102
+ 02:28.200 --> 02:34.360
103
+ And now for these two projected scalar new variables, we just compute the mutual information
104
+
105
+ 02:34.360 --> 02:38.560
106
+ between them and average everything over the choice of direction.
107
+
108
+ 02:38.560 --> 02:40.600
109
+ So that's basically the definition.
110
+
111
+ 02:40.600 --> 02:45.880
112
+ And with that, the goal of this work is effectively to show that sliced mutual information is
113
+
114
+ 02:45.880 --> 02:50.080
115
+ both a meaningful and a scalable mutual information alternative.
116
+
117
+ 02:50.080 --> 02:56.200
118
+ Meaningful, well, in the sense that it preserves many of the desired properties that make mutual
119
+
120
+ 02:56.200 --> 03:00.240
121
+ information appealing to begin with and scalable in the sense that it alleviates the set of
122
+
123
+ 03:00.240 --> 03:03.800
124
+ computational and statistical difficulties.
125
+
126
+ 03:03.800 --> 03:04.800
127
+ All right.
128
+
129
+ 03:04.800 --> 03:11.080
130
+ Yeah, and to address this first point, let me show you that, well, despite those one
131
+
132
+ 03:11.080 --> 03:15.800
133
+ dimensional projections, sliced mutual information indeed inherits many of the properties of
134
+
135
+ 03:15.800 --> 03:17.700
136
+ classic mutual information.
137
+
138
+ 03:17.700 --> 03:23.740
139
+ So we have, well, of course, non-negativity, but furthermore, identification of independence.
140
+
141
+ 03:23.740 --> 03:28.960
142
+ We have an entropy decomposition for an appropriate definition of sliced entropy.
143
+
144
+ 03:28.960 --> 03:31.840
145
+ We can represent it as a KL divergence, a sliced KL divergence.
146
+
147
+ 03:31.840 --> 03:38.920
148
+ To be more precise, we have a chain rule tensorization for independent copies, as well as a Donsker-Varadhan-like
149
+
150
+ 03:38.920 --> 03:44.840
151
+ variational form that can be readily used for neural estimation of sliced mutual information.
152
+
153
+ 03:44.840 --> 03:49.720
154
+ We actually make use of that in some of our empirical results.
155
+
156
+ 03:49.720 --> 03:53.400
157
+ And well, I mean, you are more than welcome to check the paper or visit us as a poster
158
+
159
+ 03:53.400 --> 03:55.280
160
+ if you want to know more about any of these.
161
+
162
+ 03:55.280 --> 04:00.480
163
+ But really, the upshot here is that much of the classic structure is still there after
164
+
165
+ 04:00.480 --> 04:02.360
166
+ the slicing.
167
+
168
+ 04:02.360 --> 04:06.240
169
+ Now another interesting feature of sliced mutual information comes to light when you
170
+
171
+ 04:06.240 --> 04:10.400
172
+ think of it in the context of the famous data processing inequality.
173
+
174
+ 04:10.400 --> 04:15.560
175
+ And for starters, recall that classic mutual information satisfies the DPI, which in particular
176
+
177
+ 04:15.560 --> 04:21.440
178
+ means that if you process either of your random variables with a deterministic function, say
179
+
180
+ 04:21.440 --> 04:27.400
181
+ this f over here, you can only lose the informativeness in the classic sense.
182
+
183
+ 04:27.400 --> 04:33.360
184
+ Now sliced mutual information plays differently with processing and can in some sense benefit
185
+
186
+ 04:33.360 --> 04:39.280
187
+ from nice transformations that, let's say, give rise to some nicer manifold for your
188
+
189
+ 04:39.280 --> 04:40.280
190
+ random variable.
191
+
192
+ 04:40.280 --> 04:43.880
193
+ And to understand this, keep in mind that, well, first of all, sliced mutual information
194
+
195
+ 04:43.880 --> 04:47.320
196
+ only looks at projections of random variables.
197
+
198
+ 04:47.320 --> 04:52.720
199
+ And it may very well be the case that some transformations of x, let's say, have more
200
+
201
+ 04:52.720 --> 04:58.480
202
+ informative projections about y than x itself.
203
+
204
+ 04:58.480 --> 05:01.080
205
+ And here's a simple example to that effect.
206
+
207
+ 05:01.080 --> 05:06.120
208
+ So consider a two-dimensional isotropic Gaussian x, so two coordinates, x1 and x2.
209
+
210
+ 05:06.120 --> 05:10.440
211
+ And let's take y to be, for example, its first coordinate.
212
+
213
+ 05:10.440 --> 05:15.440
214
+ Now if you look at the mutual information between two fixed projections of x and y,
215
+
216
+ 05:15.440 --> 05:18.600
217
+ well, projection does nothing to y, right, because it's a scalar.
218
+
219
+ 05:18.600 --> 05:20.400
220
+ But it does affect x.
221
+
222
+ 05:20.400 --> 05:24.520
223
+ And if you look at the mutual information between two projections of x and y, you quickly
224
+
225
+ 05:24.520 --> 05:31.120
226
+ realize that x1 really plays the role of the signal here, whereas x2 behaves like noise.
227
+
228
+ 05:31.120 --> 05:36.120
229
+ And therefore, any transformation that will effectively improve your signal-to-noise ratio,
230
+
231
+ 05:36.120 --> 05:42.520
232
+ for example, like this g sub a over here, where a is less than 1, will indeed give rise
233
+
234
+ 05:42.520 --> 05:45.880
235
+ to a higher sliced mutual information value.
236
+
237
+ 05:45.880 --> 05:50.300
238
+ So all in all, sliced mutual information can be increased from processing, which means
239
+
240
+ 05:50.300 --> 05:54.440
241
+ that, well, in particular, it validates the data processing inequality and is different
242
+
243
+ 05:54.440 --> 05:56.840
244
+ from classic mutual information in that sense.
245
+
246
+ 05:56.840 --> 06:03.120
247
+ But interestingly, and as I will show you shortly, this is actually a quite useful thing
248
+
249
+ 06:03.120 --> 06:08.400
250
+ to have, for example, for feature extraction tasks, because we can use sliced mutual information
251
+
252
+ 06:08.400 --> 06:14.240
253
+ effectively to maximize it in order to extract informative features and land on those nicer
254
+
255
+ 06:14.240 --> 06:17.660
256
+ manifolds that I mentioned a moment ago.
257
+
258
+ 06:17.660 --> 06:22.280
259
+ And here's an example theorem that kind of makes this statement precise or formal, where
260
+
261
+ 06:22.280 --> 06:28.120
262
+ we consider the maximization of sliced mutual information over linear transformations of
263
+
264
+ 06:28.120 --> 06:29.920
265
+ our random variables.
266
+
267
+ 06:29.920 --> 06:34.200
268
+ And this would, of course, not affect classic mutual information at all.
269
+
270
+ 06:34.200 --> 06:39.160
271
+ But what we can show is that for sliced mutual information, this maximization ends up extracting
272
+
273
+ 06:39.160 --> 06:44.960
274
+ the two most informative projection directions for you, which in particular will be encoded
275
+
276
+ 06:44.960 --> 06:52.200
277
+ in the optimizing matrices, these A sub x star and A sub y star.
278
+
279
+ 06:52.200 --> 06:55.240
280
+ And of course, there's nothing special about this particular setup.
281
+
282
+ 06:55.240 --> 07:00.720
283
+ And we can establish similar results for, well, first of all, rank-constrained matrices
284
+
285
+ 07:00.720 --> 07:06.720
286
+ that as opposed to what's shown here would extract the, let's say, our most informative
287
+
288
+ 07:06.720 --> 07:08.840
289
+ features or projection directions.
290
+
291
+ 07:08.840 --> 07:11.120
292
+ In the paper, we also extend this result to shallow neural networks.
293
+
294
+ 07:11.120 --> 07:17.840
295
+ And in fact, our argument can be easily extended to cover additional nonlinear cases as well.
296
+
297
+ 07:17.840 --> 07:21.440
298
+ OK, so that's pretty much for structural properties.
299
+
300
+ 07:21.440 --> 07:25.400
301
+ But like I said at the beginning, the real premise of this framework is overcoming the
302
+
303
+ 07:25.400 --> 07:26.400
304
+ curse of dimensionality.
305
+
306
+ 07:26.400 --> 07:32.640
307
+ And let me show you that this is indeed the case, that sliced mutual information is or
308
+
309
+ 07:32.640 --> 07:38.640
310
+ can be estimated in a scalable manner, effectively by combining your favorite scalar mutual information
311
+
312
+ 07:38.640 --> 07:42.200
313
+ estimator with a simple Monte Carlo average step.
314
+
315
+ 07:42.200 --> 07:43.480
316
+ And this is how it works.
317
+
318
+ 07:43.480 --> 07:48.260
319
+ So let's say we're giving n IID samples from our high-dimensional random variables.
320
+
321
+ 07:48.260 --> 07:53.400
322
+ And we're further given a scalar mutual information estimator that achieves, say, error delta
323
+
324
+ 07:53.400 --> 08:00.240
325
+ of n when applied to n IID samples of some pair of one-dimensional variables, a and b.
326
+
327
+ 08:00.240 --> 08:02.040
328
+ OK, so let's say we have these.
329
+
330
+ 08:02.040 --> 08:08.760
331
+ Now, to estimate sliced mutual information, first thing to do is sample, let's say, m
332
+
333
+ 08:08.760 --> 08:14.680
334
+ random projections from the corresponding spheres in an IID fashion, at which point
335
+
336
+ 08:14.680 --> 08:22.400
337
+ we will take our high-dimensional n samples and project them onto each of these m random
338
+
339
+ 08:22.400 --> 08:24.960
340
+ projections that we've generated.
341
+
342
+ 08:24.960 --> 08:30.780
343
+ And the thing to observe here is that the resulting n times n data set of these projections
344
+
345
+ 08:30.780 --> 08:35.220
346
+ is nothing but IID samples from the corresponding projected distribution, which is the right
347
+
348
+ 08:35.220 --> 08:39.400
349
+ thing to have here if what you're trying to estimate is sliced mutual information.
350
+
351
+ 08:39.400 --> 08:43.860
352
+ So having that, I mean, at this point, per projection direction, we can apply the scalar
353
+
354
+ 08:43.860 --> 08:49.400
355
+ mutual information estimator and then just take one big, happy Monte Carlo average of
356
+
357
+ 08:49.400 --> 08:52.040
358
+ the entire thing over the different projection directions.
359
+
360
+ 08:52.040 --> 08:55.600
361
+ And this would give rise to the proposed sliced mutual information estimator.
362
+
363
+ 08:55.600 --> 08:59.780
364
+ Now, you can compute this thing very easily, because at the end of the day, it's an average
365
+
366
+ 08:59.780 --> 09:03.000
367
+ of scalar mutual information estimates.
368
+
369
+ 09:03.000 --> 09:09.120
370
+ And as far as performance guarantees, we can show that so long that the per-sliced mutual
371
+
372
+ 09:09.120 --> 09:15.840
373
+ information is bounded, the uniform absolute error of this estimator scales like 1 over
374
+
375
+ 09:15.840 --> 09:22.240
376
+ the root of m, the number of our Monte Carlo samples, plus the error of the scalar mutual
377
+
378
+ 09:22.240 --> 09:23.240
379
+ information estimator.
380
+
381
+ 09:23.240 --> 09:26.520
382
+ And I'm just restating this informally over here.
383
+
384
+ 09:26.520 --> 09:31.240
385
+ And what this all in all shows is that sliced mutual information can therefore be estimated
386
+
387
+ 09:31.240 --> 09:37.760
388
+ the rate of scalar mutual information estimation problem plus this m to the minus half Monte
389
+
390
+ 09:37.760 --> 09:38.760
391
+ Carlo penalty.
392
+
393
+ 09:38.760 --> 09:43.440
394
+ And the thing is that under appropriate smoothness assumptions, the one-dimensional rate is in
395
+
396
+ 09:43.440 --> 09:45.200
397
+ fact parametric.
398
+
399
+ 09:45.200 --> 09:49.720
400
+ And therefore, if you just match the size of your data set and the number of Monte Carlo
401
+
402
+ 09:49.720 --> 09:54.640
403
+ samples, just equate n and m, the sliced mutual information between high-dimensional variables
404
+
405
+ 09:54.640 --> 09:59.360
406
+ can be estimated at the parametric n to the minus half rate, perhaps up to some logarithmic
407
+
408
+ 09:59.360 --> 10:00.360
409
+ factors.
410
+
411
+ 10:00.360 --> 10:06.360
412
+ And this is, of course, a significant speed up and stands in sharp contrast to the slow,
413
+
414
+ 10:06.360 --> 10:12.040
415
+ exponentially bad in dimension, curse of dimensionality rate for classic mutual information.
416
+
417
+ 10:12.040 --> 10:17.200
418
+ Yeah, now this scalability makes, in fact, running empirical experiments with sliced
419
+
420
+ 10:17.200 --> 10:18.720
421
+ mutual information quite a breeze.
422
+
423
+ 10:18.720 --> 10:24.160
424
+ So let me quickly show you some sort of proof of concept experiments, let's say.
425
+
426
+ 10:24.160 --> 10:28.280
427
+ And the first one just relies on the fact that, well, SMI, sliced mutual information
428
+
429
+ 10:28.280 --> 10:29.840
430
+ can identify independence.
431
+
432
+ 10:29.840 --> 10:34.440
433
+ And therefore, we examine it as a figure of merit for independence testing, basically
434
+
435
+ 10:34.440 --> 10:38.640
436
+ by thresholding the computed sliced mutual information value.
437
+
438
+ 10:38.640 --> 10:42.000
439
+ And the results that we have obtained, of course, we've compared them with the same
440
+
441
+ 10:42.000 --> 10:45.360
442
+ test, but based on classic mutual information.
443
+
444
+ 10:45.360 --> 10:50.320
445
+ And this figure over here shows that for a bunch of different settings, well, it presents
446
+
447
+ 10:50.320 --> 10:55.040
448
+ the area under the ROC curve as a function of the number of samples, the standard way
449
+
450
+ 10:55.040 --> 10:59.160
451
+ to represent the quality of an independence test.
452
+
453
+ 10:59.160 --> 11:02.920
454
+ And you basically want this number to be 1, which corresponds to an omniscient test.
455
+
456
+ 11:02.920 --> 11:07.520
457
+ And what we observe is that sliced mutual information performs consistently well across
458
+
459
+ 11:07.520 --> 11:13.080
460
+ different setups and across different dimensions, whereas the performance of the mutual information,
461
+
462
+ 11:13.080 --> 11:18.280
463
+ the classic mutual information-based test, quickly degrades as dimension grows.
464
+
465
+ 11:18.280 --> 11:23.280
466
+ Now, on top of that, let me also demonstrate how sliced mutual information can be used
467
+
468
+ 11:23.280 --> 11:24.680
469
+ for feature extraction.
470
+
471
+ 11:24.680 --> 11:29.780
472
+ And here, what we want to do is maximize the sliced mutual information between linear transformations
473
+
474
+ 11:29.780 --> 11:37.160
475
+ of x and y that are now chosen to be IID samples from the same MNIST class, which we restrict
476
+
477
+ 11:37.160 --> 11:39.240
478
+ to be either 0 or 1.
479
+
480
+ 11:39.240 --> 11:42.840
481
+ And the choice of class is also random, so basically just a fair coin flip.
482
+
483
+ 11:42.840 --> 11:47.280
484
+ And by observing that sliced mutual information between x and y is at most 1 bit, I mean,
485
+
486
+ 11:47.280 --> 11:52.560
487
+ it's always upper bounded by mutual information, which equals a single bit in this case, basically
488
+
489
+ 11:52.560 --> 11:57.320
490
+ the class label, the way to understand what we're doing here is that we're looking for
491
+
492
+ 11:57.320 --> 12:03.400
493
+ the linear feature that is most informative for classifying or determining this class
494
+
495
+ 12:03.400 --> 12:04.760
496
+ label.
497
+
498
+ 12:04.760 --> 12:08.200
499
+ And interestingly enough, this is what this procedure ends up learning, where the figure
500
+
501
+ 12:08.200 --> 12:15.040
502
+ shows basically the first two rows of the optimal A matrix that we obtained, rearranged
503
+
504
+ 12:15.040 --> 12:17.480
505
+ in the dimension of an MNIST image.
506
+
507
+ 12:17.480 --> 12:22.720
508
+ And this really looks like a match filter, if you're familiar, which, when applied to
509
+
510
+ 12:22.720 --> 12:27.480
511
+ the samples, would indeed be able to tell you whether the sample came from the 0 class
512
+
513
+ 12:27.480 --> 12:28.640
514
+ or not.
515
+
516
+ 12:28.640 --> 12:33.680
517
+ And as far as for the value itself, well, the maximized sliced mutual information value
518
+
519
+ 12:33.680 --> 12:39.800
520
+ ends up being roughly 0.7, which is quite close to the 1 bit upper bound, and is much,
521
+
522
+ 12:39.800 --> 12:44.400
523
+ much larger than what you would get if you would not learn A, and let's say just instantiate
524
+
525
+ 12:44.400 --> 12:49.480
526
+ it as a matrix with IID entries drawn according to some distribution.
527
+
528
+ 12:49.480 --> 12:53.640
529
+ And this is just to say that something meaningful indeed being learned here, and something meaningful
530
+
531
+ 12:53.640 --> 13:00.160
532
+ indeed happens when you maximize the sliced mutual information as your optimization objective.
533
+
534
+ 13:00.160 --> 13:03.400
535
+ OK, so yeah, that's basically it.
536
+
537
+ 13:03.400 --> 13:09.160
538
+ And just to recap, we introduced sliced mutual information, which is this average of scalar
539
+
540
+ 13:09.160 --> 13:12.160
541
+ mutual information terms between one-dimensional projections.
542
+
543
+ 13:12.160 --> 13:15.880
544
+ We've seen that it preserves much of the structure of classic mutual information.
545
+
546
+ 13:15.880 --> 13:22.280
547
+ It can be efficiently computed and estimated from samples, and can also be, in fact, increased
548
+
549
+ 13:22.280 --> 13:28.280
550
+ by our processing if, indeed, your processing gives rise to more informative projections.
551
+
552
+ 13:28.280 --> 13:32.960
553
+ And we've presented some proof of concept applications to independence testing, to feature
554
+
555
+ 13:32.960 --> 13:33.960
556
+ extraction.
557
+
558
+ 13:33.960 --> 13:35.800
559
+ We have a couple of more in the paper.
560
+
561
+ 13:35.800 --> 13:36.960
562
+ But let me say this.
563
+
564
+ 13:36.960 --> 13:41.480
565
+ While this is mostly theoretical work, and a large-scale empirical exploration is sort
566
+
567
+ 13:41.480 --> 13:46.640
568
+ of beyond its scope, we firmly believe that sliced mutual information will be extremely
569
+
570
+ 13:46.640 --> 13:51.360
571
+ useful for various such tasks, and are very excited to look into this in the future.
572
+
573
+ 13:51.360 --> 13:52.680
574
+ And yeah, with that, I'll stop.
575
+
576
+ 13:52.680 --> 13:57.220
577
+ Thank you guys for listening, and do visit us at the poster, and check out the paper
578
+
579
+ 13:57.220 --> 14:12.560
580
+ if you would like to know more.
581
+
demo_data/nips-2021/25953/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06f4968133dc8ada5fd9bf717fcd61a91049cd3c3034553cb6c2490f292c8a42
3
+ size 90905227
demo_data/nips-2021/25957/metadata.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "title": "Shared Independent Component Analysis for Multi-Subject Neuroimaging"
3
+ }
demo_data/nips-2021/25957/transcript_whisper_large-v2.vtt ADDED
@@ -0,0 +1,539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 00:00.000 --> 00:14.000
4
+ Hi, I'm Hugo Richard, I'm a third year PhD student at Université Paris-Saclay.
5
+
6
+ 00:14.000 --> 00:18.480
7
+ I'm in the INRIA Paris et Alpes team and my supervisor is Bertrand Thirion.
8
+
9
+ 00:18.480 --> 00:24.600
10
+ Today I'll talk about shared independent component analysis for multi-subject neuroimaging.
11
+
12
+ 00:24.600 --> 00:31.400
13
+ This is a joint work with Pierre Abelin, Alexandre Grandfort, Bertrand Thirion and Anna Pouy-Varine.
14
+
15
+ 00:31.400 --> 00:36.360
16
+ First let us consider two sources that are emitting a signal that is recorded by two
17
+
18
+ 00:36.360 --> 00:37.360
19
+ sensors.
20
+
21
+ 00:37.360 --> 00:43.120
22
+ This can be seen as a simplified model of magnetoencephalography where brain sources
23
+
24
+ 00:43.120 --> 00:46.000
25
+ are recorded by magnetometers.
26
+
27
+ 00:46.000 --> 00:50.200
28
+ Because propagation time can be neglected, the signal recorded by the sensors can be
29
+
30
+ 00:50.200 --> 00:55.840
31
+ seen as a linear mixture of the signal emitted by the sources.
32
+
33
+ 00:55.840 --> 00:59.600
34
+ S is a set of sources that are assumed to be independent.
35
+
36
+ 00:59.600 --> 01:06.400
37
+ X are the recordings and A describes how the sources are mixed to produce the recordings.
38
+
39
+ 01:06.400 --> 01:12.120
40
+ At first sight this model may seem ill-defined because if we permute two columns in A and
41
+
42
+ 01:12.120 --> 01:19.600
43
+ permute the corresponding sources in S, we'll get a new set of sources S' and a new mixing
44
+
45
+ 01:19.600 --> 01:25.360
46
+ matrix A' that describes X just as well as A and S.
47
+
48
+ 01:25.360 --> 01:30.360
49
+ And similarly if we scale the column of A by some constant, one column of A by some
50
+
51
+ 01:30.360 --> 01:34.920
52
+ constant and the corresponding source by the same constant, we'll also get an equivalent
53
+
54
+ 01:34.920 --> 01:35.920
55
+ description of X.
56
+
57
+ 01:35.920 --> 01:44.840
58
+ However, these scale and permutation indeterminacies are the only one if the sources contain at
59
+
60
+ 01:44.840 --> 01:46.840
61
+ most one Gaussian component.
62
+
63
+ 01:46.840 --> 01:52.040
64
+ Let us consider the more general problem where you have multiple subjects that are exposed
65
+
66
+ 01:52.040 --> 01:54.560
67
+ to the same stimuli.
68
+
69
+ 01:54.560 --> 02:00.640
70
+ We have two subjects, X1 and X2, and they have different mixing matrices, A1 and A2,
71
+
72
+ 02:00.640 --> 02:04.560
73
+ and different noise levels, N1 and N2.
74
+
75
+ 02:04.560 --> 02:08.720
76
+ The interpretation is that they have shared sources because they have shared connective
77
+
78
+ 02:08.720 --> 02:09.720
79
+ processes.
80
+
81
+ 02:09.720 --> 02:15.120
82
+ They have different mixing matrices because they have different spatial topography.
83
+
84
+ 02:15.120 --> 02:20.600
85
+ And they have different noises because we want to model inter-subject variability.
86
+
87
+ 02:20.600 --> 02:22.480
88
+ This model is called group ICA.
89
+
90
+ 02:22.480 --> 02:27.840
91
+ There are many methods to provide a solution for the group ICA problem.
92
+
93
+ 02:27.840 --> 02:34.560
94
+ A very popular one introduced by Calhoun in 2001 is to just stack the data of all subjects
95
+
96
+ 02:34.560 --> 02:42.520
97
+ feature-wise and then perform a PCA, a principal component analysis, on the stacked data.
98
+
99
+ 02:42.520 --> 02:47.520
100
+ And therefore you obtain reduced data and apply independent component analysis on the
101
+
102
+ 02:47.520 --> 02:50.520
103
+ reduced data to obtain a set of sources.
104
+
105
+ 02:50.520 --> 02:55.960
106
+ Another formulation is introduced by Varoko in 2010 and is called K-NICA.
107
+
108
+ 02:55.960 --> 03:01.320
109
+ You just replace the principal component analysis with a multiset CCA, so a multiset canonical
110
+
111
+ 03:01.320 --> 03:06.120
112
+ correlation analysis, where you have to solve a generalized eigenvalue problem.
113
+
114
+ 03:06.120 --> 03:12.800
115
+ There are many different formulations of multiset CCA, but this one with a generalized eigenvalue
116
+
117
+ 03:12.800 --> 03:15.560
118
+ problem is the fastest to solve.
119
+
120
+ 03:15.560 --> 03:17.840
121
+ KNICA and Cut-ICA have a lot of advantages.
122
+
123
+ 03:17.840 --> 03:21.000
124
+ First, they are very fast to fit.
125
+
126
+ 03:21.000 --> 03:23.320
127
+ And second, they are simple to implement.
128
+
129
+ 03:23.320 --> 03:26.920
130
+ These are the two reasons why they are so popular in neuroimaging.
131
+
132
+ 03:26.920 --> 03:30.160
133
+ However, they do not optimize the proper likelihood.
134
+
135
+ 03:30.160 --> 03:35.680
136
+ So therefore they do not benefit from advantages of such estimators such as asymptotic efficiency.
137
+
138
+ 03:35.680 --> 03:41.480
139
+ There are a lot of other related work that do optimize the proper likelihood.
140
+
141
+ 03:41.480 --> 03:46.240
142
+ I want to mention the independent vector analysis, which is a very powerful framework introduced
143
+
144
+ 03:46.240 --> 03:48.760
145
+ by Li in 2008.
146
+
147
+ 03:48.760 --> 03:54.560
148
+ So unified approach of Guo in 2008 that we will also mention and talk about later.
149
+
150
+ 03:54.560 --> 04:01.040
151
+ The approach of Shen in 2015 that also allows to perform dimension reduction.
152
+
153
+ 04:01.040 --> 04:08.320
154
+ And the multi-view ICA that was introduced by our team last year.
155
+
156
+ 04:08.320 --> 04:15.200
157
+ I want to quickly say that it's not obvious to design a likelihood-based approach that
158
+
159
+ 04:15.200 --> 04:17.400
160
+ is tractable.
161
+
162
+ 04:17.400 --> 04:23.680
163
+ And with this example of the Gaussian mixture noisy ICA by Bermond and Cardozo, we'll see
164
+
165
+ 04:23.680 --> 04:31.400
166
+ that standard approach leads to intractable algorithms.
167
+
168
+ 04:31.400 --> 04:37.080
169
+ The model we take here is the same as the group ICA, but we assume that the noise is
170
+
171
+ 04:37.080 --> 04:40.120
172
+ Gaussian with the same variance for all subjects.
173
+
174
+ 04:40.120 --> 04:47.600
175
+ We'll also assume that the sources follow a Gaussian mixture model.
176
+
177
+ 04:47.600 --> 04:53.040
178
+ And we further assume that the weights of the Gaussian mixtures are known.
179
+
180
+ 04:53.040 --> 04:56.360
181
+ We can solve such model via expectation maximization.
182
+
183
+ 04:56.360 --> 05:01.400
184
+ And if we write the E-step, we'll get a closed form that involves a large sum.
185
+
186
+ 05:01.400 --> 05:09.040
187
+ Because of this large size, this sum, and therefore the M algorithm is intractable whenever
188
+
189
+ 05:09.040 --> 05:11.600
190
+ Q and K are large.
191
+
192
+ 05:11.600 --> 05:17.520
193
+ Our contribution is shared ICA, what we call Shikha for short, where the data of subject
194
+
195
+ 05:17.520 --> 05:23.080
196
+ i are assumed as a linear mixture of noisy sources, and the noise here is not on the
197
+
198
+ 05:23.080 --> 05:24.080
199
+ sensor, but on the sources.
200
+
201
+ 05:24.080 --> 05:30.000
202
+ The noise is Gaussian with a variance that can be different for each subject and different
203
+
204
+ 05:30.000 --> 05:31.000
205
+ for each component.
206
+
207
+ 05:31.000 --> 05:37.800
208
+ S are assumed to be independent, but in contrast to almost all existing work, some components
209
+
210
+ 05:37.800 --> 05:38.800
211
+ can be Gaussian.
212
+
213
+ 05:38.800 --> 05:41.600
214
+ We have a few blanket assumptions.
215
+
216
+ 05:41.600 --> 05:45.840
217
+ We assume that the data are centered, that the mixing metrics are invertible, that the
218
+
219
+ 05:45.840 --> 05:50.680
220
+ sources have identical variance, and that the number of subjects is greater than 3.
221
+
222
+ 05:50.680 --> 05:54.000
223
+ We have two algorithms to solve the Shikha model.
224
+
225
+ 05:54.000 --> 06:01.520
226
+ We have ShikhaJ, that is a FAS algorithm that is based on multiset CCA, and ShikhaML, a
227
+
228
+ 06:01.520 --> 06:04.000
229
+ maximum likelihood approach.
230
+
231
+ 06:04.000 --> 06:07.600
232
+ In Shikha, there are two ways to recover the parameters.
233
+
234
+ 06:07.600 --> 06:12.880
235
+ Either the source are non-Gaussian, in which case we can use classical ICA results to recover
236
+
237
+ 06:12.880 --> 06:15.720
238
+ the unmixing matrices.
239
+
240
+ 06:15.720 --> 06:20.120
241
+ When the components are Gaussian, then we need something else, and what we use here
242
+
243
+ 06:20.120 --> 06:22.480
244
+ is noise diversity.
245
+
246
+ 06:22.480 --> 06:28.320
247
+ When the noise is sufficiently diverse, then it's possible to recover the unmixing matrix
248
+
249
+ 06:28.320 --> 06:34.120
250
+ and the noise covariance up to a permutation and sign indeterminacy.
251
+
252
+ 06:34.120 --> 06:38.240
253
+ Note that the noise diversity in Gaussian components is also a necessary condition.
254
+
255
+ 06:38.240 --> 06:42.680
256
+ If it does not hold, then Shikha cannot be identified.
257
+
258
+ 06:42.680 --> 06:48.520
259
+ Let us now focus on this theorem that is at the core of the ShikhaJ algorithm.
260
+
261
+ 06:48.520 --> 06:53.520
262
+ Namely it shows that we can solve group ICA with multiset CCA.
263
+
264
+ 06:53.520 --> 06:58.880
265
+ So assume the data follows the Shikha model, and consider the multiset CCA framed as a
266
+
267
+ 06:58.880 --> 07:00.920
268
+ generalized eigenvalue problem.
269
+
270
+ 07:00.920 --> 07:08.080
271
+ This generalized eigenvalue problem relies on two matrices, C and D. So C is formed by
272
+
273
+ 07:08.080 --> 07:13.560
274
+ second-order statistics, and D is formed by the diagonal blocks in C.
275
+
276
+ 07:13.560 --> 07:19.880
277
+ And so if we solve this eigenvalue problem and take the first k leading eigenvectors,
278
+
279
+ 07:19.880 --> 07:26.520
280
+ we can recover the correct unmixing matrix from them, up to a permutation and a scaling.
281
+
282
+ 07:26.520 --> 07:32.000
283
+ And this can only be done if the k first eigenvalues are distinct.
284
+
285
+ 07:32.000 --> 07:34.320
286
+ Note that the distinct eigenvalue condition is also necessary.
287
+
288
+ 07:34.320 --> 07:40.480
289
+ If two eigenvalues are the same, then this adds the need to determine IC, and therefore
290
+
291
+ 07:40.480 --> 07:42.280
292
+ we cannot solve group IC.
293
+
294
+ 07:42.280 --> 07:48.640
295
+ Note also that the condition that some eigenvalues need to be distinct is stronger than the noise
296
+
297
+ 07:48.640 --> 07:54.080
298
+ diversity condition we have in the identifiability theorem.
299
+
300
+ 07:54.080 --> 07:59.360
301
+ And therefore we can exhibit an example which is identifiable, but on which multiset CCA
302
+
303
+ 07:59.360 --> 08:00.360
304
+ will fail.
305
+
306
+ 08:00.360 --> 08:04.800
307
+ And I refer you to the paper for more details on this.
308
+
309
+ 08:04.800 --> 08:10.160
310
+ So in our theorem, in order to recover the correct unmixing matrix, we need to have access
311
+
312
+ 08:10.160 --> 08:12.480
313
+ to the second-order statistics.
314
+
315
+ 08:12.480 --> 08:18.860
316
+ However, in practice, we only have access to them, up to some sampling noise.
317
+
318
+ 08:18.860 --> 08:24.520
319
+ And because the mapping from matrices to eigenvectors is highly non-smooth, a small deviation in
320
+
321
+ 08:24.520 --> 08:31.160
322
+ the second-order statistics can lead to a high deviation of the recovered unmixing matrix.
323
+
324
+ 08:31.160 --> 08:38.080
325
+ Now to show this in practice, we take three subjects, two components, and noise covariance
326
+
327
+ 08:38.080 --> 08:47.440
328
+ matrices with two values, lambda1 and lambda2, that are separated by an eigengap epsilon.
329
+
330
+ 08:47.440 --> 08:52.440
331
+ And we compare the solution of multiset CCA on the true covariance matrices and on the
332
+
333
+ 08:52.440 --> 08:59.520
334
+ perturbed covariance matrix, where the perturbation scale is given by delta.
335
+
336
+ 08:59.520 --> 09:07.240
337
+ And for different values of epsilon, 10-4, 10-3, 10-2, 10-1, we show how the performance
338
+
339
+ 09:07.240 --> 09:14.720
340
+ of the algorithm, so the M-ary distance between the true unmixing matrix and the estimated
341
+
342
+ 09:14.720 --> 09:20.880
343
+ unmixing matrix, varies when the perturbation scale increases.
344
+
345
+ 09:20.880 --> 09:26.600
346
+ And we see that when the eigengap is very close, so 10-4, the violet curve, then even
347
+
348
+ 09:26.600 --> 09:31.440
349
+ with a very small perturbation, you can get to a very bad M-ary distance.
350
+
351
+ 09:31.440 --> 09:35.720
352
+ So the black dashed curve is a performance of chance.
353
+
354
+ 09:35.720 --> 09:41.200
355
+ Luckily, there is a large gap between the k-th eigenvalues and the k plus 1.
356
+
357
+ 09:41.200 --> 09:46.120
358
+ This means that in practice, the span of the p-leading eigenvectors is approximately preserved.
359
+
360
+ 09:46.120 --> 09:53.600
361
+ We can recover the true unmixing matrix from the unmixing matrix estimated by multiset
362
+
363
+ 09:53.600 --> 09:56.520
364
+ CCA, just by multiplying by a matrix Q.
365
+
366
+ 09:56.520 --> 10:02.640
367
+ And in order to estimate Q, we make use of the fact that the unmixed data should have
368
+
369
+ 10:02.640 --> 10:03.640
370
+ a diagonal covariance.
371
+
372
+ 10:03.640 --> 10:09.680
373
+ This leads us to a joint diagonalization problem that we can solve efficiently.
374
+
375
+ 10:09.680 --> 10:14.480
376
+ So if we take the experiments we've done on the previous slide, the results are still
377
+
378
+ 10:14.480 --> 10:15.480
379
+ shown here.
380
+
381
+ 10:15.480 --> 10:21.640
382
+ You can see the violet curves, and that is very sensitive to perturbation.
383
+
384
+ 10:21.640 --> 10:29.360
385
+ And so if we apply joint diagonalization, all these curves move, and they join the dashed
386
+
387
+ 10:29.360 --> 10:30.360
388
+ curve on the bottom.
389
+
390
+ 10:30.360 --> 10:34.720
391
+ And therefore, it's much better, because now the new curves that are represented by the
392
+
393
+ 10:34.720 --> 10:42.920
394
+ dashed line are less sensitive to perturbations.
395
+
396
+ 10:42.920 --> 10:47.920
397
+ So now we've obtained the correct unmixing matrix, but up to a scaling.
398
+
399
+ 10:47.920 --> 10:55.040
400
+ And so we need an additional step to find the correct scaling, and another one to find
401
+
402
+ 10:55.040 --> 11:00.680
403
+ the other parameter that is still unestimated, which are the noise covariance.
404
+
405
+ 11:00.680 --> 11:04.000
406
+ And luckily, it's very easy to find the noise covariance.
407
+
408
+ 11:04.000 --> 11:06.280
409
+ We can do this via an EM algorithm.
410
+
411
+ 11:06.280 --> 11:11.920
412
+ The E-step and the M-step are in closed form, and this yields a very fast algorithm.
413
+
414
+ 11:11.920 --> 11:15.200
415
+ But the Shikha-J is not a maximum likelihood estimator.
416
+
417
+ 11:15.200 --> 11:22.600
418
+ So now we will focus on Shikha-ML, which is our maximum likelihood estimator.
419
+
420
+ 11:22.600 --> 11:31.240
421
+ So I won't go too much into details on this, but we optimize this via an EM using a Gaussian
422
+
423
+ 11:31.240 --> 11:33.480
424
+ mixture assumption as a source.
425
+
426
+ 11:33.480 --> 11:35.960
427
+ We assume that the weights are known.
428
+
429
+ 11:35.960 --> 11:41.480
430
+ What I just want to showcase here is that the E-step of the algorithm, the one that
431
+
432
+ 11:41.480 --> 11:46.000
433
+ gives you the expectation of the sources given the data, and the variance of the sources
434
+
435
+ 11:46.000 --> 11:50.760
436
+ given the data, only involves the sum of size 2.
437
+
438
+ 11:50.760 --> 11:57.320
439
+ So previously we had a sum that had an exponential number of terms, and here we don't have that
440
+
441
+ 11:57.320 --> 11:58.320
442
+ anymore.
443
+
444
+ 11:58.320 --> 12:02.920
445
+ So the E-step is much faster than what we had before, and therefore the EM algorithm
446
+
447
+ 12:02.920 --> 12:07.200
448
+ here is tractable, whereas it was not the case before.
449
+
450
+ 12:07.200 --> 12:11.440
451
+ I first want to present our synthetic experiment where we generate data according to the Shikha-ML
452
+
453
+ 12:11.440 --> 12:13.200
454
+ and Shikha-J model.
455
+
456
+ 12:13.200 --> 12:18.560
457
+ In case A, we have only Gaussian components, but we have noise diversity, and therefore
458
+
459
+ 12:18.560 --> 12:24.240
460
+ methods that use noise diversity to recover the sources such as Shikha-ML and Shikha-J
461
+
462
+ 12:24.240 --> 12:25.240
463
+ perform best.
464
+
465
+ 12:25.240 --> 12:34.000
466
+ In the second case, we have only non-Gaussian components and no noise diversity, so methods
467
+
468
+ 12:34.000 --> 12:41.520
469
+ that use non-Gaussianity perform well such as Kana-ICA, Shikha-ML, or MultiView-ICA.
470
+
471
+ 12:41.520 --> 12:45.200
472
+ And the last case, half of the components are Gaussian with noise diversity, and the
473
+
474
+ 12:45.200 --> 12:49.000
475
+ other half are non-Gaussian but without noise diversity.
476
+
477
+ 12:49.000 --> 12:53.000
478
+ And in this case, only Shikha-ML is able to correctly recover the sources.
479
+
480
+ 12:53.000 --> 12:57.960
481
+ MV-ICA doesn't do that, but it's not as good as Shikha-ML.
482
+
483
+ 12:57.960 --> 13:00.400
484
+ Let us now talk about our experiments on real data.
485
+
486
+ 13:00.400 --> 13:05.080
487
+ We have this reconstruction experiment on fMRI data where subjects are exposed to a
488
+
489
+ 13:05.080 --> 13:07.920
490
+ naturalistic stimuli such as movie watching.
491
+
492
+ 13:07.920 --> 13:15.320
493
+ We use 80% of the movie to learn the unmixing matrices of all subjects, and then on the
494
+
495
+ 13:15.320 --> 13:22.320
496
+ 20% left of the movie, we compute the common sources, and from these common sources computed
497
+
498
+ 13:22.320 --> 13:28.800
499
+ using 80% of the subject, we try to reconstruct the data of the 20% left of the subject.
500
+
501
+ 13:28.800 --> 13:33.880
502
+ We compute the R2 score within regions of interest between the reconstructed data and
503
+
504
+ 13:33.880 --> 13:39.480
505
+ the true data, and plot them as a function of the number of components used.
506
+
507
+ 13:39.480 --> 13:43.000
508
+ As we see, Shikha-ML outperforms all of the methods.
509
+
510
+ 13:43.000 --> 13:47.400
511
+ As a take-home message, Shikha is a powerful framework to extract shared sources.
512
+
513
+ 13:47.400 --> 13:52.840
514
+ Shikha-J is a fast approach to fit the model, but it only uses second-order information.
515
+
516
+ 13:52.840 --> 13:58.800
517
+ In contrast, Shikha-ML is a bit slower, but is able to use non-gaussianity in addition
518
+
519
+ 13:58.800 --> 14:00.960
520
+ to second-order information.
521
+
522
+ 14:00.960 --> 14:03.840
523
+ In practice, Shikha-ML yields the best results.
524
+
525
+ 14:03.840 --> 14:05.960
526
+ The methods we've introduced work on reduced data.
527
+
528
+ 14:05.960 --> 14:11.160
529
+ It would be interesting to know how to reduce the data so that they perform optimally.
530
+
531
+ 14:11.160 --> 14:15.400
532
+ Another way to improve our results would be to learn the density of the shared sources
533
+
534
+ 14:15.400 --> 14:19.480
535
+ in Shikha-ML instead of having them fixed.
536
+
537
+ 14:19.480 --> 14:23.400
538
+ Thanks for listening, and have a good day!
539
+
demo_data/nips-2021/25957/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0539c1b965a157ce62df522fef5ea03cdec6198f5995fefa04cfddf947861fd
3
+ size 93633719
demo_data/nips-2021/25958/metadata.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "title": "ParK: Sound and Efficient Kernel Ridge Regression by Feature Space Partitions"
3
+ }
demo_data/nips-2021/25958/transcript_whisper_large-v2.vtt ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 00:00.000 --> 00:07.000
4
+ Hello everyone, I'm Luigi Carretino, and this is a joint work with Stefano Vigonia,
5
+
6
+ 00:07.000 --> 00:10.000
7
+ Daniele Calandriello, and Lorenzo Rosasco.
8
+
9
+ 00:10.000 --> 00:16.000
10
+ The problem that we study in this work is a standard regression problem, where we want
11
+
12
+ 00:16.000 --> 00:24.000
13
+ to estimate an unknown function f star given n pairs of points, x's and y's, and then
14
+
15
+ 00:24.000 --> 00:34.000
16
+ given n pairs of points, x's and y's, where y's are noisy evaluations of the functions
17
+
18
+ 00:34.000 --> 00:38.000
19
+ f star on the input points axis.
20
+
21
+ 00:41.000 --> 00:46.000
22
+ A well-established method to learn nonlinear functions is kernel ridge regression.
23
+
24
+ 00:46.000 --> 00:53.000
25
+ The basic idea is to map the input points into a higher dimensional space, where linear
26
+
27
+ 00:53.000 --> 00:59.000
28
+ relationships can be learned that then translate in nonlinear ones in the input space.
29
+
30
+ 01:01.000 --> 01:07.000
31
+ To formalize this, we can think about solving a standard empirical risk minimization problem
32
+
33
+ 01:07.000 --> 01:12.000
34
+ regularized over a spatial function which is a reproducing kernel Hilbert space.
35
+
36
+ 01:14.000 --> 01:20.000
37
+ Numerically speaking, the solution of this type of problem boils down to solving a linear
38
+
39
+ 01:20.000 --> 01:26.000
40
+ system. Particularly, we can see here that the linear system is going to be Kc equal
41
+
42
+ 01:26.000 --> 01:33.000
43
+ y, where K is the kernel matrix evaluated in all the pairs of points of the training
44
+
45
+ 01:33.000 --> 01:39.000
46
+ sets, c are the weights that we aim to learn, and y's are the output points.
47
+
48
+ 01:40.000 --> 01:45.000
49
+ We know that this method is optimal from a statistical point of view, but a drawback
50
+
51
+ 01:45.000 --> 01:52.000
52
+ is that it suffers from computational scalability. In fact, in terms of time complexity, if we
53
+
54
+ 01:52.000 --> 01:57.000
55
+ have n training points and we want to solve the linear system directly, we'll have to
56
+
57
+ 01:57.000 --> 02:03.000
58
+ invert the matrix K, and this will cost us n cubed in time.
59
+
60
+ 02:06.000 --> 02:11.000
61
+ Multiple ways of accelerating this process have been proposed over time.
62
+
63
+ 02:11.000 --> 02:17.000
64
+ The first one is to solve the methods iteratively instead of inverting directly the matrix K.
65
+
66
+ 02:18.000 --> 02:25.000
67
+ This allows us to only have matrix vector multiplications, and so the overall cost of
68
+
69
+ 02:25.000 --> 02:30.000
70
+ an iterative method to solve this linear system is going to be Tn squared.
71
+
72
+ 02:31.000 --> 02:39.000
73
+ Another method is the one known as sketching, where we can see this as subsampling the linear
74
+
75
+ 02:39.000 --> 02:46.000
76
+ system, in particular subsampling columns of this linear system, where we can take m
77
+
78
+ 02:46.000 --> 02:52.000
79
+ columns of the linear system uniformly at random to get a smaller one, and the cost
80
+
81
+ 02:52.000 --> 02:55.000
82
+ of this will be m squared n.
83
+
84
+ 02:57.000 --> 03:04.000
85
+ Another method instead is splitting. This allows us to divide the main problem into
86
+
87
+ 03:04.000 --> 03:12.000
88
+ many, in this case Q, subproblems, each one that can be solved independently and so
89
+
90
+ 03:12.000 --> 03:20.000
91
+ potentially can be distributed. So we can have a cost which boils down to n over Q to
92
+
93
+ 03:20.000 --> 03:22.000
94
+ the power of 3.
95
+
96
+ 03:25.000 --> 03:30.000
97
+ Combinations of these methods have been proposed in the literature. In particular, if
98
+
99
+ 03:30.000 --> 03:35.000
100
+ we combine iterating and sketching, we can get a solver that can solve the problem in
101
+
102
+ 03:35.000 --> 03:38.000
103
+ a time complexity of Tmn.
104
+
105
+ 03:40.000 --> 03:47.000
106
+ If instead we combine sketching and splitting, we can get a solver that can be computed
107
+
108
+ 03:47.000 --> 03:51.000
109
+ in m squared times n over Q.
110
+
111
+ 03:51.000 --> 03:59.000
112
+ And in this work, we try to blend all these techniques to derive a new algorithm, which
113
+
114
+ 03:59.000 --> 04:09.000
115
+ we will call PARC, that can achieve a time complexity of Tm times n over Q to the power
116
+
117
+ 04:09.000 --> 04:10.000
118
+ of 2.
119
+
120
+ 04:12.000 --> 04:18.000
121
+ So as we just said, in this work, we propose a new large-scale kernel regression solver
122
+
123
+ 04:18.000 --> 04:22.000
124
+ that combines the computational benefits of iteration, sketching, and splitting.
125
+
126
+ 04:23.000 --> 04:27.000
127
+ Notice, though, that these are approximation techniques and they may come at the cost of
128
+
129
+ 04:27.000 --> 04:35.000
130
+ accuracy. But we are able to show that this new algorithm is able to preserve generalization
131
+
132
+ 04:35.000 --> 04:37.000
133
+ under suitable partitions.
134
+
135
+ 04:38.000 --> 04:44.000
136
+ Now also notice that instead of general splitting, we are going to need to focus on a
137
+
138
+ 04:44.000 --> 04:48.000
139
+ particular type, which is the partitions.
140
+
141
+ 04:48.000 --> 04:53.000
142
+ So we introduce a new principal partition scheme for kernel methods.
143
+
144
+ 04:56.000 --> 05:01.000
145
+ We now look at the difference between data splitting and space partitioning.
146
+
147
+ 05:01.000 --> 05:08.000
148
+ Given a set of points, the procedure of splitting takes groups of points at random and assign
149
+
150
+ 05:08.000 --> 05:10.000
151
+ them to different splits or clusters.
152
+
153
+ 05:10.000 --> 05:14.000
154
+ In this picture, for example, we divide the points in four splits.
155
+
156
+ 05:15.000 --> 05:21.000
157
+ Partitioning instead divides the space in different cells, and then the points are implicitly
158
+
159
+ 05:21.000 --> 05:25.000
160
+ assigned to a particular cluster based on which cell they belong to.
161
+
162
+ 05:27.000 --> 05:32.000
163
+ Notice that with the splitting methods, we don't consider local information while we
164
+
165
+ 05:32.000 --> 05:37.000
166
+ perform the splitting, but we do when we perform partitioning.
167
+
168
+ 05:37.000 --> 05:42.000
169
+ Now, from this picture, the concept of partitioning a space seems pretty straightforward.
170
+
171
+ 05:43.000 --> 05:48.000
172
+ However, when you start considering high dimensional feature space, subtle problems can
173
+
174
+ 05:48.000 --> 05:49.000
175
+ appear.
176
+
177
+ 05:50.000 --> 05:55.000
178
+ So first, as a recap, remember that there are two important spaces to consider in our
179
+
180
+ 05:55.000 --> 05:56.000
181
+ regression problem.
182
+
183
+ 05:57.000 --> 06:04.000
184
+ The input space X with its input space features and the kernel space H with its input space
185
+
186
+ 06:04.000 --> 06:10.000
187
+ features, and the kernel space H, which potentially has many more implicit features.
188
+
189
+ 06:13.000 --> 06:17.000
190
+ Traditionally, partition methods are applied directly to the input space.
191
+
192
+ 06:18.000 --> 06:24.000
193
+ For example, a classical approach is to select a subset of points as centroids and then
194
+
195
+ 06:24.000 --> 06:30.000
196
+ partition the space in cells by assigning each portion of the space to the closest centroid,
197
+
198
+ 06:30.000 --> 06:32.000
199
+ which is called a Voronoi partition.
200
+
201
+ 06:32.000 --> 06:38.000
202
+ Since we are in the input space, closest here is defined according to a simple Euclidean
203
+
204
+ 06:38.000 --> 06:39.000
205
+ distance.
206
+
207
+ 06:40.000 --> 06:45.000
208
+ However, remember that our target function and our whole regression does not happen
209
+
210
+ 06:45.000 --> 06:51.000
211
+ directly on the input data space, but rather on the data mapped in the feature space.
212
+
213
+ 06:52.000 --> 06:58.000
214
+ And after we apply our feature map to the data, the concept of closest and the partition
215
+
216
+ 06:58.000 --> 06:59.000
217
+ can radically change.
218
+
219
+ 06:59.000 --> 07:05.000
220
+ For example, here on the right, we choose a kernel space associated with a cosine similarity
221
+
222
+ 07:06.000 --> 07:12.000
223
+ and again plot how the centroids partition the input space, but this time we chose closest
224
+
225
+ 07:12.000 --> 07:14.000
226
+ according to the new cosine distance.
227
+
228
+ 07:15.000 --> 07:20.000
229
+ The resulting partition is very different from the Euclidean one as it captures the
230
+
231
+ 07:20.000 --> 07:22.000
232
+ non-linearity of the kernel function.
233
+
234
+ 07:22.000 --> 07:28.000
235
+ In the paper, we discuss how this difference can impact the regression and we identified
236
+
237
+ 07:28.000 --> 07:34.000
238
+ sufficient conditions that the partition should satisfy in order to guarantee good generalization
239
+
240
+ 07:34.000 --> 07:35.000
241
+ of the learning process.
242
+
243
+ 07:37.000 --> 07:43.000
244
+ Crucially, we will see that these guarantees depend not on how the input space is partitioned,
245
+
246
+ 07:43.000 --> 07:45.000
247
+ but rather how the feature space is partitioned.
248
+
249
+ 07:45.000 --> 07:51.000
250
+ As a consequence, for our PARC methods, we focus on choosing centroids solely using the
251
+
252
+ 07:51.000 --> 07:53.000
253
+ kernel version of the distance.
254
+
255
+ 07:57.000 --> 08:00.000
256
+ We are now ready to present in more detail how the PARC algorithm works.
257
+
258
+ 08:01.000 --> 08:07.000
259
+ First of all, PARC partitioned the feature space into Q Voronoi cells and the first thing
260
+
261
+ 08:07.000 --> 08:16.000
262
+ to do is to identify the centroids in the feature space that allows us to describe the
263
+
264
+ 08:16.000 --> 08:17.000
265
+ Voronoi cells.
266
+
267
+ 08:19.000 --> 08:25.000
268
+ Then inside each Voronoi cell, we learn a local estimator using an uniterated and sketched
269
+
270
+ 08:25.000 --> 08:27.000
271
+ version of kernel ridge regression.
272
+
273
+ 08:30.000 --> 08:36.000
274
+ And then at prediction time, when a new sample arrives, we can use the Q Voronoi feature
275
+
276
+ 08:36.000 --> 08:38.000
277
+ to identify the new sample.
278
+
279
+ 08:40.000 --> 08:47.000
280
+ We use the local estimator corresponding to the Voronoi cell to which the new points fall
281
+
282
+ 08:47.000 --> 08:48.000
283
+ on.
284
+
285
+ 08:52.000 --> 08:57.000
286
+ The generalization error of standard kernel ridge regression without partitioning can
287
+
288
+ 08:57.000 --> 09:02.000
289
+ be upper bounded by two terms, a bias term and a variance term.
290
+
291
+ 09:02.000 --> 09:10.000
292
+ In our work, we can show that also the generalization error of PARC can be upper bounded by a bias
293
+
294
+ 09:10.000 --> 09:11.000
295
+ term and a variance term.
296
+
297
+ 09:11.000 --> 09:16.000
298
+ But this time, these two terms are weighted and they are weighted by a certain quantity
299
+
300
+ 09:16.000 --> 09:25.000
301
+ that depends on an angle theta, which is the minimum angle between all the subspaces of
302
+
303
+ 09:25.000 --> 09:26.000
304
+ the partitions.
305
+
306
+ 09:26.000 --> 09:33.000
307
+ For example, when all the subspaces are orthogonal between each other, we recover the exact same
308
+
309
+ 09:33.000 --> 09:36.000
310
+ generalization error of standard kernel ridge regression.
311
+
312
+ 09:38.000 --> 09:45.000
313
+ But we are also able to show that for angles which are small enough, we are able to obtain
314
+
315
+ 09:45.000 --> 09:50.000
316
+ a generalization error which is of the same order of standard kernel ridge regression.
317
+
318
+ 09:50.000 --> 09:54.000
319
+ These theoretical results suggest us how to construct a good partition.
320
+
321
+ 09:54.000 --> 10:00.000
322
+ So in particular, PARC selects the Voronoi centroids greedily in order to promote orthogonality
323
+
324
+ 10:00.000 --> 10:01.000
325
+ between the Voronoi cells.
326
+
327
+ 10:01.000 --> 10:06.000
328
+ And in particular, we use the Schur complement to measure the orthogonality.
329
+
330
+ 10:10.000 --> 10:16.000
331
+ We also use the Schur complement to measure the orthogonality of the Voronoi centroids.
332
+
333
+ 10:16.000 --> 10:20.000
334
+ And in particular, we use the Schur complement to measure the orthogonality.
335
+
336
+ 10:24.000 --> 10:28.000
337
+ Given all these ingredients, we are now able to measure the computational complexity of
338
+
339
+ 10:28.000 --> 10:32.000
340
+ PARC, which has a time complexity that is the sum of two terms.
341
+
342
+ 10:33.000 --> 10:40.000
343
+ A first term, q squared n log n, which is the cost of computing the centroids with the
344
+
345
+ 10:40.000 --> 10:41.000
346
+ just mentioned procedure.
347
+
348
+ 10:41.000 --> 10:46.000
349
+ And a second term, q squared n log n, which is the cost of computing the most expensive
350
+
351
+ 10:46.000 --> 10:47.000
352
+ local estimator.
353
+
354
+ 10:51.000 --> 10:57.000
355
+ Empirically, we performed experiments on data set of millions and of billions of points,
356
+
357
+ 10:57.000 --> 11:01.000
358
+ and we compared with the currently fastest global kernel methods and with some other
359
+
360
+ 11:01.000 --> 11:02.000
361
+ splitting kernel methods.
362
+
363
+ 11:03.000 --> 11:08.000
364
+ We can see that PARC is the only method that manages to match the accuracy of the global
365
+
366
+ 11:08.000 --> 11:11.000
367
+ estimator.
368
+
369
+ 11:11.000 --> 11:13.000
370
+ Thank you all for your attention.
371
+
372
+ 11:13.000 --> 11:40.000
373
+ And thank you to the poster for all your questions and more details.
374
+
demo_data/nips-2021/25958/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fefd926545331be9df0497e824634fa23129d26c9c9e7fdbe67c0382b98b4556
3
+ size 22931245
demo_data/nips-2021/25959/metadata.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "title": "Adversarial Feature Desensitization"
3
+ }
demo_data/nips-2021/25959/transcript_whisper_large-v2.vtt ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 00:00.000 --> 00:13.120
4
+ Hello, my name is Pouya Bahshiban and I'm going to tell you about our paper titled
5
+
6
+ 00:13.120 --> 00:18.720
7
+ Adversarial Feature Desensitization. This is joint work with a number of wonderful collaborators
8
+
9
+ 00:18.720 --> 00:24.400
10
+ at MIWA, University of Montreal and McGill University, including Reza Bayat, Adam Ibrahim,
11
+
12
+ 00:24.400 --> 00:32.160
13
+ Kartika Hoja, Mojtaba Farmazi, Tourez Dale, Lake Richards and Erin Oji. A common assumption in
14
+
15
+ 00:32.160 --> 00:36.560
16
+ machine learning is that the train and test samples come from the same distribution.
17
+
18
+ 00:37.200 --> 00:42.960
19
+ While this is a reasonable assumption under most circumstances, it is intentionally violated in the
20
+
21
+ 00:42.960 --> 00:49.600
22
+ regime of adversarial attacks. Adversarial attacks are algorithms that search for slight input
23
+
24
+ 00:49.600 --> 00:55.600
25
+ perturbations that cause the input to be misclassified. In the case of white box attacks,
26
+
27
+ 00:55.600 --> 01:01.600
28
+ the model itself is transparent to the attacker and the attacker uses it to identify the possible
29
+
30
+ 01:01.600 --> 01:07.760
31
+ inputs that would lead to misclassifications. A famous example of this is the image of a panda
32
+
33
+ 01:07.760 --> 01:13.360
34
+ that when perturbed with imperceptible noise, alters the model's prediction from a panda to a
35
+
36
+ 01:13.360 --> 01:19.840
37
+ gibbon. As prior literature has shown, this is a common issue in almost all machine learning methods
38
+
39
+ 01:19.840 --> 01:25.280
40
+ and unless the classifier is specifically trained to be robust against these attacks,
41
+
42
+ 01:25.280 --> 01:28.720
43
+ the attacks could completely break down the classifier's performance.
44
+
45
+ 01:30.240 --> 01:35.600
46
+ This issue becomes even more critical when we consider the vast usage of these machine learning
47
+
48
+ 01:35.600 --> 01:41.040
49
+ systems in our societies. For example, the possible security concerns that rise in face
50
+
51
+ 01:41.040 --> 01:46.720
52
+ recognition systems prone to adversarial attacks or the safety in autonomous driving systems.
53
+
54
+ 01:48.080 --> 01:54.000
55
+ So what is an adversarial attack? To formally define the adversarial attacks, let's assume a
56
+
57
+ 01:54.000 --> 02:00.080
58
+ feature learning function f that projects inputs x to latent space with feature space z
59
+
60
+ 02:01.600 --> 02:08.720
61
+ and a classifier that uses the latent code z to predict the correct class label y hat.
62
+
63
+ 02:08.720 --> 02:14.480
64
+ The perturbation function or the attack generates a perturbed sample x prime
65
+
66
+ 02:14.480 --> 02:21.520
67
+ within the epsilon neighborhood of the input x, which we're showing here as b of x and epsilon.
68
+
69
+ 02:22.160 --> 02:28.880
70
+ By maximizing the classification objective, the opposite of how we normally optimize the classifier's
71
+
72
+ 02:28.880 --> 02:36.720
73
+ parameter. Many methods have been proposed to defend the models against adversarial attacks.
74
+
75
+ 02:36.720 --> 02:42.640
76
+ Two of these methods that have withstood the test of time so far are the adversarial training
77
+
78
+ 02:43.200 --> 02:50.160
79
+ by Alexander Modrianov, which proposes a defense method by solving a minimax optimization problem
80
+
81
+ 02:50.160 --> 02:56.000
82
+ that involves finding an adversarial input by maximizing the classification loss in the inner
83
+
84
+ 02:56.000 --> 03:03.840
85
+ loop followed by a classifier training to minimizing the classifier loss on these adversarial inputs.
86
+
87
+ 03:03.840 --> 03:09.920
88
+ This procedure is graphically shown for two hypothetical classes in the diagram on this slide.
89
+
90
+ 03:10.560 --> 03:15.440
91
+ The adversarial training method essentially learns to separate the distributions of adversarial
92
+
93
+ 03:15.440 --> 03:22.400
94
+ examples belonging to different classes. The second method is the trades method by Zhang et al,
95
+
96
+ 03:22.400 --> 03:27.440
97
+ which proposes to push the decision boundary of the classifier away from the data.
98
+
99
+ 03:27.440 --> 03:32.480
100
+ Trades achieves this by introducing a regularization term to the original learning
101
+
102
+ 03:32.480 --> 03:38.320
103
+ objective for classification that penalizes the mismatch between the predicted label
104
+
105
+ 03:38.320 --> 03:44.400
106
+ for the clean and perturbed inputs. The diagram on the right side again graphically illustrates
107
+
108
+ 03:44.400 --> 03:50.000
109
+ this procedure, where now the defense method learns to separate the distributions of clean examples
110
+
111
+ 03:50.000 --> 03:54.400
112
+ belonging to different classes while minimizing the loss of the classifier.
113
+
114
+ 03:54.400 --> 03:59.920
115
+ The third method is the trade method by Wang et al, which proposes to push the decision boundary
116
+
117
+ 03:59.920 --> 04:06.880
118
+ of the classifier to the inner loop followed by a classifier training to minimizing the
119
+
120
+ 04:06.880 --> 04:13.120
121
+ classification loss on these adversarial inputs. The third method is the trade method by Zhang et al,
122
+
123
+ 04:13.120 --> 04:18.720
124
+ which proposes to push the decision boundary of the classifier to the inner loop followed by a
125
+
126
+ 04:18.720 --> 04:27.840
127
+ classifier training to minimizing the classification loss on these adversarial inputs to the inner
128
+
129
+ 04:27.840 --> 04:34.640
130
+ loop. The third method is the trade method by Wang et al, which proposes to push the decision
131
+
132
+ 04:34.640 --> 04:39.920
133
+ boundary of the classifier to minimizing the classification loss. The fourth method is the
134
+
135
+ 04:39.920 --> 04:45.600
136
+ trade method by Wang et al, which proposes to push the decision boundary of the classifier
137
+
138
+ 04:45.600 --> 04:52.160
139
+ for a source domain, but we want the classifier to also perform the same task on a related target
140
+
141
+ 04:52.160 --> 05:00.960
142
+ domain that we might not have enough data for or that the generating procedure for sampling
143
+
144
+ 05:00.960 --> 05:09.440
145
+ domain might be expensive. The domain adaptation theory proposed by Ben David et al answers the
146
+
147
+ 05:09.440 --> 05:15.840
148
+ question of under what conditions can we adapt a classifier trained on the source domain for use
149
+
150
+ 05:15.840 --> 05:23.920
151
+ in the target domain. Here we consider the original clean distributions as the source domain and the
152
+
153
+ 05:23.920 --> 05:31.280
154
+ distribution of adversarial images generated from those images as the target domain. Although here
155
+
156
+ 05:31.280 --> 05:38.240
157
+ the target domain continuously evolves because the adversarial examples are based on the current
158
+
159
+ 05:38.240 --> 05:46.000
160
+ state of the model at each time step. And similar to the domain adaptation theory, our goal here
161
+
162
+ 05:46.000 --> 05:52.960
163
+ is to learn how to perform well on both source and target domains, meaning the natural and
164
+
165
+ 05:52.960 --> 06:02.240
166
+ adversarial domains. Now before I tell you about our proposed method, let's dive a bit deeper into
167
+
168
+ 06:02.240 --> 06:08.960
169
+ what the domain adaptation theory from Ben David et al states. Similar to before, let's assume a
170
+
171
+ 06:08.960 --> 06:14.880
172
+ feature learning function f that projects inputs x to latent space or feature space z and the
173
+
174
+ 06:14.880 --> 06:23.040
175
+ classifier that predicts the correct label y, y hat, from those latent codes. Now consider natural
176
+
177
+ 06:23.040 --> 06:31.440
178
+ and adversarial examples as input domains dx and d' x and their induced feature distributions
179
+
180
+ 06:31.440 --> 06:42.560
181
+ which go through the f function as dz and d' z. Also consider epsilon z and epsilon' z
182
+
183
+ 06:42.560 --> 06:50.320
184
+ as the classification error over the domains dz and d' z, what we are going to refer to as the
185
+
186
+ 06:50.320 --> 06:58.880
187
+ clean accuracy and the adversarial accuracy. The domain adaptation theory now gives a bond
188
+
189
+ 06:58.880 --> 07:04.320
190
+ on the adversarial error in terms of the natural error and the distance between the two domains.
191
+
192
+ 07:05.120 --> 07:11.680
193
+ Fortunately, from the prior work, we know that h delta h distance, which measures the distance
194
+
195
+ 07:11.680 --> 07:17.440
196
+ between two domains, can be estimated using the classifier trained to discriminate between the
197
+
198
+ 07:17.440 --> 07:26.080
199
+ two domains. Now our defense method called adversarial feature desensitization essentially
200
+
201
+ 07:26.080 --> 07:34.720
202
+ minimizes the bound on the adversarial error epsilon' z using a three-step procedure which
203
+
204
+ 07:34.720 --> 07:40.560
205
+ has some conceptual similarities with prior work on adversarial domain adaptation from Ganin et al.
206
+
207
+ 07:42.240 --> 07:49.280
208
+ For this, we first update the parameters theta and phi in the feature learning function f and
209
+
210
+ 07:49.280 --> 07:56.320
211
+ task classifier c to minimize the classification loss on the natural domain. This is shown with
212
+
213
+ 07:56.320 --> 08:01.920
214
+ green arrows and green boxes marked 1 on both the equation and on the diagram.
215
+
216
+ 08:04.000 --> 08:10.400
217
+ Secondly, we estimate the h delta h distance using an additional domain discriminator
218
+
219
+ 08:10.960 --> 08:17.600
220
+ network that predicts the domain identity from the latent code z. We update the domain
221
+
222
+ 08:17.600 --> 08:24.720
223
+ discriminator parameters psi to minimize the domain classification loss. And finally,
224
+
225
+ 08:24.720 --> 08:31.680
226
+ in the third step, we update the feature learning network parameters theta to maximize the domain
227
+
228
+ 08:31.680 --> 08:39.600
229
+ classification loss in an adversarial way. These two steps are marked with red arrows in the figure
230
+
231
+ 08:39.600 --> 08:48.960
232
+ and red boxes on the equation. Similar to previous two methods, adversarial training and trades that
233
+
234
+ 08:48.960 --> 08:55.760
235
+ I showed you, we here we can also graphically demonstrate this procedure. In our method AFD,
236
+
237
+ 08:55.760 --> 09:01.040
238
+ we learn to separate the classes from the distributions of clean examples while at the
239
+
240
+ 09:01.040 --> 09:07.840
241
+ same time we optimize a domain classifier that learns the boundary between the clean and adversarial
242
+
243
+ 09:07.840 --> 09:14.560
244
+ examples for each class. And finally, we push the adversarial examples to the opposite side of that
245
+
246
+ 09:14.560 --> 09:22.400
247
+ boundary. This procedure implicitly desensitizes the learned features to adversarial perturbations
248
+
249
+ 09:22.400 --> 09:30.480
250
+ and hence the name adversarial feature desensitization. We tested our method on four
251
+
252
+ 09:30.480 --> 09:35.840
253
+ data sets and compared them with a number of other baselines including with adversarial training and
254
+
255
+ 09:35.840 --> 09:43.760
256
+ trades. We made two versions of our method called AFDTCGAN that uses the adversarial losses from
257
+
258
+ 09:43.760 --> 09:50.880
259
+ Goodfellow et al and AFDWGAN that uses the Wasserstein losses from Arjovski and Goodtuner.
260
+
261
+ 09:52.000 --> 09:57.840
262
+ In the table, we evaluated all methods on several white box and black box attacks with
263
+
264
+ 09:57.840 --> 10:07.360
265
+ nominal strengths into each data set. Overall, our method AFD and especially AFDWGAN showed superior
266
+
267
+ 10:07.360 --> 10:15.200
268
+ performance against most attacks in most data sets. However, AFD was behind trades on several attacks
269
+
270
+ 10:15.200 --> 10:20.720
271
+ especially on CIFAR-100 and TinyImageNet data set that had more classes in it.
272
+
273
+ 10:20.720 --> 10:26.080
274
+ We also looked in trust attack methods and attack strengths which we controlled with the parameter
275
+
276
+ 10:26.080 --> 10:32.800
277
+ epsilon. The diagrams on the right show the robust accuracy for each defense method across
278
+
279
+ 10:32.800 --> 10:41.200
280
+ eight attack methods and various epsilon values for each of them. Overall, our results in these
281
+
282
+ 10:41.200 --> 10:48.240
283
+ diagrams showed that AFD's robustness generalizes better than the baselines across attacks and
284
+
285
+ 10:48.240 --> 10:55.200
286
+ across attack strengths. To quantify these differences, we also computed the area under
287
+
288
+ 10:55.200 --> 11:00.000
289
+ the curve for each method for each attack and summarized them in a table on the left.
290
+
291
+ 11:00.880 --> 11:06.800
292
+ As you can see, AFD's robust performance generalizes better to unseen and stronger attacks
293
+
294
+ 11:06.800 --> 11:15.680
295
+ compared to other baselines. If you remember from previous slides, the domain adaptation theory
296
+
297
+ 11:15.680 --> 11:22.400
298
+ predicted a bound on the adversarial error which can also be turned into a bound on the generalization
299
+
300
+ 11:22.400 --> 11:30.320
301
+ gap between natural and adversarial attacks. We empirically tested this prediction in our experiments
302
+
303
+ 11:30.320 --> 11:37.600
304
+ under two settings. Under the first setting, we varied the epsilon value for the PGDL-infinity
305
+
306
+ 11:37.600 --> 11:45.600
307
+ attack which was used during the training. And under the second setting, we varied the
308
+
309
+ 11:45.600 --> 11:51.120
310
+ epsilon value for the PGDL-infinity attack which was used during the training. And under the second setting, we used a diverse set of attacks and various attack strengths for each of them.
311
+
312
+ 11:52.000 --> 11:58.480
313
+ And under both scenarios, we found that the domain discriminator, which was originally trained on a
314
+
315
+ 11:58.480 --> 12:05.280
316
+ particular attack and attack strength, in our case it was PGDL-infinity attack with a fixed epsilon
317
+
318
+ 12:05.280 --> 12:10.960
319
+ for each data set, could well predict the generalization gap to unseen attacks and
320
+
321
+ 12:10.960 --> 12:18.000
322
+ different attack magnitudes. This suggests that the adversarial training against a domain classifier
323
+
324
+ 12:18.000 --> 12:24.000
325
+ like that used in our proposed method could potentially lead to robust models with better
326
+
327
+ 12:24.000 --> 12:33.520
328
+ generalization capacity. Finally, while we showed that AFD generalizes well to most other attacks
329
+
330
+ 12:33.520 --> 12:39.200
331
+ and attack strengths, it occasionally was worse compared to other baselines, especially in data
332
+
333
+ 12:39.200 --> 12:45.760
334
+ sets with more classes like Tiny ImageNet. This could potentially be due to the difficulty of training
335
+
336
+ 12:46.320 --> 12:51.680
337
+ domain classifiers in these data sets and leaves much space for future work on
338
+
339
+ 12:51.680 --> 12:57.120
340
+ investigating the effect of domain classifiers on the robustness of feature learning functions.
341
+
342
+ 12:58.080 --> 13:04.400
343
+ Also, AFD required more backward computations compared to some of the other baselines
344
+
345
+ 13:04.400 --> 13:11.120
346
+ such as adversarial training, and as a result, its training time was on average about 31%
347
+
348
+ 13:11.120 --> 13:17.680
349
+ longer than adversarial training. We invite you to read our paper for more details and please
350
+
351
+ 13:17.680 --> 13:34.720
352
+ get in touch with us if you have any questions. Thanks for watching this video and we hope you enjoyed it.
353
+
demo_data/nips-2021/25959/video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76fac80c58c0fd077be83cb3d4b052aaf70c0128d8884b24f83a34a9f9c72fe3
3
+ size 86886949
demo_data/nips-2021/25962/metadata.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "title": "Locally differentially private estimation of functionals of discrete distributions"
3
+ }
demo_data/nips-2021/25962/transcript_whisper_large-v2.vtt ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WEBVTT
2
+
3
+ 00:00.000 --> 00:14.000
4
+ Bonjour à tous, je suis Yannis Hartel et je vais vous présenter un travail sur l'estimation
5
+
6
+ 00:14.000 --> 00:18.000
7
+ de fonctionnalité en termes de certaines contraintes particulières de la privacité.
8
+
9
+ 00:18.000 --> 00:24.000
10
+ C'est un travail en lien avec mon conseiller postdoc, le professeur Cristina Gutucia.
11
+
12
+ 00:24.000 --> 00:30.000
13
+ Nous sommes intéressés par le fonctionnalité de la somme de puissance, qui est la somme de probabilités associées
14
+
15
+ 00:30.000 --> 00:37.000
16
+ à une distribution discrète, à la puissance gamma, où gamma est un nombre réel positif.
17
+
18
+ 00:37.000 --> 00:46.000
19
+ Donc, ce fonctionnalité de la somme de puissance est un exemple d'information qui se déroule dans différents domaines
20
+
21
+ 00:46.000 --> 00:54.000
22
+ comme les statistiques, l'apprentissage de machines, la théorie de l'information, la science de la neurone, etc.
23
+
24
+ 00:54.000 --> 01:00.000
25
+ Voici donc le problème statistique standard, où l'objectif est d'estimer la somme de puissance fonctionnelle
26
+
27
+ 01:00.000 --> 01:10.000
28
+ basée sur des exemples NIID, X1, X2 jusqu'à XN, qui suivent une distribution discrète B avec une taille d'alphabet K.
29
+
30
+ 01:10.000 --> 01:19.000
31
+ Une approche beaucoup utilisée est le estimateur de plug-in, où l'on utilise un estimateur du paramètre P
32
+
33
+ 01:19.000 --> 01:25.000
34
+ pour construire un estimateur du fonctionnalité, à travers le principe de plug-in.
35
+
36
+ 01:25.000 --> 01:32.000
37
+ Cette approche n'est pas seulement simple et intuitive, mais elle est aussi théoriquement saine,
38
+
39
+ 01:32.000 --> 01:38.000
40
+ car elle satisfait une efficacité asymptotique et une néro-optimalité non-asymptote.
41
+
42
+ 01:38.000 --> 01:45.000
43
+ La question intéressante de notre paper est de savoir si cette approche de plug-in
44
+
45
+ 01:45.000 --> 01:50.000
46
+ fonctionne dans un état de séparation non standard, où l'on impose une contrainte de privé,
47
+
48
+ 01:50.000 --> 01:55.000
49
+ et plus précisément, le setup de la privé différente local.
50
+
51
+ 01:55.000 --> 02:06.000
52
+ Ce qui signifie que l'on impose un état de privé fort, où l'on n'a pas accès aux données initiales et sensibles, les XI.
53
+
54
+ 02:06.000 --> 02:12.000
55
+ Au lieu de ça, l'on a seulement accès à une version privée de XI.
56
+
57
+ 02:12.000 --> 02:22.000
58
+ Voici la représentation d'un mécanisme simple qui n'est pas interactif.
59
+
60
+ 02:22.000 --> 02:30.000
61
+ Les termes local ici reflètent le fait que le mécanisme QI ne voit que les données XI.
62
+
63
+ 02:30.000 --> 02:38.000
64
+ En d'autres mots, il n'y a pas de troisième parti confiant qui a accès à toutes les données sensibles.
65
+
66
+ 02:38.000 --> 02:48.000
67
+ C'est un mécanisme de privé non-interactif simple, mais bien sûr, nous sommes aussi intéressés par des mécanismes plus sophistiqués,
68
+
69
+ 02:48.000 --> 02:55.000
70
+ notamment le mécanisme de séquence interactif, où chaque QI voit les données privées dévoilées précédemment,
71
+
72
+ 02:55.000 --> 03:00.000
73
+ et les données privées de XI, et les données privées de XI.
74
+
75
+ 03:00.000 --> 03:10.000
76
+ Dans cette étude non-standard, nous retournons au problème original de l'estimation fonctionnelle de la power sum,
77
+
78
+ 03:10.000 --> 03:15.000
79
+ où nous n'avons qu'accès à des données privées de XI jusqu'à XL.
80
+
81
+ 03:15.000 --> 03:26.000
82
+ Notre première contribution est de donner une caractérisation tigrée et non-transomatique du erreur de caractérisation de la power sum de l'estimateur.
83
+
84
+ 03:26.000 --> 03:33.000
85
+ Ce résultat montre que l'estimateur de la power sum n'est pas optimal.
86
+
87
+ 03:33.000 --> 03:41.000
88
+ Cela contraste avec la performance de l'estimateur de la power sum dans le problème statistique standard.
89
+
90
+ 03:41.000 --> 03:50.000
91
+ Le message ici est que les bons estimateurs dans le setup standard ne sont pas toujours bons estimateurs dans le setup local privacy.
92
+
93
+ 03:50.000 --> 04:00.000
94
+ Notre deuxième contribution est la correction du estimateur de plug-in grâce à une attentionnée de troncation de Pk de petites probabilités.
95
+
96
+ 04:00.000 --> 04:06.000
97
+ Cette correction conduit à une réduction significative du risque d'erreur.
98
+
99
+ 04:06.000 --> 04:13.000
100
+ En particulier, le risque devient indépendant du size alphabétique K lorsque K est grand.
101
+
102
+ 04:13.000 --> 04:22.000
103
+ Cette deuxième contribution, par contre, se base sur un mécanisme de privé non-interactif simple.
104
+
105
+ 04:22.000 --> 04:29.000
106
+ Dans la seconde partie du document, nous examinons un mécanisme de séquence interactive plus sophistiqué,
107
+
108
+ 04:29.000 --> 04:40.000
109
+ pour lequel nous construisons une procédure de deux pas qui nous permet de réduire le risque grâce à un facteur logarithmique.
110
+
111
+ 04:40.000 --> 04:45.000
112
+ Enfin, à la fin du document, nous fournissons un lien universel en bas sur le risque d'erreur
113
+
114
+ 04:45.000 --> 04:51.000
115
+ avec respect à tous les estimateurs et tous les mécanismes non-interactifs et séquentially interactifs.
116
+
117
+ 04:51.000 --> 04:56.000
118
+ Malheureusement, ce lien bas est un lien d'accords uniquement dans certains cas,
119
+
120
+ 04:56.000 --> 05:02.000
121
+ ce qui nous laisse avec quelques questions très importantes à poser sur ce problème.
122
+
123
+ 05:02.000 --> 05:10.000
124
+ Je pense que ce premier travail sur l'estimation fonctionnelle dans le contexte de la privé locale
125
+
126
+ 05:10.000 --> 05:14.000
127
+ vous donne au moins trois points clés.
128
+
129
+ 05:14.000 --> 05:23.000
130
+ Le premier point clé est le besoin de construire une procédure statistique prudente pour la configuration de la privé locale,
131
+
132
+ 05:23.000 --> 05:31.000
133
+ puisque c'est un setup où un bon estimateur dans un cadre standard n'a pas nécessairement de fonction.
134
+
135
+ 05:31.000 --> 05:38.000
136
+ Le deuxième point clé est que l'approche de type de plug-in analysée dans ce document
137
+
138
+ 05:38.000 --> 05:43.000
139
+ sert comme un benchmark pour de futurs travaux et des procédures plus sophistiquées.
140
+
141
+ 05:43.000 --> 05:51.000
142
+ Et le dernier point clé est que notre analyse de l'approche de type de plug-in et des mécanismes non-interactifs
143
+
144
+ 05:51.000 --> 05:56.000
145
+ montrent des régimes où le problème d'estimation est difficile
146
+
147
+ 05:56.000 --> 06:01.000
148
+ et espérons que cela incite les gens à amener des développements ici.
149
+
150
+ 06:01.000 --> 06:08.000
151
+ Merci à tous, et pour plus de détails, veuillez vérifier notre document en ligne.
152
+
153
+ 06:08.000 --> 06:22.000
154
+ Bye!
155
+