ScientiaEtVeritas commited on
Commit
e4b6f2e
1 Parent(s): 317a988

txt transcripts

Browse files
app.py CHANGED
@@ -1,322 +1,322 @@
1
- import itertools
2
- import json
3
- import re
4
- from functools import partial
5
- from pathlib import Path
6
-
7
- import pandas as pd
8
- import requests
9
- import streamlit as st
10
- import webvtt
11
- from transformers import AutoTokenizer
12
-
13
- from generate_text_api import TextGenerator
14
- from model_inferences.utils.chunking import Truncater
15
- from model_inferences.utils.files import get_captions_from_vtt, get_transcript
16
-
17
- USE_PARAGRAPHING_MODEL = True
18
-
19
- def get_sublist_by_flattened_index(A, i):
20
- current_index = 0
21
- for sublist in A:
22
- sublist_length = len(sublist)
23
- if current_index <= i < current_index + sublist_length:
24
- return sublist, A.index(sublist)
25
- current_index += sublist_length
26
- return None, None
27
-
28
- import requests
29
-
30
-
31
- def get_talk_metadata(video_id):
32
- url = "https://www.ted.com/graphql"
33
-
34
- headers = {
35
- "Content-Type": "application/json",
36
- "Accept": "application/json",
37
- "x-operation-name": "Transcript", # Replace with the actual operation name
38
- }
39
-
40
- data = {
41
- "query": """
42
- query GetTalk($videoId: ID!) {
43
- video(id: $videoId) {
44
- title,
45
- presenterDisplayName,
46
- nativeDownloads {medium}
47
- }
48
- }
49
- """,
50
- "variables": {
51
- "videoId": video_id, # Corrected key to "videoId"
52
- },
53
- }
54
-
55
- response = requests.post(url, json=data, headers=headers)
56
-
57
- if response.status_code == 200:
58
- result = response.json()
59
- return result
60
- else:
61
- print(f"Error: {response.status_code}, {response.text}")
62
-
63
- class OfflineTextSegmenterClient:
64
- def __init__(self, host_url):
65
- self.host_url = host_url.rstrip("/") + "/segment"
66
-
67
- def segment(self, text, captions=None, generate_titles=False, threshold=0.4):
68
- payload = {
69
- 'text': text,
70
- 'captions': captions,
71
- 'generate_titles': generate_titles,
72
- "prefix_titles": True,
73
- "threshold": threshold,
74
- }
75
-
76
- headers = {
77
- 'Content-Type': 'application/json'
78
- }
79
-
80
- response = requests.post(self.host_url, data=json.dumps(payload), headers=headers).json()
81
- #segments = response["annotated_segments"] if "annotated_segments" in response else response["segments"]
82
- return {'segments':response["segments"], 'titles': response["titles"], 'sentences': response["sentences"]}
83
-
84
- class Toc:
85
-
86
- def __init__(self):
87
- self._items = []
88
- self._placeholder = None
89
-
90
- def title(self, text):
91
- self._markdown(text, "h1")
92
-
93
- def header(self, text):
94
- self._markdown(text, "h2", " " * 2)
95
-
96
- def subheader(self, text):
97
- self._markdown(text, "h3", " " * 4)
98
-
99
- def placeholder(self, sidebar=False):
100
- self._placeholder = st.sidebar.empty() if sidebar else st.empty()
101
-
102
- def generate(self):
103
- if self._placeholder:
104
- self._placeholder.markdown("\n".join(self._items), unsafe_allow_html=True)
105
-
106
- def _markdown(self, text, level, space=""):
107
- key = re.sub(r'[^\w-]', '', text.replace(" ", "-").replace("'", "-").lower())
108
- st.markdown(f"<{level} id='{key}'>{text}</{level}>", unsafe_allow_html=True)
109
- self._items.append(f"{space}* <a href='#{key}'>{text}</a>")
110
-
111
- # custom_css = "<style type='text/css'>" + Path('style.css').read_text() + "</style>"
112
- # st.write(custom_css, unsafe_allow_html=True)
113
-
114
- def concat_prompt(prompt_text, text, model_name):
115
- if 'flan' in model_name:
116
- input_ = prompt_text + "\n\n" + text
117
- elif 'galactica' in model_name:
118
- input_ = text + "\n\n" + prompt_text
119
- return input_
120
-
121
- endpoint = "http://hiaisc.isl.iar.kit.edu/summarize"
122
- ENDPOINTS = {"http://hiaisc.isl.iar.kit.edu/summarize": "meta-llama/Llama-2-13b-chat-hf",}
123
-
124
- client = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/chapter")
125
- if USE_PARAGRAPHING_MODEL:
126
- paragrapher = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/paragraph")
127
- summarizer = TextGenerator(endpoint)
128
-
129
- tokenizer = AutoTokenizer.from_pretrained(ENDPOINTS[endpoint], use_fast=False)
130
-
131
- # TLDR PROMPT
132
-
133
- SYSTEM_PROMPT = "You are an assistant who replies with a summary to every message."
134
-
135
- TLDR_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
136
- {system_prompt}
137
- <</SYS>>
138
-
139
- {user_message} [/INST] Sure! Here is a summary of the research presentation in a single, short sentence:"""
140
-
141
- TLDR_USER_PROMPT = "Summarize the following research presentation in a single, short sentence:\n\n{input}"
142
-
143
- TLDR_PROMPT = TLDR_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
144
- TLDR_PROMPT_LENGTH = tokenizer(TLDR_PROMPT, return_tensors="pt")["input_ids"].size(1)
145
-
146
- BP_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
147
- {system_prompt}
148
- <</SYS>>
149
-
150
- {user_message} [/INST] Sure! Here is a summary of the research presentation using three bullet points:\n\n\u2022"""
151
-
152
- BP_USER_PROMPT = "Summarize the following research presentation using three bullet points:\n\n{input}"
153
-
154
- BP_PROMPT = BP_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
155
- BP_PROMPT_LENGTH = tokenizer(BP_PROMPT, return_tensors="pt")["input_ids"].size(1)
156
-
157
- CONTEXT_LENGTH = 3072
158
- MAX_SUMMARY_LENGTH = 1024
159
- TLDR_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - TLDR_PROMPT_LENGTH - 1
160
- BP_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - BP_PROMPT_LENGTH - 1
161
-
162
-
163
- text_generator = TextGenerator(endpoint)
164
- temperature = 0.7
165
-
166
- import re
167
-
168
-
169
- def replace_newlines(text):
170
- updated_text = re.sub(r'\n+', r'\n\n', text)
171
- return updated_text
172
-
173
- def generate_summary(summarizer, generated_text_box, input_, prompt, max_input_length, prefix=""):
174
- all_generated_text = prefix
175
- truncater = Truncater(tokenizer, max_length=max_input_length)
176
- input_ = truncater(input_)
177
- input_ = prompt.format(input=input_)
178
- for generated_text in summarizer.generate_text_stream(input_, max_new_tokens=MAX_SUMMARY_LENGTH, do_sample=True, temperature=temperature):
179
- all_generated_text += replace_newlines(generated_text)
180
- generated_text_box.info(all_generated_text)
181
- print(all_generated_text)
182
- return all_generated_text.strip()
183
-
184
- st.header("Demo: Intelligent Recap")
185
-
186
- if not hasattr(st, 'global_state'):
187
- st.global_state = {'NIPS 2021 Talks': None, 'TED Talks': None}
188
- # NIPS 2021 Talks
189
- transcript_files = itertools.islice(Path("demo_data/nips-2021/").rglob("transcript_whisper_large-v2.vtt"), 15)
190
- # get titles from metadata.json
191
- transcripts_map = {}
192
- for transcript_file in transcript_files:
193
- base_path = transcript_file.parent
194
- metadata = base_path / "metadata.json"
195
- txt_file = base_path / "transcript_whisper_large-v2.txt"
196
- with open(metadata) as f:
197
- metadata = json.load(f)
198
- title = metadata["title"]
199
- transcript = get_transcript(txt_file)
200
- captions = get_captions_from_vtt(transcript_file)
201
- transcripts_map[title] = {"transcript": transcript, "captions": captions, "video": base_path / "video.mp4"}
202
- st.global_state['NIPS 2021 Talks'] = transcripts_map
203
-
204
- data = pd.read_json("demo_data/ted_talks.json")
205
- video_ids = data.talk_id.tolist()
206
- transcripts = data.text.apply(lambda x: " ".join(x)).tolist()
207
- transcripts_map = {}
208
- for video_id, transcript in zip(video_ids, transcripts):
209
- metadata = get_talk_metadata(video_id)
210
- title = metadata["data"]["video"]["title"]
211
- presenter = metadata["data"]["video"]["presenterDisplayName"]
212
- print(metadata["data"])
213
- if metadata["data"]["video"]["nativeDownloads"] is None:
214
- continue
215
- video_url = metadata["data"]["video"]["nativeDownloads"]["medium"]
216
- transcripts_map[title] = {"transcript": transcript, "video": video_url, "presenter": presenter}
217
- st.global_state['TED Talks'] = transcripts_map
218
-
219
- def get_lecture_id(path):
220
- return int(path.parts[-2].split('-')[1])
221
-
222
- transcript_files = Path("demo_data/lectures/").rglob("English.vtt")
223
- sorted_path_list = sorted(transcript_files, key=get_lecture_id)
224
-
225
- transcripts_map = {}
226
- for transcript_file in sorted_path_list:
227
- base_path = transcript_file.parent
228
- lecture_id = base_path.parts[-1]
229
- transcript = " ".join([c["text"].strip() for c in get_captions_from_vtt(transcript_file)]).replace("\n", " ")
230
- video_path = Path(base_path, "video.mp4")
231
- transcripts_map["Machine Translation: " + lecture_id] = {"transcript": transcript, "video": video_path}
232
- st.global_state['KIT Lectures'] = transcripts_map
233
-
234
- type_of_document = st.selectbox('What kind of document do you want to test it on?', list(st.global_state.keys()))
235
-
236
- transcripts_map = st.global_state[type_of_document]
237
-
238
- selected_talk = st.selectbox("Choose a document...", list(transcripts_map.keys()))
239
-
240
- st.video(str(transcripts_map[selected_talk]['video']), format="video/mp4", start_time=0)
241
-
242
- input_text = st.text_area("Transcript", value=transcripts_map[selected_talk]['transcript'], height=300)
243
-
244
- toc = Toc()
245
-
246
- summarization_todos = []
247
-
248
- with st.expander("Adjust Thresholds"):
249
- threshold = st.slider('Chapter Segmentation Threshold', 0.00, 1.00, value=0.4, step=0.05)
250
- paragraphing_threshold = st.slider('Paragraphing Threshold', 0.00, 1.00, value=0.5, step=0.05)
251
-
252
- if st.button("Process Transcript"):
253
- with st.sidebar:
254
- st.header("Table of Contents")
255
- toc.placeholder()
256
-
257
- st.header(selected_talk, divider='rainbow')
258
- # if 'presenter' in transcripts_map[selected_talk]:
259
- # st.markdown(f"### *by **{transcripts_map[selected_talk]['presenter']}***")
260
-
261
- captions = transcripts_map[selected_talk]['captions'] if 'captions' in transcripts_map[selected_talk] else None
262
- result = client.segment(input_text, captions, generate_titles=True, threshold=threshold)
263
- if USE_PARAGRAPHING_MODEL:
264
- presult = paragrapher.segment(input_text, captions, generate_titles=False, threshold=paragraphing_threshold)
265
- paragraphs = presult['segments']
266
- segments, titles, sentences = result['segments'], result['titles'], result['sentences']
267
-
268
- if USE_PARAGRAPHING_MODEL:
269
- prev_chapter_idx = 0
270
- prev_paragraph_idx = 0
271
- segment = []
272
- for i, sentence in enumerate(sentences):
273
- chapter, chapter_idx = get_sublist_by_flattened_index(segments, i)
274
- paragraph, paragraph_idx = get_sublist_by_flattened_index(paragraphs, i)
275
-
276
- if (chapter_idx != prev_chapter_idx and paragraph_idx == prev_paragraph_idx) or (paragraph_idx != prev_paragraph_idx and chapter_idx != prev_chapter_idx):
277
- print("Chapter / Chapter & Paragraph")
278
- segment_text = " ".join(segment)
279
- toc.subheader(titles[prev_chapter_idx])
280
- if len(segment_text) > 1200:
281
- generated_text_box = st.info("")
282
- summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
283
- elif len(segment_text) > 450:
284
- generated_text_box = st.info("")
285
- summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
286
- st.write(segment_text)
287
- segment = []
288
- elif paragraph_idx != prev_paragraph_idx and chapter_idx == prev_chapter_idx:
289
- print("Paragraph")
290
- segment.append("\n\n")
291
-
292
- segment.append(sentence)
293
-
294
- prev_chapter_idx = chapter_idx
295
- prev_paragraph_idx = paragraph_idx
296
-
297
- segment_text = " ".join(segment)
298
- toc.subheader(titles[prev_chapter_idx])
299
- if len(segment_text) > 1200:
300
- generated_text_box = st.info("")
301
- summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
302
- elif len(segment_text) > 450:
303
- generated_text_box = st.info("")
304
- summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
305
- st.write(segment_text)
306
-
307
-
308
- else:
309
- segments = [" ".join([sentence for sentence in segment]) for segment in segments]
310
- for title, segment in zip(titles, segments):
311
- toc.subheader(title)
312
- if len(segment) > 1200:
313
- generated_text_box = st.info("")
314
- summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
315
- elif len(segment) > 450:
316
- generated_text_box = st.info("")
317
- summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
318
- st.write(segment)
319
- toc.generate()
320
-
321
- for summarization_todo in summarization_todos:
322
- summarization_todo()
 
1
+ import itertools
2
+ import json
3
+ import re
4
+ from functools import partial
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+ import requests
9
+ import streamlit as st
10
+ import webvtt
11
+ from transformers import AutoTokenizer
12
+
13
+ from generate_text_api import TextGenerator
14
+ from model_inferences.utils.chunking import Truncater
15
+ from model_inferences.utils.files import get_captions_from_vtt, get_transcript
16
+
17
+ USE_PARAGRAPHING_MODEL = True
18
+
19
+ def get_sublist_by_flattened_index(A, i):
20
+ current_index = 0
21
+ for sublist in A:
22
+ sublist_length = len(sublist)
23
+ if current_index <= i < current_index + sublist_length:
24
+ return sublist, A.index(sublist)
25
+ current_index += sublist_length
26
+ return None, None
27
+
28
+ import requests
29
+
30
+
31
+ def get_talk_metadata(video_id):
32
+ url = "https://www.ted.com/graphql"
33
+
34
+ headers = {
35
+ "Content-Type": "application/json",
36
+ "Accept": "application/json",
37
+ "x-operation-name": "Transcript", # Replace with the actual operation name
38
+ }
39
+
40
+ data = {
41
+ "query": """
42
+ query GetTalk($videoId: ID!) {
43
+ video(id: $videoId) {
44
+ title,
45
+ presenterDisplayName,
46
+ nativeDownloads {medium}
47
+ }
48
+ }
49
+ """,
50
+ "variables": {
51
+ "videoId": video_id, # Corrected key to "videoId"
52
+ },
53
+ }
54
+
55
+ response = requests.post(url, json=data, headers=headers)
56
+
57
+ if response.status_code == 200:
58
+ result = response.json()
59
+ return result
60
+ else:
61
+ print(f"Error: {response.status_code}, {response.text}")
62
+
63
+ class OfflineTextSegmenterClient:
64
+ def __init__(self, host_url):
65
+ self.host_url = host_url.rstrip("/") + "/segment"
66
+
67
+ def segment(self, text, captions=None, generate_titles=False, threshold=0.4):
68
+ payload = {
69
+ 'text': text,
70
+ 'captions': captions,
71
+ 'generate_titles': generate_titles,
72
+ "prefix_titles": True,
73
+ "threshold": threshold,
74
+ }
75
+
76
+ headers = {
77
+ 'Content-Type': 'application/json'
78
+ }
79
+
80
+ response = requests.post(self.host_url, data=json.dumps(payload), headers=headers).json()
81
+ #segments = response["annotated_segments"] if "annotated_segments" in response else response["segments"]
82
+ return {'segments':response["segments"], 'titles': response["titles"], 'sentences': response["sentences"]}
83
+
84
+ class Toc:
85
+
86
+ def __init__(self):
87
+ self._items = []
88
+ self._placeholder = None
89
+
90
+ def title(self, text):
91
+ self._markdown(text, "h1")
92
+
93
+ def header(self, text):
94
+ self._markdown(text, "h2", " " * 2)
95
+
96
+ def subheader(self, text):
97
+ self._markdown(text, "h3", " " * 4)
98
+
99
+ def placeholder(self, sidebar=False):
100
+ self._placeholder = st.sidebar.empty() if sidebar else st.empty()
101
+
102
+ def generate(self):
103
+ if self._placeholder:
104
+ self._placeholder.markdown("\n".join(self._items), unsafe_allow_html=True)
105
+
106
+ def _markdown(self, text, level, space=""):
107
+ key = re.sub(r'[^\w-]', '', text.replace(" ", "-").replace("'", "-").lower())
108
+ st.markdown(f"<{level} id='{key}'>{text}</{level}>", unsafe_allow_html=True)
109
+ self._items.append(f"{space}* <a href='#{key}'>{text}</a>")
110
+
111
+ # custom_css = "<style type='text/css'>" + Path('style.css').read_text() + "</style>"
112
+ # st.write(custom_css, unsafe_allow_html=True)
113
+
114
+ def concat_prompt(prompt_text, text, model_name):
115
+ if 'flan' in model_name:
116
+ input_ = prompt_text + "\n\n" + text
117
+ elif 'galactica' in model_name:
118
+ input_ = text + "\n\n" + prompt_text
119
+ return input_
120
+
121
+ endpoint = "http://hiaisc.isl.iar.kit.edu/summarize"
122
+ ENDPOINTS = {"http://hiaisc.isl.iar.kit.edu/summarize": "meta-llama/Llama-2-13b-chat-hf",}
123
+
124
+ client = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/chapter")
125
+ if USE_PARAGRAPHING_MODEL:
126
+ paragrapher = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/paragraph")
127
+ summarizer = TextGenerator(endpoint)
128
+
129
+ tokenizer = AutoTokenizer.from_pretrained(ENDPOINTS[endpoint], use_fast=False)
130
+
131
+ # TLDR PROMPT
132
+
133
+ SYSTEM_PROMPT = "You are an assistant who replies with a summary to every message."
134
+
135
+ TLDR_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
136
+ {system_prompt}
137
+ <</SYS>>
138
+
139
+ {user_message} [/INST] Sure! Here is a summary of the research presentation in a single, short sentence:"""
140
+
141
+ TLDR_USER_PROMPT = "Summarize the following research presentation in a single, short sentence:\n\n{input}"
142
+
143
+ TLDR_PROMPT = TLDR_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
144
+ TLDR_PROMPT_LENGTH = tokenizer(TLDR_PROMPT, return_tensors="pt")["input_ids"].size(1)
145
+
146
+ BP_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
147
+ {system_prompt}
148
+ <</SYS>>
149
+
150
+ {user_message} [/INST] Sure! Here is a summary of the research presentation using three bullet points:\n\n\u2022"""
151
+
152
+ BP_USER_PROMPT = "Summarize the following research presentation using three bullet points:\n\n{input}"
153
+
154
+ BP_PROMPT = BP_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
155
+ BP_PROMPT_LENGTH = tokenizer(BP_PROMPT, return_tensors="pt")["input_ids"].size(1)
156
+
157
+ CONTEXT_LENGTH = 3072
158
+ MAX_SUMMARY_LENGTH = 1024
159
+ TLDR_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - TLDR_PROMPT_LENGTH - 1
160
+ BP_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - BP_PROMPT_LENGTH - 1
161
+
162
+
163
+ text_generator = TextGenerator(endpoint)
164
+ temperature = 0.7
165
+
166
+ import re
167
+
168
+
169
+ def replace_newlines(text):
170
+ updated_text = re.sub(r'\n+', r'\n\n', text)
171
+ return updated_text
172
+
173
+ def generate_summary(summarizer, generated_text_box, input_, prompt, max_input_length, prefix=""):
174
+ all_generated_text = prefix
175
+ truncater = Truncater(tokenizer, max_length=max_input_length)
176
+ input_ = truncater(input_)
177
+ input_ = prompt.format(input=input_)
178
+ for generated_text in summarizer.generate_text_stream(input_, max_new_tokens=MAX_SUMMARY_LENGTH, do_sample=True, temperature=temperature):
179
+ all_generated_text += replace_newlines(generated_text)
180
+ generated_text_box.info(all_generated_text)
181
+ print(all_generated_text)
182
+ return all_generated_text.strip()
183
+
184
+ st.header("Demo: Intelligent Recap")
185
+
186
+ if not hasattr(st, 'global_state'):
187
+ st.global_state = {'NIPS 2021 Talks': None, 'TED Talks': None}
188
+ # NIPS 2021 Talks
189
+ transcript_files = itertools.islice(Path("demo_data/nips-2021/").rglob("transcript_whisper_large-v2.vtt"), 15)
190
+ # get titles from metadata.json
191
+ transcripts_map = {}
192
+ for transcript_file in transcript_files:
193
+ base_path = transcript_file.parent
194
+ metadata = base_path / "metadata.json"
195
+ txt_file = base_path / "transcript_whisper_large-v2.txt"
196
+ with open(metadata) as f:
197
+ metadata = json.load(f)
198
+ title = metadata["title"]
199
+ transcript = get_transcript(txt_file)
200
+ captions = get_captions_from_vtt(transcript_file)
201
+ transcripts_map[title] = {"transcript": transcript, "captions": captions, "video": base_path / "video.mp4"}
202
+ st.global_state['NIPS 2021 Talks'] = transcripts_map
203
+
204
+ data = pd.read_json("demo_data/ted_talks.json")
205
+ video_ids = data.talk_id.tolist()
206
+ transcripts = data.text.apply(lambda x: " ".join(x)).tolist()
207
+ transcripts_map = {}
208
+ for video_id, transcript in zip(video_ids, transcripts):
209
+ metadata = get_talk_metadata(video_id)
210
+ title = metadata["data"]["video"]["title"]
211
+ presenter = metadata["data"]["video"]["presenterDisplayName"]
212
+ print(metadata["data"])
213
+ if metadata["data"]["video"]["nativeDownloads"] is None:
214
+ continue
215
+ video_url = metadata["data"]["video"]["nativeDownloads"]["medium"]
216
+ transcripts_map[title] = {"transcript": transcript, "video": video_url, "presenter": presenter}
217
+ st.global_state['TED Talks'] = transcripts_map
218
+
219
+ def get_lecture_id(path):
220
+ return int(path.parts[-2].split('-')[1])
221
+
222
+ transcript_files = Path("demo_data/lectures/").rglob("English.vtt")
223
+ sorted_path_list = sorted(transcript_files, key=get_lecture_id)
224
+
225
+ transcripts_map = {}
226
+ for transcript_file in sorted_path_list:
227
+ base_path = transcript_file.parent
228
+ lecture_id = base_path.parts[-1]
229
+ transcript = " ".join([c["text"].strip() for c in get_captions_from_vtt(transcript_file)]).replace("\n", " ")
230
+ video_path = Path(base_path, "video.mp4")
231
+ transcripts_map["Machine Translation: " + lecture_id] = {"transcript": transcript, "video": video_path}
232
+ st.global_state['KIT Lectures'] = transcripts_map
233
+
234
+ type_of_document = st.selectbox('What kind of document do you want to test it on?', list(st.global_state.keys()))
235
+
236
+ transcripts_map = st.global_state[type_of_document]
237
+
238
+ selected_talk = st.selectbox("Choose a document...", list(transcripts_map.keys()))
239
+
240
+ st.video(str(transcripts_map[selected_talk]['video']), format="video/mp4", start_time=0)
241
+
242
+ input_text = st.text_area("Transcript", value=transcripts_map[selected_talk]['transcript'], height=300)
243
+
244
+ toc = Toc()
245
+
246
+ summarization_todos = []
247
+
248
+ with st.expander("Adjust Thresholds"):
249
+ threshold = st.slider('Chapter Segmentation Threshold', 0.00, 1.00, value=0.4, step=0.05)
250
+ paragraphing_threshold = st.slider('Paragraphing Threshold', 0.00, 1.00, value=0.5, step=0.05)
251
+
252
+ if st.button("Process Transcript"):
253
+ with st.sidebar:
254
+ st.header("Table of Contents")
255
+ toc.placeholder()
256
+
257
+ st.header(selected_talk, divider='rainbow')
258
+ # if 'presenter' in transcripts_map[selected_talk]:
259
+ # st.markdown(f"### *by **{transcripts_map[selected_talk]['presenter']}***")
260
+
261
+ captions = transcripts_map[selected_talk]['captions'] if 'captions' in transcripts_map[selected_talk] else None
262
+ result = client.segment(input_text, captions, generate_titles=True, threshold=threshold)
263
+ if USE_PARAGRAPHING_MODEL:
264
+ presult = paragrapher.segment(input_text, captions, generate_titles=False, threshold=paragraphing_threshold)
265
+ paragraphs = presult['segments']
266
+ segments, titles, sentences = result['segments'], result['titles'], result['sentences']
267
+
268
+ if USE_PARAGRAPHING_MODEL:
269
+ prev_chapter_idx = 0
270
+ prev_paragraph_idx = 0
271
+ segment = []
272
+ for i, sentence in enumerate(sentences):
273
+ chapter, chapter_idx = get_sublist_by_flattened_index(segments, i)
274
+ paragraph, paragraph_idx = get_sublist_by_flattened_index(paragraphs, i)
275
+
276
+ if (chapter_idx != prev_chapter_idx and paragraph_idx == prev_paragraph_idx) or (paragraph_idx != prev_paragraph_idx and chapter_idx != prev_chapter_idx):
277
+ print("Chapter / Chapter & Paragraph")
278
+ segment_text = " ".join(segment)
279
+ toc.subheader(titles[prev_chapter_idx])
280
+ if len(segment_text) > 1200:
281
+ generated_text_box = st.info("")
282
+ summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
283
+ elif len(segment_text) > 450:
284
+ generated_text_box = st.info("")
285
+ summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
286
+ st.write(segment_text)
287
+ segment = []
288
+ elif paragraph_idx != prev_paragraph_idx and chapter_idx == prev_chapter_idx:
289
+ print("Paragraph")
290
+ segment.append("\n\n")
291
+
292
+ segment.append(sentence)
293
+
294
+ prev_chapter_idx = chapter_idx
295
+ prev_paragraph_idx = paragraph_idx
296
+
297
+ segment_text = " ".join(segment)
298
+ toc.subheader(titles[prev_chapter_idx])
299
+ if len(segment_text) > 1200:
300
+ generated_text_box = st.info("")
301
+ summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
302
+ elif len(segment_text) > 450:
303
+ generated_text_box = st.info("")
304
+ summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
305
+ st.write(segment_text)
306
+
307
+
308
+ else:
309
+ segments = [" ".join([sentence for sentence in segment]) for segment in segments]
310
+ for title, segment in zip(titles, segments):
311
+ toc.subheader(title)
312
+ if len(segment) > 1200:
313
+ generated_text_box = st.info("")
314
+ summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
315
+ elif len(segment) > 450:
316
+ generated_text_box = st.info("")
317
+ summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
318
+ st.write(segment)
319
+ toc.generate()
320
+
321
+ for summarization_todo in summarization_todos:
322
+ summarization_todo()
demo_data/nips-2021/25953/transcript_whisper_large-v2.txt ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hi everyone, my name is Zyw Goldfeld and this is a joint work with Christian Greenwald about
2
+ sliced mutual information, which is a new measure of statistical dependence that has
3
+ some nice scalability properties to high dimensional settings.
4
+ And to get started, I think we're all familiar with classic mutual information that is defined
5
+ between let's say continuous high dimensional random variables, which is the regime that
6
+ we'll mostly be interested in, like SOH, basically the KL divergence between their joint distributions
7
+ and the product of their marginals.
8
+ And mutual information is indeed this fundamental measure of dependence that enjoys many good
9
+ properties such that the fact that it nullifies if and only if our random variables are independent,
10
+ it is invariant to bijections and it meets several useful representations, decompositions,
11
+ variational forms, etc.
12
+ And in fact, it can be even obtained axiomatically as the unique functional of the joint distribution
13
+ that satisfies some natural informativeness conditions.
14
+ And as such, mutual information has seen a variety of applications in information theory
15
+ and statistics more recently in machine learning.
16
+ But the problem is that all this nice structure comes with a hefty price, since computing
17
+ mutual information in high dimensions or estimating it from samples is very, very hard, effectively
18
+ infeasible.
19
+ And this is the so-called curse of dimensionality and sort of the problem that we try to tackle
20
+ in this work.
21
+ And to address this difficulty, what we propose is sliced mutual information, which is, like
22
+ I said, a new measure of statistical dependence, not necessarily a proxy of mutual information
23
+ as such, but rather an alternative notion, which is defined as this average of scalar
24
+ mutual information terms between projections of our high dimensional variables onto randomly
25
+ chosen directions from the corresponding unit spheres.
26
+ And it's of course inspired by the recent popularization of slicing techniques for statistical
27
+ divergences, in particular the Wasserstein, the sliced Wasserstein distance is a great
28
+ example.
29
+ But the way it works for sliced mutual information is roughly so, well, let's say that this is
30
+ our first high dimensional variable X and this is its distribution.
31
+ What you do is draw a projection direction uniformly from the sphere.
32
+ You then project this random variable onto that direction, do the same for your other
33
+ random variable.
34
+ And now for these two projected scalar new variables, we just compute the mutual information
35
+ between them and average everything over the choice of direction.
36
+ So that's basically the definition.
37
+ And with that, the goal of this work is effectively to show that sliced mutual information is
38
+ both a meaningful and a scalable mutual information alternative.
39
+ Meaningful, well, in the sense that it preserves many of the desired properties that make mutual
40
+ information appealing to begin with and scalable in the sense that it alleviates the set of
41
+ computational and statistical difficulties.
42
+ All right.
43
+ Yeah, and to address this first point, let me show you that, well, despite those one
44
+ dimensional projections, sliced mutual information indeed inherits many of the properties of
45
+ classic mutual information.
46
+ So we have, well, of course, non-negativity, but furthermore, identification of independence.
47
+ We have an entropy decomposition for an appropriate definition of sliced entropy.
48
+ We can represent it as a KL divergence, a sliced KL divergence.
49
+ To be more precise, we have a chain rule tensorization for independent copies, as well as a Donsker-Varadhan-like
50
+ variational form that can be readily used for neural estimation of sliced mutual information.
51
+ We actually make use of that in some of our empirical results.
52
+ And well, I mean, you are more than welcome to check the paper or visit us as a poster
53
+ if you want to know more about any of these.
54
+ But really, the upshot here is that much of the classic structure is still there after
55
+ the slicing.
56
+ Now another interesting feature of sliced mutual information comes to light when you
57
+ think of it in the context of the famous data processing inequality.
58
+ And for starters, recall that classic mutual information satisfies the DPI, which in particular
59
+ means that if you process either of your random variables with a deterministic function, say
60
+ this f over here, you can only lose the informativeness in the classic sense.
61
+ Now sliced mutual information plays differently with processing and can in some sense benefit
62
+ from nice transformations that, let's say, give rise to some nicer manifold for your
63
+ random variable.
64
+ And to understand this, keep in mind that, well, first of all, sliced mutual information
65
+ only looks at projections of random variables.
66
+ And it may very well be the case that some transformations of x, let's say, have more
67
+ informative projections about y than x itself.
68
+ And here's a simple example to that effect.
69
+ So consider a two-dimensional isotropic Gaussian x, so two coordinates, x1 and x2.
70
+ And let's take y to be, for example, its first coordinate.
71
+ Now if you look at the mutual information between two fixed projections of x and y,
72
+ well, projection does nothing to y, right, because it's a scalar.
73
+ But it does affect x.
74
+ And if you look at the mutual information between two projections of x and y, you quickly
75
+ realize that x1 really plays the role of the signal here, whereas x2 behaves like noise.
76
+ And therefore, any transformation that will effectively improve your signal-to-noise ratio,
77
+ for example, like this g sub a over here, where a is less than 1, will indeed give rise
78
+ to a higher sliced mutual information value.
79
+ So all in all, sliced mutual information can be increased from processing, which means
80
+ that, well, in particular, it validates the data processing inequality and is different
81
+ from classic mutual information in that sense.
82
+ But interestingly, and as I will show you shortly, this is actually a quite useful thing
83
+ to have, for example, for feature extraction tasks, because we can use sliced mutual information
84
+ effectively to maximize it in order to extract informative features and land on those nicer
85
+ manifolds that I mentioned a moment ago.
86
+ And here's an example theorem that kind of makes this statement precise or formal, where
87
+ we consider the maximization of sliced mutual information over linear transformations of
88
+ our random variables.
89
+ And this would, of course, not affect classic mutual information at all.
90
+ But what we can show is that for sliced mutual information, this maximization ends up extracting
91
+ the two most informative projection directions for you, which in particular will be encoded
92
+ in the optimizing matrices, these A sub x star and A sub y star.
93
+ And of course, there's nothing special about this particular setup.
94
+ And we can establish similar results for, well, first of all, rank-constrained matrices
95
+ that as opposed to what's shown here would extract the, let's say, our most informative
96
+ features or projection directions.
97
+ In the paper, we also extend this result to shallow neural networks.
98
+ And in fact, our argument can be easily extended to cover additional nonlinear cases as well.
99
+ OK, so that's pretty much for structural properties.
100
+ But like I said at the beginning, the real premise of this framework is overcoming the
101
+ curse of dimensionality.
102
+ And let me show you that this is indeed the case, that sliced mutual information is or
103
+ can be estimated in a scalable manner, effectively by combining your favorite scalar mutual information
104
+ estimator with a simple Monte Carlo average step.
105
+ And this is how it works.
106
+ So let's say we're giving n IID samples from our high-dimensional random variables.
107
+ And we're further given a scalar mutual information estimator that achieves, say, error delta
108
+ of n when applied to n IID samples of some pair of one-dimensional variables, a and b.
109
+ OK, so let's say we have these.
110
+ Now, to estimate sliced mutual information, first thing to do is sample, let's say, m
111
+ random projections from the corresponding spheres in an IID fashion, at which point
112
+ we will take our high-dimensional n samples and project them onto each of these m random
113
+ projections that we've generated.
114
+ And the thing to observe here is that the resulting n times n data set of these projections
115
+ is nothing but IID samples from the corresponding projected distribution, which is the right
116
+ thing to have here if what you're trying to estimate is sliced mutual information.
117
+ So having that, I mean, at this point, per projection direction, we can apply the scalar
118
+ mutual information estimator and then just take one big, happy Monte Carlo average of
119
+ the entire thing over the different projection directions.
120
+ And this would give rise to the proposed sliced mutual information estimator.
121
+ Now, you can compute this thing very easily, because at the end of the day, it's an average
122
+ of scalar mutual information estimates.
123
+ And as far as performance guarantees, we can show that so long that the per-sliced mutual
124
+ information is bounded, the uniform absolute error of this estimator scales like 1 over
125
+ the root of m, the number of our Monte Carlo samples, plus the error of the scalar mutual
126
+ information estimator.
127
+ And I'm just restating this informally over here.
128
+ And what this all in all shows is that sliced mutual information can therefore be estimated
129
+ the rate of scalar mutual information estimation problem plus this m to the minus half Monte
130
+ Carlo penalty.
131
+ And the thing is that under appropriate smoothness assumptions, the one-dimensional rate is in
132
+ fact parametric.
133
+ And therefore, if you just match the size of your data set and the number of Monte Carlo
134
+ samples, just equate n and m, the sliced mutual information between high-dimensional variables
135
+ can be estimated at the parametric n to the minus half rate, perhaps up to some logarithmic
136
+ factors.
137
+ And this is, of course, a significant speed up and stands in sharp contrast to the slow,
138
+ exponentially bad in dimension, curse of dimensionality rate for classic mutual information.
139
+ Yeah, now this scalability makes, in fact, running empirical experiments with sliced
140
+ mutual information quite a breeze.
141
+ So let me quickly show you some sort of proof of concept experiments, let's say.
142
+ And the first one just relies on the fact that, well, SMI, sliced mutual information
143
+ can identify independence.
144
+ And therefore, we examine it as a figure of merit for independence testing, basically
145
+ by thresholding the computed sliced mutual information value.
146
+ And the results that we have obtained, of course, we've compared them with the same
147
+ test, but based on classic mutual information.
148
+ And this figure over here shows that for a bunch of different settings, well, it presents
149
+ the area under the ROC curve as a function of the number of samples, the standard way
150
+ to represent the quality of an independence test.
151
+ And you basically want this number to be 1, which corresponds to an omniscient test.
152
+ And what we observe is that sliced mutual information performs consistently well across
153
+ different setups and across different dimensions, whereas the performance of the mutual information,
154
+ the classic mutual information-based test, quickly degrades as dimension grows.
155
+ Now, on top of that, let me also demonstrate how sliced mutual information can be used
156
+ for feature extraction.
157
+ And here, what we want to do is maximize the sliced mutual information between linear transformations
158
+ of x and y that are now chosen to be IID samples from the same MNIST class, which we restrict
159
+ to be either 0 or 1.
160
+ And the choice of class is also random, so basically just a fair coin flip.
161
+ And by observing that sliced mutual information between x and y is at most 1 bit, I mean,
162
+ it's always upper bounded by mutual information, which equals a single bit in this case, basically
163
+ the class label, the way to understand what we're doing here is that we're looking for
164
+ the linear feature that is most informative for classifying or determining this class
165
+ label.
166
+ And interestingly enough, this is what this procedure ends up learning, where the figure
167
+ shows basically the first two rows of the optimal A matrix that we obtained, rearranged
168
+ in the dimension of an MNIST image.
169
+ And this really looks like a match filter, if you're familiar, which, when applied to
170
+ the samples, would indeed be able to tell you whether the sample came from the 0 class
171
+ or not.
172
+ And as far as for the value itself, well, the maximized sliced mutual information value
173
+ ends up being roughly 0.7, which is quite close to the 1 bit upper bound, and is much,
174
+ much larger than what you would get if you would not learn A, and let's say just instantiate
175
+ it as a matrix with IID entries drawn according to some distribution.
176
+ And this is just to say that something meaningful indeed being learned here, and something meaningful
177
+ indeed happens when you maximize the sliced mutual information as your optimization objective.
178
+ OK, so yeah, that's basically it.
179
+ And just to recap, we introduced sliced mutual information, which is this average of scalar
180
+ mutual information terms between one-dimensional projections.
181
+ We've seen that it preserves much of the structure of classic mutual information.
182
+ It can be efficiently computed and estimated from samples, and can also be, in fact, increased
183
+ by our processing if, indeed, your processing gives rise to more informative projections.
184
+ And we've presented some proof of concept applications to independence testing, to feature
185
+ extraction.
186
+ We have a couple of more in the paper.
187
+ But let me say this.
188
+ While this is mostly theoretical work, and a large-scale empirical exploration is sort
189
+ of beyond its scope, we firmly believe that sliced mutual information will be extremely
190
+ useful for various such tasks, and are very excited to look into this in the future.
191
+ And yeah, with that, I'll stop.
192
+ Thank you guys for listening, and do visit us at the poster, and check out the paper
193
+ if you would like to know more.
demo_data/nips-2021/25957/transcript_whisper_large-v2.txt ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hi, I'm Hugo Richard, I'm a third year PhD student at Université Paris-Saclay.
2
+ I'm in the INRIA Paris et Alpes team and my supervisor is Bertrand Thirion.
3
+ Today I'll talk about shared independent component analysis for multi-subject neuroimaging.
4
+ This is a joint work with Pierre Abelin, Alexandre Grandfort, Bertrand Thirion and Anna Pouy-Varine.
5
+ First let us consider two sources that are emitting a signal that is recorded by two
6
+ sensors.
7
+ This can be seen as a simplified model of magnetoencephalography where brain sources
8
+ are recorded by magnetometers.
9
+ Because propagation time can be neglected, the signal recorded by the sensors can be
10
+ seen as a linear mixture of the signal emitted by the sources.
11
+ S is a set of sources that are assumed to be independent.
12
+ X are the recordings and A describes how the sources are mixed to produce the recordings.
13
+ At first sight this model may seem ill-defined because if we permute two columns in A and
14
+ permute the corresponding sources in S, we'll get a new set of sources S' and a new mixing
15
+ matrix A' that describes X just as well as A and S.
16
+ And similarly if we scale the column of A by some constant, one column of A by some
17
+ constant and the corresponding source by the same constant, we'll also get an equivalent
18
+ description of X.
19
+ However, these scale and permutation indeterminacies are the only one if the sources contain at
20
+ most one Gaussian component.
21
+ Let us consider the more general problem where you have multiple subjects that are exposed
22
+ to the same stimuli.
23
+ We have two subjects, X1 and X2, and they have different mixing matrices, A1 and A2,
24
+ and different noise levels, N1 and N2.
25
+ The interpretation is that they have shared sources because they have shared connective
26
+ processes.
27
+ They have different mixing matrices because they have different spatial topography.
28
+ And they have different noises because we want to model inter-subject variability.
29
+ This model is called group ICA.
30
+ There are many methods to provide a solution for the group ICA problem.
31
+ A very popular one introduced by Calhoun in 2001 is to just stack the data of all subjects
32
+ feature-wise and then perform a PCA, a principal component analysis, on the stacked data.
33
+ And therefore you obtain reduced data and apply independent component analysis on the
34
+ reduced data to obtain a set of sources.
35
+ Another formulation is introduced by Varoko in 2010 and is called K-NICA.
36
+ You just replace the principal component analysis with a multiset CCA, so a multiset canonical
37
+ correlation analysis, where you have to solve a generalized eigenvalue problem.
38
+ There are many different formulations of multiset CCA, but this one with a generalized eigenvalue
39
+ problem is the fastest to solve.
40
+ KNICA and Cut-ICA have a lot of advantages.
41
+ First, they are very fast to fit.
42
+ And second, they are simple to implement.
43
+ These are the two reasons why they are so popular in neuroimaging.
44
+ However, they do not optimize the proper likelihood.
45
+ So therefore they do not benefit from advantages of such estimators such as asymptotic efficiency.
46
+ There are a lot of other related work that do optimize the proper likelihood.
47
+ I want to mention the independent vector analysis, which is a very powerful framework introduced
48
+ by Li in 2008.
49
+ So unified approach of Guo in 2008 that we will also mention and talk about later.
50
+ The approach of Shen in 2015 that also allows to perform dimension reduction.
51
+ And the multi-view ICA that was introduced by our team last year.
52
+ I want to quickly say that it's not obvious to design a likelihood-based approach that
53
+ is tractable.
54
+ And with this example of the Gaussian mixture noisy ICA by Bermond and Cardozo, we'll see
55
+ that standard approach leads to intractable algorithms.
56
+ The model we take here is the same as the group ICA, but we assume that the noise is
57
+ Gaussian with the same variance for all subjects.
58
+ We'll also assume that the sources follow a Gaussian mixture model.
59
+ And we further assume that the weights of the Gaussian mixtures are known.
60
+ We can solve such model via expectation maximization.
61
+ And if we write the E-step, we'll get a closed form that involves a large sum.
62
+ Because of this large size, this sum, and therefore the M algorithm is intractable whenever
63
+ Q and K are large.
64
+ Our contribution is shared ICA, what we call Shikha for short, where the data of subject
65
+ i are assumed as a linear mixture of noisy sources, and the noise here is not on the
66
+ sensor, but on the sources.
67
+ The noise is Gaussian with a variance that can be different for each subject and different
68
+ for each component.
69
+ S are assumed to be independent, but in contrast to almost all existing work, some components
70
+ can be Gaussian.
71
+ We have a few blanket assumptions.
72
+ We assume that the data are centered, that the mixing metrics are invertible, that the
73
+ sources have identical variance, and that the number of subjects is greater than 3.
74
+ We have two algorithms to solve the Shikha model.
75
+ We have ShikhaJ, that is a FAS algorithm that is based on multiset CCA, and ShikhaML, a
76
+ maximum likelihood approach.
77
+ In Shikha, there are two ways to recover the parameters.
78
+ Either the source are non-Gaussian, in which case we can use classical ICA results to recover
79
+ the unmixing matrices.
80
+ When the components are Gaussian, then we need something else, and what we use here
81
+ is noise diversity.
82
+ When the noise is sufficiently diverse, then it's possible to recover the unmixing matrix
83
+ and the noise covariance up to a permutation and sign indeterminacy.
84
+ Note that the noise diversity in Gaussian components is also a necessary condition.
85
+ If it does not hold, then Shikha cannot be identified.
86
+ Let us now focus on this theorem that is at the core of the ShikhaJ algorithm.
87
+ Namely it shows that we can solve group ICA with multiset CCA.
88
+ So assume the data follows the Shikha model, and consider the multiset CCA framed as a
89
+ generalized eigenvalue problem.
90
+ This generalized eigenvalue problem relies on two matrices, C and D. So C is formed by
91
+ second-order statistics, and D is formed by the diagonal blocks in C.
92
+ And so if we solve this eigenvalue problem and take the first k leading eigenvectors,
93
+ we can recover the correct unmixing matrix from them, up to a permutation and a scaling.
94
+ And this can only be done if the k first eigenvalues are distinct.
95
+ Note that the distinct eigenvalue condition is also necessary.
96
+ If two eigenvalues are the same, then this adds the need to determine IC, and therefore
97
+ we cannot solve group IC.
98
+ Note also that the condition that some eigenvalues need to be distinct is stronger than the noise
99
+ diversity condition we have in the identifiability theorem.
100
+ And therefore we can exhibit an example which is identifiable, but on which multiset CCA
101
+ will fail.
102
+ And I refer you to the paper for more details on this.
103
+ So in our theorem, in order to recover the correct unmixing matrix, we need to have access
104
+ to the second-order statistics.
105
+ However, in practice, we only have access to them, up to some sampling noise.
106
+ And because the mapping from matrices to eigenvectors is highly non-smooth, a small deviation in
107
+ the second-order statistics can lead to a high deviation of the recovered unmixing matrix.
108
+ Now to show this in practice, we take three subjects, two components, and noise covariance
109
+ matrices with two values, lambda1 and lambda2, that are separated by an eigengap epsilon.
110
+ And we compare the solution of multiset CCA on the true covariance matrices and on the
111
+ perturbed covariance matrix, where the perturbation scale is given by delta.
112
+ And for different values of epsilon, 10-4, 10-3, 10-2, 10-1, we show how the performance
113
+ of the algorithm, so the M-ary distance between the true unmixing matrix and the estimated
114
+ unmixing matrix, varies when the perturbation scale increases.
115
+ And we see that when the eigengap is very close, so 10-4, the violet curve, then even
116
+ with a very small perturbation, you can get to a very bad M-ary distance.
117
+ So the black dashed curve is a performance of chance.
118
+ Luckily, there is a large gap between the k-th eigenvalues and the k plus 1.
119
+ This means that in practice, the span of the p-leading eigenvectors is approximately preserved.
120
+ We can recover the true unmixing matrix from the unmixing matrix estimated by multiset
121
+ CCA, just by multiplying by a matrix Q.
122
+ And in order to estimate Q, we make use of the fact that the unmixed data should have
123
+ a diagonal covariance.
124
+ This leads us to a joint diagonalization problem that we can solve efficiently.
125
+ So if we take the experiments we've done on the previous slide, the results are still
126
+ shown here.
127
+ You can see the violet curves, and that is very sensitive to perturbation.
128
+ And so if we apply joint diagonalization, all these curves move, and they join the dashed
129
+ curve on the bottom.
130
+ And therefore, it's much better, because now the new curves that are represented by the
131
+ dashed line are less sensitive to perturbations.
132
+ So now we've obtained the correct unmixing matrix, but up to a scaling.
133
+ And so we need an additional step to find the correct scaling, and another one to find
134
+ the other parameter that is still unestimated, which are the noise covariance.
135
+ And luckily, it's very easy to find the noise covariance.
136
+ We can do this via an EM algorithm.
137
+ The E-step and the M-step are in closed form, and this yields a very fast algorithm.
138
+ But the Shikha-J is not a maximum likelihood estimator.
139
+ So now we will focus on Shikha-ML, which is our maximum likelihood estimator.
140
+ So I won't go too much into details on this, but we optimize this via an EM using a Gaussian
141
+ mixture assumption as a source.
142
+ We assume that the weights are known.
143
+ What I just want to showcase here is that the E-step of the algorithm, the one that
144
+ gives you the expectation of the sources given the data, and the variance of the sources
145
+ given the data, only involves the sum of size 2.
146
+ So previously we had a sum that had an exponential number of terms, and here we don't have that
147
+ anymore.
148
+ So the E-step is much faster than what we had before, and therefore the EM algorithm
149
+ here is tractable, whereas it was not the case before.
150
+ I first want to present our synthetic experiment where we generate data according to the Shikha-ML
151
+ and Shikha-J model.
152
+ In case A, we have only Gaussian components, but we have noise diversity, and therefore
153
+ methods that use noise diversity to recover the sources such as Shikha-ML and Shikha-J
154
+ perform best.
155
+ In the second case, we have only non-Gaussian components and no noise diversity, so methods
156
+ that use non-Gaussianity perform well such as Kana-ICA, Shikha-ML, or MultiView-ICA.
157
+ And the last case, half of the components are Gaussian with noise diversity, and the
158
+ other half are non-Gaussian but without noise diversity.
159
+ And in this case, only Shikha-ML is able to correctly recover the sources.
160
+ MV-ICA doesn't do that, but it's not as good as Shikha-ML.
161
+ Let us now talk about our experiments on real data.
162
+ We have this reconstruction experiment on fMRI data where subjects are exposed to a
163
+ naturalistic stimuli such as movie watching.
164
+ We use 80% of the movie to learn the unmixing matrices of all subjects, and then on the
165
+ 20% left of the movie, we compute the common sources, and from these common sources computed
166
+ using 80% of the subject, we try to reconstruct the data of the 20% left of the subject.
167
+ We compute the R2 score within regions of interest between the reconstructed data and
168
+ the true data, and plot them as a function of the number of components used.
169
+ As we see, Shikha-ML outperforms all of the methods.
170
+ As a take-home message, Shikha is a powerful framework to extract shared sources.
171
+ Shikha-J is a fast approach to fit the model, but it only uses second-order information.
172
+ In contrast, Shikha-ML is a bit slower, but is able to use non-gaussianity in addition
173
+ to second-order information.
174
+ In practice, Shikha-ML yields the best results.
175
+ The methods we've introduced work on reduced data.
176
+ It would be interesting to know how to reduce the data so that they perform optimally.
177
+ Another way to improve our results would be to learn the density of the shared sources
178
+ in Shikha-ML instead of having them fixed.
179
+ Thanks for listening, and have a good day!
demo_data/nips-2021/25958/transcript_whisper_large-v2.txt ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hello everyone, I'm Luigi Carretino, and this is a joint work with Stefano Vigonia,
2
+ Daniele Calandriello, and Lorenzo Rosasco.
3
+ The problem that we study in this work is a standard regression problem, where we want
4
+ to estimate an unknown function f star given n pairs of points, x's and y's, and then
5
+ given n pairs of points, x's and y's, where y's are noisy evaluations of the functions
6
+ f star on the input points axis.
7
+ A well-established method to learn nonlinear functions is kernel ridge regression.
8
+ The basic idea is to map the input points into a higher dimensional space, where linear
9
+ relationships can be learned that then translate in nonlinear ones in the input space.
10
+ To formalize this, we can think about solving a standard empirical risk minimization problem
11
+ regularized over a spatial function which is a reproducing kernel Hilbert space.
12
+ Numerically speaking, the solution of this type of problem boils down to solving a linear
13
+ system. Particularly, we can see here that the linear system is going to be Kc equal
14
+ y, where K is the kernel matrix evaluated in all the pairs of points of the training
15
+ sets, c are the weights that we aim to learn, and y's are the output points.
16
+ We know that this method is optimal from a statistical point of view, but a drawback
17
+ is that it suffers from computational scalability. In fact, in terms of time complexity, if we
18
+ have n training points and we want to solve the linear system directly, we'll have to
19
+ invert the matrix K, and this will cost us n cubed in time.
20
+ Multiple ways of accelerating this process have been proposed over time.
21
+ The first one is to solve the methods iteratively instead of inverting directly the matrix K.
22
+ This allows us to only have matrix vector multiplications, and so the overall cost of
23
+ an iterative method to solve this linear system is going to be Tn squared.
24
+ Another method is the one known as sketching, where we can see this as subsampling the linear
25
+ system, in particular subsampling columns of this linear system, where we can take m
26
+ columns of the linear system uniformly at random to get a smaller one, and the cost
27
+ of this will be m squared n.
28
+ Another method instead is splitting. This allows us to divide the main problem into
29
+ many, in this case Q, subproblems, each one that can be solved independently and so
30
+ potentially can be distributed. So we can have a cost which boils down to n over Q to
31
+ the power of 3.
32
+ Combinations of these methods have been proposed in the literature. In particular, if
33
+ we combine iterating and sketching, we can get a solver that can solve the problem in
34
+ a time complexity of Tmn.
35
+ If instead we combine sketching and splitting, we can get a solver that can be computed
36
+ in m squared times n over Q.
37
+ And in this work, we try to blend all these techniques to derive a new algorithm, which
38
+ we will call PARC, that can achieve a time complexity of Tm times n over Q to the power
39
+ of 2.
40
+ So as we just said, in this work, we propose a new large-scale kernel regression solver
41
+ that combines the computational benefits of iteration, sketching, and splitting.
42
+ Notice, though, that these are approximation techniques and they may come at the cost of
43
+ accuracy. But we are able to show that this new algorithm is able to preserve generalization
44
+ under suitable partitions.
45
+ Now also notice that instead of general splitting, we are going to need to focus on a
46
+ particular type, which is the partitions.
47
+ So we introduce a new principal partition scheme for kernel methods.
48
+ We now look at the difference between data splitting and space partitioning.
49
+ Given a set of points, the procedure of splitting takes groups of points at random and assign
50
+ them to different splits or clusters.
51
+ In this picture, for example, we divide the points in four splits.
52
+ Partitioning instead divides the space in different cells, and then the points are implicitly
53
+ assigned to a particular cluster based on which cell they belong to.
54
+ Notice that with the splitting methods, we don't consider local information while we
55
+ perform the splitting, but we do when we perform partitioning.
56
+ Now, from this picture, the concept of partitioning a space seems pretty straightforward.
57
+ However, when you start considering high dimensional feature space, subtle problems can
58
+ appear.
59
+ So first, as a recap, remember that there are two important spaces to consider in our
60
+ regression problem.
61
+ The input space X with its input space features and the kernel space H with its input space
62
+ features, and the kernel space H, which potentially has many more implicit features.
63
+ Traditionally, partition methods are applied directly to the input space.
64
+ For example, a classical approach is to select a subset of points as centroids and then
65
+ partition the space in cells by assigning each portion of the space to the closest centroid,
66
+ which is called a Voronoi partition.
67
+ Since we are in the input space, closest here is defined according to a simple Euclidean
68
+ distance.
69
+ However, remember that our target function and our whole regression does not happen
70
+ directly on the input data space, but rather on the data mapped in the feature space.
71
+ And after we apply our feature map to the data, the concept of closest and the partition
72
+ can radically change.
73
+ For example, here on the right, we choose a kernel space associated with a cosine similarity
74
+ and again plot how the centroids partition the input space, but this time we chose closest
75
+ according to the new cosine distance.
76
+ The resulting partition is very different from the Euclidean one as it captures the
77
+ non-linearity of the kernel function.
78
+ In the paper, we discuss how this difference can impact the regression and we identified
79
+ sufficient conditions that the partition should satisfy in order to guarantee good generalization
80
+ of the learning process.
81
+ Crucially, we will see that these guarantees depend not on how the input space is partitioned,
82
+ but rather how the feature space is partitioned.
83
+ As a consequence, for our PARC methods, we focus on choosing centroids solely using the
84
+ kernel version of the distance.
85
+ We are now ready to present in more detail how the PARC algorithm works.
86
+ First of all, PARC partitioned the feature space into Q Voronoi cells and the first thing
87
+ to do is to identify the centroids in the feature space that allows us to describe the
88
+ Voronoi cells.
89
+ Then inside each Voronoi cell, we learn a local estimator using an uniterated and sketched
90
+ version of kernel ridge regression.
91
+ And then at prediction time, when a new sample arrives, we can use the Q Voronoi feature
92
+ to identify the new sample.
93
+ We use the local estimator corresponding to the Voronoi cell to which the new points fall
94
+ on.
95
+ The generalization error of standard kernel ridge regression without partitioning can
96
+ be upper bounded by two terms, a bias term and a variance term.
97
+ In our work, we can show that also the generalization error of PARC can be upper bounded by a bias
98
+ term and a variance term.
99
+ But this time, these two terms are weighted and they are weighted by a certain quantity
100
+ that depends on an angle theta, which is the minimum angle between all the subspaces of
101
+ the partitions.
102
+ For example, when all the subspaces are orthogonal between each other, we recover the exact same
103
+ generalization error of standard kernel ridge regression.
104
+ But we are also able to show that for angles which are small enough, we are able to obtain
105
+ a generalization error which is of the same order of standard kernel ridge regression.
106
+ These theoretical results suggest us how to construct a good partition.
107
+ So in particular, PARC selects the Voronoi centroids greedily in order to promote orthogonality
108
+ between the Voronoi cells.
109
+ And in particular, we use the Schur complement to measure the orthogonality.
110
+ We also use the Schur complement to measure the orthogonality of the Voronoi centroids.
111
+ And in particular, we use the Schur complement to measure the orthogonality.
112
+ Given all these ingredients, we are now able to measure the computational complexity of
113
+ PARC, which has a time complexity that is the sum of two terms.
114
+ A first term, q squared n log n, which is the cost of computing the centroids with the
115
+ just mentioned procedure.
116
+ And a second term, q squared n log n, which is the cost of computing the most expensive
117
+ local estimator.
118
+ Empirically, we performed experiments on data set of millions and of billions of points,
119
+ and we compared with the currently fastest global kernel methods and with some other
120
+ splitting kernel methods.
121
+ We can see that PARC is the only method that manages to match the accuracy of the global
122
+ estimator.
123
+ Thank you all for your attention.
124
+ And thank you to the poster for all your questions and more details.
demo_data/nips-2021/25959/transcript_whisper_large-v2.txt ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hello, my name is Pouya Bahshiban and I'm going to tell you about our paper titled
2
+ Adversarial Feature Desensitization. This is joint work with a number of wonderful collaborators
3
+ at MIWA, University of Montreal and McGill University, including Reza Bayat, Adam Ibrahim,
4
+ Kartika Hoja, Mojtaba Farmazi, Tourez Dale, Lake Richards and Erin Oji. A common assumption in
5
+ machine learning is that the train and test samples come from the same distribution.
6
+ While this is a reasonable assumption under most circumstances, it is intentionally violated in the
7
+ regime of adversarial attacks. Adversarial attacks are algorithms that search for slight input
8
+ perturbations that cause the input to be misclassified. In the case of white box attacks,
9
+ the model itself is transparent to the attacker and the attacker uses it to identify the possible
10
+ inputs that would lead to misclassifications. A famous example of this is the image of a panda
11
+ that when perturbed with imperceptible noise, alters the model's prediction from a panda to a
12
+ gibbon. As prior literature has shown, this is a common issue in almost all machine learning methods
13
+ and unless the classifier is specifically trained to be robust against these attacks,
14
+ the attacks could completely break down the classifier's performance.
15
+ This issue becomes even more critical when we consider the vast usage of these machine learning
16
+ systems in our societies. For example, the possible security concerns that rise in face
17
+ recognition systems prone to adversarial attacks or the safety in autonomous driving systems.
18
+ So what is an adversarial attack? To formally define the adversarial attacks, let's assume a
19
+ feature learning function f that projects inputs x to latent space with feature space z
20
+ and a classifier that uses the latent code z to predict the correct class label y hat.
21
+ The perturbation function or the attack generates a perturbed sample x prime
22
+ within the epsilon neighborhood of the input x, which we're showing here as b of x and epsilon.
23
+ By maximizing the classification objective, the opposite of how we normally optimize the classifier's
24
+ parameter. Many methods have been proposed to defend the models against adversarial attacks.
25
+ Two of these methods that have withstood the test of time so far are the adversarial training
26
+ by Alexander Modrianov, which proposes a defense method by solving a minimax optimization problem
27
+ that involves finding an adversarial input by maximizing the classification loss in the inner
28
+ loop followed by a classifier training to minimizing the classifier loss on these adversarial inputs.
29
+ This procedure is graphically shown for two hypothetical classes in the diagram on this slide.
30
+ The adversarial training method essentially learns to separate the distributions of adversarial
31
+ examples belonging to different classes. The second method is the trades method by Zhang et al,
32
+ which proposes to push the decision boundary of the classifier away from the data.
33
+ Trades achieves this by introducing a regularization term to the original learning
34
+ objective for classification that penalizes the mismatch between the predicted label
35
+ for the clean and perturbed inputs. The diagram on the right side again graphically illustrates
36
+ this procedure, where now the defense method learns to separate the distributions of clean examples
37
+ belonging to different classes while minimizing the loss of the classifier.
38
+ The third method is the trade method by Wang et al, which proposes to push the decision boundary
39
+ of the classifier to the inner loop followed by a classifier training to minimizing the
40
+ classification loss on these adversarial inputs. The third method is the trade method by Zhang et al,
41
+ which proposes to push the decision boundary of the classifier to the inner loop followed by a
42
+ classifier training to minimizing the classification loss on these adversarial inputs to the inner
43
+ loop. The third method is the trade method by Wang et al, which proposes to push the decision
44
+ boundary of the classifier to minimizing the classification loss. The fourth method is the
45
+ trade method by Wang et al, which proposes to push the decision boundary of the classifier
46
+ for a source domain, but we want the classifier to also perform the same task on a related target
47
+ domain that we might not have enough data for or that the generating procedure for sampling
48
+ domain might be expensive. The domain adaptation theory proposed by Ben David et al answers the
49
+ question of under what conditions can we adapt a classifier trained on the source domain for use
50
+ in the target domain. Here we consider the original clean distributions as the source domain and the
51
+ distribution of adversarial images generated from those images as the target domain. Although here
52
+ the target domain continuously evolves because the adversarial examples are based on the current
53
+ state of the model at each time step. And similar to the domain adaptation theory, our goal here
54
+ is to learn how to perform well on both source and target domains, meaning the natural and
55
+ adversarial domains. Now before I tell you about our proposed method, let's dive a bit deeper into
56
+ what the domain adaptation theory from Ben David et al states. Similar to before, let's assume a
57
+ feature learning function f that projects inputs x to latent space or feature space z and the
58
+ classifier that predicts the correct label y, y hat, from those latent codes. Now consider natural
59
+ and adversarial examples as input domains dx and d' x and their induced feature distributions
60
+ which go through the f function as dz and d' z. Also consider epsilon z and epsilon' z
61
+ as the classification error over the domains dz and d' z, what we are going to refer to as the
62
+ clean accuracy and the adversarial accuracy. The domain adaptation theory now gives a bond
63
+ on the adversarial error in terms of the natural error and the distance between the two domains.
64
+ Fortunately, from the prior work, we know that h delta h distance, which measures the distance
65
+ between two domains, can be estimated using the classifier trained to discriminate between the
66
+ two domains. Now our defense method called adversarial feature desensitization essentially
67
+ minimizes the bound on the adversarial error epsilon' z using a three-step procedure which
68
+ has some conceptual similarities with prior work on adversarial domain adaptation from Ganin et al.
69
+ For this, we first update the parameters theta and phi in the feature learning function f and
70
+ task classifier c to minimize the classification loss on the natural domain. This is shown with
71
+ green arrows and green boxes marked 1 on both the equation and on the diagram.
72
+ Secondly, we estimate the h delta h distance using an additional domain discriminator
73
+ network that predicts the domain identity from the latent code z. We update the domain
74
+ discriminator parameters psi to minimize the domain classification loss. And finally,
75
+ in the third step, we update the feature learning network parameters theta to maximize the domain
76
+ classification loss in an adversarial way. These two steps are marked with red arrows in the figure
77
+ and red boxes on the equation. Similar to previous two methods, adversarial training and trades that
78
+ I showed you, we here we can also graphically demonstrate this procedure. In our method AFD,
79
+ we learn to separate the classes from the distributions of clean examples while at the
80
+ same time we optimize a domain classifier that learns the boundary between the clean and adversarial
81
+ examples for each class. And finally, we push the adversarial examples to the opposite side of that
82
+ boundary. This procedure implicitly desensitizes the learned features to adversarial perturbations
83
+ and hence the name adversarial feature desensitization. We tested our method on four
84
+ data sets and compared them with a number of other baselines including with adversarial training and
85
+ trades. We made two versions of our method called AFDTCGAN that uses the adversarial losses from
86
+ Goodfellow et al and AFDWGAN that uses the Wasserstein losses from Arjovski and Goodtuner.
87
+ In the table, we evaluated all methods on several white box and black box attacks with
88
+ nominal strengths into each data set. Overall, our method AFD and especially AFDWGAN showed superior
89
+ performance against most attacks in most data sets. However, AFD was behind trades on several attacks
90
+ especially on CIFAR-100 and TinyImageNet data set that had more classes in it.
91
+ We also looked in trust attack methods and attack strengths which we controlled with the parameter
92
+ epsilon. The diagrams on the right show the robust accuracy for each defense method across
93
+ eight attack methods and various epsilon values for each of them. Overall, our results in these
94
+ diagrams showed that AFD's robustness generalizes better than the baselines across attacks and
95
+ across attack strengths. To quantify these differences, we also computed the area under
96
+ the curve for each method for each attack and summarized them in a table on the left.
97
+ As you can see, AFD's robust performance generalizes better to unseen and stronger attacks
98
+ compared to other baselines. If you remember from previous slides, the domain adaptation theory
99
+ predicted a bound on the adversarial error which can also be turned into a bound on the generalization
100
+ gap between natural and adversarial attacks. We empirically tested this prediction in our experiments
101
+ under two settings. Under the first setting, we varied the epsilon value for the PGDL-infinity
102
+ attack which was used during the training. And under the second setting, we varied the
103
+ epsilon value for the PGDL-infinity attack which was used during the training. And under the second setting, we used a diverse set of attacks and various attack strengths for each of them.
104
+ And under both scenarios, we found that the domain discriminator, which was originally trained on a
105
+ particular attack and attack strength, in our case it was PGDL-infinity attack with a fixed epsilon
106
+ for each data set, could well predict the generalization gap to unseen attacks and
107
+ different attack magnitudes. This suggests that the adversarial training against a domain classifier
108
+ like that used in our proposed method could potentially lead to robust models with better
109
+ generalization capacity. Finally, while we showed that AFD generalizes well to most other attacks
110
+ and attack strengths, it occasionally was worse compared to other baselines, especially in data
111
+ sets with more classes like Tiny ImageNet. This could potentially be due to the difficulty of training
112
+ domain classifiers in these data sets and leaves much space for future work on
113
+ investigating the effect of domain classifiers on the robustness of feature learning functions.
114
+ Also, AFD required more backward computations compared to some of the other baselines
115
+ such as adversarial training, and as a result, its training time was on average about 31%
116
+ longer than adversarial training. We invite you to read our paper for more details and please
117
+ get in touch with us if you have any questions. Thanks for watching this video and we hope you enjoyed it.
demo_data/nips-2021/25962/transcript_whisper_large-v2.txt ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Bonjour à tous, je suis Yannis Hartel et je vais vous présenter un travail sur l'estimation
2
+ de fonctionnalité en termes de certaines contraintes particulières de la privacité.
3
+ C'est un travail en lien avec mon conseiller postdoc, le professeur Cristina Gutucia.
4
+ Nous sommes intéressés par le fonctionnalité de la somme de puissance, qui est la somme de probabilités associées
5
+ à une distribution discrète, à la puissance gamma, où gamma est un nombre réel positif.
6
+ Donc, ce fonctionnalité de la somme de puissance est un exemple d'information qui se déroule dans différents domaines
7
+ comme les statistiques, l'apprentissage de machines, la théorie de l'information, la science de la neurone, etc.
8
+ Voici donc le problème statistique standard, où l'objectif est d'estimer la somme de puissance fonctionnelle
9
+ basée sur des exemples NIID, X1, X2 jusqu'à XN, qui suivent une distribution discrète B avec une taille d'alphabet K.
10
+ Une approche beaucoup utilisée est le estimateur de plug-in, où l'on utilise un estimateur du paramètre P
11
+ pour construire un estimateur du fonctionnalité, à travers le principe de plug-in.
12
+ Cette approche n'est pas seulement simple et intuitive, mais elle est aussi théoriquement saine,
13
+ car elle satisfait une efficacité asymptotique et une néro-optimalité non-asymptote.
14
+ La question intéressante de notre paper est de savoir si cette approche de plug-in
15
+ fonctionne dans un état de séparation non standard, où l'on impose une contrainte de privé,
16
+ et plus précisément, le setup de la privé différente local.
17
+ Ce qui signifie que l'on impose un état de privé fort, où l'on n'a pas accès aux données initiales et sensibles, les XI.
18
+ Au lieu de ça, l'on a seulement accès à une version privée de XI.
19
+ Voici la représentation d'un mécanisme simple qui n'est pas interactif.
20
+ Les termes local ici reflètent le fait que le mécanisme QI ne voit que les données XI.
21
+ En d'autres mots, il n'y a pas de troisième parti confiant qui a accès à toutes les données sensibles.
22
+ C'est un mécanisme de privé non-interactif simple, mais bien sûr, nous sommes aussi intéressés par des mécanismes plus sophistiqués,
23
+ notamment le mécanisme de séquence interactif, où chaque QI voit les données privées dévoilées précédemment,
24
+ et les données privées de XI, et les données privées de XI.
25
+ Dans cette étude non-standard, nous retournons au problème original de l'estimation fonctionnelle de la power sum,
26
+ où nous n'avons qu'accès à des données privées de XI jusqu'à XL.
27
+ Notre première contribution est de donner une caractérisation tigrée et non-transomatique du erreur de caractérisation de la power sum de l'estimateur.
28
+ Ce résultat montre que l'estimateur de la power sum n'est pas optimal.
29
+ Cela contraste avec la performance de l'estimateur de la power sum dans le problème statistique standard.
30
+ Le message ici est que les bons estimateurs dans le setup standard ne sont pas toujours bons estimateurs dans le setup local privacy.
31
+ Notre deuxième contribution est la correction du estimateur de plug-in grâce à une attentionnée de troncation de Pk de petites probabilités.
32
+ Cette correction conduit à une réduction significative du risque d'erreur.
33
+ En particulier, le risque devient indépendant du size alphabétique K lorsque K est grand.
34
+ Cette deuxième contribution, par contre, se base sur un mécanisme de privé non-interactif simple.
35
+ Dans la seconde partie du document, nous examinons un mécanisme de séquence interactive plus sophistiqué,
36
+ pour lequel nous construisons une procédure de deux pas qui nous permet de réduire le risque grâce à un facteur logarithmique.
37
+ Enfin, à la fin du document, nous fournissons un lien universel en bas sur le risque d'erreur
38
+ avec respect à tous les estimateurs et tous les mécanismes non-interactifs et séquentially interactifs.
39
+ Malheureusement, ce lien bas est un lien d'accords uniquement dans certains cas,
40
+ ce qui nous laisse avec quelques questions très importantes à poser sur ce problème.
41
+ Je pense que ce premier travail sur l'estimation fonctionnelle dans le contexte de la privé locale
42
+ vous donne au moins trois points clés.
43
+ Le premier point clé est le besoin de construire une procédure statistique prudente pour la configuration de la privé locale,
44
+ puisque c'est un setup où un bon estimateur dans un cadre standard n'a pas nécessairement de fonction.
45
+ Le deuxième point clé est que l'approche de type de plug-in analysée dans ce document
46
+ sert comme un benchmark pour de futurs travaux et des procédures plus sophistiquées.
47
+ Et le dernier point clé est que notre analyse de l'approche de type de plug-in et des mécanismes non-interactifs
48
+ montrent des régimes où le problème d'estimation est difficile
49
+ et espérons que cela incite les gens à amener des développements ici.
50
+ Merci à tous, et pour plus de détails, veuillez vérifier notre document en ligne.
51
+ Bye!
demo_data/nips-2021/25963/transcript_whisper_large-v2.txt ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hello, I'm Hassam Murtaghi. I'm a PhD student at Georgia Tech. Along with my collaborator
2
+ Jay Mundra, we will present our work on reusing combinatorial structure, faster projections
3
+ over submodular-based polytopes. This is joint work with Swati Gupta.
4
+ In this talk, we consider a sequence of similar structured optimization problems a setup often
5
+ encountered in practice. We first start with our main problem of minimizing a convex function
6
+ over a decision set P. At the next time step, this problem sees some perturbation and we
7
+ obtain another similar problem, and so on. An example of this setup is the case of iterative
8
+ projections where at each time step, we are computing the projection of a new point y
9
+ t that is close to previously projected points y i. These iterative projections form a key
10
+ step in many optimal learning algorithms and they are currently solved from scratch every
11
+ iteration. They are not viewed in the context of an iterative environment where previously
12
+ computed projections can be exploited to speed up subsequent ones.
13
+ Thus, in this talk, we ask, is it possible to speed up similar iterative optimization
14
+ problems by reusing structural information from previous minimizers?
15
+ Let me now give you some more details about our setup. Here is a table that summarizes
16
+ various widespread first-order optimization algorithms. The first two algorithms are conditional
17
+ gradient variants and they only solve linear optimization every iteration. Their convergence
18
+ rates depend on the dimension of the problem and on geometric constants for the underlying
19
+ decision set, such as the pyramidal width for the waystep-Fraenkel variant given in
20
+ the second row. On the other hand, the remaining third algorithms
21
+ are projection-based algorithms that compute the projection every iteration, and their
22
+ convergence rates, however, are optimal in the sense that they only rely on the condition
23
+ number of the function and they are dimension-independent. Further, to capture a wide range of combinatorial
24
+ sets, we consider the case where decision set P is given by a submodular polytope, and
25
+ the challenge is that these polytopes have an exponential number of constraints. Thus,
26
+ computing a projection over those polytopes is a big computational bottleneck in projection-based
27
+ algorithms. Motivated by the straight-off in convergence rates versus runtime, we further
28
+ ask, is it possible to speed up iterative projections over submodular polytopes by reusing
29
+ structural information from previous minimizers? I'm now going to give more introduction on
30
+ the problem and submodularity and review of first-order methods. So, as mentioned, we
31
+ assume that the combinatorial structure in a problem is given by a submodular function.
32
+ Set function F, defined over a ground set E of n elements, is submodular if it satisfies
33
+ the following property. Furthermore, the base polytope associated with F is defined as the
34
+ following system of linear inequalities, and here we see that V of F is modeled using an
35
+ exponential number of constraints because we have a constraint for each subset of the
36
+ concept. An example is the permutahedron, a polytope whose vertices are permutations
37
+ of 1 through n. And here we have an example in the slide for when n is equal to 3. These
38
+ polytopes are extensively used in online learning over rankings of items. A special class of
39
+ submodular polytopes are known as Cardinality-based functions, and a Cardinality-based function
40
+ F is defined as F of S equal to G Cardinality of S, where G is a concave function. And here
41
+ we have another table that summarizes various machine and online learning problems in a
42
+ submodular set function that gives rise to them. We see the permutahedron in the second
43
+ row of this table, and it is in fact a Cardinality-based polytope. Other non-Cardinality-based examples
44
+ include spanning trees and independent sets of matroids.
45
+ So let's go back to our main problem of minimizing a convex function over the base polytope.
46
+ So there typically exist three main paradigms to solve this problem. The first is a class
47
+ of methods, known as conditional gradient methods, and as I mentioned before, those
48
+ assume access to B of F via linear optimization oracle. And these methods are specifically
49
+ advantageous for base polytopes because linear optimization over base polytopes could be
50
+ done very efficiently using Edmunds' greedy algorithm. The second class of methods are
51
+ mere descent variants, and those compute a projection every iteration to ensure feasibility.
52
+ And again, as I also previously mentioned, although those methods have optimal convergence
53
+ rates and are robust, they are, they remained of theoretical nature due to being computationally
54
+ expensive. The third class of methods are combinatorial algorithms specifically tailored
55
+ for convex optimization over some modular-based polytopes. Those algorithms require instead
56
+ solving a some modular function minimization problem every iteration, which again can be
57
+ very expensive. However, those algorithms enjoy the nice property of returning exact
58
+ optimal solution. In this talk, we will focus on bridging the efficiency of CG methods and
59
+ the structural properties and exactness of combinatorial algorithms to speed up iterative
60
+ projections appearing in mere descent and beyond. So first, let's consider the simpler
61
+ case when our polytope is cardinality-based. So here we have a cardinality-based some modular
62
+ function F, and for notation we define this vector c to be the vector of discrete derivatives
63
+ of the concave function g. We now give the following Duati result, which states that
64
+ the problem of computing a Bregman projection over a cardinality-based polytope is dual
65
+ to isotonic optimization. Although our results hold for general Bregman projections, we will
66
+ focus on the case of Euclidean projections for simplicity. To that end, consider a vector
67
+ y that we're trying to compute its Euclidean projection over a cardinality-based polytope,
68
+ and let e1 through en be an ordering of the ground set such that y is decreasing. In this
69
+ case, we have the following primal problem, and the dual to that is the following isotonic
70
+ regression problem. And further, we can map between the two problems using the following identity here.
71
+ So just to give you some historical context, previously the best known running time for
72
+ projections was O n squared using a primal algorithm by Gupta et al. Later on in that
73
+ year, Lim and Wright used the same Duati approach to compute projections over the permutahedron,
74
+ and we extended their approach to general cardinality-based polytopes. Now the dual
75
+ isotonic regression problem could be solved in O n time using a simple algorithm called
76
+ pool-adjacent violators algorithm, and this basically gives us an O n log n algorithm by
77
+ solving the problem in the dual space and mapping it back to the primal space. And this is currently
78
+ the fastest known algorithm. And the key takeaway is that solving projections over these polytopes
79
+ can be very efficiently done. In fact, computing a projection and solving linear optimization
80
+ have the same running time. Now let's demonstrate our result with an example. So here we are going
81
+ to project this vector y onto the probability simplex, and the probability simplex is modeled
82
+ by this cardinality-based modular function here given on the slide. And we see that y is already
83
+ ordered for simplicity and c is the vector of discrete derivatives. Now the algorithm will
84
+ proceed as follows. It initializes the dual iterates by the vector that we're trying to
85
+ compute the isotonic regression for, c minus y, and here we have an adjacent violation because the
86
+ second coordinate is strictly smaller than the first coordinate. Now the algorithm will basically
87
+ average those two coordinates to obtain the following solution z star, and here we see that
88
+ the ordering constraints are satisfied and z star is in fact the dual optimal. Next it will map it
89
+ back to a primal optimal. And let's go back to this figure from the previous slide that just compares
90
+ a basic linear regression fit with an isotonic regression fit. Here in the red stepwise curve,
91
+ the points at which the curve remains flat is where a block of consecutive adjacent violated
92
+ points are averaged similar to our example. This very efficient algorithm for computing
93
+ regimen projections over cardinality-based polytopes unfortunately does not extend to
94
+ general submodular based polytopes. And now my collaborator Jay will present different combinatorial
95
+ strategies for dealing with those polytopes. We now describe our toolkit for speeding up
96
+ projections on general submodular based polytopes. There are two basic objects that we can learn from.
97
+ First, given projections of previous points, can we do better than computing a new projection from
98
+ scratch? Second, given an iterative algorithm to compute a projection, can we use the combinatorial
99
+ structure present in the sequence of iterates to speed up the algorithm and terminate it early?
100
+ We have the well-known first-order optimality condition on the left. It helps us verify if a
101
+ point is indeed optimal. This check is reduced to a linear optimization over the base polytope,
102
+ which can be done using Edmunds-Greedy algorithm. We have an example. Suppose we know the gradient
103
+ at a point x star and want to check if x star is indeed optimal. We look at the distinct values
104
+ of the partial derivatives at x star and arrange them in an increasing order. Each time we see a
105
+ gap in this order, we want that the point x star on the prefix set equal the submodular function
106
+ value on that set. In the figure, the first such gap is after we have seen even an E5. Therefore,
107
+ x star S1 must equal f of S1. Similarly, x star S2 must equal f of S2. Finally, xE must equal f of
108
+ E. These sets S1, S2, and E are called tight sets at x and define the face containing the point x
109
+ star. This leads us to two interesting observations that we use later. One, that if we know precisely
110
+ what the tight sets are at the optimal points, we can also calculate the optimal point for all
111
+ suitable functions h. Two, that knowing the gradient at the optimal point gives us these
112
+ tight sets. We give an example using our combinatorial idea. Suppose we know a point
113
+ zk that is close to our optimal x star. If the function is smooth, this implies gradient at zk
114
+ and x star are close. This gives us a way to learn some tight sets defining the optimal face.
115
+ In the example, for each coordinate, the blue line in the middle represents the partial derivative
116
+ value at zk and the blue shade represents the possible variation in that value for the optimal
117
+ point x star. That is, the corresponding partial derivative for x star lies in the shaded interval.
118
+ The largest values in these intervals for E1 and E5 are lower than the lowest values in these
119
+ intervals for every other element. This helps us conclude that the set E1 and E5, that is S1,
120
+ is a tight set at x star. Similarly, we infer that S2 is also a tight set at x star.
121
+ We now use that idea to give our first two tools. These apply more generally, but we demonstrate
122
+ them using Euclidean projections. Suppose we already know the projection xi of a point yi,
123
+ and we wish to find the projection xt of point yt, given that yt is close to yi.
124
+ The non-expansiveness of projection implies that the gradients at xi and xt are also close,
125
+ and therefore we can infer some tight sets at xt even before solving.
126
+ Suppose we start computing the projection of yt using an iterative algorithm.
127
+ We now use the iterates zi that converge to xt. An iterate zt that is close to xt also has a
128
+ gradient that is close to the gradient at xt, and once again we can infer some tight sets at xt
129
+ as we approach the optimal. We also conducted an experiment to show that tool T1 can recover
130
+ most tight sets from previous projections. We now give two tools that help us round an
131
+ approximate solution exactly to the projection. First is our tool T3 called Relax.
132
+ We give a heuristic to check if we have already found all the tight sets at the optimal.
133
+ We also show that we can round combinatorially when we know the function f to be integral,
134
+ and an iterate zt is close enough to the optimal xt. This is our tool T4.
135
+ We can reuse previously known vertices of the polytope. Suppose that our optimal is xt,
136
+ and we are given a close by point xi as a convex combination of some vertices in the polytope.
137
+ We can use those vertices to warm start the search for xt. Now our sixth tool, Restrict.
138
+ Once we know a few tight sets for xt using our inferred tools T1 and T2,
139
+ we needn't search over the optimal or the whole base polytope. We can restrict ourselves to the
140
+ face of the polytope that satisfies these constraints. We show that a simple extension
141
+ of Edmunds' greedy algorithm provides yellow oracle for each face of the polytope.
142
+ We now bring together these tools and apply them to the awaystep-frank-wolff algorithm,
143
+ giving the algorithm we dub adaptive awaystep-frank-wolff, or A2FW for short.
144
+ First, warm start A2FW using tight sets for the optimal inferred from previous projected points,
145
+ and active sets from previous projected points. While the algorithm runs and generates new
146
+ iterates, it keeps inferring new tight sets for the optimal point using these iterates.
147
+ In each iteration, if a new set has been found, the algorithm checks if all tight sets have been
148
+ found. If indeed so, then stop and output the exact solution. Otherwise, simply restrict the
149
+ problem to a low-dimensional face and keep going on. Note that the linear optimization is over a
150
+ restricted face of the polytope. Let's see an example. Suppose we are optimizing over the
151
+ polytope P. We look for the best frank-wolff vertex and the best away vertex. We find that
152
+ the best frank-wolff vertex is the best away vertex. Since the direction opposite to the away
153
+ vertex is the better direction to move in, we find the next iterate ZT plus 1. Now, ZT plus 1 is
154
+ close enough to X star that it allows us to detect another tight set and round to the face F new.
155
+ One way to do that is to round to an arbitrary vertex in F new using our yellow oracle. Another
156
+ option is to relax to F new and see if the solution obtained is feasible. If feasibility
157
+ check is uncertain, return to the previous strategy. Eventually, we reach the optimal
158
+ X star either way. We give this theorem about the primal gap for the modified algorithm.
159
+ The function h is l-smooth and mu strongly convex and d refers to the diameter of BF.
160
+ Notice how this compares to the AFW algorithm. When we restrict to a face F of BF, our guarantee
161
+ depends only on the pyramidal width of F instead of the pyramidal width of BF. This pyramidal width
162
+ can be much lower for the restricted face. For instance, it depends on the dimension of the face
163
+ for the probability simplex. Therefore, A2FW leads to a faster convergence. We now show the
164
+ effectiveness of our toolkit and the A2FW algorithm using experiments. For our computations,
165
+ we simulate an online recommendation system where we are learning over rankings of items
166
+ displayed to users. Our loss functions are stochastic model click-through rates. This
167
+ can be seen as optimization over the permutahedron. We use online mirror descent which performs
168
+ iterative projections and uses away step Frank-Wulf for these projections. We benchmark the
169
+ original AFW algorithm against variants modified by our tools. We report significant improvement
170
+ in both runtime and the number of AFW iterations. The green line stands for OMD with the original
171
+ unoptimized AFW. The yellow line stands for OMD with A2FW algorithm. We do note that both OMDPAV,
172
+ that is OMD with projections using the poor adjacent violators algorithm, and OFW were
173
+ significantly faster than OMD with any AFW variant. However, OFW does not lead to optimum
174
+ regret rates while OMDPAV works only for cardinality-based submodular polytopes. To
175
+ conclude, we studied iterative projections for prevalent submodular-based polytopes. We presented
176
+ an algorithm for cardinality-based polytopes. For general polytopes, we developed a combinatorial
177
+ toolkit to speed up iterative projections and applied it to the AFW algorithm and computationally
178
+ showed that our algorithm is orders of magnitude faster than the original AFW variant.
demo_data/nips-2021/25964/transcript_whisper_large-v2.txt ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ e la possibilità di eseguire un'operazione di modello di un'algebra.
2
+ Questo è un'operazione che è stata creata per il nostro studio,
3
+ e che è stato creato per il nostro studio.
4
+ Ciao a tutti, sono Matteo Papini,
5
+ e questo è un lavoro insieme con Andrea Tirinzoni,
6
+ Aldo Pacchiano, Marcello Restelli,
7
+ Alessandro Lazzarici e Matteo Pirotta.
8
+ Il nostro lavoro è motivato dall'efficacia
9
+ di algoritmi di imparazione di rinforzamento profondo
10
+ per risolvere tasche complesse, come i videoghi.
11
+ Una caratteristica fondamentale di questi metodi
12
+ è la possibilità di eseguire neural networks
13
+ per eseguire rappresentazioni complesse delle tasche
14
+ che permette di rappresentare e imparare
15
+ le polizie ottime efficacemente.
16
+ Capire cosa fa una rappresentazione buona
17
+ e come trovarne una
18
+ è fondamentale per disegnare
19
+ migliori algoritmi di imparazione di rinforzamento.
20
+ In questo lavoro, per prima volta,
21
+ ci sono state presentate caratterizzazioni formali
22
+ di rappresentazioni buone per l'imparazione di rinforzamento.
23
+ Abbiamo mostrato che usare una rappresentazione buona
24
+ può davvero beneficiare l'efficienza di imparazione
25
+ e fornire garantie di regretto costante.
26
+ Finalmente, abbiamo mostrato come una rappresentazione buona
27
+ può essere selezionata dall'interazione online,
28
+ un primo passaggio verso l'apprendimento di rappresentazione per RL.
29
+ Ma prima di tutto, qualche background.
30
+ Il problema di imparazione è modellato
31
+ come un processo di decisione di marco finito di orizzonte, o MDP.
32
+ In ogni passaggio di tempo, l'agente osserva un stato dell'ambiente,
33
+ prende un'azione e riceve una rinforza
34
+ e un stato successivo come risultato.
35
+ Questi sono determinati rispettivamente
36
+ da una funzione di rinforza e una funzione di transizione
37
+ che sono un'unità di tempo e un'unità di non-conoscenza.
38
+ L'interazione è dividita in due episodi
39
+ di lunghezza finita, che si chiama l'orizzonte.
40
+ All'ultimo episodio, il stato è risalto
41
+ a seconda della distribuzione fissata.
42
+ Il comportamento dell'agente è modellato da una polizia,
43
+ che è una mappatura da stati all'azione
44
+ che può anche essere dipendente del tempo.
45
+ La funzione di valore, o funzione Q della polizia Pi,
46
+ dà la rinforza aspettata totale
47
+ ottenuta prendendo l'azione A in stato S a tempo H
48
+ e poi seguendo la polizia fino all'ultimo episodio.
49
+ Un'ottima polizia è garantita
50
+ che la funzione Q si massima su tutti i stati.
51
+ Facciamo un'assumzione extra
52
+ che ogni stato admette un'azione ottima unica.
53
+ Quando il numero di stati è molto grande o anche infinito,
54
+ imparare l'ottima polizia può essere molto difficile.
55
+ Quindi guardiamo i linear MDPs
56
+ dove l'agente ha accesso a una rappresentazione compatta.
57
+ Questa è una mappatura di caratteristiche
58
+ da stati e azioni a vectori d-dimensional
59
+ dove D è più piccolo.
60
+ Potete vedere queste caratteristiche
61
+ come l'ultimo strato scoperto di un'intera rete neurale.
62
+ Nell'apprendimento di rinforzamento profondo
63
+ impariamo tutti i pesi della rete simultaneamente.
64
+ Qui mantendremo la rappresentazione fissa
65
+ e impareremo solo i finali parametri
66
+ che sono i pesi di una combinazione lineare.
67
+ Questa funzione lineare, almeno,
68
+ deve essere in grado di rappresentare la funzione Q ottima
69
+ in modo da poterla usare per prendere azioni ottime.
70
+ Ma, infine,
71
+ essere in grado di rappresentare la funzione Q ottima
72
+ non è abbastanza per l'apprendimento efficace
73
+ perché un numero esponenziale di esempi
74
+ può ancora essere richiesto.
75
+ Per evitare questo,
76
+ ci sono necessità di assumizioni strutturali extra
77
+ sull'MDP,
78
+ e alcune sono state proposte nella literatura.
79
+ Nel MDP di basso rango,
80
+ sia la funzione di rinforzamento che la funzione di transizione
81
+ sono lineari nelle stesse funzioni.
82
+ Queste funzioni possono essere tempo-indipendenti.
83
+ Assumiamo solo per semplicità
84
+ che le due funzioni condividono la stessa dimensione D.
85
+ Una prima conseguenza della struttura di basso rango
86
+ è che la funzione Q di ogni polizia
87
+ può essere rappresentata come una funzione lineare delle funzioni.
88
+ Una assumzione strutturale più forte è la rinforzamento di Bellman.
89
+ In questi MDP,
90
+ tutte le funzioni lineare delle funzioni
91
+ devono essere chiuse sotto l'operatore di optimità di Bellman.
92
+ La struttura di basso rango implica la chiusura di Bellman,
93
+ ma l'opposto non è vero.
94
+ Indeed, nelle MDP di chiusura di Bellman,
95
+ solo l'ottima funzione Q
96
+ è garantita di essere realizzabile lineariamente.
97
+ Le algoritmi di imparazione di rinforzamento efficace
98
+ sono state proposte per questi settimenti.
99
+ Possiamo evaluare le funzioni
100
+ usando il concetto di risalto,
101
+ che è l'amounto totale di sub-optimità
102
+ che viene sofferto dall'agente
103
+ durante il processo di imparazione
104
+ rispetto alla polizia ottima.
105
+ Nelle MDP di basso rango,
106
+ l'algoritmo LSVI-UCB
107
+ soffre solo un regalo sublineare
108
+ nel caso più grave.
109
+ Eleanor è una versione raffinata
110
+ che funziona nel caso più generale
111
+ della chiusura di Bellman
112
+ e ha una migliore dipendenza
113
+ sulla dimensione di caratteristiche.
114
+ Doveva essere notato, però,
115
+ che Eleanor è computazionale intrattabile.
116
+ Per il LSVI-UCB
117
+ abbiamo anche un regalo
118
+ di base di istanze
119
+ che è logaritmico
120
+ nel numero totale di interazioni.
121
+ Qui Delta denuncia
122
+ il capo di sub-optimità
123
+ di una pariera di attesa statale
124
+ che è assumato di avere
125
+ un minimo ben definito.
126
+ Tutti questi regali di base
127
+ ignorano la qualità della rappresentazione,
128
+ a parte le assumazioni strutturali
129
+ che sono necessarie
130
+ per la sua gestione.
131
+ La domanda che cercheremo di rispondere è questa.
132
+ Possiamo raggiungere
133
+ anche piccoli dolori
134
+ con una buona rappresentazione?
135
+ Per rendere questo concetto
136
+ di buona rappresentazione formale
137
+ introduciamo la proprietà Unisoft.
138
+ Una rappresentazione è Unisoft
139
+ se le caratteristiche ottime
140
+ spostano l'intero spazio di caratteristiche.
141
+ Le caratteristiche ottime sono
142
+ le caratteristiche delle azioni ottime
143
+ in stati che sono raggiuntibili
144
+ alla propria politica ottimale.
145
+ Intuitivamente, la proprietà Unisoft
146
+ garantisce che le caratteristiche ottime
147
+ sono diverse abbastanza
148
+ per che l'agente
149
+ cominci rapidamente alla politica ottimale
150
+ senza ridurre
151
+ l'amounto di informazioni che riceve
152
+ sulla tasca in generale.
153
+ Possiamo anche misurare
154
+ il grado di diversità della rappresentazione
155
+ guardando i più piccoli valori
156
+ degli eigenvali
157
+ della matrica di covarianza delle caratteristiche ottime.
158
+ Questo parametro di Lambda
159
+ porterà un ruolo importante
160
+ nelle nostre regrette.
161
+ Notate che un valore più alto di Lambda
162
+ è migliore perché denota
163
+ più diversità di caratteristiche
164
+ e che Lambda può essere al massimo
165
+ una sotto assumizioni comuni
166
+ sulla magnitude di caratteristiche.
167
+ Ma in quale senso sono queste rappresentazioni
168
+ ottime?
169
+ Ciò che abbiamo mostrato in MDP lineari
170
+ è che Unisoft è sinonimo
171
+ con regrette costanti.
172
+ Per prima cosa, abbiamo mostrato
173
+ che la proprietà di Unisoft
174
+ è necessaria per raggiungere
175
+ regrette costanti in MDP
176
+ con regretti lineari.
177
+ Questo appartiene a MDPs di basso rango,
178
+ Bellman closure,
179
+ e anche a MDPs di mixtura lineare
180
+ che sono un'altra
181
+ assumazione strutturale comune.
182
+ Ma Unisoft è anche sufficiente
183
+ per regrette costanti
184
+ in casi interessanti.
185
+ In MDPs di basso rango,
186
+ SVI-UCB raggiunge
187
+ regrette costanti se e solo se
188
+ la rappresentazione è Unisoft.
189
+ Con una alta probabilità,
190
+ un numero finito
191
+ di interaczioni è sufficiente
192
+ per l'agente imparare
193
+ perfettamente la polizia ottimale.
194
+ Quindi, la regrette può essere
195
+ rilassata in termini di questo tempo costante
196
+ regardless of the
197
+ total number of episodes k.
198
+ In altri parole, la regrette
199
+ è costante.
200
+ Notate come il tempo τ
201
+ dipende inversamente
202
+ sul parametro λ.
203
+ Indeed, con una mappa di
204
+ più diversità di caratteristiche, possiamo imparare
205
+ la polizia ottimale più velocemente.
206
+ Abbiamo un risultato simile
207
+ per Eleanor nel caso più generale
208
+ di MDPs di Bellman closure,
209
+ con anche una migliore
210
+ dipendenza sulla dimensione d
211
+ della caratteristica.
212
+ Infine, la mancanza di
213
+ lombari per Eleanor
214
+ dà questa polinomiale
215
+ dipendenza sul parametro λ
216
+ rispetto a una dipendenza logaritmica
217
+ nel caso di LSVI-UCB.
218
+ Ma questo potrebbe ben essere
219
+ un artefatto del nostro provo.
220
+ Per ricapitulare, abbiamo mostrato
221
+ che l'Unisoft è
222
+ sia necessario che sufficiente
223
+ per raggiungere regrette costanti
224
+ in MDPs di Bellman closure
225
+ e di low rank, e ha
226
+ provvinto regrette costanti
227
+ per i bounds superiori per algoritmi comuni.
228
+ Nella ultima parte del
229
+ talco, mostriamo come
230
+ le representazioni buone possono essere
231
+ scelte online.
232
+ Ci concentriamo su MDPs di low rank
233
+ per semplicità.
234
+ L'agente è dato un set
235
+ di N rappresentazioni candidate
236
+ che rappresentano
237
+ la stessa MDP di low rank
238
+ senza misspecificazione.
239
+ Le rappresentazioni possono avere
240
+ diverse dimensioni.
241
+ Questo differe dall'approccio tipico
242
+ di rappresentazione di lezione in RL
243
+ dove si cercano di trovare
244
+ una rappresentazione accurata
245
+ da una classe di funzioni realizzabili.
246
+ Questo permette di
247
+ risolvere le misspecificazioni, ma
248
+ è tipicamente fatto offline.
249
+ Il nostro obiettivo è
250
+ imparare così efficientemente
251
+ come se usassimo la migliore
252
+ rappresentazione candidata nel set
253
+ senza sapere in avanzo.
254
+ Ovviamente, se una delle candidate
255
+ è Unisoft, vorremmo
256
+ ottenere un regalo costante.
257
+ L'algoritmo che proponiamo
258
+ è LSVI Leader.
259
+ Si guida
260
+ N istanze parallele di LSVI UCB,
261
+ una per ogni rappresentazione
262
+ candidata.
263
+ Per ogni rappresentazione, usiamo
264
+ tutte le date collezionate
265
+ dall'agente per esimerare
266
+ il parametro dell'ottima
267
+ funzione Q accordo
268
+ a questa rappresentazione.
269
+ Questo è fatto con una combinazione
270
+ di square e induzione sbattuta.
271
+ Un bonus di esplorazione
272
+ viene aggiunto all'estimato
273
+ del parametro per rendere
274
+ l'estimato ottimista, come nel caso di LSVI UCB.
275
+ Ma ora
276
+ abbiamo un parametro ottimista
277
+ per ogni rappresentazione
278
+ e l'azione viene scelta
279
+ per maximizzare il più piccolo
280
+ parametro ottimista,
281
+ che è anche l'estimato più tico.
282
+ Notate come questo
283
+ è in realtà più potente
284
+ dell'algoritmo di selezione del modello
285
+ perché possiamo usare
286
+ una rappresentazione diversa
287
+ per ogni stato.
288
+ Vediamo che il regalo del leader di LSVI
289
+ è superiore
290
+ a quello di LSVI UCB
291
+ se è condannato con la rappresentazione
292
+ migliore dei candidati,
293
+ a meno di un fattore,
294
+ che è il numero di candidati
295
+ in square.
296
+ Questo significa che se abbiamo
297
+ una rappresentazione di Unisoft nel set,
298
+ il leader di LSVI
299
+ raggiunge il regalo di selezione.
300
+ Ma il leader di LSVI
301
+ può combinare rappresentazioni
302
+ attraverso stagi, stati e azioni,
303
+ e quindi
304
+ a volte può raggiungere
305
+ il regalo di selezione
306
+ anche se non c'è una rappresentazione di candidati
307
+ di Unisoft.
308
+ I nostri risultati teoretici sono anche supportati
309
+ dai risultati empirici
310
+ in MDPs di piccolo regalo di selezione.
311
+ Questi plotti mostrano il regalo di selezione
312
+ come funzione del numero di episodi.
313
+ A sinistra abbiamo
314
+ il regalo di LSVI-UCB
315
+ che è gestito con
316
+ diverse rappresentazioni.
317
+ Di queste, l'unica rappresentazione
318
+ in grigio nel plotto
319
+ è Unisoft, e solo in questo caso
320
+ LSVI-UCB è in grado
321
+ di raggiungere regali costanti.
322
+ A sinistra abbiamo il regalo
323
+ del leader di LSVI
324
+ che è gestito con vari set di candidati.
325
+ In tutti questi casi,
326
+ il leader di LSVI raggiunge
327
+ regali costanti.
328
+ Ovviamente, senza sapere
329
+ la migliore rappresentazione in avanzo,
330
+ ci serve più tempo per imparare la polizia ottima,
331
+ ma questo è stato anche aspettato
332
+ dalla nostra regola di selezione.
333
+ Il plotto arancione è particolarmente
334
+ interessante, perché in questo caso
335
+ l'unica rappresentazione di Unisoft,
336
+ numero 1,
337
+ non è nel set di candidati,
338
+ ma ancora LSVI-leader è in grado
339
+ di raggiungere regali costanti
340
+ combinando le representazioni rimaste.
341
+ Nel lavoro futuro,
342
+ vorremmo migliorare questo fattore
343
+ di sqvrtn nel regalo del leader di LSVI,
344
+ perché nel caso dei banditi lineari
345
+ la dipendenza sull'umare
346
+ delle rappresentazioni è solo logaritmica.
347
+ Vorremmo anche
348
+ estendere il leader di LSVI
349
+ per gestire le rappresentazioni
350
+ di candidati che sono miscele.
351
+ Tuttavia, questa
352
+ selezione delle rappresentazioni è
353
+ solo un passaggio verso
354
+ il learning of representation,
355
+ che significa imparare
356
+ la rappresentazione online da scratch.
357
+ Questo è già fatto
358
+ in pratica con il learning di
359
+ rinforzamento profondo, ma la teoria
360
+ di questo è scomoda.
361
+ Finalmente, possiamo considerare
362
+ il learning di rinforzamento multitasca,
363
+ dove una singola rappresentazione
364
+ potrebbe essere buona per un
365
+ composto di MDPs che condividono
366
+ una struttura. Grazie.
demo_data/nips-2021/25965/transcript_whisper_large-v2.txt ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ How many friends do you have?
2
+ At least you have more friends than I do.
3
+ Well, on average.
4
+ Don't get me wrong, I am not a pity person.
5
+ This is a mathematical fact known as the friendship paradox.
6
+ Suppose we have two persons, A who has one friend and B who has three friends.
7
+ Now let me ask in which friend list am I likely to appear?
8
+ Because B has three times more friends, I am three times more likely to appear in the
9
+ B's friend list.
10
+ The friendship paradox dictates that on average, your friends have more friends than you do.
11
+ The more friends someone has, the more likely someone appears in your friend list.
12
+ Beyond an interesting piece of trivia, the friendship paradox has substantial importance
13
+ because it may introduce biases in graph embeddings.
14
+ Hello everyone, my name is Sadamori Kojak, and we will walk you through a new insight
15
+ into biases in graph embedding arising from the friendship paradox.
16
+ The graph embedding is a technique to map a graph into a vector space that reflects
17
+ the structure of the graph.
18
+ A widespread paradigm is the approach based on Word2Vec.
19
+ In this approach, one somehow generates a sequence of nodes from the graph.
20
+ The nodes in the sentences are then mapped to a vector space by Word2Vec.
21
+ Now the key is that Word2Vec does not directly learn the graph, but through the sentences
22
+ generated from the graph.
23
+ Unlike the word embedding, where the input sentences are the actual data, for graph embedding,
24
+ the input sentence is artificially generated, and how to generate it is a critical modeling
25
+ decision.
26
+ This leads us to the question of how to generate the sentences from the graph.
27
+ A common way is to use random walks.
28
+ The worker starts from a node in the graph, and this node is the first node in the sentence.
29
+ Then the worker moves to one of the neighbors selected randomly.
30
+ This new node is added to the sentence.
31
+ By repeating this process, we can generate a sentence of nodes from this graph.
32
+ The friendship paradox comes into play when the worker follows an edge.
33
+ It is more likely to visit a node with many neighbors.
34
+ In other words, following edges is a bias sampling that preferentially leads random
35
+ workers to nodes with many neighbors.
36
+ To see this effect, let us consider a graph with co-peripheral structure, where kernels
37
+ have more neighbors than periphery.
38
+ A sentence can be generated from this graph by running a random walk.
39
+ Now, the kernels are about 20% of nodes in the graph.
40
+ But when looking at the generated sentence, the kernels are overrepresented, which is
41
+ because of the bias due to the friendship paradox.
42
+ The fact that the sentence is biased by the friendship paradox leads us to our main question.
43
+ Does the sampling bias have negative impact?
44
+ If so, how can we fix it?
45
+ Surprisingly, it has no effect because Word2Vec itself has an overlooked built-in devising
46
+ feature that happens to negate the bias due to the friendship paradox.
47
+ This built-in devising feature can be easily utilized to negate other types of biases,
48
+ and we demonstrate how to do this.
49
+ Our starting point is a sentence of words.
50
+ Word2Vec picks a word called center and surrounding words called context, and then models the
51
+ conditional probability using a softmax function, where the conditional probability is reflected
52
+ as a dot similarity of the two vectors of the words.
53
+ We want to fit this model to the data, but it is computationally challenging due to the
54
+ normalization constant, which extends over all unique words in the corpus.
55
+ A common way to reduce this burden is negative sampling.
56
+ Now, it is often underappreciated that negative sampling is actually a simplified version
57
+ of noise contrastive estimation.
58
+ And it is this simplification that gives rise to an interesting feature of Word2Vec.
59
+ How does the noise contrastive estimation, or NCE, works?
60
+ NCE samples k random contexts from so-called noise distribution.
61
+ This noise distribution is roughly proportional to the frequency of a word in the corpus.
62
+ The random contexts are labeled as 0, and the actual context is labeled as 1.
63
+ Then NCE calculates the probability that a word comes from actual data using a Bayesian
64
+ framework.
65
+ By putting the prior likelihood together, we have a posterior like this.
66
+ This function is a sigmoid function and takes the dot similarity and the noise distribution
67
+ as the arguments.
68
+ Now the key feature of the NCE is that it is asymptomatically unbiased for the model
69
+ of the Word2Vec.
70
+ Meaning if the data is actually generated from this model, and we increase the number
71
+ of trainings, then the embedding vectors converge to the true vectors.
72
+ Beyond Word2Vec, the noise contrastive estimation is also an unbiased estimator for a more general
73
+ model that takes a real value function f instead of the dot similarity.
74
+ Now the negative sampling simplifies the noise contrastive estimation.
75
+ It estimates the same probability, but variably drops the term of the noise distribution.
76
+ You might be wondering what happens without this term.
77
+ To see this, we rewrite it in form of the noise contrastive estimation, where we define
78
+ a new function f' which consists of the original function f as well as the noise distribution.
79
+ This is asymptomatically unbiased for a probability model which now includes the noise distribution.
80
+ So all in all, Word2Vec trained with skip-gram-negative sampling is asymptomatically unbiased for
81
+ this probability model, or more specifically for Word2Vec, this function.
82
+ In this model, the noise distribution offsets the modeled probability, serving as a baseline.
83
+ The embedding vectors captures the residual from the baseline.
84
+ Now, remind that the baseline probability is roughly proportional to the frequency.
85
+ Therefore, the embedding vectors capture the information other than the frequency.
86
+ In other words, SGNS Word2Vec has a built-in debiasing feature for frequency bias.
87
+ Now let us revisit the friendship paradox.
88
+ The sampling bias due to the friendship paradox is that the frequency of a word is determined
89
+ thoroughly by the degree of noise.
90
+ Notice that this frequency is actually accounted for by the baseline probability.
91
+ Therefore, the friendship paradox has no effect thanks to the built-in debiasing feature of
92
+ SGNS Word2Vec.
93
+ This realization leads us to Residual2Vec.
94
+ The key idea is to model the baseline probability explicitly to control what bias to remove
95
+ in embedding.
96
+ So how can we model the baseline more specifically?
97
+ We start from the given graph and randomize the structure, then generate a sequence using
98
+ random walks, then calculate the conditional probability as the baseline, which is based
99
+ on the idea that we should remove biases arising from the trivial structure.
100
+ This debiasing feature is useful to predict links in the graph.
101
+ Residual2Vec performs the best or nearly the best for all six graphs of different domains.
102
+ Furthermore, Residual2Vec is the best or the second best performer for a community detection
103
+ benchmark.
104
+ To showcase the debiasing feature, we constructed a citation graph of general issues using the
105
+ web of science, where the nodes are general issues connected by undirected and weighted
106
+ citations.
107
+ When applying grove embedding, all genres are concentrated on the center, reflecting
108
+ temporal aspects of the issues.
109
+ This is because the old issues have time to accumulate many citations, and therefore well
110
+ connected to many different issues.
111
+ For subject-wise, grove separates different fields to some extent.
112
+ With Residual2Vec, we can remove the biases due to time.
113
+ In effect, the old genres now spread out, and the disciplinary separations are more
114
+ clearly visible.
115
+ Beyond eyeballing the embeddings, we test the embeddings quantitatively by predicting
116
+ the genre impact factor as well as the subject categories.
117
+ We find that the impact factor and the subject of genres can be well predicted by removing
118
+ the temporal biases as well as the friendship paradox effect.
119
+ In summary, we show that World2Vec has a built-in debiasing feature attributed to negative sampling.
120
+ Inspired by this finding, we propose Residual2Vec that can negate other types of structural
121
+ biases.
122
+ We demonstrate that removing biases not only improves the performance, but also enabling
123
+ us to control on the biases in the final representation.
124
+ Our results highlighted a new potential of negative sampling as a way to mitigate biases
125
+ in representations, which may be useful to address the problem of the biases in AI.
126
+ Although we have not studied the biases in AI, given the wide usage of negative sampling
127
+ to train AI, our approach may lead to methods and studies that expose and mitigate biases
128
+ in AI.
129
+ We believe that our approach contributes to the effort to create transparent and accountable
130
+ machine learning methods, especially because our method enables us to explicitly control
131
+ the biases in the graph representation.
132
+ That's all for the presentation, and finally I'd like to acknowledge Jason Yoon, Isabel
133
+ Constantino, and Yongyuan An for creating and adding momentum to this project for years,
134
+ and for all of you who watched this video.
135
+ If you want to know more in detail, please check out our paper.
136
+ Thanks!
demo_data/nips-2021/25969/transcript_whisper_large-v2.txt ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hello everyone, my name is Alan. I'm a PhD student from Stanford University. I'm presenting
2
+ our work Play to Grade, testing coding games as classifying Markov decision process. This
3
+ is joint work with Emma Bronskill and Chris Peach.
4
+ In this talk, we will highlight the central problem that we're trying to solve, which
5
+ is scaling up quality feedback for students learning to code is crucial. Grading interactive
6
+ coding game is very difficult, and we frame this as an instance of identifying if a program
7
+ has the same behavior as a desired MDP. Even with 11 label programs, we can achieve 94%
8
+ accuracy on real student assignment from code.org.
9
+ Each year, hundreds of thousands of people, children and adults alike, want to learn coding.
10
+ Modern massive online education platforms like code.org serves over 40% of US K-12 students.
11
+ Scaling up quality feedback for these students is crucial, especially in areas where there
12
+ are shortages of computer science teachers.
13
+ Interactive coding assignments are becoming more popular. It's a lot more fun for students
14
+ to program them. They're also a common type of programs for students to code. For example,
15
+ web pages are interactive. However, in order to grade them, teachers often need to play
16
+ each student homework for 20 seconds to a couple minutes. This quickly becomes a scaling
17
+ issue. A 20-student classroom might still be manageable, but in a large university where
18
+ there are hundreds of students taking the same class or on an online education platform
19
+ like code.org, grading these assignments is a real challenge. This places a real burden
20
+ on teachers.
21
+ Why is it difficult to develop automatic grading tools? First of all, each assignment is different
22
+ from each other. Traditional machine learning solutions that rely on collecting a large
23
+ set of data set simply won't work here. Oftentimes, assignments for the same class can even change
24
+ from year to year. Spending effort to collect a large label data set is a hard sell to teachers.
25
+ Second, the same assignment can be written in different coding languages. The solutions
26
+ could end up looking quite different. At last, code solutions can be very long, especially
27
+ when interaction is involved. Unfortunately, current state-of-the-art code analysis solutions
28
+ don't scale beyond 10 lines of code. In this work, we hope to offer a new solution
29
+ inspired by human teachers' grade these assignments.
30
+ Let's take a look at how a teacher plays to grade a student homework. This is what
31
+ a correct solution for code.org's coding assignment, Bounce, looks like. The teacher
32
+ controls a paddle to bounce a ball into a goal post and gets one score.
33
+ Here's what an incorrect student submission looks like. The student didn't put the boundary
34
+ condition for the wall and the ball goes right through it.
35
+ Here's another incorrect submission. Instead of getting a point after successfully bouncing
36
+ the ball into the goal post, the player gets a point whenever the ball bounces on wall
37
+ and paddle. This is clearly not the correct behavior.
38
+ However, a teacher isn't just playing the game normally. In order to grade it, the teacher
39
+ has to play it in a specific way to expose bugs in the game. Take a look at both programs
40
+ on the left and right. Both have wall boundary problems, but we would never know if the teacher
41
+ didn't try to bounce the ball on the wall. The right panel shows a game, though broken,
42
+ can look like a perfectly correct game.
43
+ Using the Markov Decision Process framework from reinforcement learning, we can characterize
44
+ the intuition we have built up. The MDP framework can be used to describe any interactive environment,
45
+ not just games. It includes a state space, action space, a transition dynamics that defines
46
+ how the game moves from one frame to the next, and a reward function. We can train an agent
47
+ using a reinforcement learning algorithm that learns to maximize the reward. So how does
48
+ the MDP framework help us understand programs with bugs?
49
+ We can treat each program as its own MDP. The teacher's correct program is the correct
50
+ or desired MDP, while the student's program is another MDP or a test MDP. We can frame
51
+ grading as an instance of identifying if a test MDP has the same behavior as a desired
52
+ MDP. Using components from the MDP framework, we can express bugs as distance between two
53
+ MDPs' transition and reward functions. The ball going through the wall is clearly not
54
+ a correct transition. Receive reward when you shouldn't can also be captured by the
55
+ difference in the reward function output. More precisely, we can treat grading as calculating
56
+ a distance between two MDPs. Equation 1 might suggest that we should check over all states.
57
+ However, since distance is non-negative and we're interested in the overall sum, we
58
+ only need to find one state-action pair in the test MDP to know if the overall distance
59
+ is non-zero. If we set this distance as a reward for an RL agent, we can make the task
60
+ of reaching bug states a lot more intelligent and efficient. This RL agent's objective
61
+ is to reach states that have the highest potential to be different between the two MDPs with
62
+ respect to this distance function. We do have one more challenge that remains.
63
+ The distance function DSA requires access to both MDPs' transition and reward functions.
64
+ We cannot assume we have access to the student program's inner mechanism. We can't control
65
+ the randomness in the student's code either, meaning two MDPs can have different random
66
+ initial starting positions. Therefore, when we interact with the student's MDP, we need
67
+ to learn a parametrized distance function that can tell us how far the observed state-action
68
+ pairs from the student MDP is from the correct MDP.
69
+ Now we have two parametrized models. The agent requires training to find the bug. The classifier
70
+ requires training to identify the bug. We call this the code star problem. So, if I
71
+ have a classifier that can classify which state triggers a bug, then we can simply replace
72
+ reward function in the MDP with this classifier and directly teach our agent. If I have an
73
+ agent that can always reach the bug state, I can probably just collect a dataset of trajectories
74
+ and train a good classifier. But at the beginning, neither the agent nor the classifier can do
75
+ a very good job. Therefore, we introduce a procedure called
76
+ collaborative training. The agent will start out as a random agent, where we can train
77
+ the agent to maximize the original reward in the MDP. It collects trajectories and trains
78
+ the classifier. Then we use the classifier as a reward function to guide the agent on
79
+ how to reach bug states. They both start out bad, but the agent can help the classifier
80
+ learn and the classifier can in return teach the agent.
81
+ We present two baselines to train the bug classifier. Since we have some training data,
82
+ though not a lot, we can simply apply coarse labeling, creating a dataset where all state-action
83
+ pairs from the correct labeled MDP as non-bug states and all state-action pairs from the
84
+ broken MDP as bug states. This is incredibly noisy because not all state-action pairs from
85
+ the broken MDP are bug states, only a few of them are. But this is a good baseline to
86
+ have. We can also train an unsupervised learning
87
+ model to memorize all state-action pairs from the correct MDP and use log probability or
88
+ reconstruction loss to detect abnormal state-action pairs in the broken MDP.
89
+ Inspired by Hohr-Triples and MDP state equivalence literature, we designed two models to fully
90
+ capture this notion of MDP-based state difference. We assume that the students can specify and
91
+ set random seed for their game. Therefore, the game objects, such as a ball, will not
92
+ always appear in the same initial state. Therefore, it is crucial for us to approximate one MDP's
93
+ transition dynamics and reward function. When our agent interacts with a new MDP, this is
94
+ where Hohr-LSTM comes in. We train it to model the correct MDP's transition dynamics and
95
+ reward function and treat bug states in the new MDP when sufficient deviation occurs from
96
+ the prediction. We further introduce contrastive Hohr-LSTM.
97
+ Sometimes the agent will explore a new region that it might not have visited in the correct
98
+ MDP. The predictive difference between the observed state and predictive state is in
99
+ fact a function approximation error. In order to reduce this error, we approximate both
100
+ the correct MDP and the broken MDP.
101
+ Let's take a look at how these models work. We introduce a car environment. In here, the
102
+ student miscalculated the boundary of this environment, so whenever the car goes outside
103
+ of the red dotted line, it will get stuck and can only wriggle back and forth. This
104
+ is a task where you will always reach a bug state at the end of each trajectory. Therefore,
105
+ every single agent is already an optimal agent. We create a specific one that only knows how
106
+ to drive north in a straight line.
107
+ As we can see, almost all models, except Gaussian mixture model, can be close to 100% accuracy
108
+ at classifying bug states and non-bug states. However, the agent that only knows how to
109
+ drive north is not a very interesting agent, and we probably will never use that in real
110
+ life. So what if we make it a little bit harder?
111
+ We can create an agent that drives the car randomly. Now the trajectory will become different
112
+ each time. We see a significant drop in performance for baseline solutions like noisy supervised
113
+ learning and variational autoencoder. However, our LSTM-based models can still do very well
114
+ at close to 100% accuracy. This is a pretty challenging task because we're measuring the
115
+ accuracy of each classifier on every state in a trajectory, even though we're in a toy
116
+ environment.
117
+ Let's make this setting even harder. The car environment can stay the same, but for now,
118
+ bugs can only be triggered if the agent successfully drives the car into some small red rectangular
119
+ areas. Not all agents are optimal now, and it would be unlikely for a single-direction
120
+ agent to ever see a bug state. We can now showcase the power of collaborative training
121
+ through this example.
122
+ We can see at the beginning, the agent is pretty random, and the classifier is pretty
123
+ bad except for the LSTM models. However, after only one round of collaborative training,
124
+ we see a substantial improvement for the two baseline models, both noisy supervised learning
125
+ model and variational autoencoder are able to improve their accuracy by 30% and precision
126
+ by 60%. This shows that the collaborative training is helping both the agent and the
127
+ classifier to be more optimal, even for the weaker classifiers.
128
+ We also notice that this improvement is not monotonic. Just like every other AI training
129
+ scheme, overfitting sometimes happens. Only the most expressive classifiers, our proposed
130
+ Horl LSTM and contrastive Horl LSTM can remain stable and even mildly improve their recall
131
+ in the last round of collaborative training.
132
+ We can directly examine the agent's learning by looking at its trajectory. At first, the
133
+ agent drives the car randomly, but after only one round of collaborative training, the agent
134
+ becomes sharply focused and only visits the possible buggy areas.
135
+ We verify our method on a real student dataset that we obtained from code.org. We use this
136
+ assignment as our motivating examples earlier. Bounce is a simple coding exercise where 450,000
137
+ students have submitted their solutions. We built a simulator that can run and execute
138
+ students' programs that conforms to the OpenAI GEM API. For each student program, we have
139
+ created goal labels for bug behaviors. We further binarize them into a single label
140
+ indicating correct or incorrect.
141
+ Bounce is a lot more complicated than car. Learning to bounce a ball into the goalpost
142
+ and understanding the physics is a lot more difficult for the agent. Therefore, we pre-train
143
+ the agent using the score as a reward. We call this play-to-win agent. Then we use this
144
+ agent to train our bug classifier. We're able to reach 94% accuracy with only 11 label
145
+ programs as training data. A similar algorithm that uses code as text input cannot match
146
+ our method's performance due to the smallness of the training dataset.
147
+ In addition to just grading, since we're able to determine bugs at the state level,
148
+ we can simply record a few frames before and after the bug occurs and compile a short video
149
+ for the students to demonstrate what the bug is in their assignment.
150
+ To summarize our work, we provide a fully functional simulator and a massive amount
151
+ of real student programs with goal labels. We demonstrate that our solution achieves
152
+ a high performance. However, there are still many problems remain. For example, can we
153
+ know which bug is triggered in the student program? This is helpful for providing fine-grained
154
+ feedback to the students. Training an RL agent with a classifier has also been explored in
155
+ other areas like SafeRL, where unsafe states are predicted by a classifier.
156
+ At last, we pose this question of creativity. Can our formulation accommodate creativity?
157
+ Creative programs are different but not broken. A ball can move faster or slower than the
158
+ teacher's solution, but it doesn't mean it's wrong. Exploring how we can recognize
159
+ and encourage student creativity is crucial for automated grading. Thanks for listening.
160
+ Come and chat with me during the poster session.
demo_data/nips-2021/25970/transcript_whisper_large-v2.txt ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hi, my name is Maxwell Nye, and today I'll be talking about improving coherence and consistency
2
+ in neural sequence models with dual system neurosymbolic reasoning.
3
+ So I first want to give a little bit of a demo, which is to ask this question.
4
+ A bat and a ball cost $1.10 in total.
5
+ The bat costs $1 more than the ball.
6
+ How much does the ball cost?
7
+ So I'll let you think a little bit for this.
8
+ So one answer that sort of might jump out at you is $0.10, but this is actually incorrect
9
+ because the sum of the two objects should be $1.10.
10
+ So the correct answer is actually $0.05.
11
+ And this is an example from a cognitive reflection test, and these are questions designed to
12
+ have a particular answer which comes to mind quite quickly, which is in fact wrong.
13
+ And something that's interesting is that large-scale language models such as GPT-3 predict the
14
+ wrong answers as well.
15
+ And this is true not just for the sort of the classic cognitive reflection test, but
16
+ also for variants with different numbers.
17
+ So this is sort of an interesting thing.
18
+ It talks about how neural language models often have issues with consistency and coherence.
19
+ So another place that we can see this a little more concretely is the clutter data set.
20
+ In the clutter data set, models are trained to...
21
+ There are sentences about people and their family relationships and stories about those
22
+ people.
23
+ And this was originally devised as a question-answering data set where you ask what the relations
24
+ are.
25
+ One thing you can do is ask models to be trained on this data set and then generate new stories.
26
+ And when you do that, you'll see that often the generated stories have inconsistency.
27
+ So if we look at the bottom of the screen here, we can see an example of this.
28
+ Robert and his brother Antonio played harmonicas together.
29
+ Robert's daughter, Elsie, asked him to play with her.
30
+ Elsie doesn't like having to babysit her younger brother, Antonio.
31
+ And so we can see that this is a common sense error because Elsie is not the younger brother
32
+ of Antonio.
33
+ Or Elsie's younger brother is not Antonio.
34
+ So what we've done is we've built a dual system model using large-scale neural networks and
35
+ symbolic deliberative logic in order to try to help with these consistency issues.
36
+ So the model is as follows.
37
+ You use neural generation to generate sentences in a particular story.
38
+ You might generate the next sentence using a model such as GPT-3 or BART.
39
+ What you can then do is parse that sentence into the semantic meaning with respect to
40
+ the family relationships and check whether or not it matches the current state of the
41
+ family relationships that's been described so far, and only accept the candidate sentence
42
+ generations that are actually consistent.
43
+ So this has a few components.
44
+ One of the components here is a symbolic world model.
45
+ In the case of this clutter domain, the symbolic world model that we built encodes people and
46
+ their family relationships.
47
+ So in other words, you could take a sentence and encode what the underlying family relationship
48
+ is.
49
+ And what you can do is you can use SMT solvers such as the Z3 solver to check consistency.
50
+ So given a new sentence, you can check that it doesn't disobey the rules of ancestry that
51
+ we've defined here.
52
+ And so some of those are, for example, what is the relationship between children and grandchildren?
53
+ And then another is what are the rules about whether ancestry, can you be your own ancestor,
54
+ et cetera.
55
+ So one question is how is this semantic parsing done?
56
+ And it turns out we can actually do this quite cheaply using GPT-3.
57
+ So what we can see here in the dotted box is an actual example of a few-shot prompt
58
+ we can use to parse each new sentence, each new candidate sentence from the system one
59
+ generation model and parse it into the semantic form that we can then give to the world model
60
+ solver.
61
+ So the results here show that models that use this dual system neurosymbolic stories
62
+ show improved coherence over just sentences that were constructed by a neural model.
63
+ So the example here is that what we've done is we've used human judgments on which of
64
+ the following sentences make more sense given the prior context of the story.
65
+ And we see that if we use a symbolic world model and the parsing scheme described above,
66
+ humans prefer the judgments given by this model.
67
+ We can also apply the same sort of reasoning to a completely different task.
68
+ Here we can discuss the grounded instruction following task, the grounded instruction following
69
+ domain called gscan.
70
+ In this domain, the goal is to have an agent, which is shown by this pink triangle, follow
71
+ a command to perform some simple action in this grid world.
72
+ So you can see here, walk to a small yellow cylinder might be an example of a command.
73
+ Prior work has shown that one thing you can do is encode the initial state, encode the
74
+ instruction and then train a neural model to predict the action sequences.
75
+ Other work has also shown that one thing you can do is train a model to predict a distribution
76
+ over the correct target location as part of the neural model.
77
+ That will also increase the performance of the model.
78
+ What we do here is show that if you do both of these things, you predict both an action
79
+ sequence and a target location, like what is the location you should end up in, and
80
+ then check whether or not when you execute the set of instructions, you will end up in
81
+ the predicted target location.
82
+ You can sort of check consistency between these two different predictions and only accept
83
+ those instruction sequences which match the target location prediction.
84
+ And this leads to also higher accuracy, especially in a low data regime.
85
+ We have more details about the results of the paper.
86
+ So that's a little bit of an overview of our paper.
87
+ Our takeaways are that you can build systems with combined neural methods and explicit
88
+ world knowledge.
89
+ And if you add just a little bit of world knowledge, you can really help increase coherence
90
+ and consistency for these large sequence models.
91
+ There are some challenges here about parsing in larger scale domains and also what it would
92
+ mean to automatically build a more complete world model.
93
+ Thank you very much.
demo_data/nips-2021/25973/transcript_whisper_large-v2.txt ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hi everyone, I'm Jingwen, a PhD student in National University of Singapore.
2
+ In this paper, we introduce dual-aspect collaborative transformer for solving routine problems.
3
+ Until now, the neural solvers for VRPs could be classified in two types.
4
+ The first one is the neural construction solver.
5
+ It starts from an empty solution and iteratively selects a customer node to the solution,
6
+ until all customers have been visited.
7
+ And in this paper, we focus more on the neural improvement solvers.
8
+ It starts from an incomplete solution and iteratively improves the solution
9
+ based on the node features and solution features, until reaching a step limit T.
10
+ Although the transformer has shown the efficiency for processing the sequence data,
11
+ its positional encoding method may not be optimal for encoding the VRP solutions,
12
+ because it only learns a unified set of embeddings and combines the node embeddings
13
+ and the positional embeddings together.
14
+ Also, it can only encode the linear sequences,
15
+ which cannot capture the circularity and symmetry of VRP solutions.
16
+ So in this paper, we introduce the dual-aspect augmentation,
17
+ which could better describe the VRP solutions.
18
+ We separate the learnings to node feature embeddings and positional feature embeddings
19
+ based on the cross-aspect referential attention.
20
+ And in this table, we compare the performance of dual-aspect and single-aspect.
21
+ We can see the dual-aspect outperforms the single-aspect.
22
+ And here we introduce the cyclic positional encoding.
23
+ In this figure, we describe the embedding vectors and correlations between every two embeddings
24
+ of the original PE and our CPE method in subfeature A and B.
25
+ In subfeature C, we describe the top two principal components after PCA projection.
26
+ And we can see our PCA method can better capture the circularity of VRP solutions.
27
+ And here we did some ablation studies on the CPE method,
28
+ which can achieve better generalization performance.
29
+ And now we introduce our curriculum learning strategy in the training process.
30
+ And in this method, we're training with an unstepped PPO method and a curriculum learning strategy.
31
+ It gradually prescribes higher quality solutions as the initial stage for training.
32
+ And in this graph, we describe two curves.
33
+ The blue one is the PPO method only, and the green one is the PPO method only.
34
+ And the green one is the PPO method with our curriculum learning strategy.
35
+ And we can see the green one is more stable and achieves lower objective values.
36
+ And here is the comparison performance of our method and some baselines on both TST and CVRP.
37
+ We can see our DACT outperforms the existing transformer-based improvement models.
38
+ So, based on these experiments, we can see our DACT performs very well for the routing problems.
39
+ And in the future, we hope to use this method to solve more combinatorial optimization problems.
40
+ Thank you.
demo_data/nips-2021/25974/transcript_whisper_large-v2.txt ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hi, I am Mohamed Pezeshki from Mila and today I am going to talk about creating starvation.
2
+ This is a joint work with Omar Kaba, Joshua Bengio, Aaron Korvel, Doina Prikop, and Guillaume
3
+ Lajra.
4
+ Let me start with a story.
5
+ Back in 1904, there was a horse called Hans and people believed that he could do arithmetic.
6
+ Here is an article from New York Times published in 1904.
7
+ The article says that Hans is an expert in numbers.
8
+ For example, when two numbers of 5 and 9 are written on a blackboard, Hans replies by tapping
9
+ on the ground 14 times.
10
+ Seven years later, in an article, Oscar Feinst unveiled that the so-called clever Hans was
11
+ not actually capable of doing any arithmetic and instead reading subtle hints in his trainer's
12
+ behavior indicating when to stop tapping.
13
+ As the article says, even the trainer was not aware of providing these shortcut signals.
14
+ So Hans was clever but probably not in doing arithmetic.
15
+ Its cleverness was in reading his trainer's clues.
16
+ A similar phenomenon has been observed in many applications of machine learning.
17
+ Essentially, the situations where the model seemingly has a very good performance but
18
+ in fact it hasn't learned true underlying relationships between the input and the target.
19
+ In this paper by Robert Gares and co-authors, they list several instances of what they call
20
+ shortcut learning.
21
+ For example, in a task of image captioning, the model predicts grazing sheep only by seeing
22
+ the green hillside.
23
+ In another instance, the network hallucinates a teapot with high confidence in an image
24
+ of pure noise.
25
+ This is another and indeed dangerous example of the task of pneumonia detection from x-ray
26
+ images.
27
+ The model appears to have a very good performance even on the test set.
28
+ However, the heat maps reveal that the network is not looking at the long section at all
29
+ and just latching on some features in the corner of the image.
30
+ The intuition behind this phenomenon is a folk knowledge in one form or another.
31
+ Given a strongly correlated and fast to learn features in training data, grading the sense
32
+ is biased towards learning them first.
33
+ However, this intuition is a bit abstract and hand-wavy, so let's look at a more concrete
34
+ example.
35
+ Consider a 2D classification task with red and blue data points as shown.
36
+ If you train in raw network and this data, here is the decision boundary that we learn.
37
+ Now consider slightly different arrangements of the data points such that the blue data
38
+ points are slightly shifted to the left and the red data points are shifted to the right,
39
+ making the data linearly separable.
40
+ Now if we train in neural network on this, we get an almost linear decision boundary.
41
+ Note that the network is only making its predictions based on the feature along the x-axis.
42
+ Indicated in the red circle here, you can see that the decision boundary is very close
43
+ to the data points.
44
+ However, the network is super confident on its predictions and the training loss is indeed
45
+ zero.
46
+ So you can see that the slightly perturbing data point can get the network to predict
47
+ an incorrect label with high confidence.
48
+ This problem will be even more visible when testing the model on OOD, meaning out of distribution
49
+ test data.
50
+ An online interactive demo of this work is available on a blog post we wrote.
51
+ If you wish to play with it a bit, please visit the link provided here.
52
+ So we hypothesize that what is happening here is gradient starvation.
53
+ Gradient starvation is a phenomenon in which a neural network captures statistically dominant
54
+ features while remaining invariant to the rest.
55
+ Here gradient descent leads to parameter updates, predominantly in directions that only capture
56
+ these dominant features, thus starving the gradient from other potentially informative
57
+ features.
58
+ Here, the notions of feature and dominancy of a feature is rather vague.
59
+ To define them more formally, we need to look into the learning dynamics.
60
+ In the interest of time, I will be covering only the general intuition of our results
61
+ and encourage interested audiences to take a look at the full paper for detailed treatment.
62
+ So the two main theorems of the paper can be summarized into these two plots that I
63
+ now explain.
64
+ Let's first start with gradient starvation itself on the left.
65
+ We train a model with common binary cross entropy loss.
66
+ On the x-axis we have training iterations or epochs, and on the y-axis we monitor two
67
+ features z1 and z2.
68
+ Their dynamics depend on several factors, including their strength, meaning how easy
69
+ or how hard it is for the network to learn those features, and their correlation with
70
+ the target.
71
+ Here, z1 has a larger correlation and hence converges to a value around 6, and z2 with
72
+ a smaller correlation converges to a value around 2.
73
+ However, the strength is equal, i.e. kappa is set to be 1.
74
+ Again, it means that both of these features are as easy for the network to learn.
75
+ Now let's keep their correlation fixed but increase the strength of z1.
76
+ A kappa equal to 2 means that z1 is learned easier than z2.
77
+ We can immediately see that although their correlation is still the same as before, z1
78
+ is overestimated while z2 is underestimated.
79
+ If we make kappa to be 4 or 8, it becomes more evident that simply because z1 is easier
80
+ to learn, it is being overestimated, while z2 is being starved.
81
+ Our theory shows that an increase in the strength of feature z1 has a detrimental effect on
82
+ the learning of feature z2.
83
+ Now our second theory shows that adding this term, indicated in the red rectangle, to the
84
+ loss decouples the features.
85
+ As you can see, a spectral decoupling decouples the features at the converged solution.
86
+ Regardless of the value of kappa, all of the experiments on z1 and z2 converge to the same
87
+ place.
88
+ Again, we refer interested audience to the paper for more theory as well as more intuition.
89
+ Now let's look at some experiments.
90
+ Recall the task that we studied earlier.
91
+ When the data is not linearly separable, we learn the curve decision boundary.
92
+ On the right, we see how z1 and z2 evolve.
93
+ When the data is linearly separable with a small margin, a linear decision boundary is
94
+ learned.
95
+ We observe that z1 is overestimated, while z2 is heavily underestimated.
96
+ Now let's see what happens if we add spectral decoupling.
97
+ Spectral decoupling suppresses z1 and as a result allows z2 to grow.
98
+ It also appears that other regularization methods do not succeed at learning a curve
99
+ decision boundary.
100
+ So we observed that spectral decoupling leads to a decision boundary with a larger margin.
101
+ What happens in real-world tasks?
102
+ The distance to the decision boundary is not trivial to compute when working with nonlinear
103
+ models.
104
+ However, we can use a proxy.
105
+ The amount of perturbation required to fool the network is a proxy to the margin.
106
+ Look at the plot on the right.
107
+ On the x-axis, we have the amount of perturbation and on the y-axis, we have how many of the
108
+ examples are misclassified.
109
+ You can see that with a fixed amount of perturbation, a model with vanilla binary cross entropy
110
+ is much more vulnerable compared to a model trained with spectral decoupling.
111
+ In another experiment, we studied colored MNIST, a well-known task of OOD generalization
112
+ where the color is spuriously correlated with the labels.
113
+ Also another task of OOD generalization is a classification task on the CILIB8 dataset
114
+ where the training data is again biased with respect to the color of the hair and the gender
115
+ such that most of male images have black hair while the majority of females have blonde
116
+ hair.
117
+ Here, we skip the details in the interest of time.
118
+ However, let me just draw your attention to the superiority of spectral decoupling in
119
+ these both tasks.
120
+ Finally to conclude, we talked about the clever hands effect.
121
+ We showed that a similar phenomenon can happen in neural networks and we called that gradient
122
+ starvation.
123
+ To understand gradient starvation, we looked into the learning dynamics.
124
+ We showed that the presence of a strongly correlated feature could result in a starvation
125
+ of other features.
126
+ We also showed that spectral decoupling provides some degree of control over what features
127
+ to learn and decouples essentially the features.
128
+ Thanks for your attention.
129
+ If you're interested to chat more, please visit our poster this afternoon.
130
+ Thank you very much.
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
- webvtt-py
2
- transformers
3
- requests
4
- pandas
5
- nltk
6
- sentencepiece
7
  torch
 
1
+ webvtt-py
2
+ transformers
3
+ requests
4
+ pandas
5
+ nltk
6
+ sentencepiece
7
  torch