Spaces:
Running
Running
ScientiaEtVeritas
commited on
Commit
•
e4b6f2e
1
Parent(s):
317a988
txt transcripts
Browse files- app.py +322 -322
- demo_data/nips-2021/25953/transcript_whisper_large-v2.txt +193 -0
- demo_data/nips-2021/25957/transcript_whisper_large-v2.txt +179 -0
- demo_data/nips-2021/25958/transcript_whisper_large-v2.txt +124 -0
- demo_data/nips-2021/25959/transcript_whisper_large-v2.txt +117 -0
- demo_data/nips-2021/25962/transcript_whisper_large-v2.txt +51 -0
- demo_data/nips-2021/25963/transcript_whisper_large-v2.txt +178 -0
- demo_data/nips-2021/25964/transcript_whisper_large-v2.txt +366 -0
- demo_data/nips-2021/25965/transcript_whisper_large-v2.txt +136 -0
- demo_data/nips-2021/25969/transcript_whisper_large-v2.txt +160 -0
- demo_data/nips-2021/25970/transcript_whisper_large-v2.txt +93 -0
- demo_data/nips-2021/25973/transcript_whisper_large-v2.txt +40 -0
- demo_data/nips-2021/25974/transcript_whisper_large-v2.txt +130 -0
- requirements.txt +6 -6
app.py
CHANGED
@@ -1,322 +1,322 @@
|
|
1 |
-
import itertools
|
2 |
-
import json
|
3 |
-
import re
|
4 |
-
from functools import partial
|
5 |
-
from pathlib import Path
|
6 |
-
|
7 |
-
import pandas as pd
|
8 |
-
import requests
|
9 |
-
import streamlit as st
|
10 |
-
import webvtt
|
11 |
-
from transformers import AutoTokenizer
|
12 |
-
|
13 |
-
from generate_text_api import TextGenerator
|
14 |
-
from model_inferences.utils.chunking import Truncater
|
15 |
-
from model_inferences.utils.files import get_captions_from_vtt, get_transcript
|
16 |
-
|
17 |
-
USE_PARAGRAPHING_MODEL = True
|
18 |
-
|
19 |
-
def get_sublist_by_flattened_index(A, i):
|
20 |
-
current_index = 0
|
21 |
-
for sublist in A:
|
22 |
-
sublist_length = len(sublist)
|
23 |
-
if current_index <= i < current_index + sublist_length:
|
24 |
-
return sublist, A.index(sublist)
|
25 |
-
current_index += sublist_length
|
26 |
-
return None, None
|
27 |
-
|
28 |
-
import requests
|
29 |
-
|
30 |
-
|
31 |
-
def get_talk_metadata(video_id):
|
32 |
-
url = "https://www.ted.com/graphql"
|
33 |
-
|
34 |
-
headers = {
|
35 |
-
"Content-Type": "application/json",
|
36 |
-
"Accept": "application/json",
|
37 |
-
"x-operation-name": "Transcript", # Replace with the actual operation name
|
38 |
-
}
|
39 |
-
|
40 |
-
data = {
|
41 |
-
"query": """
|
42 |
-
query GetTalk($videoId: ID!) {
|
43 |
-
video(id: $videoId) {
|
44 |
-
title,
|
45 |
-
presenterDisplayName,
|
46 |
-
nativeDownloads {medium}
|
47 |
-
}
|
48 |
-
}
|
49 |
-
""",
|
50 |
-
"variables": {
|
51 |
-
"videoId": video_id, # Corrected key to "videoId"
|
52 |
-
},
|
53 |
-
}
|
54 |
-
|
55 |
-
response = requests.post(url, json=data, headers=headers)
|
56 |
-
|
57 |
-
if response.status_code == 200:
|
58 |
-
result = response.json()
|
59 |
-
return result
|
60 |
-
else:
|
61 |
-
print(f"Error: {response.status_code}, {response.text}")
|
62 |
-
|
63 |
-
class OfflineTextSegmenterClient:
|
64 |
-
def __init__(self, host_url):
|
65 |
-
self.host_url = host_url.rstrip("/") + "/segment"
|
66 |
-
|
67 |
-
def segment(self, text, captions=None, generate_titles=False, threshold=0.4):
|
68 |
-
payload = {
|
69 |
-
'text': text,
|
70 |
-
'captions': captions,
|
71 |
-
'generate_titles': generate_titles,
|
72 |
-
"prefix_titles": True,
|
73 |
-
"threshold": threshold,
|
74 |
-
}
|
75 |
-
|
76 |
-
headers = {
|
77 |
-
'Content-Type': 'application/json'
|
78 |
-
}
|
79 |
-
|
80 |
-
response = requests.post(self.host_url, data=json.dumps(payload), headers=headers).json()
|
81 |
-
#segments = response["annotated_segments"] if "annotated_segments" in response else response["segments"]
|
82 |
-
return {'segments':response["segments"], 'titles': response["titles"], 'sentences': response["sentences"]}
|
83 |
-
|
84 |
-
class Toc:
|
85 |
-
|
86 |
-
def __init__(self):
|
87 |
-
self._items = []
|
88 |
-
self._placeholder = None
|
89 |
-
|
90 |
-
def title(self, text):
|
91 |
-
self._markdown(text, "h1")
|
92 |
-
|
93 |
-
def header(self, text):
|
94 |
-
self._markdown(text, "h2", " " * 2)
|
95 |
-
|
96 |
-
def subheader(self, text):
|
97 |
-
self._markdown(text, "h3", " " * 4)
|
98 |
-
|
99 |
-
def placeholder(self, sidebar=False):
|
100 |
-
self._placeholder = st.sidebar.empty() if sidebar else st.empty()
|
101 |
-
|
102 |
-
def generate(self):
|
103 |
-
if self._placeholder:
|
104 |
-
self._placeholder.markdown("\n".join(self._items), unsafe_allow_html=True)
|
105 |
-
|
106 |
-
def _markdown(self, text, level, space=""):
|
107 |
-
key = re.sub(r'[^\w-]', '', text.replace(" ", "-").replace("'", "-").lower())
|
108 |
-
st.markdown(f"<{level} id='{key}'>{text}</{level}>", unsafe_allow_html=True)
|
109 |
-
self._items.append(f"{space}* <a href='#{key}'>{text}</a>")
|
110 |
-
|
111 |
-
# custom_css = "<style type='text/css'>" + Path('style.css').read_text() + "</style>"
|
112 |
-
# st.write(custom_css, unsafe_allow_html=True)
|
113 |
-
|
114 |
-
def concat_prompt(prompt_text, text, model_name):
|
115 |
-
if 'flan' in model_name:
|
116 |
-
input_ = prompt_text + "\n\n" + text
|
117 |
-
elif 'galactica' in model_name:
|
118 |
-
input_ = text + "\n\n" + prompt_text
|
119 |
-
return input_
|
120 |
-
|
121 |
-
endpoint = "http://hiaisc.isl.iar.kit.edu/summarize"
|
122 |
-
ENDPOINTS = {"http://hiaisc.isl.iar.kit.edu/summarize": "meta-llama/Llama-2-13b-chat-hf",}
|
123 |
-
|
124 |
-
client = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/chapter")
|
125 |
-
if USE_PARAGRAPHING_MODEL:
|
126 |
-
paragrapher = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/paragraph")
|
127 |
-
summarizer = TextGenerator(endpoint)
|
128 |
-
|
129 |
-
tokenizer = AutoTokenizer.from_pretrained(ENDPOINTS[endpoint], use_fast=False)
|
130 |
-
|
131 |
-
# TLDR PROMPT
|
132 |
-
|
133 |
-
SYSTEM_PROMPT = "You are an assistant who replies with a summary to every message."
|
134 |
-
|
135 |
-
TLDR_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
|
136 |
-
{system_prompt}
|
137 |
-
<</SYS>>
|
138 |
-
|
139 |
-
{user_message} [/INST] Sure! Here is a summary of the research presentation in a single, short sentence:"""
|
140 |
-
|
141 |
-
TLDR_USER_PROMPT = "Summarize the following research presentation in a single, short sentence:\n\n{input}"
|
142 |
-
|
143 |
-
TLDR_PROMPT = TLDR_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
|
144 |
-
TLDR_PROMPT_LENGTH = tokenizer(TLDR_PROMPT, return_tensors="pt")["input_ids"].size(1)
|
145 |
-
|
146 |
-
BP_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
|
147 |
-
{system_prompt}
|
148 |
-
<</SYS>>
|
149 |
-
|
150 |
-
{user_message} [/INST] Sure! Here is a summary of the research presentation using three bullet points:\n\n\u2022"""
|
151 |
-
|
152 |
-
BP_USER_PROMPT = "Summarize the following research presentation using three bullet points:\n\n{input}"
|
153 |
-
|
154 |
-
BP_PROMPT = BP_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
|
155 |
-
BP_PROMPT_LENGTH = tokenizer(BP_PROMPT, return_tensors="pt")["input_ids"].size(1)
|
156 |
-
|
157 |
-
CONTEXT_LENGTH = 3072
|
158 |
-
MAX_SUMMARY_LENGTH = 1024
|
159 |
-
TLDR_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - TLDR_PROMPT_LENGTH - 1
|
160 |
-
BP_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - BP_PROMPT_LENGTH - 1
|
161 |
-
|
162 |
-
|
163 |
-
text_generator = TextGenerator(endpoint)
|
164 |
-
temperature = 0.7
|
165 |
-
|
166 |
-
import re
|
167 |
-
|
168 |
-
|
169 |
-
def replace_newlines(text):
|
170 |
-
updated_text = re.sub(r'\n+', r'\n\n', text)
|
171 |
-
return updated_text
|
172 |
-
|
173 |
-
def generate_summary(summarizer, generated_text_box, input_, prompt, max_input_length, prefix=""):
|
174 |
-
all_generated_text = prefix
|
175 |
-
truncater = Truncater(tokenizer, max_length=max_input_length)
|
176 |
-
input_ = truncater(input_)
|
177 |
-
input_ = prompt.format(input=input_)
|
178 |
-
for generated_text in summarizer.generate_text_stream(input_, max_new_tokens=MAX_SUMMARY_LENGTH, do_sample=True, temperature=temperature):
|
179 |
-
all_generated_text += replace_newlines(generated_text)
|
180 |
-
generated_text_box.info(all_generated_text)
|
181 |
-
print(all_generated_text)
|
182 |
-
return all_generated_text.strip()
|
183 |
-
|
184 |
-
st.header("Demo: Intelligent Recap")
|
185 |
-
|
186 |
-
if not hasattr(st, 'global_state'):
|
187 |
-
st.global_state = {'NIPS 2021 Talks': None, 'TED Talks': None}
|
188 |
-
# NIPS 2021 Talks
|
189 |
-
transcript_files = itertools.islice(Path("demo_data/nips-2021/").rglob("transcript_whisper_large-v2.vtt"), 15)
|
190 |
-
# get titles from metadata.json
|
191 |
-
transcripts_map = {}
|
192 |
-
for transcript_file in transcript_files:
|
193 |
-
base_path = transcript_file.parent
|
194 |
-
metadata = base_path / "metadata.json"
|
195 |
-
txt_file = base_path / "transcript_whisper_large-v2.txt"
|
196 |
-
with open(metadata) as f:
|
197 |
-
metadata = json.load(f)
|
198 |
-
title = metadata["title"]
|
199 |
-
transcript = get_transcript(txt_file)
|
200 |
-
captions = get_captions_from_vtt(transcript_file)
|
201 |
-
transcripts_map[title] = {"transcript": transcript, "captions": captions, "video": base_path / "video.mp4"}
|
202 |
-
st.global_state['NIPS 2021 Talks'] = transcripts_map
|
203 |
-
|
204 |
-
data = pd.read_json("demo_data/ted_talks.json")
|
205 |
-
video_ids = data.talk_id.tolist()
|
206 |
-
transcripts = data.text.apply(lambda x: " ".join(x)).tolist()
|
207 |
-
transcripts_map = {}
|
208 |
-
for video_id, transcript in zip(video_ids, transcripts):
|
209 |
-
metadata = get_talk_metadata(video_id)
|
210 |
-
title = metadata["data"]["video"]["title"]
|
211 |
-
presenter = metadata["data"]["video"]["presenterDisplayName"]
|
212 |
-
print(metadata["data"])
|
213 |
-
if metadata["data"]["video"]["nativeDownloads"] is None:
|
214 |
-
continue
|
215 |
-
video_url = metadata["data"]["video"]["nativeDownloads"]["medium"]
|
216 |
-
transcripts_map[title] = {"transcript": transcript, "video": video_url, "presenter": presenter}
|
217 |
-
st.global_state['TED Talks'] = transcripts_map
|
218 |
-
|
219 |
-
def get_lecture_id(path):
|
220 |
-
return int(path.parts[-2].split('-')[1])
|
221 |
-
|
222 |
-
transcript_files = Path("demo_data/lectures/").rglob("English.vtt")
|
223 |
-
sorted_path_list = sorted(transcript_files, key=get_lecture_id)
|
224 |
-
|
225 |
-
transcripts_map = {}
|
226 |
-
for transcript_file in sorted_path_list:
|
227 |
-
base_path = transcript_file.parent
|
228 |
-
lecture_id = base_path.parts[-1]
|
229 |
-
transcript = " ".join([c["text"].strip() for c in get_captions_from_vtt(transcript_file)]).replace("\n", " ")
|
230 |
-
video_path = Path(base_path, "video.mp4")
|
231 |
-
transcripts_map["Machine Translation: " + lecture_id] = {"transcript": transcript, "video": video_path}
|
232 |
-
st.global_state['KIT Lectures'] = transcripts_map
|
233 |
-
|
234 |
-
type_of_document = st.selectbox('What kind of document do you want to test it on?', list(st.global_state.keys()))
|
235 |
-
|
236 |
-
transcripts_map = st.global_state[type_of_document]
|
237 |
-
|
238 |
-
selected_talk = st.selectbox("Choose a document...", list(transcripts_map.keys()))
|
239 |
-
|
240 |
-
st.video(str(transcripts_map[selected_talk]['video']), format="video/mp4", start_time=0)
|
241 |
-
|
242 |
-
input_text = st.text_area("Transcript", value=transcripts_map[selected_talk]['transcript'], height=300)
|
243 |
-
|
244 |
-
toc = Toc()
|
245 |
-
|
246 |
-
summarization_todos = []
|
247 |
-
|
248 |
-
with st.expander("Adjust Thresholds"):
|
249 |
-
threshold = st.slider('Chapter Segmentation Threshold', 0.00, 1.00, value=0.4, step=0.05)
|
250 |
-
paragraphing_threshold = st.slider('Paragraphing Threshold', 0.00, 1.00, value=0.5, step=0.05)
|
251 |
-
|
252 |
-
if st.button("Process Transcript"):
|
253 |
-
with st.sidebar:
|
254 |
-
st.header("Table of Contents")
|
255 |
-
toc.placeholder()
|
256 |
-
|
257 |
-
st.header(selected_talk, divider='rainbow')
|
258 |
-
# if 'presenter' in transcripts_map[selected_talk]:
|
259 |
-
# st.markdown(f"### *by **{transcripts_map[selected_talk]['presenter']}***")
|
260 |
-
|
261 |
-
captions = transcripts_map[selected_talk]['captions'] if 'captions' in transcripts_map[selected_talk] else None
|
262 |
-
result = client.segment(input_text, captions, generate_titles=True, threshold=threshold)
|
263 |
-
if USE_PARAGRAPHING_MODEL:
|
264 |
-
presult = paragrapher.segment(input_text, captions, generate_titles=False, threshold=paragraphing_threshold)
|
265 |
-
paragraphs = presult['segments']
|
266 |
-
segments, titles, sentences = result['segments'], result['titles'], result['sentences']
|
267 |
-
|
268 |
-
if USE_PARAGRAPHING_MODEL:
|
269 |
-
prev_chapter_idx = 0
|
270 |
-
prev_paragraph_idx = 0
|
271 |
-
segment = []
|
272 |
-
for i, sentence in enumerate(sentences):
|
273 |
-
chapter, chapter_idx = get_sublist_by_flattened_index(segments, i)
|
274 |
-
paragraph, paragraph_idx = get_sublist_by_flattened_index(paragraphs, i)
|
275 |
-
|
276 |
-
if (chapter_idx != prev_chapter_idx and paragraph_idx == prev_paragraph_idx) or (paragraph_idx != prev_paragraph_idx and chapter_idx != prev_chapter_idx):
|
277 |
-
print("Chapter / Chapter & Paragraph")
|
278 |
-
segment_text = " ".join(segment)
|
279 |
-
toc.subheader(titles[prev_chapter_idx])
|
280 |
-
if len(segment_text) > 1200:
|
281 |
-
generated_text_box = st.info("")
|
282 |
-
summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
|
283 |
-
elif len(segment_text) > 450:
|
284 |
-
generated_text_box = st.info("")
|
285 |
-
summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
|
286 |
-
st.write(segment_text)
|
287 |
-
segment = []
|
288 |
-
elif paragraph_idx != prev_paragraph_idx and chapter_idx == prev_chapter_idx:
|
289 |
-
print("Paragraph")
|
290 |
-
segment.append("\n\n")
|
291 |
-
|
292 |
-
segment.append(sentence)
|
293 |
-
|
294 |
-
prev_chapter_idx = chapter_idx
|
295 |
-
prev_paragraph_idx = paragraph_idx
|
296 |
-
|
297 |
-
segment_text = " ".join(segment)
|
298 |
-
toc.subheader(titles[prev_chapter_idx])
|
299 |
-
if len(segment_text) > 1200:
|
300 |
-
generated_text_box = st.info("")
|
301 |
-
summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
|
302 |
-
elif len(segment_text) > 450:
|
303 |
-
generated_text_box = st.info("")
|
304 |
-
summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
|
305 |
-
st.write(segment_text)
|
306 |
-
|
307 |
-
|
308 |
-
else:
|
309 |
-
segments = [" ".join([sentence for sentence in segment]) for segment in segments]
|
310 |
-
for title, segment in zip(titles, segments):
|
311 |
-
toc.subheader(title)
|
312 |
-
if len(segment) > 1200:
|
313 |
-
generated_text_box = st.info("")
|
314 |
-
summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
|
315 |
-
elif len(segment) > 450:
|
316 |
-
generated_text_box = st.info("")
|
317 |
-
summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
|
318 |
-
st.write(segment)
|
319 |
-
toc.generate()
|
320 |
-
|
321 |
-
for summarization_todo in summarization_todos:
|
322 |
-
summarization_todo()
|
|
|
1 |
+
import itertools
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
from functools import partial
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
import requests
|
9 |
+
import streamlit as st
|
10 |
+
import webvtt
|
11 |
+
from transformers import AutoTokenizer
|
12 |
+
|
13 |
+
from generate_text_api import TextGenerator
|
14 |
+
from model_inferences.utils.chunking import Truncater
|
15 |
+
from model_inferences.utils.files import get_captions_from_vtt, get_transcript
|
16 |
+
|
17 |
+
USE_PARAGRAPHING_MODEL = True
|
18 |
+
|
19 |
+
def get_sublist_by_flattened_index(A, i):
|
20 |
+
current_index = 0
|
21 |
+
for sublist in A:
|
22 |
+
sublist_length = len(sublist)
|
23 |
+
if current_index <= i < current_index + sublist_length:
|
24 |
+
return sublist, A.index(sublist)
|
25 |
+
current_index += sublist_length
|
26 |
+
return None, None
|
27 |
+
|
28 |
+
import requests
|
29 |
+
|
30 |
+
|
31 |
+
def get_talk_metadata(video_id):
|
32 |
+
url = "https://www.ted.com/graphql"
|
33 |
+
|
34 |
+
headers = {
|
35 |
+
"Content-Type": "application/json",
|
36 |
+
"Accept": "application/json",
|
37 |
+
"x-operation-name": "Transcript", # Replace with the actual operation name
|
38 |
+
}
|
39 |
+
|
40 |
+
data = {
|
41 |
+
"query": """
|
42 |
+
query GetTalk($videoId: ID!) {
|
43 |
+
video(id: $videoId) {
|
44 |
+
title,
|
45 |
+
presenterDisplayName,
|
46 |
+
nativeDownloads {medium}
|
47 |
+
}
|
48 |
+
}
|
49 |
+
""",
|
50 |
+
"variables": {
|
51 |
+
"videoId": video_id, # Corrected key to "videoId"
|
52 |
+
},
|
53 |
+
}
|
54 |
+
|
55 |
+
response = requests.post(url, json=data, headers=headers)
|
56 |
+
|
57 |
+
if response.status_code == 200:
|
58 |
+
result = response.json()
|
59 |
+
return result
|
60 |
+
else:
|
61 |
+
print(f"Error: {response.status_code}, {response.text}")
|
62 |
+
|
63 |
+
class OfflineTextSegmenterClient:
|
64 |
+
def __init__(self, host_url):
|
65 |
+
self.host_url = host_url.rstrip("/") + "/segment"
|
66 |
+
|
67 |
+
def segment(self, text, captions=None, generate_titles=False, threshold=0.4):
|
68 |
+
payload = {
|
69 |
+
'text': text,
|
70 |
+
'captions': captions,
|
71 |
+
'generate_titles': generate_titles,
|
72 |
+
"prefix_titles": True,
|
73 |
+
"threshold": threshold,
|
74 |
+
}
|
75 |
+
|
76 |
+
headers = {
|
77 |
+
'Content-Type': 'application/json'
|
78 |
+
}
|
79 |
+
|
80 |
+
response = requests.post(self.host_url, data=json.dumps(payload), headers=headers).json()
|
81 |
+
#segments = response["annotated_segments"] if "annotated_segments" in response else response["segments"]
|
82 |
+
return {'segments':response["segments"], 'titles': response["titles"], 'sentences': response["sentences"]}
|
83 |
+
|
84 |
+
class Toc:
|
85 |
+
|
86 |
+
def __init__(self):
|
87 |
+
self._items = []
|
88 |
+
self._placeholder = None
|
89 |
+
|
90 |
+
def title(self, text):
|
91 |
+
self._markdown(text, "h1")
|
92 |
+
|
93 |
+
def header(self, text):
|
94 |
+
self._markdown(text, "h2", " " * 2)
|
95 |
+
|
96 |
+
def subheader(self, text):
|
97 |
+
self._markdown(text, "h3", " " * 4)
|
98 |
+
|
99 |
+
def placeholder(self, sidebar=False):
|
100 |
+
self._placeholder = st.sidebar.empty() if sidebar else st.empty()
|
101 |
+
|
102 |
+
def generate(self):
|
103 |
+
if self._placeholder:
|
104 |
+
self._placeholder.markdown("\n".join(self._items), unsafe_allow_html=True)
|
105 |
+
|
106 |
+
def _markdown(self, text, level, space=""):
|
107 |
+
key = re.sub(r'[^\w-]', '', text.replace(" ", "-").replace("'", "-").lower())
|
108 |
+
st.markdown(f"<{level} id='{key}'>{text}</{level}>", unsafe_allow_html=True)
|
109 |
+
self._items.append(f"{space}* <a href='#{key}'>{text}</a>")
|
110 |
+
|
111 |
+
# custom_css = "<style type='text/css'>" + Path('style.css').read_text() + "</style>"
|
112 |
+
# st.write(custom_css, unsafe_allow_html=True)
|
113 |
+
|
114 |
+
def concat_prompt(prompt_text, text, model_name):
|
115 |
+
if 'flan' in model_name:
|
116 |
+
input_ = prompt_text + "\n\n" + text
|
117 |
+
elif 'galactica' in model_name:
|
118 |
+
input_ = text + "\n\n" + prompt_text
|
119 |
+
return input_
|
120 |
+
|
121 |
+
endpoint = "http://hiaisc.isl.iar.kit.edu/summarize"
|
122 |
+
ENDPOINTS = {"http://hiaisc.isl.iar.kit.edu/summarize": "meta-llama/Llama-2-13b-chat-hf",}
|
123 |
+
|
124 |
+
client = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/chapter")
|
125 |
+
if USE_PARAGRAPHING_MODEL:
|
126 |
+
paragrapher = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/paragraph")
|
127 |
+
summarizer = TextGenerator(endpoint)
|
128 |
+
|
129 |
+
tokenizer = AutoTokenizer.from_pretrained(ENDPOINTS[endpoint], use_fast=False)
|
130 |
+
|
131 |
+
# TLDR PROMPT
|
132 |
+
|
133 |
+
SYSTEM_PROMPT = "You are an assistant who replies with a summary to every message."
|
134 |
+
|
135 |
+
TLDR_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
|
136 |
+
{system_prompt}
|
137 |
+
<</SYS>>
|
138 |
+
|
139 |
+
{user_message} [/INST] Sure! Here is a summary of the research presentation in a single, short sentence:"""
|
140 |
+
|
141 |
+
TLDR_USER_PROMPT = "Summarize the following research presentation in a single, short sentence:\n\n{input}"
|
142 |
+
|
143 |
+
TLDR_PROMPT = TLDR_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
|
144 |
+
TLDR_PROMPT_LENGTH = tokenizer(TLDR_PROMPT, return_tensors="pt")["input_ids"].size(1)
|
145 |
+
|
146 |
+
BP_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
|
147 |
+
{system_prompt}
|
148 |
+
<</SYS>>
|
149 |
+
|
150 |
+
{user_message} [/INST] Sure! Here is a summary of the research presentation using three bullet points:\n\n\u2022"""
|
151 |
+
|
152 |
+
BP_USER_PROMPT = "Summarize the following research presentation using three bullet points:\n\n{input}"
|
153 |
+
|
154 |
+
BP_PROMPT = BP_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
|
155 |
+
BP_PROMPT_LENGTH = tokenizer(BP_PROMPT, return_tensors="pt")["input_ids"].size(1)
|
156 |
+
|
157 |
+
CONTEXT_LENGTH = 3072
|
158 |
+
MAX_SUMMARY_LENGTH = 1024
|
159 |
+
TLDR_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - TLDR_PROMPT_LENGTH - 1
|
160 |
+
BP_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - BP_PROMPT_LENGTH - 1
|
161 |
+
|
162 |
+
|
163 |
+
text_generator = TextGenerator(endpoint)
|
164 |
+
temperature = 0.7
|
165 |
+
|
166 |
+
import re
|
167 |
+
|
168 |
+
|
169 |
+
def replace_newlines(text):
|
170 |
+
updated_text = re.sub(r'\n+', r'\n\n', text)
|
171 |
+
return updated_text
|
172 |
+
|
173 |
+
def generate_summary(summarizer, generated_text_box, input_, prompt, max_input_length, prefix=""):
|
174 |
+
all_generated_text = prefix
|
175 |
+
truncater = Truncater(tokenizer, max_length=max_input_length)
|
176 |
+
input_ = truncater(input_)
|
177 |
+
input_ = prompt.format(input=input_)
|
178 |
+
for generated_text in summarizer.generate_text_stream(input_, max_new_tokens=MAX_SUMMARY_LENGTH, do_sample=True, temperature=temperature):
|
179 |
+
all_generated_text += replace_newlines(generated_text)
|
180 |
+
generated_text_box.info(all_generated_text)
|
181 |
+
print(all_generated_text)
|
182 |
+
return all_generated_text.strip()
|
183 |
+
|
184 |
+
st.header("Demo: Intelligent Recap")
|
185 |
+
|
186 |
+
if not hasattr(st, 'global_state'):
|
187 |
+
st.global_state = {'NIPS 2021 Talks': None, 'TED Talks': None}
|
188 |
+
# NIPS 2021 Talks
|
189 |
+
transcript_files = itertools.islice(Path("demo_data/nips-2021/").rglob("transcript_whisper_large-v2.vtt"), 15)
|
190 |
+
# get titles from metadata.json
|
191 |
+
transcripts_map = {}
|
192 |
+
for transcript_file in transcript_files:
|
193 |
+
base_path = transcript_file.parent
|
194 |
+
metadata = base_path / "metadata.json"
|
195 |
+
txt_file = base_path / "transcript_whisper_large-v2.txt"
|
196 |
+
with open(metadata) as f:
|
197 |
+
metadata = json.load(f)
|
198 |
+
title = metadata["title"]
|
199 |
+
transcript = get_transcript(txt_file)
|
200 |
+
captions = get_captions_from_vtt(transcript_file)
|
201 |
+
transcripts_map[title] = {"transcript": transcript, "captions": captions, "video": base_path / "video.mp4"}
|
202 |
+
st.global_state['NIPS 2021 Talks'] = transcripts_map
|
203 |
+
|
204 |
+
data = pd.read_json("demo_data/ted_talks.json")
|
205 |
+
video_ids = data.talk_id.tolist()
|
206 |
+
transcripts = data.text.apply(lambda x: " ".join(x)).tolist()
|
207 |
+
transcripts_map = {}
|
208 |
+
for video_id, transcript in zip(video_ids, transcripts):
|
209 |
+
metadata = get_talk_metadata(video_id)
|
210 |
+
title = metadata["data"]["video"]["title"]
|
211 |
+
presenter = metadata["data"]["video"]["presenterDisplayName"]
|
212 |
+
print(metadata["data"])
|
213 |
+
if metadata["data"]["video"]["nativeDownloads"] is None:
|
214 |
+
continue
|
215 |
+
video_url = metadata["data"]["video"]["nativeDownloads"]["medium"]
|
216 |
+
transcripts_map[title] = {"transcript": transcript, "video": video_url, "presenter": presenter}
|
217 |
+
st.global_state['TED Talks'] = transcripts_map
|
218 |
+
|
219 |
+
def get_lecture_id(path):
|
220 |
+
return int(path.parts[-2].split('-')[1])
|
221 |
+
|
222 |
+
transcript_files = Path("demo_data/lectures/").rglob("English.vtt")
|
223 |
+
sorted_path_list = sorted(transcript_files, key=get_lecture_id)
|
224 |
+
|
225 |
+
transcripts_map = {}
|
226 |
+
for transcript_file in sorted_path_list:
|
227 |
+
base_path = transcript_file.parent
|
228 |
+
lecture_id = base_path.parts[-1]
|
229 |
+
transcript = " ".join([c["text"].strip() for c in get_captions_from_vtt(transcript_file)]).replace("\n", " ")
|
230 |
+
video_path = Path(base_path, "video.mp4")
|
231 |
+
transcripts_map["Machine Translation: " + lecture_id] = {"transcript": transcript, "video": video_path}
|
232 |
+
st.global_state['KIT Lectures'] = transcripts_map
|
233 |
+
|
234 |
+
type_of_document = st.selectbox('What kind of document do you want to test it on?', list(st.global_state.keys()))
|
235 |
+
|
236 |
+
transcripts_map = st.global_state[type_of_document]
|
237 |
+
|
238 |
+
selected_talk = st.selectbox("Choose a document...", list(transcripts_map.keys()))
|
239 |
+
|
240 |
+
st.video(str(transcripts_map[selected_talk]['video']), format="video/mp4", start_time=0)
|
241 |
+
|
242 |
+
input_text = st.text_area("Transcript", value=transcripts_map[selected_talk]['transcript'], height=300)
|
243 |
+
|
244 |
+
toc = Toc()
|
245 |
+
|
246 |
+
summarization_todos = []
|
247 |
+
|
248 |
+
with st.expander("Adjust Thresholds"):
|
249 |
+
threshold = st.slider('Chapter Segmentation Threshold', 0.00, 1.00, value=0.4, step=0.05)
|
250 |
+
paragraphing_threshold = st.slider('Paragraphing Threshold', 0.00, 1.00, value=0.5, step=0.05)
|
251 |
+
|
252 |
+
if st.button("Process Transcript"):
|
253 |
+
with st.sidebar:
|
254 |
+
st.header("Table of Contents")
|
255 |
+
toc.placeholder()
|
256 |
+
|
257 |
+
st.header(selected_talk, divider='rainbow')
|
258 |
+
# if 'presenter' in transcripts_map[selected_talk]:
|
259 |
+
# st.markdown(f"### *by **{transcripts_map[selected_talk]['presenter']}***")
|
260 |
+
|
261 |
+
captions = transcripts_map[selected_talk]['captions'] if 'captions' in transcripts_map[selected_talk] else None
|
262 |
+
result = client.segment(input_text, captions, generate_titles=True, threshold=threshold)
|
263 |
+
if USE_PARAGRAPHING_MODEL:
|
264 |
+
presult = paragrapher.segment(input_text, captions, generate_titles=False, threshold=paragraphing_threshold)
|
265 |
+
paragraphs = presult['segments']
|
266 |
+
segments, titles, sentences = result['segments'], result['titles'], result['sentences']
|
267 |
+
|
268 |
+
if USE_PARAGRAPHING_MODEL:
|
269 |
+
prev_chapter_idx = 0
|
270 |
+
prev_paragraph_idx = 0
|
271 |
+
segment = []
|
272 |
+
for i, sentence in enumerate(sentences):
|
273 |
+
chapter, chapter_idx = get_sublist_by_flattened_index(segments, i)
|
274 |
+
paragraph, paragraph_idx = get_sublist_by_flattened_index(paragraphs, i)
|
275 |
+
|
276 |
+
if (chapter_idx != prev_chapter_idx and paragraph_idx == prev_paragraph_idx) or (paragraph_idx != prev_paragraph_idx and chapter_idx != prev_chapter_idx):
|
277 |
+
print("Chapter / Chapter & Paragraph")
|
278 |
+
segment_text = " ".join(segment)
|
279 |
+
toc.subheader(titles[prev_chapter_idx])
|
280 |
+
if len(segment_text) > 1200:
|
281 |
+
generated_text_box = st.info("")
|
282 |
+
summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
|
283 |
+
elif len(segment_text) > 450:
|
284 |
+
generated_text_box = st.info("")
|
285 |
+
summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
|
286 |
+
st.write(segment_text)
|
287 |
+
segment = []
|
288 |
+
elif paragraph_idx != prev_paragraph_idx and chapter_idx == prev_chapter_idx:
|
289 |
+
print("Paragraph")
|
290 |
+
segment.append("\n\n")
|
291 |
+
|
292 |
+
segment.append(sentence)
|
293 |
+
|
294 |
+
prev_chapter_idx = chapter_idx
|
295 |
+
prev_paragraph_idx = paragraph_idx
|
296 |
+
|
297 |
+
segment_text = " ".join(segment)
|
298 |
+
toc.subheader(titles[prev_chapter_idx])
|
299 |
+
if len(segment_text) > 1200:
|
300 |
+
generated_text_box = st.info("")
|
301 |
+
summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
|
302 |
+
elif len(segment_text) > 450:
|
303 |
+
generated_text_box = st.info("")
|
304 |
+
summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
|
305 |
+
st.write(segment_text)
|
306 |
+
|
307 |
+
|
308 |
+
else:
|
309 |
+
segments = [" ".join([sentence for sentence in segment]) for segment in segments]
|
310 |
+
for title, segment in zip(titles, segments):
|
311 |
+
toc.subheader(title)
|
312 |
+
if len(segment) > 1200:
|
313 |
+
generated_text_box = st.info("")
|
314 |
+
summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
|
315 |
+
elif len(segment) > 450:
|
316 |
+
generated_text_box = st.info("")
|
317 |
+
summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
|
318 |
+
st.write(segment)
|
319 |
+
toc.generate()
|
320 |
+
|
321 |
+
for summarization_todo in summarization_todos:
|
322 |
+
summarization_todo()
|
demo_data/nips-2021/25953/transcript_whisper_large-v2.txt
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Hi everyone, my name is Zyw Goldfeld and this is a joint work with Christian Greenwald about
|
2 |
+
sliced mutual information, which is a new measure of statistical dependence that has
|
3 |
+
some nice scalability properties to high dimensional settings.
|
4 |
+
And to get started, I think we're all familiar with classic mutual information that is defined
|
5 |
+
between let's say continuous high dimensional random variables, which is the regime that
|
6 |
+
we'll mostly be interested in, like SOH, basically the KL divergence between their joint distributions
|
7 |
+
and the product of their marginals.
|
8 |
+
And mutual information is indeed this fundamental measure of dependence that enjoys many good
|
9 |
+
properties such that the fact that it nullifies if and only if our random variables are independent,
|
10 |
+
it is invariant to bijections and it meets several useful representations, decompositions,
|
11 |
+
variational forms, etc.
|
12 |
+
And in fact, it can be even obtained axiomatically as the unique functional of the joint distribution
|
13 |
+
that satisfies some natural informativeness conditions.
|
14 |
+
And as such, mutual information has seen a variety of applications in information theory
|
15 |
+
and statistics more recently in machine learning.
|
16 |
+
But the problem is that all this nice structure comes with a hefty price, since computing
|
17 |
+
mutual information in high dimensions or estimating it from samples is very, very hard, effectively
|
18 |
+
infeasible.
|
19 |
+
And this is the so-called curse of dimensionality and sort of the problem that we try to tackle
|
20 |
+
in this work.
|
21 |
+
And to address this difficulty, what we propose is sliced mutual information, which is, like
|
22 |
+
I said, a new measure of statistical dependence, not necessarily a proxy of mutual information
|
23 |
+
as such, but rather an alternative notion, which is defined as this average of scalar
|
24 |
+
mutual information terms between projections of our high dimensional variables onto randomly
|
25 |
+
chosen directions from the corresponding unit spheres.
|
26 |
+
And it's of course inspired by the recent popularization of slicing techniques for statistical
|
27 |
+
divergences, in particular the Wasserstein, the sliced Wasserstein distance is a great
|
28 |
+
example.
|
29 |
+
But the way it works for sliced mutual information is roughly so, well, let's say that this is
|
30 |
+
our first high dimensional variable X and this is its distribution.
|
31 |
+
What you do is draw a projection direction uniformly from the sphere.
|
32 |
+
You then project this random variable onto that direction, do the same for your other
|
33 |
+
random variable.
|
34 |
+
And now for these two projected scalar new variables, we just compute the mutual information
|
35 |
+
between them and average everything over the choice of direction.
|
36 |
+
So that's basically the definition.
|
37 |
+
And with that, the goal of this work is effectively to show that sliced mutual information is
|
38 |
+
both a meaningful and a scalable mutual information alternative.
|
39 |
+
Meaningful, well, in the sense that it preserves many of the desired properties that make mutual
|
40 |
+
information appealing to begin with and scalable in the sense that it alleviates the set of
|
41 |
+
computational and statistical difficulties.
|
42 |
+
All right.
|
43 |
+
Yeah, and to address this first point, let me show you that, well, despite those one
|
44 |
+
dimensional projections, sliced mutual information indeed inherits many of the properties of
|
45 |
+
classic mutual information.
|
46 |
+
So we have, well, of course, non-negativity, but furthermore, identification of independence.
|
47 |
+
We have an entropy decomposition for an appropriate definition of sliced entropy.
|
48 |
+
We can represent it as a KL divergence, a sliced KL divergence.
|
49 |
+
To be more precise, we have a chain rule tensorization for independent copies, as well as a Donsker-Varadhan-like
|
50 |
+
variational form that can be readily used for neural estimation of sliced mutual information.
|
51 |
+
We actually make use of that in some of our empirical results.
|
52 |
+
And well, I mean, you are more than welcome to check the paper or visit us as a poster
|
53 |
+
if you want to know more about any of these.
|
54 |
+
But really, the upshot here is that much of the classic structure is still there after
|
55 |
+
the slicing.
|
56 |
+
Now another interesting feature of sliced mutual information comes to light when you
|
57 |
+
think of it in the context of the famous data processing inequality.
|
58 |
+
And for starters, recall that classic mutual information satisfies the DPI, which in particular
|
59 |
+
means that if you process either of your random variables with a deterministic function, say
|
60 |
+
this f over here, you can only lose the informativeness in the classic sense.
|
61 |
+
Now sliced mutual information plays differently with processing and can in some sense benefit
|
62 |
+
from nice transformations that, let's say, give rise to some nicer manifold for your
|
63 |
+
random variable.
|
64 |
+
And to understand this, keep in mind that, well, first of all, sliced mutual information
|
65 |
+
only looks at projections of random variables.
|
66 |
+
And it may very well be the case that some transformations of x, let's say, have more
|
67 |
+
informative projections about y than x itself.
|
68 |
+
And here's a simple example to that effect.
|
69 |
+
So consider a two-dimensional isotropic Gaussian x, so two coordinates, x1 and x2.
|
70 |
+
And let's take y to be, for example, its first coordinate.
|
71 |
+
Now if you look at the mutual information between two fixed projections of x and y,
|
72 |
+
well, projection does nothing to y, right, because it's a scalar.
|
73 |
+
But it does affect x.
|
74 |
+
And if you look at the mutual information between two projections of x and y, you quickly
|
75 |
+
realize that x1 really plays the role of the signal here, whereas x2 behaves like noise.
|
76 |
+
And therefore, any transformation that will effectively improve your signal-to-noise ratio,
|
77 |
+
for example, like this g sub a over here, where a is less than 1, will indeed give rise
|
78 |
+
to a higher sliced mutual information value.
|
79 |
+
So all in all, sliced mutual information can be increased from processing, which means
|
80 |
+
that, well, in particular, it validates the data processing inequality and is different
|
81 |
+
from classic mutual information in that sense.
|
82 |
+
But interestingly, and as I will show you shortly, this is actually a quite useful thing
|
83 |
+
to have, for example, for feature extraction tasks, because we can use sliced mutual information
|
84 |
+
effectively to maximize it in order to extract informative features and land on those nicer
|
85 |
+
manifolds that I mentioned a moment ago.
|
86 |
+
And here's an example theorem that kind of makes this statement precise or formal, where
|
87 |
+
we consider the maximization of sliced mutual information over linear transformations of
|
88 |
+
our random variables.
|
89 |
+
And this would, of course, not affect classic mutual information at all.
|
90 |
+
But what we can show is that for sliced mutual information, this maximization ends up extracting
|
91 |
+
the two most informative projection directions for you, which in particular will be encoded
|
92 |
+
in the optimizing matrices, these A sub x star and A sub y star.
|
93 |
+
And of course, there's nothing special about this particular setup.
|
94 |
+
And we can establish similar results for, well, first of all, rank-constrained matrices
|
95 |
+
that as opposed to what's shown here would extract the, let's say, our most informative
|
96 |
+
features or projection directions.
|
97 |
+
In the paper, we also extend this result to shallow neural networks.
|
98 |
+
And in fact, our argument can be easily extended to cover additional nonlinear cases as well.
|
99 |
+
OK, so that's pretty much for structural properties.
|
100 |
+
But like I said at the beginning, the real premise of this framework is overcoming the
|
101 |
+
curse of dimensionality.
|
102 |
+
And let me show you that this is indeed the case, that sliced mutual information is or
|
103 |
+
can be estimated in a scalable manner, effectively by combining your favorite scalar mutual information
|
104 |
+
estimator with a simple Monte Carlo average step.
|
105 |
+
And this is how it works.
|
106 |
+
So let's say we're giving n IID samples from our high-dimensional random variables.
|
107 |
+
And we're further given a scalar mutual information estimator that achieves, say, error delta
|
108 |
+
of n when applied to n IID samples of some pair of one-dimensional variables, a and b.
|
109 |
+
OK, so let's say we have these.
|
110 |
+
Now, to estimate sliced mutual information, first thing to do is sample, let's say, m
|
111 |
+
random projections from the corresponding spheres in an IID fashion, at which point
|
112 |
+
we will take our high-dimensional n samples and project them onto each of these m random
|
113 |
+
projections that we've generated.
|
114 |
+
And the thing to observe here is that the resulting n times n data set of these projections
|
115 |
+
is nothing but IID samples from the corresponding projected distribution, which is the right
|
116 |
+
thing to have here if what you're trying to estimate is sliced mutual information.
|
117 |
+
So having that, I mean, at this point, per projection direction, we can apply the scalar
|
118 |
+
mutual information estimator and then just take one big, happy Monte Carlo average of
|
119 |
+
the entire thing over the different projection directions.
|
120 |
+
And this would give rise to the proposed sliced mutual information estimator.
|
121 |
+
Now, you can compute this thing very easily, because at the end of the day, it's an average
|
122 |
+
of scalar mutual information estimates.
|
123 |
+
And as far as performance guarantees, we can show that so long that the per-sliced mutual
|
124 |
+
information is bounded, the uniform absolute error of this estimator scales like 1 over
|
125 |
+
the root of m, the number of our Monte Carlo samples, plus the error of the scalar mutual
|
126 |
+
information estimator.
|
127 |
+
And I'm just restating this informally over here.
|
128 |
+
And what this all in all shows is that sliced mutual information can therefore be estimated
|
129 |
+
the rate of scalar mutual information estimation problem plus this m to the minus half Monte
|
130 |
+
Carlo penalty.
|
131 |
+
And the thing is that under appropriate smoothness assumptions, the one-dimensional rate is in
|
132 |
+
fact parametric.
|
133 |
+
And therefore, if you just match the size of your data set and the number of Monte Carlo
|
134 |
+
samples, just equate n and m, the sliced mutual information between high-dimensional variables
|
135 |
+
can be estimated at the parametric n to the minus half rate, perhaps up to some logarithmic
|
136 |
+
factors.
|
137 |
+
And this is, of course, a significant speed up and stands in sharp contrast to the slow,
|
138 |
+
exponentially bad in dimension, curse of dimensionality rate for classic mutual information.
|
139 |
+
Yeah, now this scalability makes, in fact, running empirical experiments with sliced
|
140 |
+
mutual information quite a breeze.
|
141 |
+
So let me quickly show you some sort of proof of concept experiments, let's say.
|
142 |
+
And the first one just relies on the fact that, well, SMI, sliced mutual information
|
143 |
+
can identify independence.
|
144 |
+
And therefore, we examine it as a figure of merit for independence testing, basically
|
145 |
+
by thresholding the computed sliced mutual information value.
|
146 |
+
And the results that we have obtained, of course, we've compared them with the same
|
147 |
+
test, but based on classic mutual information.
|
148 |
+
And this figure over here shows that for a bunch of different settings, well, it presents
|
149 |
+
the area under the ROC curve as a function of the number of samples, the standard way
|
150 |
+
to represent the quality of an independence test.
|
151 |
+
And you basically want this number to be 1, which corresponds to an omniscient test.
|
152 |
+
And what we observe is that sliced mutual information performs consistently well across
|
153 |
+
different setups and across different dimensions, whereas the performance of the mutual information,
|
154 |
+
the classic mutual information-based test, quickly degrades as dimension grows.
|
155 |
+
Now, on top of that, let me also demonstrate how sliced mutual information can be used
|
156 |
+
for feature extraction.
|
157 |
+
And here, what we want to do is maximize the sliced mutual information between linear transformations
|
158 |
+
of x and y that are now chosen to be IID samples from the same MNIST class, which we restrict
|
159 |
+
to be either 0 or 1.
|
160 |
+
And the choice of class is also random, so basically just a fair coin flip.
|
161 |
+
And by observing that sliced mutual information between x and y is at most 1 bit, I mean,
|
162 |
+
it's always upper bounded by mutual information, which equals a single bit in this case, basically
|
163 |
+
the class label, the way to understand what we're doing here is that we're looking for
|
164 |
+
the linear feature that is most informative for classifying or determining this class
|
165 |
+
label.
|
166 |
+
And interestingly enough, this is what this procedure ends up learning, where the figure
|
167 |
+
shows basically the first two rows of the optimal A matrix that we obtained, rearranged
|
168 |
+
in the dimension of an MNIST image.
|
169 |
+
And this really looks like a match filter, if you're familiar, which, when applied to
|
170 |
+
the samples, would indeed be able to tell you whether the sample came from the 0 class
|
171 |
+
or not.
|
172 |
+
And as far as for the value itself, well, the maximized sliced mutual information value
|
173 |
+
ends up being roughly 0.7, which is quite close to the 1 bit upper bound, and is much,
|
174 |
+
much larger than what you would get if you would not learn A, and let's say just instantiate
|
175 |
+
it as a matrix with IID entries drawn according to some distribution.
|
176 |
+
And this is just to say that something meaningful indeed being learned here, and something meaningful
|
177 |
+
indeed happens when you maximize the sliced mutual information as your optimization objective.
|
178 |
+
OK, so yeah, that's basically it.
|
179 |
+
And just to recap, we introduced sliced mutual information, which is this average of scalar
|
180 |
+
mutual information terms between one-dimensional projections.
|
181 |
+
We've seen that it preserves much of the structure of classic mutual information.
|
182 |
+
It can be efficiently computed and estimated from samples, and can also be, in fact, increased
|
183 |
+
by our processing if, indeed, your processing gives rise to more informative projections.
|
184 |
+
And we've presented some proof of concept applications to independence testing, to feature
|
185 |
+
extraction.
|
186 |
+
We have a couple of more in the paper.
|
187 |
+
But let me say this.
|
188 |
+
While this is mostly theoretical work, and a large-scale empirical exploration is sort
|
189 |
+
of beyond its scope, we firmly believe that sliced mutual information will be extremely
|
190 |
+
useful for various such tasks, and are very excited to look into this in the future.
|
191 |
+
And yeah, with that, I'll stop.
|
192 |
+
Thank you guys for listening, and do visit us at the poster, and check out the paper
|
193 |
+
if you would like to know more.
|
demo_data/nips-2021/25957/transcript_whisper_large-v2.txt
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Hi, I'm Hugo Richard, I'm a third year PhD student at Université Paris-Saclay.
|
2 |
+
I'm in the INRIA Paris et Alpes team and my supervisor is Bertrand Thirion.
|
3 |
+
Today I'll talk about shared independent component analysis for multi-subject neuroimaging.
|
4 |
+
This is a joint work with Pierre Abelin, Alexandre Grandfort, Bertrand Thirion and Anna Pouy-Varine.
|
5 |
+
First let us consider two sources that are emitting a signal that is recorded by two
|
6 |
+
sensors.
|
7 |
+
This can be seen as a simplified model of magnetoencephalography where brain sources
|
8 |
+
are recorded by magnetometers.
|
9 |
+
Because propagation time can be neglected, the signal recorded by the sensors can be
|
10 |
+
seen as a linear mixture of the signal emitted by the sources.
|
11 |
+
S is a set of sources that are assumed to be independent.
|
12 |
+
X are the recordings and A describes how the sources are mixed to produce the recordings.
|
13 |
+
At first sight this model may seem ill-defined because if we permute two columns in A and
|
14 |
+
permute the corresponding sources in S, we'll get a new set of sources S' and a new mixing
|
15 |
+
matrix A' that describes X just as well as A and S.
|
16 |
+
And similarly if we scale the column of A by some constant, one column of A by some
|
17 |
+
constant and the corresponding source by the same constant, we'll also get an equivalent
|
18 |
+
description of X.
|
19 |
+
However, these scale and permutation indeterminacies are the only one if the sources contain at
|
20 |
+
most one Gaussian component.
|
21 |
+
Let us consider the more general problem where you have multiple subjects that are exposed
|
22 |
+
to the same stimuli.
|
23 |
+
We have two subjects, X1 and X2, and they have different mixing matrices, A1 and A2,
|
24 |
+
and different noise levels, N1 and N2.
|
25 |
+
The interpretation is that they have shared sources because they have shared connective
|
26 |
+
processes.
|
27 |
+
They have different mixing matrices because they have different spatial topography.
|
28 |
+
And they have different noises because we want to model inter-subject variability.
|
29 |
+
This model is called group ICA.
|
30 |
+
There are many methods to provide a solution for the group ICA problem.
|
31 |
+
A very popular one introduced by Calhoun in 2001 is to just stack the data of all subjects
|
32 |
+
feature-wise and then perform a PCA, a principal component analysis, on the stacked data.
|
33 |
+
And therefore you obtain reduced data and apply independent component analysis on the
|
34 |
+
reduced data to obtain a set of sources.
|
35 |
+
Another formulation is introduced by Varoko in 2010 and is called K-NICA.
|
36 |
+
You just replace the principal component analysis with a multiset CCA, so a multiset canonical
|
37 |
+
correlation analysis, where you have to solve a generalized eigenvalue problem.
|
38 |
+
There are many different formulations of multiset CCA, but this one with a generalized eigenvalue
|
39 |
+
problem is the fastest to solve.
|
40 |
+
KNICA and Cut-ICA have a lot of advantages.
|
41 |
+
First, they are very fast to fit.
|
42 |
+
And second, they are simple to implement.
|
43 |
+
These are the two reasons why they are so popular in neuroimaging.
|
44 |
+
However, they do not optimize the proper likelihood.
|
45 |
+
So therefore they do not benefit from advantages of such estimators such as asymptotic efficiency.
|
46 |
+
There are a lot of other related work that do optimize the proper likelihood.
|
47 |
+
I want to mention the independent vector analysis, which is a very powerful framework introduced
|
48 |
+
by Li in 2008.
|
49 |
+
So unified approach of Guo in 2008 that we will also mention and talk about later.
|
50 |
+
The approach of Shen in 2015 that also allows to perform dimension reduction.
|
51 |
+
And the multi-view ICA that was introduced by our team last year.
|
52 |
+
I want to quickly say that it's not obvious to design a likelihood-based approach that
|
53 |
+
is tractable.
|
54 |
+
And with this example of the Gaussian mixture noisy ICA by Bermond and Cardozo, we'll see
|
55 |
+
that standard approach leads to intractable algorithms.
|
56 |
+
The model we take here is the same as the group ICA, but we assume that the noise is
|
57 |
+
Gaussian with the same variance for all subjects.
|
58 |
+
We'll also assume that the sources follow a Gaussian mixture model.
|
59 |
+
And we further assume that the weights of the Gaussian mixtures are known.
|
60 |
+
We can solve such model via expectation maximization.
|
61 |
+
And if we write the E-step, we'll get a closed form that involves a large sum.
|
62 |
+
Because of this large size, this sum, and therefore the M algorithm is intractable whenever
|
63 |
+
Q and K are large.
|
64 |
+
Our contribution is shared ICA, what we call Shikha for short, where the data of subject
|
65 |
+
i are assumed as a linear mixture of noisy sources, and the noise here is not on the
|
66 |
+
sensor, but on the sources.
|
67 |
+
The noise is Gaussian with a variance that can be different for each subject and different
|
68 |
+
for each component.
|
69 |
+
S are assumed to be independent, but in contrast to almost all existing work, some components
|
70 |
+
can be Gaussian.
|
71 |
+
We have a few blanket assumptions.
|
72 |
+
We assume that the data are centered, that the mixing metrics are invertible, that the
|
73 |
+
sources have identical variance, and that the number of subjects is greater than 3.
|
74 |
+
We have two algorithms to solve the Shikha model.
|
75 |
+
We have ShikhaJ, that is a FAS algorithm that is based on multiset CCA, and ShikhaML, a
|
76 |
+
maximum likelihood approach.
|
77 |
+
In Shikha, there are two ways to recover the parameters.
|
78 |
+
Either the source are non-Gaussian, in which case we can use classical ICA results to recover
|
79 |
+
the unmixing matrices.
|
80 |
+
When the components are Gaussian, then we need something else, and what we use here
|
81 |
+
is noise diversity.
|
82 |
+
When the noise is sufficiently diverse, then it's possible to recover the unmixing matrix
|
83 |
+
and the noise covariance up to a permutation and sign indeterminacy.
|
84 |
+
Note that the noise diversity in Gaussian components is also a necessary condition.
|
85 |
+
If it does not hold, then Shikha cannot be identified.
|
86 |
+
Let us now focus on this theorem that is at the core of the ShikhaJ algorithm.
|
87 |
+
Namely it shows that we can solve group ICA with multiset CCA.
|
88 |
+
So assume the data follows the Shikha model, and consider the multiset CCA framed as a
|
89 |
+
generalized eigenvalue problem.
|
90 |
+
This generalized eigenvalue problem relies on two matrices, C and D. So C is formed by
|
91 |
+
second-order statistics, and D is formed by the diagonal blocks in C.
|
92 |
+
And so if we solve this eigenvalue problem and take the first k leading eigenvectors,
|
93 |
+
we can recover the correct unmixing matrix from them, up to a permutation and a scaling.
|
94 |
+
And this can only be done if the k first eigenvalues are distinct.
|
95 |
+
Note that the distinct eigenvalue condition is also necessary.
|
96 |
+
If two eigenvalues are the same, then this adds the need to determine IC, and therefore
|
97 |
+
we cannot solve group IC.
|
98 |
+
Note also that the condition that some eigenvalues need to be distinct is stronger than the noise
|
99 |
+
diversity condition we have in the identifiability theorem.
|
100 |
+
And therefore we can exhibit an example which is identifiable, but on which multiset CCA
|
101 |
+
will fail.
|
102 |
+
And I refer you to the paper for more details on this.
|
103 |
+
So in our theorem, in order to recover the correct unmixing matrix, we need to have access
|
104 |
+
to the second-order statistics.
|
105 |
+
However, in practice, we only have access to them, up to some sampling noise.
|
106 |
+
And because the mapping from matrices to eigenvectors is highly non-smooth, a small deviation in
|
107 |
+
the second-order statistics can lead to a high deviation of the recovered unmixing matrix.
|
108 |
+
Now to show this in practice, we take three subjects, two components, and noise covariance
|
109 |
+
matrices with two values, lambda1 and lambda2, that are separated by an eigengap epsilon.
|
110 |
+
And we compare the solution of multiset CCA on the true covariance matrices and on the
|
111 |
+
perturbed covariance matrix, where the perturbation scale is given by delta.
|
112 |
+
And for different values of epsilon, 10-4, 10-3, 10-2, 10-1, we show how the performance
|
113 |
+
of the algorithm, so the M-ary distance between the true unmixing matrix and the estimated
|
114 |
+
unmixing matrix, varies when the perturbation scale increases.
|
115 |
+
And we see that when the eigengap is very close, so 10-4, the violet curve, then even
|
116 |
+
with a very small perturbation, you can get to a very bad M-ary distance.
|
117 |
+
So the black dashed curve is a performance of chance.
|
118 |
+
Luckily, there is a large gap between the k-th eigenvalues and the k plus 1.
|
119 |
+
This means that in practice, the span of the p-leading eigenvectors is approximately preserved.
|
120 |
+
We can recover the true unmixing matrix from the unmixing matrix estimated by multiset
|
121 |
+
CCA, just by multiplying by a matrix Q.
|
122 |
+
And in order to estimate Q, we make use of the fact that the unmixed data should have
|
123 |
+
a diagonal covariance.
|
124 |
+
This leads us to a joint diagonalization problem that we can solve efficiently.
|
125 |
+
So if we take the experiments we've done on the previous slide, the results are still
|
126 |
+
shown here.
|
127 |
+
You can see the violet curves, and that is very sensitive to perturbation.
|
128 |
+
And so if we apply joint diagonalization, all these curves move, and they join the dashed
|
129 |
+
curve on the bottom.
|
130 |
+
And therefore, it's much better, because now the new curves that are represented by the
|
131 |
+
dashed line are less sensitive to perturbations.
|
132 |
+
So now we've obtained the correct unmixing matrix, but up to a scaling.
|
133 |
+
And so we need an additional step to find the correct scaling, and another one to find
|
134 |
+
the other parameter that is still unestimated, which are the noise covariance.
|
135 |
+
And luckily, it's very easy to find the noise covariance.
|
136 |
+
We can do this via an EM algorithm.
|
137 |
+
The E-step and the M-step are in closed form, and this yields a very fast algorithm.
|
138 |
+
But the Shikha-J is not a maximum likelihood estimator.
|
139 |
+
So now we will focus on Shikha-ML, which is our maximum likelihood estimator.
|
140 |
+
So I won't go too much into details on this, but we optimize this via an EM using a Gaussian
|
141 |
+
mixture assumption as a source.
|
142 |
+
We assume that the weights are known.
|
143 |
+
What I just want to showcase here is that the E-step of the algorithm, the one that
|
144 |
+
gives you the expectation of the sources given the data, and the variance of the sources
|
145 |
+
given the data, only involves the sum of size 2.
|
146 |
+
So previously we had a sum that had an exponential number of terms, and here we don't have that
|
147 |
+
anymore.
|
148 |
+
So the E-step is much faster than what we had before, and therefore the EM algorithm
|
149 |
+
here is tractable, whereas it was not the case before.
|
150 |
+
I first want to present our synthetic experiment where we generate data according to the Shikha-ML
|
151 |
+
and Shikha-J model.
|
152 |
+
In case A, we have only Gaussian components, but we have noise diversity, and therefore
|
153 |
+
methods that use noise diversity to recover the sources such as Shikha-ML and Shikha-J
|
154 |
+
perform best.
|
155 |
+
In the second case, we have only non-Gaussian components and no noise diversity, so methods
|
156 |
+
that use non-Gaussianity perform well such as Kana-ICA, Shikha-ML, or MultiView-ICA.
|
157 |
+
And the last case, half of the components are Gaussian with noise diversity, and the
|
158 |
+
other half are non-Gaussian but without noise diversity.
|
159 |
+
And in this case, only Shikha-ML is able to correctly recover the sources.
|
160 |
+
MV-ICA doesn't do that, but it's not as good as Shikha-ML.
|
161 |
+
Let us now talk about our experiments on real data.
|
162 |
+
We have this reconstruction experiment on fMRI data where subjects are exposed to a
|
163 |
+
naturalistic stimuli such as movie watching.
|
164 |
+
We use 80% of the movie to learn the unmixing matrices of all subjects, and then on the
|
165 |
+
20% left of the movie, we compute the common sources, and from these common sources computed
|
166 |
+
using 80% of the subject, we try to reconstruct the data of the 20% left of the subject.
|
167 |
+
We compute the R2 score within regions of interest between the reconstructed data and
|
168 |
+
the true data, and plot them as a function of the number of components used.
|
169 |
+
As we see, Shikha-ML outperforms all of the methods.
|
170 |
+
As a take-home message, Shikha is a powerful framework to extract shared sources.
|
171 |
+
Shikha-J is a fast approach to fit the model, but it only uses second-order information.
|
172 |
+
In contrast, Shikha-ML is a bit slower, but is able to use non-gaussianity in addition
|
173 |
+
to second-order information.
|
174 |
+
In practice, Shikha-ML yields the best results.
|
175 |
+
The methods we've introduced work on reduced data.
|
176 |
+
It would be interesting to know how to reduce the data so that they perform optimally.
|
177 |
+
Another way to improve our results would be to learn the density of the shared sources
|
178 |
+
in Shikha-ML instead of having them fixed.
|
179 |
+
Thanks for listening, and have a good day!
|
demo_data/nips-2021/25958/transcript_whisper_large-v2.txt
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Hello everyone, I'm Luigi Carretino, and this is a joint work with Stefano Vigonia,
|
2 |
+
Daniele Calandriello, and Lorenzo Rosasco.
|
3 |
+
The problem that we study in this work is a standard regression problem, where we want
|
4 |
+
to estimate an unknown function f star given n pairs of points, x's and y's, and then
|
5 |
+
given n pairs of points, x's and y's, where y's are noisy evaluations of the functions
|
6 |
+
f star on the input points axis.
|
7 |
+
A well-established method to learn nonlinear functions is kernel ridge regression.
|
8 |
+
The basic idea is to map the input points into a higher dimensional space, where linear
|
9 |
+
relationships can be learned that then translate in nonlinear ones in the input space.
|
10 |
+
To formalize this, we can think about solving a standard empirical risk minimization problem
|
11 |
+
regularized over a spatial function which is a reproducing kernel Hilbert space.
|
12 |
+
Numerically speaking, the solution of this type of problem boils down to solving a linear
|
13 |
+
system. Particularly, we can see here that the linear system is going to be Kc equal
|
14 |
+
y, where K is the kernel matrix evaluated in all the pairs of points of the training
|
15 |
+
sets, c are the weights that we aim to learn, and y's are the output points.
|
16 |
+
We know that this method is optimal from a statistical point of view, but a drawback
|
17 |
+
is that it suffers from computational scalability. In fact, in terms of time complexity, if we
|
18 |
+
have n training points and we want to solve the linear system directly, we'll have to
|
19 |
+
invert the matrix K, and this will cost us n cubed in time.
|
20 |
+
Multiple ways of accelerating this process have been proposed over time.
|
21 |
+
The first one is to solve the methods iteratively instead of inverting directly the matrix K.
|
22 |
+
This allows us to only have matrix vector multiplications, and so the overall cost of
|
23 |
+
an iterative method to solve this linear system is going to be Tn squared.
|
24 |
+
Another method is the one known as sketching, where we can see this as subsampling the linear
|
25 |
+
system, in particular subsampling columns of this linear system, where we can take m
|
26 |
+
columns of the linear system uniformly at random to get a smaller one, and the cost
|
27 |
+
of this will be m squared n.
|
28 |
+
Another method instead is splitting. This allows us to divide the main problem into
|
29 |
+
many, in this case Q, subproblems, each one that can be solved independently and so
|
30 |
+
potentially can be distributed. So we can have a cost which boils down to n over Q to
|
31 |
+
the power of 3.
|
32 |
+
Combinations of these methods have been proposed in the literature. In particular, if
|
33 |
+
we combine iterating and sketching, we can get a solver that can solve the problem in
|
34 |
+
a time complexity of Tmn.
|
35 |
+
If instead we combine sketching and splitting, we can get a solver that can be computed
|
36 |
+
in m squared times n over Q.
|
37 |
+
And in this work, we try to blend all these techniques to derive a new algorithm, which
|
38 |
+
we will call PARC, that can achieve a time complexity of Tm times n over Q to the power
|
39 |
+
of 2.
|
40 |
+
So as we just said, in this work, we propose a new large-scale kernel regression solver
|
41 |
+
that combines the computational benefits of iteration, sketching, and splitting.
|
42 |
+
Notice, though, that these are approximation techniques and they may come at the cost of
|
43 |
+
accuracy. But we are able to show that this new algorithm is able to preserve generalization
|
44 |
+
under suitable partitions.
|
45 |
+
Now also notice that instead of general splitting, we are going to need to focus on a
|
46 |
+
particular type, which is the partitions.
|
47 |
+
So we introduce a new principal partition scheme for kernel methods.
|
48 |
+
We now look at the difference between data splitting and space partitioning.
|
49 |
+
Given a set of points, the procedure of splitting takes groups of points at random and assign
|
50 |
+
them to different splits or clusters.
|
51 |
+
In this picture, for example, we divide the points in four splits.
|
52 |
+
Partitioning instead divides the space in different cells, and then the points are implicitly
|
53 |
+
assigned to a particular cluster based on which cell they belong to.
|
54 |
+
Notice that with the splitting methods, we don't consider local information while we
|
55 |
+
perform the splitting, but we do when we perform partitioning.
|
56 |
+
Now, from this picture, the concept of partitioning a space seems pretty straightforward.
|
57 |
+
However, when you start considering high dimensional feature space, subtle problems can
|
58 |
+
appear.
|
59 |
+
So first, as a recap, remember that there are two important spaces to consider in our
|
60 |
+
regression problem.
|
61 |
+
The input space X with its input space features and the kernel space H with its input space
|
62 |
+
features, and the kernel space H, which potentially has many more implicit features.
|
63 |
+
Traditionally, partition methods are applied directly to the input space.
|
64 |
+
For example, a classical approach is to select a subset of points as centroids and then
|
65 |
+
partition the space in cells by assigning each portion of the space to the closest centroid,
|
66 |
+
which is called a Voronoi partition.
|
67 |
+
Since we are in the input space, closest here is defined according to a simple Euclidean
|
68 |
+
distance.
|
69 |
+
However, remember that our target function and our whole regression does not happen
|
70 |
+
directly on the input data space, but rather on the data mapped in the feature space.
|
71 |
+
And after we apply our feature map to the data, the concept of closest and the partition
|
72 |
+
can radically change.
|
73 |
+
For example, here on the right, we choose a kernel space associated with a cosine similarity
|
74 |
+
and again plot how the centroids partition the input space, but this time we chose closest
|
75 |
+
according to the new cosine distance.
|
76 |
+
The resulting partition is very different from the Euclidean one as it captures the
|
77 |
+
non-linearity of the kernel function.
|
78 |
+
In the paper, we discuss how this difference can impact the regression and we identified
|
79 |
+
sufficient conditions that the partition should satisfy in order to guarantee good generalization
|
80 |
+
of the learning process.
|
81 |
+
Crucially, we will see that these guarantees depend not on how the input space is partitioned,
|
82 |
+
but rather how the feature space is partitioned.
|
83 |
+
As a consequence, for our PARC methods, we focus on choosing centroids solely using the
|
84 |
+
kernel version of the distance.
|
85 |
+
We are now ready to present in more detail how the PARC algorithm works.
|
86 |
+
First of all, PARC partitioned the feature space into Q Voronoi cells and the first thing
|
87 |
+
to do is to identify the centroids in the feature space that allows us to describe the
|
88 |
+
Voronoi cells.
|
89 |
+
Then inside each Voronoi cell, we learn a local estimator using an uniterated and sketched
|
90 |
+
version of kernel ridge regression.
|
91 |
+
And then at prediction time, when a new sample arrives, we can use the Q Voronoi feature
|
92 |
+
to identify the new sample.
|
93 |
+
We use the local estimator corresponding to the Voronoi cell to which the new points fall
|
94 |
+
on.
|
95 |
+
The generalization error of standard kernel ridge regression without partitioning can
|
96 |
+
be upper bounded by two terms, a bias term and a variance term.
|
97 |
+
In our work, we can show that also the generalization error of PARC can be upper bounded by a bias
|
98 |
+
term and a variance term.
|
99 |
+
But this time, these two terms are weighted and they are weighted by a certain quantity
|
100 |
+
that depends on an angle theta, which is the minimum angle between all the subspaces of
|
101 |
+
the partitions.
|
102 |
+
For example, when all the subspaces are orthogonal between each other, we recover the exact same
|
103 |
+
generalization error of standard kernel ridge regression.
|
104 |
+
But we are also able to show that for angles which are small enough, we are able to obtain
|
105 |
+
a generalization error which is of the same order of standard kernel ridge regression.
|
106 |
+
These theoretical results suggest us how to construct a good partition.
|
107 |
+
So in particular, PARC selects the Voronoi centroids greedily in order to promote orthogonality
|
108 |
+
between the Voronoi cells.
|
109 |
+
And in particular, we use the Schur complement to measure the orthogonality.
|
110 |
+
We also use the Schur complement to measure the orthogonality of the Voronoi centroids.
|
111 |
+
And in particular, we use the Schur complement to measure the orthogonality.
|
112 |
+
Given all these ingredients, we are now able to measure the computational complexity of
|
113 |
+
PARC, which has a time complexity that is the sum of two terms.
|
114 |
+
A first term, q squared n log n, which is the cost of computing the centroids with the
|
115 |
+
just mentioned procedure.
|
116 |
+
And a second term, q squared n log n, which is the cost of computing the most expensive
|
117 |
+
local estimator.
|
118 |
+
Empirically, we performed experiments on data set of millions and of billions of points,
|
119 |
+
and we compared with the currently fastest global kernel methods and with some other
|
120 |
+
splitting kernel methods.
|
121 |
+
We can see that PARC is the only method that manages to match the accuracy of the global
|
122 |
+
estimator.
|
123 |
+
Thank you all for your attention.
|
124 |
+
And thank you to the poster for all your questions and more details.
|
demo_data/nips-2021/25959/transcript_whisper_large-v2.txt
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Hello, my name is Pouya Bahshiban and I'm going to tell you about our paper titled
|
2 |
+
Adversarial Feature Desensitization. This is joint work with a number of wonderful collaborators
|
3 |
+
at MIWA, University of Montreal and McGill University, including Reza Bayat, Adam Ibrahim,
|
4 |
+
Kartika Hoja, Mojtaba Farmazi, Tourez Dale, Lake Richards and Erin Oji. A common assumption in
|
5 |
+
machine learning is that the train and test samples come from the same distribution.
|
6 |
+
While this is a reasonable assumption under most circumstances, it is intentionally violated in the
|
7 |
+
regime of adversarial attacks. Adversarial attacks are algorithms that search for slight input
|
8 |
+
perturbations that cause the input to be misclassified. In the case of white box attacks,
|
9 |
+
the model itself is transparent to the attacker and the attacker uses it to identify the possible
|
10 |
+
inputs that would lead to misclassifications. A famous example of this is the image of a panda
|
11 |
+
that when perturbed with imperceptible noise, alters the model's prediction from a panda to a
|
12 |
+
gibbon. As prior literature has shown, this is a common issue in almost all machine learning methods
|
13 |
+
and unless the classifier is specifically trained to be robust against these attacks,
|
14 |
+
the attacks could completely break down the classifier's performance.
|
15 |
+
This issue becomes even more critical when we consider the vast usage of these machine learning
|
16 |
+
systems in our societies. For example, the possible security concerns that rise in face
|
17 |
+
recognition systems prone to adversarial attacks or the safety in autonomous driving systems.
|
18 |
+
So what is an adversarial attack? To formally define the adversarial attacks, let's assume a
|
19 |
+
feature learning function f that projects inputs x to latent space with feature space z
|
20 |
+
and a classifier that uses the latent code z to predict the correct class label y hat.
|
21 |
+
The perturbation function or the attack generates a perturbed sample x prime
|
22 |
+
within the epsilon neighborhood of the input x, which we're showing here as b of x and epsilon.
|
23 |
+
By maximizing the classification objective, the opposite of how we normally optimize the classifier's
|
24 |
+
parameter. Many methods have been proposed to defend the models against adversarial attacks.
|
25 |
+
Two of these methods that have withstood the test of time so far are the adversarial training
|
26 |
+
by Alexander Modrianov, which proposes a defense method by solving a minimax optimization problem
|
27 |
+
that involves finding an adversarial input by maximizing the classification loss in the inner
|
28 |
+
loop followed by a classifier training to minimizing the classifier loss on these adversarial inputs.
|
29 |
+
This procedure is graphically shown for two hypothetical classes in the diagram on this slide.
|
30 |
+
The adversarial training method essentially learns to separate the distributions of adversarial
|
31 |
+
examples belonging to different classes. The second method is the trades method by Zhang et al,
|
32 |
+
which proposes to push the decision boundary of the classifier away from the data.
|
33 |
+
Trades achieves this by introducing a regularization term to the original learning
|
34 |
+
objective for classification that penalizes the mismatch between the predicted label
|
35 |
+
for the clean and perturbed inputs. The diagram on the right side again graphically illustrates
|
36 |
+
this procedure, where now the defense method learns to separate the distributions of clean examples
|
37 |
+
belonging to different classes while minimizing the loss of the classifier.
|
38 |
+
The third method is the trade method by Wang et al, which proposes to push the decision boundary
|
39 |
+
of the classifier to the inner loop followed by a classifier training to minimizing the
|
40 |
+
classification loss on these adversarial inputs. The third method is the trade method by Zhang et al,
|
41 |
+
which proposes to push the decision boundary of the classifier to the inner loop followed by a
|
42 |
+
classifier training to minimizing the classification loss on these adversarial inputs to the inner
|
43 |
+
loop. The third method is the trade method by Wang et al, which proposes to push the decision
|
44 |
+
boundary of the classifier to minimizing the classification loss. The fourth method is the
|
45 |
+
trade method by Wang et al, which proposes to push the decision boundary of the classifier
|
46 |
+
for a source domain, but we want the classifier to also perform the same task on a related target
|
47 |
+
domain that we might not have enough data for or that the generating procedure for sampling
|
48 |
+
domain might be expensive. The domain adaptation theory proposed by Ben David et al answers the
|
49 |
+
question of under what conditions can we adapt a classifier trained on the source domain for use
|
50 |
+
in the target domain. Here we consider the original clean distributions as the source domain and the
|
51 |
+
distribution of adversarial images generated from those images as the target domain. Although here
|
52 |
+
the target domain continuously evolves because the adversarial examples are based on the current
|
53 |
+
state of the model at each time step. And similar to the domain adaptation theory, our goal here
|
54 |
+
is to learn how to perform well on both source and target domains, meaning the natural and
|
55 |
+
adversarial domains. Now before I tell you about our proposed method, let's dive a bit deeper into
|
56 |
+
what the domain adaptation theory from Ben David et al states. Similar to before, let's assume a
|
57 |
+
feature learning function f that projects inputs x to latent space or feature space z and the
|
58 |
+
classifier that predicts the correct label y, y hat, from those latent codes. Now consider natural
|
59 |
+
and adversarial examples as input domains dx and d' x and their induced feature distributions
|
60 |
+
which go through the f function as dz and d' z. Also consider epsilon z and epsilon' z
|
61 |
+
as the classification error over the domains dz and d' z, what we are going to refer to as the
|
62 |
+
clean accuracy and the adversarial accuracy. The domain adaptation theory now gives a bond
|
63 |
+
on the adversarial error in terms of the natural error and the distance between the two domains.
|
64 |
+
Fortunately, from the prior work, we know that h delta h distance, which measures the distance
|
65 |
+
between two domains, can be estimated using the classifier trained to discriminate between the
|
66 |
+
two domains. Now our defense method called adversarial feature desensitization essentially
|
67 |
+
minimizes the bound on the adversarial error epsilon' z using a three-step procedure which
|
68 |
+
has some conceptual similarities with prior work on adversarial domain adaptation from Ganin et al.
|
69 |
+
For this, we first update the parameters theta and phi in the feature learning function f and
|
70 |
+
task classifier c to minimize the classification loss on the natural domain. This is shown with
|
71 |
+
green arrows and green boxes marked 1 on both the equation and on the diagram.
|
72 |
+
Secondly, we estimate the h delta h distance using an additional domain discriminator
|
73 |
+
network that predicts the domain identity from the latent code z. We update the domain
|
74 |
+
discriminator parameters psi to minimize the domain classification loss. And finally,
|
75 |
+
in the third step, we update the feature learning network parameters theta to maximize the domain
|
76 |
+
classification loss in an adversarial way. These two steps are marked with red arrows in the figure
|
77 |
+
and red boxes on the equation. Similar to previous two methods, adversarial training and trades that
|
78 |
+
I showed you, we here we can also graphically demonstrate this procedure. In our method AFD,
|
79 |
+
we learn to separate the classes from the distributions of clean examples while at the
|
80 |
+
same time we optimize a domain classifier that learns the boundary between the clean and adversarial
|
81 |
+
examples for each class. And finally, we push the adversarial examples to the opposite side of that
|
82 |
+
boundary. This procedure implicitly desensitizes the learned features to adversarial perturbations
|
83 |
+
and hence the name adversarial feature desensitization. We tested our method on four
|
84 |
+
data sets and compared them with a number of other baselines including with adversarial training and
|
85 |
+
trades. We made two versions of our method called AFDTCGAN that uses the adversarial losses from
|
86 |
+
Goodfellow et al and AFDWGAN that uses the Wasserstein losses from Arjovski and Goodtuner.
|
87 |
+
In the table, we evaluated all methods on several white box and black box attacks with
|
88 |
+
nominal strengths into each data set. Overall, our method AFD and especially AFDWGAN showed superior
|
89 |
+
performance against most attacks in most data sets. However, AFD was behind trades on several attacks
|
90 |
+
especially on CIFAR-100 and TinyImageNet data set that had more classes in it.
|
91 |
+
We also looked in trust attack methods and attack strengths which we controlled with the parameter
|
92 |
+
epsilon. The diagrams on the right show the robust accuracy for each defense method across
|
93 |
+
eight attack methods and various epsilon values for each of them. Overall, our results in these
|
94 |
+
diagrams showed that AFD's robustness generalizes better than the baselines across attacks and
|
95 |
+
across attack strengths. To quantify these differences, we also computed the area under
|
96 |
+
the curve for each method for each attack and summarized them in a table on the left.
|
97 |
+
As you can see, AFD's robust performance generalizes better to unseen and stronger attacks
|
98 |
+
compared to other baselines. If you remember from previous slides, the domain adaptation theory
|
99 |
+
predicted a bound on the adversarial error which can also be turned into a bound on the generalization
|
100 |
+
gap between natural and adversarial attacks. We empirically tested this prediction in our experiments
|
101 |
+
under two settings. Under the first setting, we varied the epsilon value for the PGDL-infinity
|
102 |
+
attack which was used during the training. And under the second setting, we varied the
|
103 |
+
epsilon value for the PGDL-infinity attack which was used during the training. And under the second setting, we used a diverse set of attacks and various attack strengths for each of them.
|
104 |
+
And under both scenarios, we found that the domain discriminator, which was originally trained on a
|
105 |
+
particular attack and attack strength, in our case it was PGDL-infinity attack with a fixed epsilon
|
106 |
+
for each data set, could well predict the generalization gap to unseen attacks and
|
107 |
+
different attack magnitudes. This suggests that the adversarial training against a domain classifier
|
108 |
+
like that used in our proposed method could potentially lead to robust models with better
|
109 |
+
generalization capacity. Finally, while we showed that AFD generalizes well to most other attacks
|
110 |
+
and attack strengths, it occasionally was worse compared to other baselines, especially in data
|
111 |
+
sets with more classes like Tiny ImageNet. This could potentially be due to the difficulty of training
|
112 |
+
domain classifiers in these data sets and leaves much space for future work on
|
113 |
+
investigating the effect of domain classifiers on the robustness of feature learning functions.
|
114 |
+
Also, AFD required more backward computations compared to some of the other baselines
|
115 |
+
such as adversarial training, and as a result, its training time was on average about 31%
|
116 |
+
longer than adversarial training. We invite you to read our paper for more details and please
|
117 |
+
get in touch with us if you have any questions. Thanks for watching this video and we hope you enjoyed it.
|
demo_data/nips-2021/25962/transcript_whisper_large-v2.txt
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Bonjour à tous, je suis Yannis Hartel et je vais vous présenter un travail sur l'estimation
|
2 |
+
de fonctionnalité en termes de certaines contraintes particulières de la privacité.
|
3 |
+
C'est un travail en lien avec mon conseiller postdoc, le professeur Cristina Gutucia.
|
4 |
+
Nous sommes intéressés par le fonctionnalité de la somme de puissance, qui est la somme de probabilités associées
|
5 |
+
à une distribution discrète, à la puissance gamma, où gamma est un nombre réel positif.
|
6 |
+
Donc, ce fonctionnalité de la somme de puissance est un exemple d'information qui se déroule dans différents domaines
|
7 |
+
comme les statistiques, l'apprentissage de machines, la théorie de l'information, la science de la neurone, etc.
|
8 |
+
Voici donc le problème statistique standard, où l'objectif est d'estimer la somme de puissance fonctionnelle
|
9 |
+
basée sur des exemples NIID, X1, X2 jusqu'à XN, qui suivent une distribution discrète B avec une taille d'alphabet K.
|
10 |
+
Une approche beaucoup utilisée est le estimateur de plug-in, où l'on utilise un estimateur du paramètre P
|
11 |
+
pour construire un estimateur du fonctionnalité, à travers le principe de plug-in.
|
12 |
+
Cette approche n'est pas seulement simple et intuitive, mais elle est aussi théoriquement saine,
|
13 |
+
car elle satisfait une efficacité asymptotique et une néro-optimalité non-asymptote.
|
14 |
+
La question intéressante de notre paper est de savoir si cette approche de plug-in
|
15 |
+
fonctionne dans un état de séparation non standard, où l'on impose une contrainte de privé,
|
16 |
+
et plus précisément, le setup de la privé différente local.
|
17 |
+
Ce qui signifie que l'on impose un état de privé fort, où l'on n'a pas accès aux données initiales et sensibles, les XI.
|
18 |
+
Au lieu de ça, l'on a seulement accès à une version privée de XI.
|
19 |
+
Voici la représentation d'un mécanisme simple qui n'est pas interactif.
|
20 |
+
Les termes local ici reflètent le fait que le mécanisme QI ne voit que les données XI.
|
21 |
+
En d'autres mots, il n'y a pas de troisième parti confiant qui a accès à toutes les données sensibles.
|
22 |
+
C'est un mécanisme de privé non-interactif simple, mais bien sûr, nous sommes aussi intéressés par des mécanismes plus sophistiqués,
|
23 |
+
notamment le mécanisme de séquence interactif, où chaque QI voit les données privées dévoilées précédemment,
|
24 |
+
et les données privées de XI, et les données privées de XI.
|
25 |
+
Dans cette étude non-standard, nous retournons au problème original de l'estimation fonctionnelle de la power sum,
|
26 |
+
où nous n'avons qu'accès à des données privées de XI jusqu'à XL.
|
27 |
+
Notre première contribution est de donner une caractérisation tigrée et non-transomatique du erreur de caractérisation de la power sum de l'estimateur.
|
28 |
+
Ce résultat montre que l'estimateur de la power sum n'est pas optimal.
|
29 |
+
Cela contraste avec la performance de l'estimateur de la power sum dans le problème statistique standard.
|
30 |
+
Le message ici est que les bons estimateurs dans le setup standard ne sont pas toujours bons estimateurs dans le setup local privacy.
|
31 |
+
Notre deuxième contribution est la correction du estimateur de plug-in grâce à une attentionnée de troncation de Pk de petites probabilités.
|
32 |
+
Cette correction conduit à une réduction significative du risque d'erreur.
|
33 |
+
En particulier, le risque devient indépendant du size alphabétique K lorsque K est grand.
|
34 |
+
Cette deuxième contribution, par contre, se base sur un mécanisme de privé non-interactif simple.
|
35 |
+
Dans la seconde partie du document, nous examinons un mécanisme de séquence interactive plus sophistiqué,
|
36 |
+
pour lequel nous construisons une procédure de deux pas qui nous permet de réduire le risque grâce à un facteur logarithmique.
|
37 |
+
Enfin, à la fin du document, nous fournissons un lien universel en bas sur le risque d'erreur
|
38 |
+
avec respect à tous les estimateurs et tous les mécanismes non-interactifs et séquentially interactifs.
|
39 |
+
Malheureusement, ce lien bas est un lien d'accords uniquement dans certains cas,
|
40 |
+
ce qui nous laisse avec quelques questions très importantes à poser sur ce problème.
|
41 |
+
Je pense que ce premier travail sur l'estimation fonctionnelle dans le contexte de la privé locale
|
42 |
+
vous donne au moins trois points clés.
|
43 |
+
Le premier point clé est le besoin de construire une procédure statistique prudente pour la configuration de la privé locale,
|
44 |
+
puisque c'est un setup où un bon estimateur dans un cadre standard n'a pas nécessairement de fonction.
|
45 |
+
Le deuxième point clé est que l'approche de type de plug-in analysée dans ce document
|
46 |
+
sert comme un benchmark pour de futurs travaux et des procédures plus sophistiquées.
|
47 |
+
Et le dernier point clé est que notre analyse de l'approche de type de plug-in et des mécanismes non-interactifs
|
48 |
+
montrent des régimes où le problème d'estimation est difficile
|
49 |
+
et espérons que cela incite les gens à amener des développements ici.
|
50 |
+
Merci à tous, et pour plus de détails, veuillez vérifier notre document en ligne.
|
51 |
+
Bye!
|
demo_data/nips-2021/25963/transcript_whisper_large-v2.txt
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Hello, I'm Hassam Murtaghi. I'm a PhD student at Georgia Tech. Along with my collaborator
|
2 |
+
Jay Mundra, we will present our work on reusing combinatorial structure, faster projections
|
3 |
+
over submodular-based polytopes. This is joint work with Swati Gupta.
|
4 |
+
In this talk, we consider a sequence of similar structured optimization problems a setup often
|
5 |
+
encountered in practice. We first start with our main problem of minimizing a convex function
|
6 |
+
over a decision set P. At the next time step, this problem sees some perturbation and we
|
7 |
+
obtain another similar problem, and so on. An example of this setup is the case of iterative
|
8 |
+
projections where at each time step, we are computing the projection of a new point y
|
9 |
+
t that is close to previously projected points y i. These iterative projections form a key
|
10 |
+
step in many optimal learning algorithms and they are currently solved from scratch every
|
11 |
+
iteration. They are not viewed in the context of an iterative environment where previously
|
12 |
+
computed projections can be exploited to speed up subsequent ones.
|
13 |
+
Thus, in this talk, we ask, is it possible to speed up similar iterative optimization
|
14 |
+
problems by reusing structural information from previous minimizers?
|
15 |
+
Let me now give you some more details about our setup. Here is a table that summarizes
|
16 |
+
various widespread first-order optimization algorithms. The first two algorithms are conditional
|
17 |
+
gradient variants and they only solve linear optimization every iteration. Their convergence
|
18 |
+
rates depend on the dimension of the problem and on geometric constants for the underlying
|
19 |
+
decision set, such as the pyramidal width for the waystep-Fraenkel variant given in
|
20 |
+
the second row. On the other hand, the remaining third algorithms
|
21 |
+
are projection-based algorithms that compute the projection every iteration, and their
|
22 |
+
convergence rates, however, are optimal in the sense that they only rely on the condition
|
23 |
+
number of the function and they are dimension-independent. Further, to capture a wide range of combinatorial
|
24 |
+
sets, we consider the case where decision set P is given by a submodular polytope, and
|
25 |
+
the challenge is that these polytopes have an exponential number of constraints. Thus,
|
26 |
+
computing a projection over those polytopes is a big computational bottleneck in projection-based
|
27 |
+
algorithms. Motivated by the straight-off in convergence rates versus runtime, we further
|
28 |
+
ask, is it possible to speed up iterative projections over submodular polytopes by reusing
|
29 |
+
structural information from previous minimizers? I'm now going to give more introduction on
|
30 |
+
the problem and submodularity and review of first-order methods. So, as mentioned, we
|
31 |
+
assume that the combinatorial structure in a problem is given by a submodular function.
|
32 |
+
Set function F, defined over a ground set E of n elements, is submodular if it satisfies
|
33 |
+
the following property. Furthermore, the base polytope associated with F is defined as the
|
34 |
+
following system of linear inequalities, and here we see that V of F is modeled using an
|
35 |
+
exponential number of constraints because we have a constraint for each subset of the
|
36 |
+
concept. An example is the permutahedron, a polytope whose vertices are permutations
|
37 |
+
of 1 through n. And here we have an example in the slide for when n is equal to 3. These
|
38 |
+
polytopes are extensively used in online learning over rankings of items. A special class of
|
39 |
+
submodular polytopes are known as Cardinality-based functions, and a Cardinality-based function
|
40 |
+
F is defined as F of S equal to G Cardinality of S, where G is a concave function. And here
|
41 |
+
we have another table that summarizes various machine and online learning problems in a
|
42 |
+
submodular set function that gives rise to them. We see the permutahedron in the second
|
43 |
+
row of this table, and it is in fact a Cardinality-based polytope. Other non-Cardinality-based examples
|
44 |
+
include spanning trees and independent sets of matroids.
|
45 |
+
So let's go back to our main problem of minimizing a convex function over the base polytope.
|
46 |
+
So there typically exist three main paradigms to solve this problem. The first is a class
|
47 |
+
of methods, known as conditional gradient methods, and as I mentioned before, those
|
48 |
+
assume access to B of F via linear optimization oracle. And these methods are specifically
|
49 |
+
advantageous for base polytopes because linear optimization over base polytopes could be
|
50 |
+
done very efficiently using Edmunds' greedy algorithm. The second class of methods are
|
51 |
+
mere descent variants, and those compute a projection every iteration to ensure feasibility.
|
52 |
+
And again, as I also previously mentioned, although those methods have optimal convergence
|
53 |
+
rates and are robust, they are, they remained of theoretical nature due to being computationally
|
54 |
+
expensive. The third class of methods are combinatorial algorithms specifically tailored
|
55 |
+
for convex optimization over some modular-based polytopes. Those algorithms require instead
|
56 |
+
solving a some modular function minimization problem every iteration, which again can be
|
57 |
+
very expensive. However, those algorithms enjoy the nice property of returning exact
|
58 |
+
optimal solution. In this talk, we will focus on bridging the efficiency of CG methods and
|
59 |
+
the structural properties and exactness of combinatorial algorithms to speed up iterative
|
60 |
+
projections appearing in mere descent and beyond. So first, let's consider the simpler
|
61 |
+
case when our polytope is cardinality-based. So here we have a cardinality-based some modular
|
62 |
+
function F, and for notation we define this vector c to be the vector of discrete derivatives
|
63 |
+
of the concave function g. We now give the following Duati result, which states that
|
64 |
+
the problem of computing a Bregman projection over a cardinality-based polytope is dual
|
65 |
+
to isotonic optimization. Although our results hold for general Bregman projections, we will
|
66 |
+
focus on the case of Euclidean projections for simplicity. To that end, consider a vector
|
67 |
+
y that we're trying to compute its Euclidean projection over a cardinality-based polytope,
|
68 |
+
and let e1 through en be an ordering of the ground set such that y is decreasing. In this
|
69 |
+
case, we have the following primal problem, and the dual to that is the following isotonic
|
70 |
+
regression problem. And further, we can map between the two problems using the following identity here.
|
71 |
+
So just to give you some historical context, previously the best known running time for
|
72 |
+
projections was O n squared using a primal algorithm by Gupta et al. Later on in that
|
73 |
+
year, Lim and Wright used the same Duati approach to compute projections over the permutahedron,
|
74 |
+
and we extended their approach to general cardinality-based polytopes. Now the dual
|
75 |
+
isotonic regression problem could be solved in O n time using a simple algorithm called
|
76 |
+
pool-adjacent violators algorithm, and this basically gives us an O n log n algorithm by
|
77 |
+
solving the problem in the dual space and mapping it back to the primal space. And this is currently
|
78 |
+
the fastest known algorithm. And the key takeaway is that solving projections over these polytopes
|
79 |
+
can be very efficiently done. In fact, computing a projection and solving linear optimization
|
80 |
+
have the same running time. Now let's demonstrate our result with an example. So here we are going
|
81 |
+
to project this vector y onto the probability simplex, and the probability simplex is modeled
|
82 |
+
by this cardinality-based modular function here given on the slide. And we see that y is already
|
83 |
+
ordered for simplicity and c is the vector of discrete derivatives. Now the algorithm will
|
84 |
+
proceed as follows. It initializes the dual iterates by the vector that we're trying to
|
85 |
+
compute the isotonic regression for, c minus y, and here we have an adjacent violation because the
|
86 |
+
second coordinate is strictly smaller than the first coordinate. Now the algorithm will basically
|
87 |
+
average those two coordinates to obtain the following solution z star, and here we see that
|
88 |
+
the ordering constraints are satisfied and z star is in fact the dual optimal. Next it will map it
|
89 |
+
back to a primal optimal. And let's go back to this figure from the previous slide that just compares
|
90 |
+
a basic linear regression fit with an isotonic regression fit. Here in the red stepwise curve,
|
91 |
+
the points at which the curve remains flat is where a block of consecutive adjacent violated
|
92 |
+
points are averaged similar to our example. This very efficient algorithm for computing
|
93 |
+
regimen projections over cardinality-based polytopes unfortunately does not extend to
|
94 |
+
general submodular based polytopes. And now my collaborator Jay will present different combinatorial
|
95 |
+
strategies for dealing with those polytopes. We now describe our toolkit for speeding up
|
96 |
+
projections on general submodular based polytopes. There are two basic objects that we can learn from.
|
97 |
+
First, given projections of previous points, can we do better than computing a new projection from
|
98 |
+
scratch? Second, given an iterative algorithm to compute a projection, can we use the combinatorial
|
99 |
+
structure present in the sequence of iterates to speed up the algorithm and terminate it early?
|
100 |
+
We have the well-known first-order optimality condition on the left. It helps us verify if a
|
101 |
+
point is indeed optimal. This check is reduced to a linear optimization over the base polytope,
|
102 |
+
which can be done using Edmunds-Greedy algorithm. We have an example. Suppose we know the gradient
|
103 |
+
at a point x star and want to check if x star is indeed optimal. We look at the distinct values
|
104 |
+
of the partial derivatives at x star and arrange them in an increasing order. Each time we see a
|
105 |
+
gap in this order, we want that the point x star on the prefix set equal the submodular function
|
106 |
+
value on that set. In the figure, the first such gap is after we have seen even an E5. Therefore,
|
107 |
+
x star S1 must equal f of S1. Similarly, x star S2 must equal f of S2. Finally, xE must equal f of
|
108 |
+
E. These sets S1, S2, and E are called tight sets at x and define the face containing the point x
|
109 |
+
star. This leads us to two interesting observations that we use later. One, that if we know precisely
|
110 |
+
what the tight sets are at the optimal points, we can also calculate the optimal point for all
|
111 |
+
suitable functions h. Two, that knowing the gradient at the optimal point gives us these
|
112 |
+
tight sets. We give an example using our combinatorial idea. Suppose we know a point
|
113 |
+
zk that is close to our optimal x star. If the function is smooth, this implies gradient at zk
|
114 |
+
and x star are close. This gives us a way to learn some tight sets defining the optimal face.
|
115 |
+
In the example, for each coordinate, the blue line in the middle represents the partial derivative
|
116 |
+
value at zk and the blue shade represents the possible variation in that value for the optimal
|
117 |
+
point x star. That is, the corresponding partial derivative for x star lies in the shaded interval.
|
118 |
+
The largest values in these intervals for E1 and E5 are lower than the lowest values in these
|
119 |
+
intervals for every other element. This helps us conclude that the set E1 and E5, that is S1,
|
120 |
+
is a tight set at x star. Similarly, we infer that S2 is also a tight set at x star.
|
121 |
+
We now use that idea to give our first two tools. These apply more generally, but we demonstrate
|
122 |
+
them using Euclidean projections. Suppose we already know the projection xi of a point yi,
|
123 |
+
and we wish to find the projection xt of point yt, given that yt is close to yi.
|
124 |
+
The non-expansiveness of projection implies that the gradients at xi and xt are also close,
|
125 |
+
and therefore we can infer some tight sets at xt even before solving.
|
126 |
+
Suppose we start computing the projection of yt using an iterative algorithm.
|
127 |
+
We now use the iterates zi that converge to xt. An iterate zt that is close to xt also has a
|
128 |
+
gradient that is close to the gradient at xt, and once again we can infer some tight sets at xt
|
129 |
+
as we approach the optimal. We also conducted an experiment to show that tool T1 can recover
|
130 |
+
most tight sets from previous projections. We now give two tools that help us round an
|
131 |
+
approximate solution exactly to the projection. First is our tool T3 called Relax.
|
132 |
+
We give a heuristic to check if we have already found all the tight sets at the optimal.
|
133 |
+
We also show that we can round combinatorially when we know the function f to be integral,
|
134 |
+
and an iterate zt is close enough to the optimal xt. This is our tool T4.
|
135 |
+
We can reuse previously known vertices of the polytope. Suppose that our optimal is xt,
|
136 |
+
and we are given a close by point xi as a convex combination of some vertices in the polytope.
|
137 |
+
We can use those vertices to warm start the search for xt. Now our sixth tool, Restrict.
|
138 |
+
Once we know a few tight sets for xt using our inferred tools T1 and T2,
|
139 |
+
we needn't search over the optimal or the whole base polytope. We can restrict ourselves to the
|
140 |
+
face of the polytope that satisfies these constraints. We show that a simple extension
|
141 |
+
of Edmunds' greedy algorithm provides yellow oracle for each face of the polytope.
|
142 |
+
We now bring together these tools and apply them to the awaystep-frank-wolff algorithm,
|
143 |
+
giving the algorithm we dub adaptive awaystep-frank-wolff, or A2FW for short.
|
144 |
+
First, warm start A2FW using tight sets for the optimal inferred from previous projected points,
|
145 |
+
and active sets from previous projected points. While the algorithm runs and generates new
|
146 |
+
iterates, it keeps inferring new tight sets for the optimal point using these iterates.
|
147 |
+
In each iteration, if a new set has been found, the algorithm checks if all tight sets have been
|
148 |
+
found. If indeed so, then stop and output the exact solution. Otherwise, simply restrict the
|
149 |
+
problem to a low-dimensional face and keep going on. Note that the linear optimization is over a
|
150 |
+
restricted face of the polytope. Let's see an example. Suppose we are optimizing over the
|
151 |
+
polytope P. We look for the best frank-wolff vertex and the best away vertex. We find that
|
152 |
+
the best frank-wolff vertex is the best away vertex. Since the direction opposite to the away
|
153 |
+
vertex is the better direction to move in, we find the next iterate ZT plus 1. Now, ZT plus 1 is
|
154 |
+
close enough to X star that it allows us to detect another tight set and round to the face F new.
|
155 |
+
One way to do that is to round to an arbitrary vertex in F new using our yellow oracle. Another
|
156 |
+
option is to relax to F new and see if the solution obtained is feasible. If feasibility
|
157 |
+
check is uncertain, return to the previous strategy. Eventually, we reach the optimal
|
158 |
+
X star either way. We give this theorem about the primal gap for the modified algorithm.
|
159 |
+
The function h is l-smooth and mu strongly convex and d refers to the diameter of BF.
|
160 |
+
Notice how this compares to the AFW algorithm. When we restrict to a face F of BF, our guarantee
|
161 |
+
depends only on the pyramidal width of F instead of the pyramidal width of BF. This pyramidal width
|
162 |
+
can be much lower for the restricted face. For instance, it depends on the dimension of the face
|
163 |
+
for the probability simplex. Therefore, A2FW leads to a faster convergence. We now show the
|
164 |
+
effectiveness of our toolkit and the A2FW algorithm using experiments. For our computations,
|
165 |
+
we simulate an online recommendation system where we are learning over rankings of items
|
166 |
+
displayed to users. Our loss functions are stochastic model click-through rates. This
|
167 |
+
can be seen as optimization over the permutahedron. We use online mirror descent which performs
|
168 |
+
iterative projections and uses away step Frank-Wulf for these projections. We benchmark the
|
169 |
+
original AFW algorithm against variants modified by our tools. We report significant improvement
|
170 |
+
in both runtime and the number of AFW iterations. The green line stands for OMD with the original
|
171 |
+
unoptimized AFW. The yellow line stands for OMD with A2FW algorithm. We do note that both OMDPAV,
|
172 |
+
that is OMD with projections using the poor adjacent violators algorithm, and OFW were
|
173 |
+
significantly faster than OMD with any AFW variant. However, OFW does not lead to optimum
|
174 |
+
regret rates while OMDPAV works only for cardinality-based submodular polytopes. To
|
175 |
+
conclude, we studied iterative projections for prevalent submodular-based polytopes. We presented
|
176 |
+
an algorithm for cardinality-based polytopes. For general polytopes, we developed a combinatorial
|
177 |
+
toolkit to speed up iterative projections and applied it to the AFW algorithm and computationally
|
178 |
+
showed that our algorithm is orders of magnitude faster than the original AFW variant.
|
demo_data/nips-2021/25964/transcript_whisper_large-v2.txt
ADDED
@@ -0,0 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
e la possibilità di eseguire un'operazione di modello di un'algebra.
|
2 |
+
Questo è un'operazione che è stata creata per il nostro studio,
|
3 |
+
e che è stato creato per il nostro studio.
|
4 |
+
Ciao a tutti, sono Matteo Papini,
|
5 |
+
e questo è un lavoro insieme con Andrea Tirinzoni,
|
6 |
+
Aldo Pacchiano, Marcello Restelli,
|
7 |
+
Alessandro Lazzarici e Matteo Pirotta.
|
8 |
+
Il nostro lavoro è motivato dall'efficacia
|
9 |
+
di algoritmi di imparazione di rinforzamento profondo
|
10 |
+
per risolvere tasche complesse, come i videoghi.
|
11 |
+
Una caratteristica fondamentale di questi metodi
|
12 |
+
è la possibilità di eseguire neural networks
|
13 |
+
per eseguire rappresentazioni complesse delle tasche
|
14 |
+
che permette di rappresentare e imparare
|
15 |
+
le polizie ottime efficacemente.
|
16 |
+
Capire cosa fa una rappresentazione buona
|
17 |
+
e come trovarne una
|
18 |
+
è fondamentale per disegnare
|
19 |
+
migliori algoritmi di imparazione di rinforzamento.
|
20 |
+
In questo lavoro, per prima volta,
|
21 |
+
ci sono state presentate caratterizzazioni formali
|
22 |
+
di rappresentazioni buone per l'imparazione di rinforzamento.
|
23 |
+
Abbiamo mostrato che usare una rappresentazione buona
|
24 |
+
può davvero beneficiare l'efficienza di imparazione
|
25 |
+
e fornire garantie di regretto costante.
|
26 |
+
Finalmente, abbiamo mostrato come una rappresentazione buona
|
27 |
+
può essere selezionata dall'interazione online,
|
28 |
+
un primo passaggio verso l'apprendimento di rappresentazione per RL.
|
29 |
+
Ma prima di tutto, qualche background.
|
30 |
+
Il problema di imparazione è modellato
|
31 |
+
come un processo di decisione di marco finito di orizzonte, o MDP.
|
32 |
+
In ogni passaggio di tempo, l'agente osserva un stato dell'ambiente,
|
33 |
+
prende un'azione e riceve una rinforza
|
34 |
+
e un stato successivo come risultato.
|
35 |
+
Questi sono determinati rispettivamente
|
36 |
+
da una funzione di rinforza e una funzione di transizione
|
37 |
+
che sono un'unità di tempo e un'unità di non-conoscenza.
|
38 |
+
L'interazione è dividita in due episodi
|
39 |
+
di lunghezza finita, che si chiama l'orizzonte.
|
40 |
+
All'ultimo episodio, il stato è risalto
|
41 |
+
a seconda della distribuzione fissata.
|
42 |
+
Il comportamento dell'agente è modellato da una polizia,
|
43 |
+
che è una mappatura da stati all'azione
|
44 |
+
che può anche essere dipendente del tempo.
|
45 |
+
La funzione di valore, o funzione Q della polizia Pi,
|
46 |
+
dà la rinforza aspettata totale
|
47 |
+
ottenuta prendendo l'azione A in stato S a tempo H
|
48 |
+
e poi seguendo la polizia fino all'ultimo episodio.
|
49 |
+
Un'ottima polizia è garantita
|
50 |
+
che la funzione Q si massima su tutti i stati.
|
51 |
+
Facciamo un'assumzione extra
|
52 |
+
che ogni stato admette un'azione ottima unica.
|
53 |
+
Quando il numero di stati è molto grande o anche infinito,
|
54 |
+
imparare l'ottima polizia può essere molto difficile.
|
55 |
+
Quindi guardiamo i linear MDPs
|
56 |
+
dove l'agente ha accesso a una rappresentazione compatta.
|
57 |
+
Questa è una mappatura di caratteristiche
|
58 |
+
da stati e azioni a vectori d-dimensional
|
59 |
+
dove D è più piccolo.
|
60 |
+
Potete vedere queste caratteristiche
|
61 |
+
come l'ultimo strato scoperto di un'intera rete neurale.
|
62 |
+
Nell'apprendimento di rinforzamento profondo
|
63 |
+
impariamo tutti i pesi della rete simultaneamente.
|
64 |
+
Qui mantendremo la rappresentazione fissa
|
65 |
+
e impareremo solo i finali parametri
|
66 |
+
che sono i pesi di una combinazione lineare.
|
67 |
+
Questa funzione lineare, almeno,
|
68 |
+
deve essere in grado di rappresentare la funzione Q ottima
|
69 |
+
in modo da poterla usare per prendere azioni ottime.
|
70 |
+
Ma, infine,
|
71 |
+
essere in grado di rappresentare la funzione Q ottima
|
72 |
+
non è abbastanza per l'apprendimento efficace
|
73 |
+
perché un numero esponenziale di esempi
|
74 |
+
può ancora essere richiesto.
|
75 |
+
Per evitare questo,
|
76 |
+
ci sono necessità di assumizioni strutturali extra
|
77 |
+
sull'MDP,
|
78 |
+
e alcune sono state proposte nella literatura.
|
79 |
+
Nel MDP di basso rango,
|
80 |
+
sia la funzione di rinforzamento che la funzione di transizione
|
81 |
+
sono lineari nelle stesse funzioni.
|
82 |
+
Queste funzioni possono essere tempo-indipendenti.
|
83 |
+
Assumiamo solo per semplicità
|
84 |
+
che le due funzioni condividono la stessa dimensione D.
|
85 |
+
Una prima conseguenza della struttura di basso rango
|
86 |
+
è che la funzione Q di ogni polizia
|
87 |
+
può essere rappresentata come una funzione lineare delle funzioni.
|
88 |
+
Una assumzione strutturale più forte è la rinforzamento di Bellman.
|
89 |
+
In questi MDP,
|
90 |
+
tutte le funzioni lineare delle funzioni
|
91 |
+
devono essere chiuse sotto l'operatore di optimità di Bellman.
|
92 |
+
La struttura di basso rango implica la chiusura di Bellman,
|
93 |
+
ma l'opposto non è vero.
|
94 |
+
Indeed, nelle MDP di chiusura di Bellman,
|
95 |
+
solo l'ottima funzione Q
|
96 |
+
è garantita di essere realizzabile lineariamente.
|
97 |
+
Le algoritmi di imparazione di rinforzamento efficace
|
98 |
+
sono state proposte per questi settimenti.
|
99 |
+
Possiamo evaluare le funzioni
|
100 |
+
usando il concetto di risalto,
|
101 |
+
che è l'amounto totale di sub-optimità
|
102 |
+
che viene sofferto dall'agente
|
103 |
+
durante il processo di imparazione
|
104 |
+
rispetto alla polizia ottima.
|
105 |
+
Nelle MDP di basso rango,
|
106 |
+
l'algoritmo LSVI-UCB
|
107 |
+
soffre solo un regalo sublineare
|
108 |
+
nel caso più grave.
|
109 |
+
Eleanor è una versione raffinata
|
110 |
+
che funziona nel caso più generale
|
111 |
+
della chiusura di Bellman
|
112 |
+
e ha una migliore dipendenza
|
113 |
+
sulla dimensione di caratteristiche.
|
114 |
+
Doveva essere notato, però,
|
115 |
+
che Eleanor è computazionale intrattabile.
|
116 |
+
Per il LSVI-UCB
|
117 |
+
abbiamo anche un regalo
|
118 |
+
di base di istanze
|
119 |
+
che è logaritmico
|
120 |
+
nel numero totale di interazioni.
|
121 |
+
Qui Delta denuncia
|
122 |
+
il capo di sub-optimità
|
123 |
+
di una pariera di attesa statale
|
124 |
+
che è assumato di avere
|
125 |
+
un minimo ben definito.
|
126 |
+
Tutti questi regali di base
|
127 |
+
ignorano la qualità della rappresentazione,
|
128 |
+
a parte le assumazioni strutturali
|
129 |
+
che sono necessarie
|
130 |
+
per la sua gestione.
|
131 |
+
La domanda che cercheremo di rispondere è questa.
|
132 |
+
Possiamo raggiungere
|
133 |
+
anche piccoli dolori
|
134 |
+
con una buona rappresentazione?
|
135 |
+
Per rendere questo concetto
|
136 |
+
di buona rappresentazione formale
|
137 |
+
introduciamo la proprietà Unisoft.
|
138 |
+
Una rappresentazione è Unisoft
|
139 |
+
se le caratteristiche ottime
|
140 |
+
spostano l'intero spazio di caratteristiche.
|
141 |
+
Le caratteristiche ottime sono
|
142 |
+
le caratteristiche delle azioni ottime
|
143 |
+
in stati che sono raggiuntibili
|
144 |
+
alla propria politica ottimale.
|
145 |
+
Intuitivamente, la proprietà Unisoft
|
146 |
+
garantisce che le caratteristiche ottime
|
147 |
+
sono diverse abbastanza
|
148 |
+
per che l'agente
|
149 |
+
cominci rapidamente alla politica ottimale
|
150 |
+
senza ridurre
|
151 |
+
l'amounto di informazioni che riceve
|
152 |
+
sulla tasca in generale.
|
153 |
+
Possiamo anche misurare
|
154 |
+
il grado di diversità della rappresentazione
|
155 |
+
guardando i più piccoli valori
|
156 |
+
degli eigenvali
|
157 |
+
della matrica di covarianza delle caratteristiche ottime.
|
158 |
+
Questo parametro di Lambda
|
159 |
+
porterà un ruolo importante
|
160 |
+
nelle nostre regrette.
|
161 |
+
Notate che un valore più alto di Lambda
|
162 |
+
è migliore perché denota
|
163 |
+
più diversità di caratteristiche
|
164 |
+
e che Lambda può essere al massimo
|
165 |
+
una sotto assumizioni comuni
|
166 |
+
sulla magnitude di caratteristiche.
|
167 |
+
Ma in quale senso sono queste rappresentazioni
|
168 |
+
ottime?
|
169 |
+
Ciò che abbiamo mostrato in MDP lineari
|
170 |
+
è che Unisoft è sinonimo
|
171 |
+
con regrette costanti.
|
172 |
+
Per prima cosa, abbiamo mostrato
|
173 |
+
che la proprietà di Unisoft
|
174 |
+
è necessaria per raggiungere
|
175 |
+
regrette costanti in MDP
|
176 |
+
con regretti lineari.
|
177 |
+
Questo appartiene a MDPs di basso rango,
|
178 |
+
Bellman closure,
|
179 |
+
e anche a MDPs di mixtura lineare
|
180 |
+
che sono un'altra
|
181 |
+
assumazione strutturale comune.
|
182 |
+
Ma Unisoft è anche sufficiente
|
183 |
+
per regrette costanti
|
184 |
+
in casi interessanti.
|
185 |
+
In MDPs di basso rango,
|
186 |
+
SVI-UCB raggiunge
|
187 |
+
regrette costanti se e solo se
|
188 |
+
la rappresentazione è Unisoft.
|
189 |
+
Con una alta probabilità,
|
190 |
+
un numero finito
|
191 |
+
di interaczioni è sufficiente
|
192 |
+
per l'agente imparare
|
193 |
+
perfettamente la polizia ottimale.
|
194 |
+
Quindi, la regrette può essere
|
195 |
+
rilassata in termini di questo tempo costante
|
196 |
+
regardless of the
|
197 |
+
total number of episodes k.
|
198 |
+
In altri parole, la regrette
|
199 |
+
è costante.
|
200 |
+
Notate come il tempo τ
|
201 |
+
dipende inversamente
|
202 |
+
sul parametro λ.
|
203 |
+
Indeed, con una mappa di
|
204 |
+
più diversità di caratteristiche, possiamo imparare
|
205 |
+
la polizia ottimale più velocemente.
|
206 |
+
Abbiamo un risultato simile
|
207 |
+
per Eleanor nel caso più generale
|
208 |
+
di MDPs di Bellman closure,
|
209 |
+
con anche una migliore
|
210 |
+
dipendenza sulla dimensione d
|
211 |
+
della caratteristica.
|
212 |
+
Infine, la mancanza di
|
213 |
+
lombari per Eleanor
|
214 |
+
dà questa polinomiale
|
215 |
+
dipendenza sul parametro λ
|
216 |
+
rispetto a una dipendenza logaritmica
|
217 |
+
nel caso di LSVI-UCB.
|
218 |
+
Ma questo potrebbe ben essere
|
219 |
+
un artefatto del nostro provo.
|
220 |
+
Per ricapitulare, abbiamo mostrato
|
221 |
+
che l'Unisoft è
|
222 |
+
sia necessario che sufficiente
|
223 |
+
per raggiungere regrette costanti
|
224 |
+
in MDPs di Bellman closure
|
225 |
+
e di low rank, e ha
|
226 |
+
provvinto regrette costanti
|
227 |
+
per i bounds superiori per algoritmi comuni.
|
228 |
+
Nella ultima parte del
|
229 |
+
talco, mostriamo come
|
230 |
+
le representazioni buone possono essere
|
231 |
+
scelte online.
|
232 |
+
Ci concentriamo su MDPs di low rank
|
233 |
+
per semplicità.
|
234 |
+
L'agente è dato un set
|
235 |
+
di N rappresentazioni candidate
|
236 |
+
che rappresentano
|
237 |
+
la stessa MDP di low rank
|
238 |
+
senza misspecificazione.
|
239 |
+
Le rappresentazioni possono avere
|
240 |
+
diverse dimensioni.
|
241 |
+
Questo differe dall'approccio tipico
|
242 |
+
di rappresentazione di lezione in RL
|
243 |
+
dove si cercano di trovare
|
244 |
+
una rappresentazione accurata
|
245 |
+
da una classe di funzioni realizzabili.
|
246 |
+
Questo permette di
|
247 |
+
risolvere le misspecificazioni, ma
|
248 |
+
è tipicamente fatto offline.
|
249 |
+
Il nostro obiettivo è
|
250 |
+
imparare così efficientemente
|
251 |
+
come se usassimo la migliore
|
252 |
+
rappresentazione candidata nel set
|
253 |
+
senza sapere in avanzo.
|
254 |
+
Ovviamente, se una delle candidate
|
255 |
+
è Unisoft, vorremmo
|
256 |
+
ottenere un regalo costante.
|
257 |
+
L'algoritmo che proponiamo
|
258 |
+
è LSVI Leader.
|
259 |
+
Si guida
|
260 |
+
N istanze parallele di LSVI UCB,
|
261 |
+
una per ogni rappresentazione
|
262 |
+
candidata.
|
263 |
+
Per ogni rappresentazione, usiamo
|
264 |
+
tutte le date collezionate
|
265 |
+
dall'agente per esimerare
|
266 |
+
il parametro dell'ottima
|
267 |
+
funzione Q accordo
|
268 |
+
a questa rappresentazione.
|
269 |
+
Questo è fatto con una combinazione
|
270 |
+
di square e induzione sbattuta.
|
271 |
+
Un bonus di esplorazione
|
272 |
+
viene aggiunto all'estimato
|
273 |
+
del parametro per rendere
|
274 |
+
l'estimato ottimista, come nel caso di LSVI UCB.
|
275 |
+
Ma ora
|
276 |
+
abbiamo un parametro ottimista
|
277 |
+
per ogni rappresentazione
|
278 |
+
e l'azione viene scelta
|
279 |
+
per maximizzare il più piccolo
|
280 |
+
parametro ottimista,
|
281 |
+
che è anche l'estimato più tico.
|
282 |
+
Notate come questo
|
283 |
+
è in realtà più potente
|
284 |
+
dell'algoritmo di selezione del modello
|
285 |
+
perché possiamo usare
|
286 |
+
una rappresentazione diversa
|
287 |
+
per ogni stato.
|
288 |
+
Vediamo che il regalo del leader di LSVI
|
289 |
+
è superiore
|
290 |
+
a quello di LSVI UCB
|
291 |
+
se è condannato con la rappresentazione
|
292 |
+
migliore dei candidati,
|
293 |
+
a meno di un fattore,
|
294 |
+
che è il numero di candidati
|
295 |
+
in square.
|
296 |
+
Questo significa che se abbiamo
|
297 |
+
una rappresentazione di Unisoft nel set,
|
298 |
+
il leader di LSVI
|
299 |
+
raggiunge il regalo di selezione.
|
300 |
+
Ma il leader di LSVI
|
301 |
+
può combinare rappresentazioni
|
302 |
+
attraverso stagi, stati e azioni,
|
303 |
+
e quindi
|
304 |
+
a volte può raggiungere
|
305 |
+
il regalo di selezione
|
306 |
+
anche se non c'è una rappresentazione di candidati
|
307 |
+
di Unisoft.
|
308 |
+
I nostri risultati teoretici sono anche supportati
|
309 |
+
dai risultati empirici
|
310 |
+
in MDPs di piccolo regalo di selezione.
|
311 |
+
Questi plotti mostrano il regalo di selezione
|
312 |
+
come funzione del numero di episodi.
|
313 |
+
A sinistra abbiamo
|
314 |
+
il regalo di LSVI-UCB
|
315 |
+
che è gestito con
|
316 |
+
diverse rappresentazioni.
|
317 |
+
Di queste, l'unica rappresentazione
|
318 |
+
in grigio nel plotto
|
319 |
+
è Unisoft, e solo in questo caso
|
320 |
+
LSVI-UCB è in grado
|
321 |
+
di raggiungere regali costanti.
|
322 |
+
A sinistra abbiamo il regalo
|
323 |
+
del leader di LSVI
|
324 |
+
che è gestito con vari set di candidati.
|
325 |
+
In tutti questi casi,
|
326 |
+
il leader di LSVI raggiunge
|
327 |
+
regali costanti.
|
328 |
+
Ovviamente, senza sapere
|
329 |
+
la migliore rappresentazione in avanzo,
|
330 |
+
ci serve più tempo per imparare la polizia ottima,
|
331 |
+
ma questo è stato anche aspettato
|
332 |
+
dalla nostra regola di selezione.
|
333 |
+
Il plotto arancione è particolarmente
|
334 |
+
interessante, perché in questo caso
|
335 |
+
l'unica rappresentazione di Unisoft,
|
336 |
+
numero 1,
|
337 |
+
non è nel set di candidati,
|
338 |
+
ma ancora LSVI-leader è in grado
|
339 |
+
di raggiungere regali costanti
|
340 |
+
combinando le representazioni rimaste.
|
341 |
+
Nel lavoro futuro,
|
342 |
+
vorremmo migliorare questo fattore
|
343 |
+
di sqvrtn nel regalo del leader di LSVI,
|
344 |
+
perché nel caso dei banditi lineari
|
345 |
+
la dipendenza sull'umare
|
346 |
+
delle rappresentazioni è solo logaritmica.
|
347 |
+
Vorremmo anche
|
348 |
+
estendere il leader di LSVI
|
349 |
+
per gestire le rappresentazioni
|
350 |
+
di candidati che sono miscele.
|
351 |
+
Tuttavia, questa
|
352 |
+
selezione delle rappresentazioni è
|
353 |
+
solo un passaggio verso
|
354 |
+
il learning of representation,
|
355 |
+
che significa imparare
|
356 |
+
la rappresentazione online da scratch.
|
357 |
+
Questo è già fatto
|
358 |
+
in pratica con il learning di
|
359 |
+
rinforzamento profondo, ma la teoria
|
360 |
+
di questo è scomoda.
|
361 |
+
Finalmente, possiamo considerare
|
362 |
+
il learning di rinforzamento multitasca,
|
363 |
+
dove una singola rappresentazione
|
364 |
+
potrebbe essere buona per un
|
365 |
+
composto di MDPs che condividono
|
366 |
+
una struttura. Grazie.
|
demo_data/nips-2021/25965/transcript_whisper_large-v2.txt
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
How many friends do you have?
|
2 |
+
At least you have more friends than I do.
|
3 |
+
Well, on average.
|
4 |
+
Don't get me wrong, I am not a pity person.
|
5 |
+
This is a mathematical fact known as the friendship paradox.
|
6 |
+
Suppose we have two persons, A who has one friend and B who has three friends.
|
7 |
+
Now let me ask in which friend list am I likely to appear?
|
8 |
+
Because B has three times more friends, I am three times more likely to appear in the
|
9 |
+
B's friend list.
|
10 |
+
The friendship paradox dictates that on average, your friends have more friends than you do.
|
11 |
+
The more friends someone has, the more likely someone appears in your friend list.
|
12 |
+
Beyond an interesting piece of trivia, the friendship paradox has substantial importance
|
13 |
+
because it may introduce biases in graph embeddings.
|
14 |
+
Hello everyone, my name is Sadamori Kojak, and we will walk you through a new insight
|
15 |
+
into biases in graph embedding arising from the friendship paradox.
|
16 |
+
The graph embedding is a technique to map a graph into a vector space that reflects
|
17 |
+
the structure of the graph.
|
18 |
+
A widespread paradigm is the approach based on Word2Vec.
|
19 |
+
In this approach, one somehow generates a sequence of nodes from the graph.
|
20 |
+
The nodes in the sentences are then mapped to a vector space by Word2Vec.
|
21 |
+
Now the key is that Word2Vec does not directly learn the graph, but through the sentences
|
22 |
+
generated from the graph.
|
23 |
+
Unlike the word embedding, where the input sentences are the actual data, for graph embedding,
|
24 |
+
the input sentence is artificially generated, and how to generate it is a critical modeling
|
25 |
+
decision.
|
26 |
+
This leads us to the question of how to generate the sentences from the graph.
|
27 |
+
A common way is to use random walks.
|
28 |
+
The worker starts from a node in the graph, and this node is the first node in the sentence.
|
29 |
+
Then the worker moves to one of the neighbors selected randomly.
|
30 |
+
This new node is added to the sentence.
|
31 |
+
By repeating this process, we can generate a sentence of nodes from this graph.
|
32 |
+
The friendship paradox comes into play when the worker follows an edge.
|
33 |
+
It is more likely to visit a node with many neighbors.
|
34 |
+
In other words, following edges is a bias sampling that preferentially leads random
|
35 |
+
workers to nodes with many neighbors.
|
36 |
+
To see this effect, let us consider a graph with co-peripheral structure, where kernels
|
37 |
+
have more neighbors than periphery.
|
38 |
+
A sentence can be generated from this graph by running a random walk.
|
39 |
+
Now, the kernels are about 20% of nodes in the graph.
|
40 |
+
But when looking at the generated sentence, the kernels are overrepresented, which is
|
41 |
+
because of the bias due to the friendship paradox.
|
42 |
+
The fact that the sentence is biased by the friendship paradox leads us to our main question.
|
43 |
+
Does the sampling bias have negative impact?
|
44 |
+
If so, how can we fix it?
|
45 |
+
Surprisingly, it has no effect because Word2Vec itself has an overlooked built-in devising
|
46 |
+
feature that happens to negate the bias due to the friendship paradox.
|
47 |
+
This built-in devising feature can be easily utilized to negate other types of biases,
|
48 |
+
and we demonstrate how to do this.
|
49 |
+
Our starting point is a sentence of words.
|
50 |
+
Word2Vec picks a word called center and surrounding words called context, and then models the
|
51 |
+
conditional probability using a softmax function, where the conditional probability is reflected
|
52 |
+
as a dot similarity of the two vectors of the words.
|
53 |
+
We want to fit this model to the data, but it is computationally challenging due to the
|
54 |
+
normalization constant, which extends over all unique words in the corpus.
|
55 |
+
A common way to reduce this burden is negative sampling.
|
56 |
+
Now, it is often underappreciated that negative sampling is actually a simplified version
|
57 |
+
of noise contrastive estimation.
|
58 |
+
And it is this simplification that gives rise to an interesting feature of Word2Vec.
|
59 |
+
How does the noise contrastive estimation, or NCE, works?
|
60 |
+
NCE samples k random contexts from so-called noise distribution.
|
61 |
+
This noise distribution is roughly proportional to the frequency of a word in the corpus.
|
62 |
+
The random contexts are labeled as 0, and the actual context is labeled as 1.
|
63 |
+
Then NCE calculates the probability that a word comes from actual data using a Bayesian
|
64 |
+
framework.
|
65 |
+
By putting the prior likelihood together, we have a posterior like this.
|
66 |
+
This function is a sigmoid function and takes the dot similarity and the noise distribution
|
67 |
+
as the arguments.
|
68 |
+
Now the key feature of the NCE is that it is asymptomatically unbiased for the model
|
69 |
+
of the Word2Vec.
|
70 |
+
Meaning if the data is actually generated from this model, and we increase the number
|
71 |
+
of trainings, then the embedding vectors converge to the true vectors.
|
72 |
+
Beyond Word2Vec, the noise contrastive estimation is also an unbiased estimator for a more general
|
73 |
+
model that takes a real value function f instead of the dot similarity.
|
74 |
+
Now the negative sampling simplifies the noise contrastive estimation.
|
75 |
+
It estimates the same probability, but variably drops the term of the noise distribution.
|
76 |
+
You might be wondering what happens without this term.
|
77 |
+
To see this, we rewrite it in form of the noise contrastive estimation, where we define
|
78 |
+
a new function f' which consists of the original function f as well as the noise distribution.
|
79 |
+
This is asymptomatically unbiased for a probability model which now includes the noise distribution.
|
80 |
+
So all in all, Word2Vec trained with skip-gram-negative sampling is asymptomatically unbiased for
|
81 |
+
this probability model, or more specifically for Word2Vec, this function.
|
82 |
+
In this model, the noise distribution offsets the modeled probability, serving as a baseline.
|
83 |
+
The embedding vectors captures the residual from the baseline.
|
84 |
+
Now, remind that the baseline probability is roughly proportional to the frequency.
|
85 |
+
Therefore, the embedding vectors capture the information other than the frequency.
|
86 |
+
In other words, SGNS Word2Vec has a built-in debiasing feature for frequency bias.
|
87 |
+
Now let us revisit the friendship paradox.
|
88 |
+
The sampling bias due to the friendship paradox is that the frequency of a word is determined
|
89 |
+
thoroughly by the degree of noise.
|
90 |
+
Notice that this frequency is actually accounted for by the baseline probability.
|
91 |
+
Therefore, the friendship paradox has no effect thanks to the built-in debiasing feature of
|
92 |
+
SGNS Word2Vec.
|
93 |
+
This realization leads us to Residual2Vec.
|
94 |
+
The key idea is to model the baseline probability explicitly to control what bias to remove
|
95 |
+
in embedding.
|
96 |
+
So how can we model the baseline more specifically?
|
97 |
+
We start from the given graph and randomize the structure, then generate a sequence using
|
98 |
+
random walks, then calculate the conditional probability as the baseline, which is based
|
99 |
+
on the idea that we should remove biases arising from the trivial structure.
|
100 |
+
This debiasing feature is useful to predict links in the graph.
|
101 |
+
Residual2Vec performs the best or nearly the best for all six graphs of different domains.
|
102 |
+
Furthermore, Residual2Vec is the best or the second best performer for a community detection
|
103 |
+
benchmark.
|
104 |
+
To showcase the debiasing feature, we constructed a citation graph of general issues using the
|
105 |
+
web of science, where the nodes are general issues connected by undirected and weighted
|
106 |
+
citations.
|
107 |
+
When applying grove embedding, all genres are concentrated on the center, reflecting
|
108 |
+
temporal aspects of the issues.
|
109 |
+
This is because the old issues have time to accumulate many citations, and therefore well
|
110 |
+
connected to many different issues.
|
111 |
+
For subject-wise, grove separates different fields to some extent.
|
112 |
+
With Residual2Vec, we can remove the biases due to time.
|
113 |
+
In effect, the old genres now spread out, and the disciplinary separations are more
|
114 |
+
clearly visible.
|
115 |
+
Beyond eyeballing the embeddings, we test the embeddings quantitatively by predicting
|
116 |
+
the genre impact factor as well as the subject categories.
|
117 |
+
We find that the impact factor and the subject of genres can be well predicted by removing
|
118 |
+
the temporal biases as well as the friendship paradox effect.
|
119 |
+
In summary, we show that World2Vec has a built-in debiasing feature attributed to negative sampling.
|
120 |
+
Inspired by this finding, we propose Residual2Vec that can negate other types of structural
|
121 |
+
biases.
|
122 |
+
We demonstrate that removing biases not only improves the performance, but also enabling
|
123 |
+
us to control on the biases in the final representation.
|
124 |
+
Our results highlighted a new potential of negative sampling as a way to mitigate biases
|
125 |
+
in representations, which may be useful to address the problem of the biases in AI.
|
126 |
+
Although we have not studied the biases in AI, given the wide usage of negative sampling
|
127 |
+
to train AI, our approach may lead to methods and studies that expose and mitigate biases
|
128 |
+
in AI.
|
129 |
+
We believe that our approach contributes to the effort to create transparent and accountable
|
130 |
+
machine learning methods, especially because our method enables us to explicitly control
|
131 |
+
the biases in the graph representation.
|
132 |
+
That's all for the presentation, and finally I'd like to acknowledge Jason Yoon, Isabel
|
133 |
+
Constantino, and Yongyuan An for creating and adding momentum to this project for years,
|
134 |
+
and for all of you who watched this video.
|
135 |
+
If you want to know more in detail, please check out our paper.
|
136 |
+
Thanks!
|
demo_data/nips-2021/25969/transcript_whisper_large-v2.txt
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Hello everyone, my name is Alan. I'm a PhD student from Stanford University. I'm presenting
|
2 |
+
our work Play to Grade, testing coding games as classifying Markov decision process. This
|
3 |
+
is joint work with Emma Bronskill and Chris Peach.
|
4 |
+
In this talk, we will highlight the central problem that we're trying to solve, which
|
5 |
+
is scaling up quality feedback for students learning to code is crucial. Grading interactive
|
6 |
+
coding game is very difficult, and we frame this as an instance of identifying if a program
|
7 |
+
has the same behavior as a desired MDP. Even with 11 label programs, we can achieve 94%
|
8 |
+
accuracy on real student assignment from code.org.
|
9 |
+
Each year, hundreds of thousands of people, children and adults alike, want to learn coding.
|
10 |
+
Modern massive online education platforms like code.org serves over 40% of US K-12 students.
|
11 |
+
Scaling up quality feedback for these students is crucial, especially in areas where there
|
12 |
+
are shortages of computer science teachers.
|
13 |
+
Interactive coding assignments are becoming more popular. It's a lot more fun for students
|
14 |
+
to program them. They're also a common type of programs for students to code. For example,
|
15 |
+
web pages are interactive. However, in order to grade them, teachers often need to play
|
16 |
+
each student homework for 20 seconds to a couple minutes. This quickly becomes a scaling
|
17 |
+
issue. A 20-student classroom might still be manageable, but in a large university where
|
18 |
+
there are hundreds of students taking the same class or on an online education platform
|
19 |
+
like code.org, grading these assignments is a real challenge. This places a real burden
|
20 |
+
on teachers.
|
21 |
+
Why is it difficult to develop automatic grading tools? First of all, each assignment is different
|
22 |
+
from each other. Traditional machine learning solutions that rely on collecting a large
|
23 |
+
set of data set simply won't work here. Oftentimes, assignments for the same class can even change
|
24 |
+
from year to year. Spending effort to collect a large label data set is a hard sell to teachers.
|
25 |
+
Second, the same assignment can be written in different coding languages. The solutions
|
26 |
+
could end up looking quite different. At last, code solutions can be very long, especially
|
27 |
+
when interaction is involved. Unfortunately, current state-of-the-art code analysis solutions
|
28 |
+
don't scale beyond 10 lines of code. In this work, we hope to offer a new solution
|
29 |
+
inspired by human teachers' grade these assignments.
|
30 |
+
Let's take a look at how a teacher plays to grade a student homework. This is what
|
31 |
+
a correct solution for code.org's coding assignment, Bounce, looks like. The teacher
|
32 |
+
controls a paddle to bounce a ball into a goal post and gets one score.
|
33 |
+
Here's what an incorrect student submission looks like. The student didn't put the boundary
|
34 |
+
condition for the wall and the ball goes right through it.
|
35 |
+
Here's another incorrect submission. Instead of getting a point after successfully bouncing
|
36 |
+
the ball into the goal post, the player gets a point whenever the ball bounces on wall
|
37 |
+
and paddle. This is clearly not the correct behavior.
|
38 |
+
However, a teacher isn't just playing the game normally. In order to grade it, the teacher
|
39 |
+
has to play it in a specific way to expose bugs in the game. Take a look at both programs
|
40 |
+
on the left and right. Both have wall boundary problems, but we would never know if the teacher
|
41 |
+
didn't try to bounce the ball on the wall. The right panel shows a game, though broken,
|
42 |
+
can look like a perfectly correct game.
|
43 |
+
Using the Markov Decision Process framework from reinforcement learning, we can characterize
|
44 |
+
the intuition we have built up. The MDP framework can be used to describe any interactive environment,
|
45 |
+
not just games. It includes a state space, action space, a transition dynamics that defines
|
46 |
+
how the game moves from one frame to the next, and a reward function. We can train an agent
|
47 |
+
using a reinforcement learning algorithm that learns to maximize the reward. So how does
|
48 |
+
the MDP framework help us understand programs with bugs?
|
49 |
+
We can treat each program as its own MDP. The teacher's correct program is the correct
|
50 |
+
or desired MDP, while the student's program is another MDP or a test MDP. We can frame
|
51 |
+
grading as an instance of identifying if a test MDP has the same behavior as a desired
|
52 |
+
MDP. Using components from the MDP framework, we can express bugs as distance between two
|
53 |
+
MDPs' transition and reward functions. The ball going through the wall is clearly not
|
54 |
+
a correct transition. Receive reward when you shouldn't can also be captured by the
|
55 |
+
difference in the reward function output. More precisely, we can treat grading as calculating
|
56 |
+
a distance between two MDPs. Equation 1 might suggest that we should check over all states.
|
57 |
+
However, since distance is non-negative and we're interested in the overall sum, we
|
58 |
+
only need to find one state-action pair in the test MDP to know if the overall distance
|
59 |
+
is non-zero. If we set this distance as a reward for an RL agent, we can make the task
|
60 |
+
of reaching bug states a lot more intelligent and efficient. This RL agent's objective
|
61 |
+
is to reach states that have the highest potential to be different between the two MDPs with
|
62 |
+
respect to this distance function. We do have one more challenge that remains.
|
63 |
+
The distance function DSA requires access to both MDPs' transition and reward functions.
|
64 |
+
We cannot assume we have access to the student program's inner mechanism. We can't control
|
65 |
+
the randomness in the student's code either, meaning two MDPs can have different random
|
66 |
+
initial starting positions. Therefore, when we interact with the student's MDP, we need
|
67 |
+
to learn a parametrized distance function that can tell us how far the observed state-action
|
68 |
+
pairs from the student MDP is from the correct MDP.
|
69 |
+
Now we have two parametrized models. The agent requires training to find the bug. The classifier
|
70 |
+
requires training to identify the bug. We call this the code star problem. So, if I
|
71 |
+
have a classifier that can classify which state triggers a bug, then we can simply replace
|
72 |
+
reward function in the MDP with this classifier and directly teach our agent. If I have an
|
73 |
+
agent that can always reach the bug state, I can probably just collect a dataset of trajectories
|
74 |
+
and train a good classifier. But at the beginning, neither the agent nor the classifier can do
|
75 |
+
a very good job. Therefore, we introduce a procedure called
|
76 |
+
collaborative training. The agent will start out as a random agent, where we can train
|
77 |
+
the agent to maximize the original reward in the MDP. It collects trajectories and trains
|
78 |
+
the classifier. Then we use the classifier as a reward function to guide the agent on
|
79 |
+
how to reach bug states. They both start out bad, but the agent can help the classifier
|
80 |
+
learn and the classifier can in return teach the agent.
|
81 |
+
We present two baselines to train the bug classifier. Since we have some training data,
|
82 |
+
though not a lot, we can simply apply coarse labeling, creating a dataset where all state-action
|
83 |
+
pairs from the correct labeled MDP as non-bug states and all state-action pairs from the
|
84 |
+
broken MDP as bug states. This is incredibly noisy because not all state-action pairs from
|
85 |
+
the broken MDP are bug states, only a few of them are. But this is a good baseline to
|
86 |
+
have. We can also train an unsupervised learning
|
87 |
+
model to memorize all state-action pairs from the correct MDP and use log probability or
|
88 |
+
reconstruction loss to detect abnormal state-action pairs in the broken MDP.
|
89 |
+
Inspired by Hohr-Triples and MDP state equivalence literature, we designed two models to fully
|
90 |
+
capture this notion of MDP-based state difference. We assume that the students can specify and
|
91 |
+
set random seed for their game. Therefore, the game objects, such as a ball, will not
|
92 |
+
always appear in the same initial state. Therefore, it is crucial for us to approximate one MDP's
|
93 |
+
transition dynamics and reward function. When our agent interacts with a new MDP, this is
|
94 |
+
where Hohr-LSTM comes in. We train it to model the correct MDP's transition dynamics and
|
95 |
+
reward function and treat bug states in the new MDP when sufficient deviation occurs from
|
96 |
+
the prediction. We further introduce contrastive Hohr-LSTM.
|
97 |
+
Sometimes the agent will explore a new region that it might not have visited in the correct
|
98 |
+
MDP. The predictive difference between the observed state and predictive state is in
|
99 |
+
fact a function approximation error. In order to reduce this error, we approximate both
|
100 |
+
the correct MDP and the broken MDP.
|
101 |
+
Let's take a look at how these models work. We introduce a car environment. In here, the
|
102 |
+
student miscalculated the boundary of this environment, so whenever the car goes outside
|
103 |
+
of the red dotted line, it will get stuck and can only wriggle back and forth. This
|
104 |
+
is a task where you will always reach a bug state at the end of each trajectory. Therefore,
|
105 |
+
every single agent is already an optimal agent. We create a specific one that only knows how
|
106 |
+
to drive north in a straight line.
|
107 |
+
As we can see, almost all models, except Gaussian mixture model, can be close to 100% accuracy
|
108 |
+
at classifying bug states and non-bug states. However, the agent that only knows how to
|
109 |
+
drive north is not a very interesting agent, and we probably will never use that in real
|
110 |
+
life. So what if we make it a little bit harder?
|
111 |
+
We can create an agent that drives the car randomly. Now the trajectory will become different
|
112 |
+
each time. We see a significant drop in performance for baseline solutions like noisy supervised
|
113 |
+
learning and variational autoencoder. However, our LSTM-based models can still do very well
|
114 |
+
at close to 100% accuracy. This is a pretty challenging task because we're measuring the
|
115 |
+
accuracy of each classifier on every state in a trajectory, even though we're in a toy
|
116 |
+
environment.
|
117 |
+
Let's make this setting even harder. The car environment can stay the same, but for now,
|
118 |
+
bugs can only be triggered if the agent successfully drives the car into some small red rectangular
|
119 |
+
areas. Not all agents are optimal now, and it would be unlikely for a single-direction
|
120 |
+
agent to ever see a bug state. We can now showcase the power of collaborative training
|
121 |
+
through this example.
|
122 |
+
We can see at the beginning, the agent is pretty random, and the classifier is pretty
|
123 |
+
bad except for the LSTM models. However, after only one round of collaborative training,
|
124 |
+
we see a substantial improvement for the two baseline models, both noisy supervised learning
|
125 |
+
model and variational autoencoder are able to improve their accuracy by 30% and precision
|
126 |
+
by 60%. This shows that the collaborative training is helping both the agent and the
|
127 |
+
classifier to be more optimal, even for the weaker classifiers.
|
128 |
+
We also notice that this improvement is not monotonic. Just like every other AI training
|
129 |
+
scheme, overfitting sometimes happens. Only the most expressive classifiers, our proposed
|
130 |
+
Horl LSTM and contrastive Horl LSTM can remain stable and even mildly improve their recall
|
131 |
+
in the last round of collaborative training.
|
132 |
+
We can directly examine the agent's learning by looking at its trajectory. At first, the
|
133 |
+
agent drives the car randomly, but after only one round of collaborative training, the agent
|
134 |
+
becomes sharply focused and only visits the possible buggy areas.
|
135 |
+
We verify our method on a real student dataset that we obtained from code.org. We use this
|
136 |
+
assignment as our motivating examples earlier. Bounce is a simple coding exercise where 450,000
|
137 |
+
students have submitted their solutions. We built a simulator that can run and execute
|
138 |
+
students' programs that conforms to the OpenAI GEM API. For each student program, we have
|
139 |
+
created goal labels for bug behaviors. We further binarize them into a single label
|
140 |
+
indicating correct or incorrect.
|
141 |
+
Bounce is a lot more complicated than car. Learning to bounce a ball into the goalpost
|
142 |
+
and understanding the physics is a lot more difficult for the agent. Therefore, we pre-train
|
143 |
+
the agent using the score as a reward. We call this play-to-win agent. Then we use this
|
144 |
+
agent to train our bug classifier. We're able to reach 94% accuracy with only 11 label
|
145 |
+
programs as training data. A similar algorithm that uses code as text input cannot match
|
146 |
+
our method's performance due to the smallness of the training dataset.
|
147 |
+
In addition to just grading, since we're able to determine bugs at the state level,
|
148 |
+
we can simply record a few frames before and after the bug occurs and compile a short video
|
149 |
+
for the students to demonstrate what the bug is in their assignment.
|
150 |
+
To summarize our work, we provide a fully functional simulator and a massive amount
|
151 |
+
of real student programs with goal labels. We demonstrate that our solution achieves
|
152 |
+
a high performance. However, there are still many problems remain. For example, can we
|
153 |
+
know which bug is triggered in the student program? This is helpful for providing fine-grained
|
154 |
+
feedback to the students. Training an RL agent with a classifier has also been explored in
|
155 |
+
other areas like SafeRL, where unsafe states are predicted by a classifier.
|
156 |
+
At last, we pose this question of creativity. Can our formulation accommodate creativity?
|
157 |
+
Creative programs are different but not broken. A ball can move faster or slower than the
|
158 |
+
teacher's solution, but it doesn't mean it's wrong. Exploring how we can recognize
|
159 |
+
and encourage student creativity is crucial for automated grading. Thanks for listening.
|
160 |
+
Come and chat with me during the poster session.
|
demo_data/nips-2021/25970/transcript_whisper_large-v2.txt
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Hi, my name is Maxwell Nye, and today I'll be talking about improving coherence and consistency
|
2 |
+
in neural sequence models with dual system neurosymbolic reasoning.
|
3 |
+
So I first want to give a little bit of a demo, which is to ask this question.
|
4 |
+
A bat and a ball cost $1.10 in total.
|
5 |
+
The bat costs $1 more than the ball.
|
6 |
+
How much does the ball cost?
|
7 |
+
So I'll let you think a little bit for this.
|
8 |
+
So one answer that sort of might jump out at you is $0.10, but this is actually incorrect
|
9 |
+
because the sum of the two objects should be $1.10.
|
10 |
+
So the correct answer is actually $0.05.
|
11 |
+
And this is an example from a cognitive reflection test, and these are questions designed to
|
12 |
+
have a particular answer which comes to mind quite quickly, which is in fact wrong.
|
13 |
+
And something that's interesting is that large-scale language models such as GPT-3 predict the
|
14 |
+
wrong answers as well.
|
15 |
+
And this is true not just for the sort of the classic cognitive reflection test, but
|
16 |
+
also for variants with different numbers.
|
17 |
+
So this is sort of an interesting thing.
|
18 |
+
It talks about how neural language models often have issues with consistency and coherence.
|
19 |
+
So another place that we can see this a little more concretely is the clutter data set.
|
20 |
+
In the clutter data set, models are trained to...
|
21 |
+
There are sentences about people and their family relationships and stories about those
|
22 |
+
people.
|
23 |
+
And this was originally devised as a question-answering data set where you ask what the relations
|
24 |
+
are.
|
25 |
+
One thing you can do is ask models to be trained on this data set and then generate new stories.
|
26 |
+
And when you do that, you'll see that often the generated stories have inconsistency.
|
27 |
+
So if we look at the bottom of the screen here, we can see an example of this.
|
28 |
+
Robert and his brother Antonio played harmonicas together.
|
29 |
+
Robert's daughter, Elsie, asked him to play with her.
|
30 |
+
Elsie doesn't like having to babysit her younger brother, Antonio.
|
31 |
+
And so we can see that this is a common sense error because Elsie is not the younger brother
|
32 |
+
of Antonio.
|
33 |
+
Or Elsie's younger brother is not Antonio.
|
34 |
+
So what we've done is we've built a dual system model using large-scale neural networks and
|
35 |
+
symbolic deliberative logic in order to try to help with these consistency issues.
|
36 |
+
So the model is as follows.
|
37 |
+
You use neural generation to generate sentences in a particular story.
|
38 |
+
You might generate the next sentence using a model such as GPT-3 or BART.
|
39 |
+
What you can then do is parse that sentence into the semantic meaning with respect to
|
40 |
+
the family relationships and check whether or not it matches the current state of the
|
41 |
+
family relationships that's been described so far, and only accept the candidate sentence
|
42 |
+
generations that are actually consistent.
|
43 |
+
So this has a few components.
|
44 |
+
One of the components here is a symbolic world model.
|
45 |
+
In the case of this clutter domain, the symbolic world model that we built encodes people and
|
46 |
+
their family relationships.
|
47 |
+
So in other words, you could take a sentence and encode what the underlying family relationship
|
48 |
+
is.
|
49 |
+
And what you can do is you can use SMT solvers such as the Z3 solver to check consistency.
|
50 |
+
So given a new sentence, you can check that it doesn't disobey the rules of ancestry that
|
51 |
+
we've defined here.
|
52 |
+
And so some of those are, for example, what is the relationship between children and grandchildren?
|
53 |
+
And then another is what are the rules about whether ancestry, can you be your own ancestor,
|
54 |
+
et cetera.
|
55 |
+
So one question is how is this semantic parsing done?
|
56 |
+
And it turns out we can actually do this quite cheaply using GPT-3.
|
57 |
+
So what we can see here in the dotted box is an actual example of a few-shot prompt
|
58 |
+
we can use to parse each new sentence, each new candidate sentence from the system one
|
59 |
+
generation model and parse it into the semantic form that we can then give to the world model
|
60 |
+
solver.
|
61 |
+
So the results here show that models that use this dual system neurosymbolic stories
|
62 |
+
show improved coherence over just sentences that were constructed by a neural model.
|
63 |
+
So the example here is that what we've done is we've used human judgments on which of
|
64 |
+
the following sentences make more sense given the prior context of the story.
|
65 |
+
And we see that if we use a symbolic world model and the parsing scheme described above,
|
66 |
+
humans prefer the judgments given by this model.
|
67 |
+
We can also apply the same sort of reasoning to a completely different task.
|
68 |
+
Here we can discuss the grounded instruction following task, the grounded instruction following
|
69 |
+
domain called gscan.
|
70 |
+
In this domain, the goal is to have an agent, which is shown by this pink triangle, follow
|
71 |
+
a command to perform some simple action in this grid world.
|
72 |
+
So you can see here, walk to a small yellow cylinder might be an example of a command.
|
73 |
+
Prior work has shown that one thing you can do is encode the initial state, encode the
|
74 |
+
instruction and then train a neural model to predict the action sequences.
|
75 |
+
Other work has also shown that one thing you can do is train a model to predict a distribution
|
76 |
+
over the correct target location as part of the neural model.
|
77 |
+
That will also increase the performance of the model.
|
78 |
+
What we do here is show that if you do both of these things, you predict both an action
|
79 |
+
sequence and a target location, like what is the location you should end up in, and
|
80 |
+
then check whether or not when you execute the set of instructions, you will end up in
|
81 |
+
the predicted target location.
|
82 |
+
You can sort of check consistency between these two different predictions and only accept
|
83 |
+
those instruction sequences which match the target location prediction.
|
84 |
+
And this leads to also higher accuracy, especially in a low data regime.
|
85 |
+
We have more details about the results of the paper.
|
86 |
+
So that's a little bit of an overview of our paper.
|
87 |
+
Our takeaways are that you can build systems with combined neural methods and explicit
|
88 |
+
world knowledge.
|
89 |
+
And if you add just a little bit of world knowledge, you can really help increase coherence
|
90 |
+
and consistency for these large sequence models.
|
91 |
+
There are some challenges here about parsing in larger scale domains and also what it would
|
92 |
+
mean to automatically build a more complete world model.
|
93 |
+
Thank you very much.
|
demo_data/nips-2021/25973/transcript_whisper_large-v2.txt
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Hi everyone, I'm Jingwen, a PhD student in National University of Singapore.
|
2 |
+
In this paper, we introduce dual-aspect collaborative transformer for solving routine problems.
|
3 |
+
Until now, the neural solvers for VRPs could be classified in two types.
|
4 |
+
The first one is the neural construction solver.
|
5 |
+
It starts from an empty solution and iteratively selects a customer node to the solution,
|
6 |
+
until all customers have been visited.
|
7 |
+
And in this paper, we focus more on the neural improvement solvers.
|
8 |
+
It starts from an incomplete solution and iteratively improves the solution
|
9 |
+
based on the node features and solution features, until reaching a step limit T.
|
10 |
+
Although the transformer has shown the efficiency for processing the sequence data,
|
11 |
+
its positional encoding method may not be optimal for encoding the VRP solutions,
|
12 |
+
because it only learns a unified set of embeddings and combines the node embeddings
|
13 |
+
and the positional embeddings together.
|
14 |
+
Also, it can only encode the linear sequences,
|
15 |
+
which cannot capture the circularity and symmetry of VRP solutions.
|
16 |
+
So in this paper, we introduce the dual-aspect augmentation,
|
17 |
+
which could better describe the VRP solutions.
|
18 |
+
We separate the learnings to node feature embeddings and positional feature embeddings
|
19 |
+
based on the cross-aspect referential attention.
|
20 |
+
And in this table, we compare the performance of dual-aspect and single-aspect.
|
21 |
+
We can see the dual-aspect outperforms the single-aspect.
|
22 |
+
And here we introduce the cyclic positional encoding.
|
23 |
+
In this figure, we describe the embedding vectors and correlations between every two embeddings
|
24 |
+
of the original PE and our CPE method in subfeature A and B.
|
25 |
+
In subfeature C, we describe the top two principal components after PCA projection.
|
26 |
+
And we can see our PCA method can better capture the circularity of VRP solutions.
|
27 |
+
And here we did some ablation studies on the CPE method,
|
28 |
+
which can achieve better generalization performance.
|
29 |
+
And now we introduce our curriculum learning strategy in the training process.
|
30 |
+
And in this method, we're training with an unstepped PPO method and a curriculum learning strategy.
|
31 |
+
It gradually prescribes higher quality solutions as the initial stage for training.
|
32 |
+
And in this graph, we describe two curves.
|
33 |
+
The blue one is the PPO method only, and the green one is the PPO method only.
|
34 |
+
And the green one is the PPO method with our curriculum learning strategy.
|
35 |
+
And we can see the green one is more stable and achieves lower objective values.
|
36 |
+
And here is the comparison performance of our method and some baselines on both TST and CVRP.
|
37 |
+
We can see our DACT outperforms the existing transformer-based improvement models.
|
38 |
+
So, based on these experiments, we can see our DACT performs very well for the routing problems.
|
39 |
+
And in the future, we hope to use this method to solve more combinatorial optimization problems.
|
40 |
+
Thank you.
|
demo_data/nips-2021/25974/transcript_whisper_large-v2.txt
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Hi, I am Mohamed Pezeshki from Mila and today I am going to talk about creating starvation.
|
2 |
+
This is a joint work with Omar Kaba, Joshua Bengio, Aaron Korvel, Doina Prikop, and Guillaume
|
3 |
+
Lajra.
|
4 |
+
Let me start with a story.
|
5 |
+
Back in 1904, there was a horse called Hans and people believed that he could do arithmetic.
|
6 |
+
Here is an article from New York Times published in 1904.
|
7 |
+
The article says that Hans is an expert in numbers.
|
8 |
+
For example, when two numbers of 5 and 9 are written on a blackboard, Hans replies by tapping
|
9 |
+
on the ground 14 times.
|
10 |
+
Seven years later, in an article, Oscar Feinst unveiled that the so-called clever Hans was
|
11 |
+
not actually capable of doing any arithmetic and instead reading subtle hints in his trainer's
|
12 |
+
behavior indicating when to stop tapping.
|
13 |
+
As the article says, even the trainer was not aware of providing these shortcut signals.
|
14 |
+
So Hans was clever but probably not in doing arithmetic.
|
15 |
+
Its cleverness was in reading his trainer's clues.
|
16 |
+
A similar phenomenon has been observed in many applications of machine learning.
|
17 |
+
Essentially, the situations where the model seemingly has a very good performance but
|
18 |
+
in fact it hasn't learned true underlying relationships between the input and the target.
|
19 |
+
In this paper by Robert Gares and co-authors, they list several instances of what they call
|
20 |
+
shortcut learning.
|
21 |
+
For example, in a task of image captioning, the model predicts grazing sheep only by seeing
|
22 |
+
the green hillside.
|
23 |
+
In another instance, the network hallucinates a teapot with high confidence in an image
|
24 |
+
of pure noise.
|
25 |
+
This is another and indeed dangerous example of the task of pneumonia detection from x-ray
|
26 |
+
images.
|
27 |
+
The model appears to have a very good performance even on the test set.
|
28 |
+
However, the heat maps reveal that the network is not looking at the long section at all
|
29 |
+
and just latching on some features in the corner of the image.
|
30 |
+
The intuition behind this phenomenon is a folk knowledge in one form or another.
|
31 |
+
Given a strongly correlated and fast to learn features in training data, grading the sense
|
32 |
+
is biased towards learning them first.
|
33 |
+
However, this intuition is a bit abstract and hand-wavy, so let's look at a more concrete
|
34 |
+
example.
|
35 |
+
Consider a 2D classification task with red and blue data points as shown.
|
36 |
+
If you train in raw network and this data, here is the decision boundary that we learn.
|
37 |
+
Now consider slightly different arrangements of the data points such that the blue data
|
38 |
+
points are slightly shifted to the left and the red data points are shifted to the right,
|
39 |
+
making the data linearly separable.
|
40 |
+
Now if we train in neural network on this, we get an almost linear decision boundary.
|
41 |
+
Note that the network is only making its predictions based on the feature along the x-axis.
|
42 |
+
Indicated in the red circle here, you can see that the decision boundary is very close
|
43 |
+
to the data points.
|
44 |
+
However, the network is super confident on its predictions and the training loss is indeed
|
45 |
+
zero.
|
46 |
+
So you can see that the slightly perturbing data point can get the network to predict
|
47 |
+
an incorrect label with high confidence.
|
48 |
+
This problem will be even more visible when testing the model on OOD, meaning out of distribution
|
49 |
+
test data.
|
50 |
+
An online interactive demo of this work is available on a blog post we wrote.
|
51 |
+
If you wish to play with it a bit, please visit the link provided here.
|
52 |
+
So we hypothesize that what is happening here is gradient starvation.
|
53 |
+
Gradient starvation is a phenomenon in which a neural network captures statistically dominant
|
54 |
+
features while remaining invariant to the rest.
|
55 |
+
Here gradient descent leads to parameter updates, predominantly in directions that only capture
|
56 |
+
these dominant features, thus starving the gradient from other potentially informative
|
57 |
+
features.
|
58 |
+
Here, the notions of feature and dominancy of a feature is rather vague.
|
59 |
+
To define them more formally, we need to look into the learning dynamics.
|
60 |
+
In the interest of time, I will be covering only the general intuition of our results
|
61 |
+
and encourage interested audiences to take a look at the full paper for detailed treatment.
|
62 |
+
So the two main theorems of the paper can be summarized into these two plots that I
|
63 |
+
now explain.
|
64 |
+
Let's first start with gradient starvation itself on the left.
|
65 |
+
We train a model with common binary cross entropy loss.
|
66 |
+
On the x-axis we have training iterations or epochs, and on the y-axis we monitor two
|
67 |
+
features z1 and z2.
|
68 |
+
Their dynamics depend on several factors, including their strength, meaning how easy
|
69 |
+
or how hard it is for the network to learn those features, and their correlation with
|
70 |
+
the target.
|
71 |
+
Here, z1 has a larger correlation and hence converges to a value around 6, and z2 with
|
72 |
+
a smaller correlation converges to a value around 2.
|
73 |
+
However, the strength is equal, i.e. kappa is set to be 1.
|
74 |
+
Again, it means that both of these features are as easy for the network to learn.
|
75 |
+
Now let's keep their correlation fixed but increase the strength of z1.
|
76 |
+
A kappa equal to 2 means that z1 is learned easier than z2.
|
77 |
+
We can immediately see that although their correlation is still the same as before, z1
|
78 |
+
is overestimated while z2 is underestimated.
|
79 |
+
If we make kappa to be 4 or 8, it becomes more evident that simply because z1 is easier
|
80 |
+
to learn, it is being overestimated, while z2 is being starved.
|
81 |
+
Our theory shows that an increase in the strength of feature z1 has a detrimental effect on
|
82 |
+
the learning of feature z2.
|
83 |
+
Now our second theory shows that adding this term, indicated in the red rectangle, to the
|
84 |
+
loss decouples the features.
|
85 |
+
As you can see, a spectral decoupling decouples the features at the converged solution.
|
86 |
+
Regardless of the value of kappa, all of the experiments on z1 and z2 converge to the same
|
87 |
+
place.
|
88 |
+
Again, we refer interested audience to the paper for more theory as well as more intuition.
|
89 |
+
Now let's look at some experiments.
|
90 |
+
Recall the task that we studied earlier.
|
91 |
+
When the data is not linearly separable, we learn the curve decision boundary.
|
92 |
+
On the right, we see how z1 and z2 evolve.
|
93 |
+
When the data is linearly separable with a small margin, a linear decision boundary is
|
94 |
+
learned.
|
95 |
+
We observe that z1 is overestimated, while z2 is heavily underestimated.
|
96 |
+
Now let's see what happens if we add spectral decoupling.
|
97 |
+
Spectral decoupling suppresses z1 and as a result allows z2 to grow.
|
98 |
+
It also appears that other regularization methods do not succeed at learning a curve
|
99 |
+
decision boundary.
|
100 |
+
So we observed that spectral decoupling leads to a decision boundary with a larger margin.
|
101 |
+
What happens in real-world tasks?
|
102 |
+
The distance to the decision boundary is not trivial to compute when working with nonlinear
|
103 |
+
models.
|
104 |
+
However, we can use a proxy.
|
105 |
+
The amount of perturbation required to fool the network is a proxy to the margin.
|
106 |
+
Look at the plot on the right.
|
107 |
+
On the x-axis, we have the amount of perturbation and on the y-axis, we have how many of the
|
108 |
+
examples are misclassified.
|
109 |
+
You can see that with a fixed amount of perturbation, a model with vanilla binary cross entropy
|
110 |
+
is much more vulnerable compared to a model trained with spectral decoupling.
|
111 |
+
In another experiment, we studied colored MNIST, a well-known task of OOD generalization
|
112 |
+
where the color is spuriously correlated with the labels.
|
113 |
+
Also another task of OOD generalization is a classification task on the CILIB8 dataset
|
114 |
+
where the training data is again biased with respect to the color of the hair and the gender
|
115 |
+
such that most of male images have black hair while the majority of females have blonde
|
116 |
+
hair.
|
117 |
+
Here, we skip the details in the interest of time.
|
118 |
+
However, let me just draw your attention to the superiority of spectral decoupling in
|
119 |
+
these both tasks.
|
120 |
+
Finally to conclude, we talked about the clever hands effect.
|
121 |
+
We showed that a similar phenomenon can happen in neural networks and we called that gradient
|
122 |
+
starvation.
|
123 |
+
To understand gradient starvation, we looked into the learning dynamics.
|
124 |
+
We showed that the presence of a strongly correlated feature could result in a starvation
|
125 |
+
of other features.
|
126 |
+
We also showed that spectral decoupling provides some degree of control over what features
|
127 |
+
to learn and decouples essentially the features.
|
128 |
+
Thanks for your attention.
|
129 |
+
If you're interested to chat more, please visit our poster this afternoon.
|
130 |
+
Thank you very much.
|
requirements.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
-
webvtt-py
|
2 |
-
transformers
|
3 |
-
requests
|
4 |
-
pandas
|
5 |
-
nltk
|
6 |
-
sentencepiece
|
7 |
torch
|
|
|
1 |
+
webvtt-py
|
2 |
+
transformers
|
3 |
+
requests
|
4 |
+
pandas
|
5 |
+
nltk
|
6 |
+
sentencepiece
|
7 |
torch
|