Spaces:

retkowski
/

ytseg_demo

Running

App Files Files Community

ScientiaEtVeritas commited on Mar 15

Commit

e4b6f2e

•

1 Parent(s): 317a988

txt transcripts

Browse files

Files changed (14) hide show

app.py +322 -322
demo_data/nips-2021/25953/transcript_whisper_large-v2.txt +193 -0
demo_data/nips-2021/25957/transcript_whisper_large-v2.txt +179 -0
demo_data/nips-2021/25958/transcript_whisper_large-v2.txt +124 -0
demo_data/nips-2021/25959/transcript_whisper_large-v2.txt +117 -0
demo_data/nips-2021/25962/transcript_whisper_large-v2.txt +51 -0
demo_data/nips-2021/25963/transcript_whisper_large-v2.txt +178 -0
demo_data/nips-2021/25964/transcript_whisper_large-v2.txt +366 -0
demo_data/nips-2021/25965/transcript_whisper_large-v2.txt +136 -0
demo_data/nips-2021/25969/transcript_whisper_large-v2.txt +160 -0
demo_data/nips-2021/25970/transcript_whisper_large-v2.txt +93 -0
demo_data/nips-2021/25973/transcript_whisper_large-v2.txt +40 -0
demo_data/nips-2021/25974/transcript_whisper_large-v2.txt +130 -0
requirements.txt +6 -6

app.py CHANGED Viewed

@@ -1,322 +1,322 @@
-import itertools
-import json
-import re
-from functools import partial
-from pathlib import Path
-import pandas as pd
-import requests
-import streamlit as st
-import webvtt
-from transformers import AutoTokenizer
-from generate_text_api import TextGenerator
-from model_inferences.utils.chunking import Truncater
-from model_inferences.utils.files import get_captions_from_vtt, get_transcript
-USE_PARAGRAPHING_MODEL = True
-def get_sublist_by_flattened_index(A, i):
-    current_index = 0
-    for sublist in A:
-        sublist_length = len(sublist)
-        if current_index <= i < current_index + sublist_length:
-            return sublist, A.index(sublist)
-        current_index += sublist_length
-    return None, None
-import requests
-def get_talk_metadata(video_id):
-    url = "https://www.ted.com/graphql"
-    headers = {
-        "Content-Type": "application/json",
-        "Accept": "application/json",
-        "x-operation-name": "Transcript",  # Replace with the actual operation name
-    }
-    data = {
-        "query": """
-        query GetTalk($videoId: ID!) {
-            video(id: $videoId) {
-                title,
-                presenterDisplayName,
-                nativeDownloads {medium}
-            }
-        }
-        """,
-        "variables": {
-            "videoId": video_id,  # Corrected key to "videoId"
-        },
-    }
-    response = requests.post(url, json=data, headers=headers)
-    if response.status_code == 200:
-        result = response.json()
-        return result
-    else:
-        print(f"Error: {response.status_code}, {response.text}")
-class OfflineTextSegmenterClient:
-    def __init__(self, host_url):
-        self.host_url = host_url.rstrip("/") + "/segment"
-    def segment(self, text, captions=None, generate_titles=False, threshold=0.4):
-        payload = {
-            'text': text,
-            'captions': captions,
-            'generate_titles': generate_titles,
-            "prefix_titles": True,
-            "threshold": threshold,
-        }
-        headers = {
-            'Content-Type': 'application/json'
-        }
-        response = requests.post(self.host_url, data=json.dumps(payload), headers=headers).json()
-        #segments =  response["annotated_segments"] if "annotated_segments" in response else response["segments"]
-        return {'segments':response["segments"], 'titles': response["titles"], 'sentences': response["sentences"]}
-class Toc:
-    def __init__(self):
-        self._items = []
-        self._placeholder = None
-    def title(self, text):
-        self._markdown(text, "h1")
-    def header(self, text):
-        self._markdown(text, "h2", " " * 2)
-    def subheader(self, text):
-        self._markdown(text, "h3", " " * 4)
-    def placeholder(self, sidebar=False):
-        self._placeholder = st.sidebar.empty() if sidebar else st.empty()
-    def generate(self):
-        if self._placeholder:
-            self._placeholder.markdown("\n".join(self._items), unsafe_allow_html=True)
-    def _markdown(self, text, level, space=""):
-        key = re.sub(r'[^\w-]', '', text.replace(" ", "-").replace("'", "-").lower())
-        st.markdown(f"<{level} id='{key}'>{text}</{level}>", unsafe_allow_html=True)
-        self._items.append(f"{space}* <a href='#{key}'>{text}</a>")
-# custom_css = "<style type='text/css'>" + Path('style.css').read_text() + "</style>"
-# st.write(custom_css, unsafe_allow_html=True)
-def concat_prompt(prompt_text, text, model_name):
-    if 'flan' in model_name:
-        input_ = prompt_text + "\n\n" + text
-    elif 'galactica' in model_name:
-        input_ = text + "\n\n" + prompt_text
-    return input_
-endpoint = "http://hiaisc.isl.iar.kit.edu/summarize"
-ENDPOINTS = {"http://hiaisc.isl.iar.kit.edu/summarize": "meta-llama/Llama-2-13b-chat-hf",}
-client = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/chapter")
-if USE_PARAGRAPHING_MODEL:
-    paragrapher = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/paragraph")
-summarizer = TextGenerator(endpoint)
-tokenizer = AutoTokenizer.from_pretrained(ENDPOINTS[endpoint], use_fast=False)
-# TLDR PROMPT
-SYSTEM_PROMPT = "You are an assistant who replies with a summary to every message."
-TLDR_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
-{system_prompt}
-<</SYS>>
-{user_message} [/INST] Sure! Here is a summary of the research presentation in a single, short sentence:"""
-TLDR_USER_PROMPT = "Summarize the following research presentation in a single, short sentence:\n\n{input}"
-TLDR_PROMPT = TLDR_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
-TLDR_PROMPT_LENGTH = tokenizer(TLDR_PROMPT, return_tensors="pt")["input_ids"].size(1)
-BP_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
-{system_prompt}
-<</SYS>>
-{user_message} [/INST] Sure! Here is a summary of the research presentation using three bullet points:\n\n\u2022"""
-BP_USER_PROMPT = "Summarize the following research presentation using three bullet points:\n\n{input}"
-BP_PROMPT = BP_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
-BP_PROMPT_LENGTH = tokenizer(BP_PROMPT, return_tensors="pt")["input_ids"].size(1)
-CONTEXT_LENGTH = 3072
-MAX_SUMMARY_LENGTH = 1024
-TLDR_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - TLDR_PROMPT_LENGTH - 1
-BP_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - BP_PROMPT_LENGTH - 1
-text_generator = TextGenerator(endpoint)
-temperature = 0.7
-import re
-def replace_newlines(text):
-    updated_text = re.sub(r'\n+', r'\n\n', text)
-    return updated_text
-def generate_summary(summarizer, generated_text_box, input_, prompt, max_input_length, prefix=""):
-    all_generated_text = prefix
-    truncater = Truncater(tokenizer, max_length=max_input_length)
-    input_ = truncater(input_)
-    input_ = prompt.format(input=input_)
-    for generated_text in summarizer.generate_text_stream(input_, max_new_tokens=MAX_SUMMARY_LENGTH, do_sample=True, temperature=temperature):
-        all_generated_text += replace_newlines(generated_text)
-        generated_text_box.info(all_generated_text)
-    print(all_generated_text)
-    return all_generated_text.strip()
-st.header("Demo: Intelligent Recap")
-if not hasattr(st, 'global_state'):
-    st.global_state = {'NIPS 2021 Talks': None, 'TED Talks': None}
-    # NIPS 2021 Talks
-    transcript_files = itertools.islice(Path("demo_data/nips-2021/").rglob("transcript_whisper_large-v2.vtt"), 15)
-    # get titles from metadata.json
-    transcripts_map = {}
-    for transcript_file in transcript_files:
-        base_path = transcript_file.parent
-        metadata = base_path / "metadata.json"
-        txt_file = base_path / "transcript_whisper_large-v2.txt"
-        with open(metadata) as f:
-            metadata = json.load(f)
-            title = metadata["title"]
-            transcript = get_transcript(txt_file)
-            captions = get_captions_from_vtt(transcript_file)
-            transcripts_map[title] = {"transcript": transcript, "captions": captions, "video": base_path / "video.mp4"}
-    st.global_state['NIPS 2021 Talks'] = transcripts_map
-    data = pd.read_json("demo_data/ted_talks.json")
-    video_ids = data.talk_id.tolist()
-    transcripts = data.text.apply(lambda x: " ".join(x)).tolist()
-    transcripts_map = {}
-    for video_id, transcript in zip(video_ids, transcripts):
-        metadata = get_talk_metadata(video_id)
-        title = metadata["data"]["video"]["title"]
-        presenter = metadata["data"]["video"]["presenterDisplayName"]
-        print(metadata["data"])
-        if metadata["data"]["video"]["nativeDownloads"] is None:
-            continue
-        video_url = metadata["data"]["video"]["nativeDownloads"]["medium"]
-        transcripts_map[title] = {"transcript": transcript, "video": video_url, "presenter": presenter}
-    st.global_state['TED Talks'] = transcripts_map
-    def get_lecture_id(path):
-        return int(path.parts[-2].split('-')[1])
-    transcript_files = Path("demo_data/lectures/").rglob("English.vtt")
-    sorted_path_list = sorted(transcript_files, key=get_lecture_id)
-    transcripts_map = {}
-    for transcript_file in sorted_path_list:
-        base_path = transcript_file.parent
-        lecture_id = base_path.parts[-1]
-        transcript = " ".join([c["text"].strip() for c in get_captions_from_vtt(transcript_file)]).replace("\n", " ")
-        video_path = Path(base_path, "video.mp4")
-        transcripts_map["Machine Translation: " + lecture_id] = {"transcript": transcript, "video": video_path}
-    st.global_state['KIT Lectures'] = transcripts_map
-type_of_document = st.selectbox('What kind of document do you want to test it on?', list(st.global_state.keys()))
-transcripts_map = st.global_state[type_of_document]
-selected_talk = st.selectbox("Choose a document...", list(transcripts_map.keys()))
-st.video(str(transcripts_map[selected_talk]['video']), format="video/mp4", start_time=0)
-input_text = st.text_area("Transcript", value=transcripts_map[selected_talk]['transcript'], height=300)
-toc = Toc()
-summarization_todos = []
-with st.expander("Adjust Thresholds"):
-    threshold = st.slider('Chapter Segmentation Threshold', 0.00, 1.00, value=0.4, step=0.05)
-    paragraphing_threshold = st.slider('Paragraphing Threshold', 0.00, 1.00, value=0.5, step=0.05)
-if st.button("Process Transcript"):
-    with st.sidebar:
-        st.header("Table of Contents")
-        toc.placeholder()
-    st.header(selected_talk, divider='rainbow')
-    # if 'presenter' in transcripts_map[selected_talk]:
-    #     st.markdown(f"### *by **{transcripts_map[selected_talk]['presenter']}***")
-    captions = transcripts_map[selected_talk]['captions'] if 'captions' in transcripts_map[selected_talk] else None
-    result = client.segment(input_text, captions, generate_titles=True, threshold=threshold)
-    if USE_PARAGRAPHING_MODEL:
-        presult = paragrapher.segment(input_text, captions, generate_titles=False, threshold=paragraphing_threshold)
-        paragraphs = presult['segments']
-    segments, titles, sentences = result['segments'], result['titles'], result['sentences']
-    if USE_PARAGRAPHING_MODEL:
-        prev_chapter_idx = 0
-        prev_paragraph_idx = 0
-        segment = []
-        for i, sentence in enumerate(sentences):
-            chapter, chapter_idx = get_sublist_by_flattened_index(segments, i)
-            paragraph, paragraph_idx = get_sublist_by_flattened_index(paragraphs, i)
-            if (chapter_idx != prev_chapter_idx and paragraph_idx == prev_paragraph_idx) or (paragraph_idx != prev_paragraph_idx and chapter_idx != prev_chapter_idx):
-                print("Chapter / Chapter & Paragraph")
-                segment_text = " ".join(segment)
-                toc.subheader(titles[prev_chapter_idx])
-                if len(segment_text) > 1200:
-                    generated_text_box = st.info("")
-                    summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
-                elif len(segment_text) > 450:
-                    generated_text_box = st.info("")
-                    summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
-                st.write(segment_text)
-                segment = []
-            elif paragraph_idx != prev_paragraph_idx and chapter_idx == prev_chapter_idx:
-                print("Paragraph")
-                segment.append("\n\n")
-            segment.append(sentence)
-            prev_chapter_idx = chapter_idx
-            prev_paragraph_idx = paragraph_idx
-        segment_text = " ".join(segment)
-        toc.subheader(titles[prev_chapter_idx])
-        if len(segment_text) > 1200:
-            generated_text_box = st.info("")
-            summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
-        elif len(segment_text) > 450:
-            generated_text_box = st.info("")
-            summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
-        st.write(segment_text)
-    else:
-        segments = [" ".join([sentence for sentence in segment]) for segment in segments]
-        for title, segment in zip(titles, segments):
-            toc.subheader(title)
-            if len(segment) > 1200:
-                generated_text_box = st.info("")
-                summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
-            elif len(segment) > 450:
-                generated_text_box = st.info("")
-                summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
-            st.write(segment)
-    toc.generate()
-for summarization_todo in summarization_todos:
-    summarization_todo()

+import itertools
+import json
+import re
+from functools import partial
+from pathlib import Path
+import pandas as pd
+import requests
+import streamlit as st
+import webvtt
+from transformers import AutoTokenizer
+from generate_text_api import TextGenerator
+from model_inferences.utils.chunking import Truncater
+from model_inferences.utils.files import get_captions_from_vtt, get_transcript
+USE_PARAGRAPHING_MODEL = True
+def get_sublist_by_flattened_index(A, i):
+    current_index = 0
+    for sublist in A:
+        sublist_length = len(sublist)
+        if current_index <= i < current_index + sublist_length:
+            return sublist, A.index(sublist)
+        current_index += sublist_length
+    return None, None
+import requests
+def get_talk_metadata(video_id):
+    url = "https://www.ted.com/graphql"
+    headers = {
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+        "x-operation-name": "Transcript",  # Replace with the actual operation name
+    }
+    data = {
+        "query": """
+        query GetTalk($videoId: ID!) {
+            video(id: $videoId) {
+                title,
+                presenterDisplayName,
+                nativeDownloads {medium}
+            }
+        }
+        """,
+        "variables": {
+            "videoId": video_id,  # Corrected key to "videoId"
+        },
+    }
+    response = requests.post(url, json=data, headers=headers)
+    if response.status_code == 200:
+        result = response.json()
+        return result
+    else:
+        print(f"Error: {response.status_code}, {response.text}")
+class OfflineTextSegmenterClient:
+    def __init__(self, host_url):
+        self.host_url = host_url.rstrip("/") + "/segment"
+    def segment(self, text, captions=None, generate_titles=False, threshold=0.4):
+        payload = {
+            'text': text,
+            'captions': captions,
+            'generate_titles': generate_titles,
+            "prefix_titles": True,
+            "threshold": threshold,
+        }
+        headers = {
+            'Content-Type': 'application/json'
+        }
+        response = requests.post(self.host_url, data=json.dumps(payload), headers=headers).json()
+        #segments =  response["annotated_segments"] if "annotated_segments" in response else response["segments"]
+        return {'segments':response["segments"], 'titles': response["titles"], 'sentences': response["sentences"]}
+class Toc:
+    def __init__(self):
+        self._items = []
+        self._placeholder = None
+    def title(self, text):
+        self._markdown(text, "h1")
+    def header(self, text):
+        self._markdown(text, "h2", " " * 2)
+    def subheader(self, text):
+        self._markdown(text, "h3", " " * 4)
+    def placeholder(self, sidebar=False):
+        self._placeholder = st.sidebar.empty() if sidebar else st.empty()
+    def generate(self):
+        if self._placeholder:
+            self._placeholder.markdown("\n".join(self._items), unsafe_allow_html=True)
+    def _markdown(self, text, level, space=""):
+        key = re.sub(r'[^\w-]', '', text.replace(" ", "-").replace("'", "-").lower())
+        st.markdown(f"<{level} id='{key}'>{text}</{level}>", unsafe_allow_html=True)
+        self._items.append(f"{space}* <a href='#{key}'>{text}</a>")
+# custom_css = "<style type='text/css'>" + Path('style.css').read_text() + "</style>"
+# st.write(custom_css, unsafe_allow_html=True)
+def concat_prompt(prompt_text, text, model_name):
+    if 'flan' in model_name:
+        input_ = prompt_text + "\n\n" + text
+    elif 'galactica' in model_name:
+        input_ = text + "\n\n" + prompt_text
+    return input_
+endpoint = "http://hiaisc.isl.iar.kit.edu/summarize"
+ENDPOINTS = {"http://hiaisc.isl.iar.kit.edu/summarize": "meta-llama/Llama-2-13b-chat-hf",}
+client = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/chapter")
+if USE_PARAGRAPHING_MODEL:
+    paragrapher = OfflineTextSegmenterClient("http://hiaisc.isl.iar.kit.edu/paragraph")
+summarizer = TextGenerator(endpoint)
+tokenizer = AutoTokenizer.from_pretrained(ENDPOINTS[endpoint], use_fast=False)
+# TLDR PROMPT
+SYSTEM_PROMPT = "You are an assistant who replies with a summary to every message."
+TLDR_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
+{system_prompt}
+<</SYS>>
+{user_message} [/INST] Sure! Here is a summary of the research presentation in a single, short sentence:"""
+TLDR_USER_PROMPT = "Summarize the following research presentation in a single, short sentence:\n\n{input}"
+TLDR_PROMPT = TLDR_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
+TLDR_PROMPT_LENGTH = tokenizer(TLDR_PROMPT, return_tensors="pt")["input_ids"].size(1)
+BP_PROMPT_TEMPLATE = """<s>[INST] <<SYS>>
+{system_prompt}
+<</SYS>>
+{user_message} [/INST] Sure! Here is a summary of the research presentation using three bullet points:\n\n\u2022"""
+BP_USER_PROMPT = "Summarize the following research presentation using three bullet points:\n\n{input}"
+BP_PROMPT = BP_PROMPT_TEMPLATE.format(system_prompt=SYSTEM_PROMPT, user_message=TLDR_USER_PROMPT)
+BP_PROMPT_LENGTH = tokenizer(BP_PROMPT, return_tensors="pt")["input_ids"].size(1)
+CONTEXT_LENGTH = 3072
+MAX_SUMMARY_LENGTH = 1024
+TLDR_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - TLDR_PROMPT_LENGTH - 1
+BP_MAX_INPUT_LENGTH = CONTEXT_LENGTH - MAX_SUMMARY_LENGTH - BP_PROMPT_LENGTH - 1
+text_generator = TextGenerator(endpoint)
+temperature = 0.7
+import re
+def replace_newlines(text):
+    updated_text = re.sub(r'\n+', r'\n\n', text)
+    return updated_text
+def generate_summary(summarizer, generated_text_box, input_, prompt, max_input_length, prefix=""):
+    all_generated_text = prefix
+    truncater = Truncater(tokenizer, max_length=max_input_length)
+    input_ = truncater(input_)
+    input_ = prompt.format(input=input_)
+    for generated_text in summarizer.generate_text_stream(input_, max_new_tokens=MAX_SUMMARY_LENGTH, do_sample=True, temperature=temperature):
+        all_generated_text += replace_newlines(generated_text)
+        generated_text_box.info(all_generated_text)
+    print(all_generated_text)
+    return all_generated_text.strip()
+st.header("Demo: Intelligent Recap")
+if not hasattr(st, 'global_state'):
+    st.global_state = {'NIPS 2021 Talks': None, 'TED Talks': None}
+    # NIPS 2021 Talks
+    transcript_files = itertools.islice(Path("demo_data/nips-2021/").rglob("transcript_whisper_large-v2.vtt"), 15)
+    # get titles from metadata.json
+    transcripts_map = {}
+    for transcript_file in transcript_files:
+        base_path = transcript_file.parent
+        metadata = base_path / "metadata.json"
+        txt_file = base_path / "transcript_whisper_large-v2.txt"
+        with open(metadata) as f:
+            metadata = json.load(f)
+            title = metadata["title"]
+            transcript = get_transcript(txt_file)
+            captions = get_captions_from_vtt(transcript_file)
+            transcripts_map[title] = {"transcript": transcript, "captions": captions, "video": base_path / "video.mp4"}
+    st.global_state['NIPS 2021 Talks'] = transcripts_map
+    data = pd.read_json("demo_data/ted_talks.json")
+    video_ids = data.talk_id.tolist()
+    transcripts = data.text.apply(lambda x: " ".join(x)).tolist()
+    transcripts_map = {}
+    for video_id, transcript in zip(video_ids, transcripts):
+        metadata = get_talk_metadata(video_id)
+        title = metadata["data"]["video"]["title"]
+        presenter = metadata["data"]["video"]["presenterDisplayName"]
+        print(metadata["data"])
+        if metadata["data"]["video"]["nativeDownloads"] is None:
+            continue
+        video_url = metadata["data"]["video"]["nativeDownloads"]["medium"]
+        transcripts_map[title] = {"transcript": transcript, "video": video_url, "presenter": presenter}
+    st.global_state['TED Talks'] = transcripts_map
+    def get_lecture_id(path):
+        return int(path.parts[-2].split('-')[1])
+    transcript_files = Path("demo_data/lectures/").rglob("English.vtt")
+    sorted_path_list = sorted(transcript_files, key=get_lecture_id)
+    transcripts_map = {}
+    for transcript_file in sorted_path_list:
+        base_path = transcript_file.parent
+        lecture_id = base_path.parts[-1]
+        transcript = " ".join([c["text"].strip() for c in get_captions_from_vtt(transcript_file)]).replace("\n", " ")
+        video_path = Path(base_path, "video.mp4")
+        transcripts_map["Machine Translation: " + lecture_id] = {"transcript": transcript, "video": video_path}
+    st.global_state['KIT Lectures'] = transcripts_map
+type_of_document = st.selectbox('What kind of document do you want to test it on?', list(st.global_state.keys()))
+transcripts_map = st.global_state[type_of_document]
+selected_talk = st.selectbox("Choose a document...", list(transcripts_map.keys()))
+st.video(str(transcripts_map[selected_talk]['video']), format="video/mp4", start_time=0)
+input_text = st.text_area("Transcript", value=transcripts_map[selected_talk]['transcript'], height=300)
+toc = Toc()
+summarization_todos = []
+with st.expander("Adjust Thresholds"):
+    threshold = st.slider('Chapter Segmentation Threshold', 0.00, 1.00, value=0.4, step=0.05)
+    paragraphing_threshold = st.slider('Paragraphing Threshold', 0.00, 1.00, value=0.5, step=0.05)
+if st.button("Process Transcript"):
+    with st.sidebar:
+        st.header("Table of Contents")
+        toc.placeholder()
+    st.header(selected_talk, divider='rainbow')
+    # if 'presenter' in transcripts_map[selected_talk]:
+    #     st.markdown(f"### *by **{transcripts_map[selected_talk]['presenter']}***")
+    captions = transcripts_map[selected_talk]['captions'] if 'captions' in transcripts_map[selected_talk] else None
+    result = client.segment(input_text, captions, generate_titles=True, threshold=threshold)
+    if USE_PARAGRAPHING_MODEL:
+        presult = paragrapher.segment(input_text, captions, generate_titles=False, threshold=paragraphing_threshold)
+        paragraphs = presult['segments']
+    segments, titles, sentences = result['segments'], result['titles'], result['sentences']
+    if USE_PARAGRAPHING_MODEL:
+        prev_chapter_idx = 0
+        prev_paragraph_idx = 0
+        segment = []
+        for i, sentence in enumerate(sentences):
+            chapter, chapter_idx = get_sublist_by_flattened_index(segments, i)
+            paragraph, paragraph_idx = get_sublist_by_flattened_index(paragraphs, i)
+            if (chapter_idx != prev_chapter_idx and paragraph_idx == prev_paragraph_idx) or (paragraph_idx != prev_paragraph_idx and chapter_idx != prev_chapter_idx):
+                print("Chapter / Chapter & Paragraph")
+                segment_text = " ".join(segment)
+                toc.subheader(titles[prev_chapter_idx])
+                if len(segment_text) > 1200:
+                    generated_text_box = st.info("")
+                    summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
+                elif len(segment_text) > 450:
+                    generated_text_box = st.info("")
+                    summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
+                st.write(segment_text)
+                segment = []
+            elif paragraph_idx != prev_paragraph_idx and chapter_idx == prev_chapter_idx:
+                print("Paragraph")
+                segment.append("\n\n")
+            segment.append(sentence)
+            prev_chapter_idx = chapter_idx
+            prev_paragraph_idx = paragraph_idx
+        segment_text = " ".join(segment)
+        toc.subheader(titles[prev_chapter_idx])
+        if len(segment_text) > 1200:
+            generated_text_box = st.info("")
+            summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
+        elif len(segment_text) > 450:
+            generated_text_box = st.info("")
+            summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment_text, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
+        st.write(segment_text)
+    else:
+        segments = [" ".join([sentence for sentence in segment]) for segment in segments]
+        for title, segment in zip(titles, segments):
+            toc.subheader(title)
+            if len(segment) > 1200:
+                generated_text_box = st.info("")
+                summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, BP_PROMPT, BP_MAX_INPUT_LENGTH, prefix="\u2022"))
+            elif len(segment) > 450:
+                generated_text_box = st.info("")
+                summarization_todos.append(partial(generate_summary, summarizer, generated_text_box, segment, TLDR_PROMPT, TLDR_MAX_INPUT_LENGTH))
+            st.write(segment)
+    toc.generate()
+for summarization_todo in summarization_todos:
+    summarization_todo()

demo_data/nips-2021/25953/transcript_whisper_large-v2.txt ADDED Viewed

	@@ -0,0 +1,193 @@

+Hi everyone, my name is Zyw Goldfeld and this is a joint work with Christian Greenwald about
+sliced mutual information, which is a new measure of statistical dependence that has
+some nice scalability properties to high dimensional settings.
+And to get started, I think we're all familiar with classic mutual information that is defined
+between let's say continuous high dimensional random variables, which is the regime that
+we'll mostly be interested in, like SOH, basically the KL divergence between their joint distributions
+and the product of their marginals.
+And mutual information is indeed this fundamental measure of dependence that enjoys many good
+properties such that the fact that it nullifies if and only if our random variables are independent,
+it is invariant to bijections and it meets several useful representations, decompositions,
+variational forms, etc.
+And in fact, it can be even obtained axiomatically as the unique functional of the joint distribution
+that satisfies some natural informativeness conditions.
+And as such, mutual information has seen a variety of applications in information theory
+and statistics more recently in machine learning.
+But the problem is that all this nice structure comes with a hefty price, since computing
+mutual information in high dimensions or estimating it from samples is very, very hard, effectively
+infeasible.
+And this is the so-called curse of dimensionality and sort of the problem that we try to tackle
+in this work.
+And to address this difficulty, what we propose is sliced mutual information, which is, like
+I said, a new measure of statistical dependence, not necessarily a proxy of mutual information
+as such, but rather an alternative notion, which is defined as this average of scalar
+mutual information terms between projections of our high dimensional variables onto randomly
+chosen directions from the corresponding unit spheres.
+And it's of course inspired by the recent popularization of slicing techniques for statistical
+divergences, in particular the Wasserstein, the sliced Wasserstein distance is a great
+example.
+But the way it works for sliced mutual information is roughly so, well, let's say that this is
+our first high dimensional variable X and this is its distribution.
+What you do is draw a projection direction uniformly from the sphere.
+You then project this random variable onto that direction, do the same for your other
+random variable.
+And now for these two projected scalar new variables, we just compute the mutual information
+between them and average everything over the choice of direction.
+So that's basically the definition.
+And with that, the goal of this work is effectively to show that sliced mutual information is
+both a meaningful and a scalable mutual information alternative.
+Meaningful, well, in the sense that it preserves many of the desired properties that make mutual
+information appealing to begin with and scalable in the sense that it alleviates the set of
+computational and statistical difficulties.
+All right.
+Yeah, and to address this first point, let me show you that, well, despite those one
+dimensional projections, sliced mutual information indeed inherits many of the properties of
+classic mutual information.
+So we have, well, of course, non-negativity, but furthermore, identification of independence.
+We have an entropy decomposition for an appropriate definition of sliced entropy.
+We can represent it as a KL divergence, a sliced KL divergence.
+To be more precise, we have a chain rule tensorization for independent copies, as well as a Donsker-Varadhan-like
+variational form that can be readily used for neural estimation of sliced mutual information.
+We actually make use of that in some of our empirical results.
+And well, I mean, you are more than welcome to check the paper or visit us as a poster
+if you want to know more about any of these.
+But really, the upshot here is that much of the classic structure is still there after
+the slicing.
+Now another interesting feature of sliced mutual information comes to light when you
+think of it in the context of the famous data processing inequality.
+And for starters, recall that classic mutual information satisfies the DPI, which in particular
+means that if you process either of your random variables with a deterministic function, say
+this f over here, you can only lose the informativeness in the classic sense.
+Now sliced mutual information plays differently with processing and can in some sense benefit
+from nice transformations that, let's say, give rise to some nicer manifold for your
+random variable.
+And to understand this, keep in mind that, well, first of all, sliced mutual information
+only looks at projections of random variables.
+And it may very well be the case that some transformations of x, let's say, have more
+informative projections about y than x itself.
+And here's a simple example to that effect.
+So consider a two-dimensional isotropic Gaussian x, so two coordinates, x1 and x2.
+And let's take y to be, for example, its first coordinate.
+Now if you look at the mutual information between two fixed projections of x and y,
+well, projection does nothing to y, right, because it's a scalar.
+But it does affect x.
+And if you look at the mutual information between two projections of x and y, you quickly
+realize that x1 really plays the role of the signal here, whereas x2 behaves like noise.
+And therefore, any transformation that will effectively improve your signal-to-noise ratio,
+for example, like this g sub a over here, where a is less than 1, will indeed give rise
+to a higher sliced mutual information value.
+So all in all, sliced mutual information can be increased from processing, which means
+that, well, in particular, it validates the data processing inequality and is different
+from classic mutual information in that sense.
+But interestingly, and as I will show you shortly, this is actually a quite useful thing
+to have, for example, for feature extraction tasks, because we can use sliced mutual information
+effectively to maximize it in order to extract informative features and land on those nicer
+manifolds that I mentioned a moment ago.
+And here's an example theorem that kind of makes this statement precise or formal, where
+we consider the maximization of sliced mutual information over linear transformations of
+our random variables.
+And this would, of course, not affect classic mutual information at all.
+But what we can show is that for sliced mutual information, this maximization ends up extracting
+the two most informative projection directions for you, which in particular will be encoded
+in the optimizing matrices, these A sub x star and A sub y star.
+And of course, there's nothing special about this particular setup.
+And we can establish similar results for, well, first of all, rank-constrained matrices
+that as opposed to what's shown here would extract the, let's say, our most informative
+features or projection directions.
+In the paper, we also extend this result to shallow neural networks.
+And in fact, our argument can be easily extended to cover additional nonlinear cases as well.
+OK, so that's pretty much for structural properties.
+But like I said at the beginning, the real premise of this framework is overcoming the
+curse of dimensionality.
+And let me show you that this is indeed the case, that sliced mutual information is or
+can be estimated in a scalable manner, effectively by combining your favorite scalar mutual information
+estimator with a simple Monte Carlo average step.
+And this is how it works.
+So let's say we're giving n IID samples from our high-dimensional random variables.
+And we're further given a scalar mutual information estimator that achieves, say, error delta
+of n when applied to n IID samples of some pair of one-dimensional variables, a and b.
+OK, so let's say we have these.
+Now, to estimate sliced mutual information, first thing to do is sample, let's say, m
+random projections from the corresponding spheres in an IID fashion, at which point
+we will take our high-dimensional n samples and project them onto each of these m random
+projections that we've generated.
+And the thing to observe here is that the resulting n times n data set of these projections
+is nothing but IID samples from the corresponding projected distribution, which is the right
+thing to have here if what you're trying to estimate is sliced mutual information.
+So having that, I mean, at this point, per projection direction, we can apply the scalar
+mutual information estimator and then just take one big, happy Monte Carlo average of
+the entire thing over the different projection directions.
+And this would give rise to the proposed sliced mutual information estimator.
+Now, you can compute this thing very easily, because at the end of the day, it's an average
+of scalar mutual information estimates.
+And as far as performance guarantees, we can show that so long that the per-sliced mutual
+information is bounded, the uniform absolute error of this estimator scales like 1 over
+the root of m, the number of our Monte Carlo samples, plus the error of the scalar mutual
+information estimator.
+And I'm just restating this informally over here.
+And what this all in all shows is that sliced mutual information can therefore be estimated
+the rate of scalar mutual information estimation problem plus this m to the minus half Monte
+Carlo penalty.
+And the thing is that under appropriate smoothness assumptions, the one-dimensional rate is in
+fact parametric.
+And therefore, if you just match the size of your data set and the number of Monte Carlo
+samples, just equate n and m, the sliced mutual information between high-dimensional variables
+can be estimated at the parametric n to the minus half rate, perhaps up to some logarithmic
+factors.
+And this is, of course, a significant speed up and stands in sharp contrast to the slow,
+exponentially bad in dimension, curse of dimensionality rate for classic mutual information.
+Yeah, now this scalability makes, in fact, running empirical experiments with sliced
+mutual information quite a breeze.
+So let me quickly show you some sort of proof of concept experiments, let's say.
+And the first one just relies on the fact that, well, SMI, sliced mutual information
+can identify independence.
+And therefore, we examine it as a figure of merit for independence testing, basically
+by thresholding the computed sliced mutual information value.
+And the results that we have obtained, of course, we've compared them with the same
+test, but based on classic mutual information.
+And this figure over here shows that for a bunch of different settings, well, it presents
+the area under the ROC curve as a function of the number of samples, the standard way
+to represent the quality of an independence test.
+And you basically want this number to be 1, which corresponds to an omniscient test.
+And what we observe is that sliced mutual information performs consistently well across
+different setups and across different dimensions, whereas the performance of the mutual information,
+the classic mutual information-based test, quickly degrades as dimension grows.
+Now, on top of that, let me also demonstrate how sliced mutual information can be used
+for feature extraction.
+And here, what we want to do is maximize the sliced mutual information between linear transformations
+of x and y that are now chosen to be IID samples from the same MNIST class, which we restrict
+to be either 0 or 1.
+And the choice of class is also random, so basically just a fair coin flip.
+And by observing that sliced mutual information between x and y is at most 1 bit, I mean,
+it's always upper bounded by mutual information, which equals a single bit in this case, basically
+the class label, the way to understand what we're doing here is that we're looking for
+the linear feature that is most informative for classifying or determining this class
+label.
+And interestingly enough, this is what this procedure ends up learning, where the figure
+shows basically the first two rows of the optimal A matrix that we obtained, rearranged
+in the dimension of an MNIST image.
+And this really looks like a match filter, if you're familiar, which, when applied to
+the samples, would indeed be able to tell you whether the sample came from the 0 class
+or not.
+And as far as for the value itself, well, the maximized sliced mutual information value
+ends up being roughly 0.7, which is quite close to the 1 bit upper bound, and is much,
+much larger than what you would get if you would not learn A, and let's say just instantiate
+it as a matrix with IID entries drawn according to some distribution.
+And this is just to say that something meaningful indeed being learned here, and something meaningful
+indeed happens when you maximize the sliced mutual information as your optimization objective.
+OK, so yeah, that's basically it.
+And just to recap, we introduced sliced mutual information, which is this average of scalar
+mutual information terms between one-dimensional projections.
+We've seen that it preserves much of the structure of classic mutual information.
+It can be efficiently computed and estimated from samples, and can also be, in fact, increased
+by our processing if, indeed, your processing gives rise to more informative projections.
+And we've presented some proof of concept applications to independence testing, to feature
+extraction.
+We have a couple of more in the paper.
+But let me say this.
+While this is mostly theoretical work, and a large-scale empirical exploration is sort
+of beyond its scope, we firmly believe that sliced mutual information will be extremely
+useful for various such tasks, and are very excited to look into this in the future.
+And yeah, with that, I'll stop.
+Thank you guys for listening, and do visit us at the poster, and check out the paper
+if you would like to know more.

demo_data/nips-2021/25957/transcript_whisper_large-v2.txt ADDED Viewed

	@@ -0,0 +1,179 @@

+Hi, I'm Hugo Richard, I'm a third year PhD student at Université Paris-Saclay.
+I'm in the INRIA Paris et Alpes team and my supervisor is Bertrand Thirion.
+Today I'll talk about shared independent component analysis for multi-subject neuroimaging.
+This is a joint work with Pierre Abelin, Alexandre Grandfort, Bertrand Thirion and Anna Pouy-Varine.
+First let us consider two sources that are emitting a signal that is recorded by two
+sensors.
+This can be seen as a simplified model of magnetoencephalography where brain sources
+are recorded by magnetometers.
+Because propagation time can be neglected, the signal recorded by the sensors can be
+seen as a linear mixture of the signal emitted by the sources.
+S is a set of sources that are assumed to be independent.
+X are the recordings and A describes how the sources are mixed to produce the recordings.
+At first sight this model may seem ill-defined because if we permute two columns in A and
+permute the corresponding sources in S, we'll get a new set of sources S' and a new mixing
+matrix A' that describes X just as well as A and S.
+And similarly if we scale the column of A by some constant, one column of A by some
+constant and the corresponding source by the same constant, we'll also get an equivalent
+description of X.
+However, these scale and permutation indeterminacies are the only one if the sources contain at
+most one Gaussian component.
+Let us consider the more general problem where you have multiple subjects that are exposed
+to the same stimuli.
+We have two subjects, X1 and X2, and they have different mixing matrices, A1 and A2,
+and different noise levels, N1 and N2.
+The interpretation is that they have shared sources because they have shared connective
+processes.
+They have different mixing matrices because they have different spatial topography.
+And they have different noises because we want to model inter-subject variability.
+This model is called group ICA.
+There are many methods to provide a solution for the group ICA problem.
+A very popular one introduced by Calhoun in 2001 is to just stack the data of all subjects
+feature-wise and then perform a PCA, a principal component analysis, on the stacked data.
+And therefore you obtain reduced data and apply independent component analysis on the
+reduced data to obtain a set of sources.
+Another formulation is introduced by Varoko in 2010 and is called K-NICA.
+You just replace the principal component analysis with a multiset CCA, so a multiset canonical
+correlation analysis, where you have to solve a generalized eigenvalue problem.
+There are many different formulations of multiset CCA, but this one with a generalized eigenvalue
+problem is the fastest to solve.
+KNICA and Cut-ICA have a lot of advantages.
+First, they are very fast to fit.
+And second, they are simple to implement.
+These are the two reasons why they are so popular in neuroimaging.
+However, they do not optimize the proper likelihood.
+So therefore they do not benefit from advantages of such estimators such as asymptotic efficiency.
+There are a lot of other related work that do optimize the proper likelihood.
+I want to mention the independent vector analysis, which is a very powerful framework introduced
+by Li in 2008.
+So unified approach of Guo in 2008 that we will also mention and talk about later.
+The approach of Shen in 2015 that also allows to perform dimension reduction.
+And the multi-view ICA that was introduced by our team last year.
+I want to quickly say that it's not obvious to design a likelihood-based approach that
+is tractable.
+And with this example of the Gaussian mixture noisy ICA by Bermond and Cardozo, we'll see
+that standard approach leads to intractable algorithms.
+The model we take here is the same as the group ICA, but we assume that the noise is
+Gaussian with the same variance for all subjects.
+We'll also assume that the sources follow a Gaussian mixture model.
+And we further assume that the weights of the Gaussian mixtures are known.
+We can solve such model via expectation maximization.
+And if we write the E-step, we'll get a closed form that involves a large sum.
+Because of this large size, this sum, and therefore the M algorithm is intractable whenever
+Q and K are large.
+Our contribution is shared ICA, what we call Shikha for short, where the data of subject
+i are assumed as a linear mixture of noisy sources, and the noise here is not on the
+sensor, but on the sources.
+The noise is Gaussian with a variance that can be different for each subject and different
+for each component.
+S are assumed to be independent, but in contrast to almost all existing work, some components
+can be Gaussian.
+We have a few blanket assumptions.
+We assume that the data are centered, that the mixing metrics are invertible, that the
+sources have identical variance, and that the number of subjects is greater than 3.
+We have two algorithms to solve the Shikha model.
+We have ShikhaJ, that is a FAS algorithm that is based on multiset CCA, and ShikhaML, a
+maximum likelihood approach.
+In Shikha, there are two ways to recover the parameters.
+Either the source are non-Gaussian, in which case we can use classical ICA results to recover
+the unmixing matrices.
+When the components are Gaussian, then we need something else, and what we use here
+is noise diversity.
+When the noise is sufficiently diverse, then it's possible to recover the unmixing matrix
+and the noise covariance up to a permutation and sign indeterminacy.
+Note that the noise diversity in Gaussian components is also a necessary condition.
+If it does not hold, then Shikha cannot be identified.
+Let us now focus on this theorem that is at the core of the ShikhaJ algorithm.
+Namely it shows that we can solve group ICA with multiset CCA.
+So assume the data follows the Shikha model, and consider the multiset CCA framed as a
+generalized eigenvalue problem.
+This generalized eigenvalue problem relies on two matrices, C and D. So C is formed by
+second-order statistics, and D is formed by the diagonal blocks in C.
+And so if we solve this eigenvalue problem and take the first k leading eigenvectors,
+we can recover the correct unmixing matrix from them, up to a permutation and a scaling.
+And this can only be done if the k first eigenvalues are distinct.
+Note that the distinct eigenvalue condition is also necessary.
+If two eigenvalues are the same, then this adds the need to determine IC, and therefore
+we cannot solve group IC.
+Note also that the condition that some eigenvalues need to be distinct is stronger than the noise
+diversity condition we have in the identifiability theorem.
+And therefore we can exhibit an example which is identifiable, but on which multiset CCA
+will fail.
+And I refer you to the paper for more details on this.
+So in our theorem, in order to recover the correct unmixing matrix, we need to have access
+to the second-order statistics.
+However, in practice, we only have access to them, up to some sampling noise.
+And because the mapping from matrices to eigenvectors is highly non-smooth, a small deviation in
+the second-order statistics can lead to a high deviation of the recovered unmixing matrix.
+Now to show this in practice, we take three subjects, two components, and noise covariance
+matrices with two values, lambda1 and lambda2, that are separated by an eigengap epsilon.
+And we compare the solution of multiset CCA on the true covariance matrices and on the
+perturbed covariance matrix, where the perturbation scale is given by delta.
+And for different values of epsilon, 10-4, 10-3, 10-2, 10-1, we show how the performance
+of the algorithm, so the M-ary distance between the true unmixing matrix and the estimated
+unmixing matrix, varies when the perturbation scale increases.
+And we see that when the eigengap is very close, so 10-4, the violet curve, then even
+with a very small perturbation, you can get to a very bad M-ary distance.
+So the black dashed curve is a performance of chance.
+Luckily, there is a large gap between the k-th eigenvalues and the k plus 1.
+This means that in practice, the span of the p-leading eigenvectors is approximately preserved.
+We can recover the true unmixing matrix from the unmixing matrix estimated by multiset
+CCA, just by multiplying by a matrix Q.
+And in order to estimate Q, we make use of the fact that the unmixed data should have
+a diagonal covariance.
+This leads us to a joint diagonalization problem that we can solve efficiently.
+So if we take the experiments we've done on the previous slide, the results are still
+shown here.
+You can see the violet curves, and that is very sensitive to perturbation.
+And so if we apply joint diagonalization, all these curves move, and they join the dashed
+curve on the bottom.
+And therefore, it's much better, because now the new curves that are represented by the
+dashed line are less sensitive to perturbations.
+So now we've obtained the correct unmixing matrix, but up to a scaling.
+And so we need an additional step to find the correct scaling, and another one to find
+the other parameter that is still unestimated, which are the noise covariance.
+And luckily, it's very easy to find the noise covariance.
+We can do this via an EM algorithm.
+The E-step and the M-step are in closed form, and this yields a very fast algorithm.
+But the Shikha-J is not a maximum likelihood estimator.
+So now we will focus on Shikha-ML, which is our maximum likelihood estimator.
+So I won't go too much into details on this, but we optimize this via an EM using a Gaussian
+mixture assumption as a source.
+We assume that the weights are known.
+What I just want to showcase here is that the E-step of the algorithm, the one that
+gives you the expectation of the sources given the data, and the variance of the sources
+given the data, only involves the sum of size 2.
+So previously we had a sum that had an exponential number of terms, and here we don't have that
+anymore.
+So the E-step is much faster than what we had before, and therefore the EM algorithm
+here is tractable, whereas it was not the case before.
+I first want to present our synthetic experiment where we generate data according to the Shikha-ML
+and Shikha-J model.
+In case A, we have only Gaussian components, but we have noise diversity, and therefore
+methods that use noise diversity to recover the sources such as Shikha-ML and Shikha-J
+perform best.
+In the second case, we have only non-Gaussian components and no noise diversity, so methods
+that use non-Gaussianity perform well such as Kana-ICA, Shikha-ML, or MultiView-ICA.
+And the last case, half of the components are Gaussian with noise diversity, and the
+other half are non-Gaussian but without noise diversity.
+And in this case, only Shikha-ML is able to correctly recover the sources.
+MV-ICA doesn't do that, but it's not as good as Shikha-ML.
+Let us now talk about our experiments on real data.
+We have this reconstruction experiment on fMRI data where subjects are exposed to a
+naturalistic stimuli such as movie watching.
+We use 80% of the movie to learn the unmixing matrices of all subjects, and then on the
+20% left of the movie, we compute the common sources, and from these common sources computed
+using 80% of the subject, we try to reconstruct the data of the 20% left of the subject.
+We compute the R2 score within regions of interest between the reconstructed data and
+the true data, and plot them as a function of the number of components used.
+As we see, Shikha-ML outperforms all of the methods.
+As a take-home message, Shikha is a powerful framework to extract shared sources.
+Shikha-J is a fast approach to fit the model, but it only uses second-order information.
+In contrast, Shikha-ML is a bit slower, but is able to use non-gaussianity in addition
+to second-order information.
+In practice, Shikha-ML yields the best results.
+The methods we've introduced work on reduced data.
+It would be interesting to know how to reduce the data so that they perform optimally.
+Another way to improve our results would be to learn the density of the shared sources
+in Shikha-ML instead of having them fixed.
+Thanks for listening, and have a good day!

demo_data/nips-2021/25958/transcript_whisper_large-v2.txt ADDED Viewed

	@@ -0,0 +1,124 @@

+Hello everyone, I'm Luigi Carretino, and this is a joint work with Stefano Vigonia,
+Daniele Calandriello, and Lorenzo Rosasco.
+The problem that we study in this work is a standard regression problem, where we want
+to estimate an unknown function f star given n pairs of points, x's and y's, and then
+given n pairs of points, x's and y's, where y's are noisy evaluations of the functions
+f star on the input points axis.
+A well-established method to learn nonlinear functions is kernel ridge regression.
+The basic idea is to map the input points into a higher dimensional space, where linear
+relationships can be learned that then translate in nonlinear ones in the input space.
+To formalize this, we can think about solving a standard empirical risk minimization problem
+regularized over a spatial function which is a reproducing kernel Hilbert space.
+Numerically speaking, the solution of this type of problem boils down to solving a linear
+system. Particularly, we can see here that the linear system is going to be Kc equal
+y, where K is the kernel matrix evaluated in all the pairs of points of the training
+sets, c are the weights that we aim to learn, and y's are the output points.
+We know that this method is optimal from a statistical point of view, but a drawback
+is that it suffers from computational scalability. In fact, in terms of time complexity, if we
+have n training points and we want to solve the linear system directly, we'll have to
+invert the matrix K, and this will cost us n cubed in time.
+Multiple ways of accelerating this process have been proposed over time.
+The first one is to solve the methods iteratively instead of inverting directly the matrix K.
+This allows us to only have matrix vector multiplications, and so the overall cost of
+an iterative method to solve this linear system is going to be Tn squared.
+Another method is the one known as sketching, where we can see this as subsampling the linear
+system, in particular subsampling columns of this linear system, where we can take m
+columns of the linear system uniformly at random to get a smaller one, and the cost
+of this will be m squared n.
+Another method instead is splitting. This allows us to divide the main problem into
+many, in this case Q, subproblems, each one that can be solved independently and so
+potentially can be distributed. So we can have a cost which boils down to n over Q to
+the power of 3.
+Combinations of these methods have been proposed in the literature. In particular, if
+we combine iterating and sketching, we can get a solver that can solve the problem in
+a time complexity of Tmn.
+If instead we combine sketching and splitting, we can get a solver that can be computed
+in m squared times n over Q.
+And in this work, we try to blend all these techniques to derive a new algorithm, which
+we will call PARC, that can achieve a time complexity of Tm times n over Q to the power
+of 2.
+So as we just said, in this work, we propose a new large-scale kernel regression solver
+that combines the computational benefits of iteration, sketching, and splitting.
+Notice, though, that these are approximation techniques and they may come at the cost of
+accuracy. But we are able to show that this new algorithm is able to preserve generalization
+under suitable partitions.
+Now also notice that instead of general splitting, we are going to need to focus on a
+particular type, which is the partitions.
+So we introduce a new principal partition scheme for kernel methods.
+We now look at the difference between data splitting and space partitioning.
+Given a set of points, the procedure of splitting takes groups of points at random and assign
+them to different splits or clusters.
+In this picture, for example, we divide the points in four splits.
+Partitioning instead divides the space in different cells, and then the points are implicitly
+assigned to a particular cluster based on which cell they belong to.
+Notice that with the splitting methods, we don't consider local information while we
+perform the splitting, but we do when we perform partitioning.
+Now, from this picture, the concept of partitioning a space seems pretty straightforward.
+However, when you start considering high dimensional feature space, subtle problems can
+appear.
+So first, as a recap, remember that there are two important spaces to consider in our
+regression problem.
+The input space X with its input space features and the kernel space H with its input space
+features, and the kernel space H, which potentially has many more implicit features.
+Traditionally, partition methods are applied directly to the input space.
+For example, a classical approach is to select a subset of points as centroids and then
+partition the space in cells by assigning each portion of the space to the closest centroid,
+which is called a Voronoi partition.
+Since we are in the input space, closest here is defined according to a simple Euclidean
+distance.
+However, remember that our target function and our whole regression does not happen
+directly on the input data space, but rather on the data mapped in the feature space.
+And after we apply our feature map to the data, the concept of closest and the partition
+can radically change.
+For example, here on the right, we choose a kernel space associated with a cosine similarity
+and again plot how the centroids partition the input space, but this time we chose closest
+according to the new cosine distance.
+The resulting partition is very different from the Euclidean one as it captures the
+non-linearity of the kernel function.
+In the paper, we discuss how this difference can impact the regression and we identified
+sufficient conditions that the partition should satisfy in order to guarantee good generalization
+of the learning process.
+Crucially, we will see that these guarantees depend not on how the input space is partitioned,
+but rather how the feature space is partitioned.
+As a consequence, for our PARC methods, we focus on choosing centroids solely using the
+kernel version of the distance.
+We are now ready to present in more detail how the PARC algorithm works.
+First of all, PARC partitioned the feature space into Q Voronoi cells and the first thing
+to do is to identify the centroids in the feature space that allows us to describe the
+Voronoi cells.
+Then inside each Voronoi cell, we learn a local estimator using an uniterated and sketched
+version of kernel ridge regression.
+And then at prediction time, when a new sample arrives, we can use the Q Voronoi feature
+to identify the new sample.
+We use the local estimator corresponding to the Voronoi cell to which the new points fall
+on.
+The generalization error of standard kernel ridge regression without partitioning can
+be upper bounded by two terms, a bias term and a variance term.
+In our work, we can show that also the generalization error of PARC can be upper bounded by a bias
+term and a variance term.
+But this time, these two terms are weighted and they are weighted by a certain quantity
+that depends on an angle theta, which is the minimum angle between all the subspaces of
+the partitions.
+For example, when all the subspaces are orthogonal between each other, we recover the exact same
+generalization error of standard kernel ridge regression.
+But we are also able to show that for angles which are small enough, we are able to obtain
+a generalization error which is of the same order of standard kernel ridge regression.
+These theoretical results suggest us how to construct a good partition.
+So in particular, PARC selects the Voronoi centroids greedily in order to promote orthogonality
+between the Voronoi cells.
+And in particular, we use the Schur complement to measure the orthogonality.
+We also use the Schur complement to measure the orthogonality of the Voronoi centroids.
+And in particular, we use the Schur complement to measure the orthogonality.
+Given all these ingredients, we are now able to measure the computational complexity of
+PARC, which has a time complexity that is the sum of two terms.
+A first term, q squared n log n, which is the cost of computing the centroids with the
+just mentioned procedure.
+And a second term, q squared n log n, which is the cost of computing the most expensive
+local estimator.
+Empirically, we performed experiments on data set of millions and of billions of points,
+and we compared with the currently fastest global kernel methods and with some other
+splitting kernel methods.
+We can see that PARC is the only method that manages to match the accuracy of the global
+estimator.
+Thank you all for your attention.
+And thank you to the poster for all your questions and more details.

demo_data/nips-2021/25959/transcript_whisper_large-v2.txt ADDED Viewed

	@@ -0,0 +1,117 @@

+Hello, my name is Pouya Bahshiban and I'm going to tell you about our paper titled
+Adversarial Feature Desensitization. This is joint work with a number of wonderful collaborators
+at MIWA, University of Montreal and McGill University, including Reza Bayat, Adam Ibrahim,
+Kartika Hoja, Mojtaba Farmazi, Tourez Dale, Lake Richards and Erin Oji. A common assumption in
+machine learning is that the train and test samples come from the same distribution.
+While this is a reasonable assumption under most circumstances, it is intentionally violated in the
+regime of adversarial attacks. Adversarial attacks are algorithms that search for slight input
+perturbations that cause the input to be misclassified. In the case of white box attacks,
+the model itself is transparent to the attacker and the attacker uses it to identify the possible
+inputs that would lead to misclassifications. A famous example of this is the image of a panda
+that when perturbed with imperceptible noise, alters the model's prediction from a panda to a
+gibbon. As prior literature has shown, this is a common issue in almost all machine learning methods
+and unless the classifier is specifically trained to be robust against these attacks,
+the attacks could completely break down the classifier's performance.
+This issue becomes even more critical when we consider the vast usage of these machine learning
+systems in our societies. For example, the possible security concerns that rise in face
+recognition systems prone to adversarial attacks or the safety in autonomous driving systems.
+So what is an adversarial attack? To formally define the adversarial attacks, let's assume a
+feature learning function f that projects inputs x to latent space with feature space z
+and a classifier that uses the latent code z to predict the correct class label y hat.
+The perturbation function or the attack generates a perturbed sample x prime
+within the epsilon neighborhood of the input x, which we're showing here as b of x and epsilon.
+By maximizing the classification objective, the opposite of how we normally optimize the classifier's
+parameter. Many methods have been proposed to defend the models against adversarial attacks.
+Two of these methods that have withstood the test of time so far are the adversarial training
+by Alexander Modrianov, which proposes a defense method by solving a minimax optimization problem
+that involves finding an adversarial input by maximizing the classification loss in the inner
+loop followed by a classifier training to minimizing the classifier loss on these adversarial inputs.
+This procedure is graphically shown for two hypothetical classes in the diagram on this slide.
+The adversarial training method essentially learns to separate the distributions of adversarial
+examples belonging to different classes. The second method is the trades method by Zhang et al,
+which proposes to push the decision boundary of the classifier away from the data.
+Trades achieves this by introducing a regularization term to the original learning
+objective for classification that penalizes the mismatch between the predicted label
+for the clean and perturbed inputs. The diagram on the right side again graphically illustrates
+this procedure, where now the defense method learns to separate the distributions of clean examples
+belonging to different classes while minimizing the loss of the classifier.
+The third method is the trade method by Wang et al, which proposes to push the decision boundary
+of the classifier to the inner loop followed by a classifier training to minimizing the
+classification loss on these adversarial inputs. The third method is the trade method by Zhang et al,
+which proposes to push the decision boundary of the classifier to the inner loop followed by a
+classifier training to minimizing the classification loss on these adversarial inputs to the inner
+loop. The third method is the trade method by Wang et al, which proposes to push the decision
+boundary of the classifier to minimizing the classification loss. The fourth method is the
+trade method by Wang et al, which proposes to push the decision boundary of the classifier
+for a source domain, but we want the classifier to also perform the same task on a related target
+domain that we might not have enough data for or that the generating procedure for sampling
+domain might be expensive. The domain adaptation theory proposed by Ben David et al answers the
+question of under what conditions can we adapt a classifier trained on the source domain for use
+in the target domain. Here we consider the original clean distributions as the source domain and the
+distribution of adversarial images generated from those images as the target domain. Although here
+the target domain continuously evolves because the adversarial examples are based on the current
+state of the model at each time step. And similar to the domain adaptation theory, our goal here
+is to learn how to perform well on both source and target domains, meaning the natural and
+adversarial domains. Now before I tell you about our proposed method, let's dive a bit deeper into
+what the domain adaptation theory from Ben David et al states. Similar to before, let's assume a
+feature learning function f that projects inputs x to latent space or feature space z and the
+classifier that predicts the correct label y, y hat, from those latent codes. Now consider natural
+and adversarial examples as input domains dx and d' x and their induced feature distributions
+which go through the f function as dz and d' z. Also consider epsilon z and epsilon' z
+as the classification error over the domains dz and d' z, what we are going to refer to as the
+clean accuracy and the adversarial accuracy. The domain adaptation theory now gives a bond
+on the adversarial error in terms of the natural error and the distance between the two domains.
+Fortunately, from the prior work, we know that h delta h distance, which measures the distance
+between two domains, can be estimated using the classifier trained to discriminate between the
+two domains. Now our defense method called adversarial feature desensitization essentially
+minimizes the bound on the adversarial error epsilon' z using a three-step procedure which
+has some conceptual similarities with prior work on adversarial domain adaptation from Ganin et al.
+For this, we first update the parameters theta and phi in the feature learning function f and
+task classifier c to minimize the classification loss on the natural domain. This is shown with
+green arrows and green boxes marked 1 on both the equation and on the diagram.
+Secondly, we estimate the h delta h distance using an additional domain discriminator
+network that predicts the domain identity from the latent code z. We update the domain
+discriminator parameters psi to minimize the domain classification loss. And finally,
+in the third step, we update the feature learning network parameters theta to maximize the domain
+classification loss in an adversarial way. These two steps are marked with red arrows in the figure
+and red boxes on the equation. Similar to previous two methods, adversarial training and trades that
+I showed you, we here we can also graphically demonstrate this procedure. In our method AFD,
+we learn to separate the classes from the distributions of clean examples while at the
+same time we optimize a domain classifier that learns the boundary between the clean and adversarial
+examples for each class. And finally, we push the adversarial examples to the opposite side of that
+boundary. This procedure implicitly desensitizes the learned features to adversarial perturbations
+and hence the name adversarial feature desensitization. We tested our method on four
+data sets and compared them with a number of other baselines including with adversarial training and
+trades. We made two versions of our method called AFDTCGAN that uses the adversarial losses from
+Goodfellow et al and AFDWGAN that uses the Wasserstein losses from Arjovski and Goodtuner.
+In the table, we evaluated all methods on several white box and black box attacks with
+nominal strengths into each data set. Overall, our method AFD and especially AFDWGAN showed superior
+performance against most attacks in most data sets. However, AFD was behind trades on several attacks
+especially on CIFAR-100 and TinyImageNet data set that had more classes in it.
+We also looked in trust attack methods and attack strengths which we controlled with the parameter
+epsilon. The diagrams on the right show the robust accuracy for each defense method across
+eight attack methods and various epsilon values for each of them. Overall, our results in these
+diagrams showed that AFD's robustness generalizes better than the baselines across attacks and
+across attack strengths. To quantify these differences, we also computed the area under
+the curve for each method for each attack and summarized them in a table on the left.
+As you can see, AFD's robust performance generalizes better to unseen and stronger attacks
+compared to other baselines. If you remember from previous slides, the domain adaptation theory
+predicted a bound on the adversarial error which can also be turned into a bound on the generalization
+gap between natural and adversarial attacks. We empirically tested this prediction in our experiments
+under two settings. Under the first setting, we varied the epsilon value for the PGDL-infinity
+attack which was used during the training. And under the second setting, we varied the
+epsilon value for the PGDL-infinity attack which was used during the training. And under the second setting, we used a diverse set of attacks and various attack strengths for each of them.
+And under both scenarios, we found that the domain discriminator, which was originally trained on a
+particular attack and attack strength, in our case it was PGDL-infinity attack with a fixed epsilon
+for each data set, could well predict the generalization gap to unseen attacks and
+different attack magnitudes. This suggests that the adversarial training against a domain classifier
+like that used in our proposed method could potentially lead to robust models with better
+generalization capacity. Finally, while we showed that AFD generalizes well to most other attacks
+and attack strengths, it occasionally was worse compared to other baselines, especially in data
+sets with more classes like Tiny ImageNet. This could potentially be due to the difficulty of training
+domain classifiers in these data sets and leaves much space for future work on
+investigating the effect of domain classifiers on the robustness of feature learning functions.
+Also, AFD required more backward computations compared to some of the other baselines
+such as adversarial training, and as a result, its training time was on average about 31%
+longer than adversarial training. We invite you to read our paper for more details and please
+get in touch with us if you have any questions. Thanks for watching this video and we hope you enjoyed it.

demo_data/nips-2021/25962/transcript_whisper_large-v2.txt ADDED Viewed

	@@ -0,0 +1,51 @@

+Bonjour à tous, je suis Yannis Hartel et je vais vous présenter un travail sur l'estimation
+de fonctionnalité en termes de certaines contraintes particulières de la privacité.
+C'est un travail en lien avec mon conseiller postdoc, le professeur Cristina Gutucia.
+Nous sommes intéressés par le fonctionnalité de la somme de puissance, qui est la somme de probabilités associées
+à une distribution discrète, à la puissance gamma, où gamma est un nombre réel positif.
+Donc, ce fonctionnalité de la somme de puissance est un exemple d'information qui se déroule dans différents domaines
+comme les statistiques, l'apprentissage de machines, la théorie de l'information, la science de la neurone, etc.
+Voici donc le problème statistique standard, où l'objectif est d'estimer la somme de puissance fonctionnelle
+basée sur des exemples NIID, X1, X2 jusqu'à XN, qui suivent une distribution discrète B avec une taille d'alphabet K.
+Une approche beaucoup utilisée est le estimateur de plug-in, où l'on utilise un estimateur du paramètre P
+pour construire un estimateur du fonctionnalité, à travers le principe de plug-in.
+Cette approche n'est pas seulement simple et intuitive, mais elle est aussi théoriquement saine,
+car elle satisfait une efficacité asymptotique et une néro-optimalité non-asymptote.
+La question intéressante de notre paper est de savoir si cette approche de plug-in
+fonctionne dans un état de séparation non standard, où l'on impose une contrainte de privé,
+et plus précisément, le setup de la privé différente local.
+Ce qui signifie que l'on impose un état de privé fort, où l'on n'a pas accès aux données initiales et sensibles, les XI.
+Au lieu de ça, l'on a seulement accès à une version privée de XI.
+Voici la représentation d'un mécanisme simple qui n'est pas interactif.
+Les termes local ici reflètent le fait que le mécanisme QI ne voit que les données XI.
+En d'autres mots, il n'y a pas de troisième parti confiant qui a accès à toutes les données sensibles.
+C'est un mécanisme de privé non-interactif simple, mais bien sûr, nous sommes aussi intéressés par des mécanismes plus sophistiqués,
+notamment le mécanisme de séquence interactif, où chaque QI voit les données privées dévoilées précédemment,
+et les données privées de XI, et les données privées de XI.
+Dans cette étude non-standard, nous retournons au problème original de l'estimation fonctionnelle de la power sum,
+où nous n'avons qu'accès à des données privées de XI jusqu'à XL.
+Notre première contribution est de donner une caractérisation tigrée et non-transomatique du erreur de caractérisation de la power sum de l'estimateur.
+Ce résultat montre que l'estimateur de la power sum n'est pas optimal.
+Cela contraste avec la performance de l'estimateur de la power sum dans le problème statistique standard.
+Le message ici est que les bons estimateurs dans le setup standard ne sont pas toujours bons estimateurs dans le setup local privacy.
+Notre deuxième contribution est la correction du estimateur de plug-in grâce à une attentionnée de troncation de Pk de petites probabilités.
+Cette correction conduit à une réduction significative du risque d'erreur.
+En particulier, le risque devient indépendant du size alphabétique K lorsque K est grand.
+Cette deuxième contribution, par contre, se base sur un mécanisme de privé non-interactif simple.
+Dans la seconde partie du document, nous examinons un mécanisme de séquence interactive plus sophistiqué,
+pour lequel nous construisons une procédure de deux pas qui nous permet de réduire le risque grâce à un facteur logarithmique.
+Enfin, à la fin du document, nous fournissons un lien universel en bas sur le risque d'erreur
+avec respect à tous les estimateurs et tous les mécanismes non-interactifs et séquentially interactifs.
+Malheureusement, ce lien bas est un lien d'accords uniquement dans certains cas,
+ce qui nous laisse avec quelques questions très importantes à poser sur ce problème.
+Je pense que ce premier travail sur l'estimation fonctionnelle dans le contexte de la privé locale
+vous donne au moins trois points clés.
+Le premier point clé est le besoin de construire une procédure statistique prudente pour la configuration de la privé locale,
+puisque c'est un setup où un bon estimateur dans un cadre standard n'a pas nécessairement de fonction.
+Le deuxième point clé est que l'approche de type de plug-in analysée dans ce document
+sert comme un benchmark pour de futurs travaux et des procédures plus sophistiquées.
+Et le dernier point clé est que notre analyse de l'approche de type de plug-in et des mécanismes non-interactifs
+montrent des régimes où le problème d'estimation est difficile
+et espérons que cela incite les gens à amener des développements ici.
+Merci à tous, et pour plus de détails, veuillez vérifier notre document en ligne.
+Bye!

demo_data/nips-2021/25963/transcript_whisper_large-v2.txt ADDED Viewed

	@@ -0,0 +1,178 @@

+Hello, I'm Hassam Murtaghi. I'm a PhD student at Georgia Tech. Along with my collaborator
+Jay Mundra, we will present our work on reusing combinatorial structure, faster projections
+over submodular-based polytopes. This is joint work with Swati Gupta.
+In this talk, we consider a sequence of similar structured optimization problems a setup often
+encountered in practice. We first start with our main problem of minimizing a convex function
+over a decision set P. At the next time step, this problem sees some perturbation and we
+obtain another similar problem, and so on. An example of this setup is the case of iterative
+projections where at each time step, we are computing the projection of a new point y
+t that is close to previously projected points y i. These iterative projections form a key
+step in many optimal learning algorithms and they are currently solved from scratch every
+iteration. They are not viewed in the context of an iterative environment where previously
+computed projections can be exploited to speed up subsequent ones.
+Thus, in this talk, we ask, is it possible to speed up similar iterative optimization
+problems by reusing structural information from previous minimizers?
+Let me now give you some more details about our setup. Here is a table that summarizes
+various widespread first-order optimization algorithms. The first two algorithms are conditional
+gradient variants and they only solve linear optimization every iteration. Their convergence
+rates depend on the dimension of the problem and on geometric constants for the underlying
+decision set, such as the pyramidal width for the waystep-Fraenkel variant given in
+the second row. On the other hand, the remaining third algorithms
+are projection-based algorithms that compute the projection every iteration, and their
+convergence rates, however, are optimal in the sense that they only rely on the condition
+number of the function and they are dimension-independent. Further, to capture a wide range of combinatorial
+sets, we consider the case where decision set P is given by a submodular polytope, and
+the challenge is that these polytopes have an exponential number of constraints. Thus,
+computing a projection over those polytopes is a big computational bottleneck in projection-based
+algorithms. Motivated by the straight-off in convergence rates versus runtime, we further
+ask, is it possible to speed up iterative projections over submodular polytopes by reusing
+structural information from previous minimizers? I'm now going to give more introduction on
+the problem and submodularity and review of first-order methods. So, as mentioned, we
+assume that the combinatorial structure in a problem is given by a submodular function.
+Set function F, defined over a ground set E of n elements, is submodular if it satisfies
+the following property. Furthermore, the base polytope associated with F is defined as the
+following system of linear inequalities, and here we see that V of F is modeled using an
+exponential number of constraints because we have a constraint for each subset of the
+concept. An example is the permutahedron, a polytope whose vertices are permutations
+of 1 through n. And here we have an example in the slide for when n is equal to 3. These
+polytopes are extensively used in online learning over rankings of items. A special class of
+submodular polytopes are known as Cardinality-based functions, and a Cardinality-based function
+F is defined as F of S equal to G Cardinality of S, where G is a concave function. And here
+we have another table that summarizes various machine and online learning problems in a
+submodular set function that gives rise to them. We see the permutahedron in the second
+row of this table, and it is in fact a Cardinality-based polytope. Other non-Cardinality-based examples
+include spanning trees and independent sets of matroids.
+So let's go back to our main problem of minimizing a convex function over the base polytope.
+So there typically exist three main paradigms to solve this problem. The first is a class
+of methods, known as conditional gradient methods, and as I mentioned before, those
+assume access to B of F via linear optimization oracle. And these methods are specifically
+advantageous for base polytopes because linear optimization over base polytopes could be
+done very efficiently using Edmunds' greedy algorithm. The second class of methods are
+mere descent variants, and those compute a projection every iteration to ensure feasibility.
+And again, as I also previously mentioned, although those methods have optimal convergence
+rates and are robust, they are, they remained of theoretical nature due to being computationally
+expensive. The third class of methods are combinatorial algorithms specifically tailored
+for convex optimization over some modular-based polytopes. Those algorithms require instead
+solving a some modular function minimization problem every iteration, which again can be
+very expensive. However, those algorithms enjoy the nice property of returning exact
+optimal solution. In this talk, we will focus on bridging the efficiency of CG methods and
+the structural properties and exactness of combinatorial algorithms to speed up iterative
+projections appearing in mere descent and beyond. So first, let's consider the simpler
+case when our polytope is cardinality-based. So here we have a cardinality-based some modular
+function F, and for notation we define this vector c to be the vector of discrete derivatives
+of the concave function g. We now give the following Duati result, which states that
+the problem of computing a Bregman projection over a cardinality-based polytope is dual
+to isotonic optimization. Although our results hold for general Bregman projections, we will
+focus on the case of Euclidean projections for simplicity. To that end, consider a vector
+y that we're trying to compute its Euclidean projection over a cardinality-based polytope,
+and let e1 through en be an ordering of the ground set such that y is decreasing. In this
+case, we have the following primal problem, and the dual to that is the following isotonic
+regression problem. And further, we can map between the two problems using the following identity here.
+So just to give you some historical context, previously the best known running time for
+projections was O n squared using a primal algorithm by Gupta et al. Later on in that
+year, Lim and Wright used the same Duati approach to compute projections over the permutahedron,
+and we extended their approach to general cardinality-based polytopes. Now the dual
+isotonic regression problem could be solved in O n time using a simple algorithm called
+pool-adjacent violators algorithm, and this basically gives us an O n log n algorithm by
+solving the problem in the dual space and mapping it back to the primal space. And this is currently
+the fastest known algorithm. And the key takeaway is that solving projections over these polytopes
+can be very efficiently done. In fact, computing a projection and solving linear optimization
+have the same running time. Now let's demonstrate our result with an example. So here we are going
+to project this vector y onto the probability simplex, and the probability simplex is modeled
+by this cardinality-based modular function here given on the slide. And we see that y is already
+ordered for simplicity and c is the vector of discrete derivatives. Now the algorithm will
+proceed as follows. It initializes the dual iterates by the vector that we're trying to
+compute the isotonic regression for, c minus y, and here we have an adjacent violation because the
+second coordinate is strictly smaller than the first coordinate. Now the algorithm will basically
+average those two coordinates to obtain the following solution z star, and here we see that
+the ordering constraints are satisfied and z star is in fact the dual optimal. Next it will map it
+back to a primal optimal. And let's go back to this figure from the previous slide that just compares
+a basic linear regression fit with an isotonic regression fit. Here in the red stepwise curve,
+the points at which the curve remains flat is where a block of consecutive adjacent violated
+points are averaged similar to our example. This very efficient algorithm for computing
+regimen projections over cardinality-based polytopes unfortunately does not extend to
+general submodular based polytopes. And now my collaborator Jay will present different combinatorial
+strategies for dealing with those polytopes. We now describe our toolkit for speeding up
+projections on general submodular based polytopes. There are two basic objects that we can learn from.
+First, given projections of previous points, can we do better than computing a new projection from
+scratch? Second, given an iterative algorithm to compute a projection, can we use the combinatorial
+structure present in the sequence of iterates to speed up the algorithm and terminate it early?
+We have the well-known first-order optimality condition on the left. It helps us verify if a
+point is indeed optimal. This check is reduced to a linear optimization over the base polytope,
+which can be done using Edmunds-Greedy algorithm. We have an example. Suppose we know the gradient
+at a point x star and want to check if x star is indeed optimal. We look at the distinct values
+of the partial derivatives at x star and arrange them in an increasing order. Each time we see a
+gap in this order, we want that the point x star on the prefix set equal the submodular function
+value on that set. In the figure, the first such gap is after we have seen even an E5. Therefore,
+x star S1 must equal f of S1. Similarly, x star S2 must equal f of S2. Finally, xE must equal f of
+E. These sets S1, S2, and E are called tight sets at x and define the face containing the point x
+star. This leads us to two interesting observations that we use later. One, that if we know precisely
+what the tight sets are at the optimal points, we can also calculate the optimal point for all
+suitable functions h. Two, that knowing the gradient at the optimal point gives us these
+tight sets. We give an example using our combinatorial idea. Suppose we know a point
+zk that is close to our optimal x star. If the function is smooth, this implies gradient at zk
+and x star are close. This gives us a way to learn some tight sets defining the optimal face.
+In the example, for each coordinate, the blue line in the middle represents the partial derivative
+value at zk and the blue shade represents the possible variation in that value for the optimal
+point x star. That is, the corresponding partial derivative for x star lies in the shaded interval.
+The largest values in these intervals for E1 and E5 are lower than the lowest values in these
+intervals for every other element. This helps us conclude that the set E1 and E5, that is S1,
+is a tight set at x star. Similarly, we infer that S2 is also a tight set at x star.
+We now use that idea to give our first two tools. These apply more generally, but we demonstrate
+them using Euclidean projections. Suppose we already know the projection xi of a point yi,
+and we wish to find the projection xt of point yt, given that yt is close to yi.
+The non-expansiveness of projection implies that the gradients at xi and xt are also close,
+and therefore we can infer some tight sets at xt even before solving.
+Suppose we start computing the projection of yt using an iterative algorithm.
+We now use the iterates zi that converge to xt. An iterate zt that is close to xt also has a
+gradient that is close to the gradient at xt, and once again we can infer some tight sets at xt
+as we approach the optimal. We also conducted an experiment to show that tool T1 can recover
+most tight sets from previous projections. We now give two tools that help us round an
+approximate solution exactly to the projection. First is our tool T3 called Relax.
+We give a heuristic to check if we have already found all the tight sets at the optimal.
+We also show that we can round combinatorially when we know the function f to be integral,
+and an iterate zt is close enough to the optimal xt. This is our tool T4.
+We can reuse previously known vertices of the polytope. Suppose that our optimal is xt,
+and we are given a close by point xi as a convex combination of some vertices in the polytope.
+We can use those vertices to warm start the search for xt. Now our sixth tool, Restrict.
+Once we know a few tight sets for xt using our inferred tools T1 and T2,
+we needn't search over the optimal or the whole base polytope. We can restrict ourselves to the
+face of the polytope that satisfies these constraints. We show that a simple extension
+of Edmunds' greedy algorithm provides yellow oracle for each face of the polytope.
+We now bring together these tools and apply them to the awaystep-frank-wolff algorithm,
+giving the algorithm we dub adaptive awaystep-frank-wolff, or A2FW for short.
+First, warm start A2FW using tight sets for the optimal inferred from previous projected points,
+and active sets from previous projected points. While the algorithm runs and generates new
+iterates, it keeps inferring new tight sets for the optimal point using these iterates.
+In each iteration, if a new set has been found, the algorithm checks if all tight sets have been
+found. If indeed so, then stop and output the exact solution. Otherwise, simply restrict the
+problem to a low-dimensional face and keep going on. Note that the linear optimization is over a
+restricted face of the polytope. Let's see an example. Suppose we are optimizing over the
+polytope P. We look for the best frank-wolff vertex and the best away vertex. We find that
+the best frank-wolff vertex is the best away vertex. Since the direction opposite to the away
+vertex is the better direction to move in, we find the next iterate ZT plus 1. Now, ZT plus 1 is
+close enough to X star that it allows us to detect another tight set and round to the face F new.
+One way to do that is to round to an arbitrary vertex in F new using our yellow oracle. Another
+option is to relax to F new and see if the solution obtained is feasible. If feasibility
+check is uncertain, return to the previous strategy. Eventually, we reach the optimal
+X star either way. We give this theorem about the primal gap for the modified algorithm.
+The function h is l-smooth and mu strongly convex and d refers to the diameter of BF.
+Notice how this compares to the AFW algorithm. When we restrict to a face F of BF, our guarantee
+depends only on the pyramidal width of F instead of the pyramidal width of BF. This pyramidal width
+can be much lower for the restricted face. For instance, it depends on the dimension of the face
+for the probability simplex. Therefore, A2FW leads to a faster convergence. We now show the
+effectiveness of our toolkit and the A2FW algorithm using experiments. For our computations,
+we simulate an online recommendation system where we are learning over rankings of items
+displayed to users. Our loss functions are stochastic model click-through rates. This
+can be seen as optimization over the permutahedron. We use online mirror descent which performs
+iterative projections and uses away step Frank-Wulf for these projections. We benchmark the
+original AFW algorithm against variants modified by our tools. We report significant improvement
+in both runtime and the number of AFW iterations. The green line stands for OMD with the original
+unoptimized AFW. The yellow line stands for OMD with A2FW algorithm. We do note that both OMDPAV,
+that is OMD with projections using the poor adjacent violators algorithm, and OFW were
+significantly faster than OMD with any AFW variant. However, OFW does not lead to optimum
+regret rates while OMDPAV works only for cardinality-based submodular polytopes. To
+conclude, we studied iterative projections for prevalent submodular-based polytopes. We presented
+an algorithm for cardinality-based polytopes. For general polytopes, we developed a combinatorial
+toolkit to speed up iterative projections and applied it to the AFW algorithm and computationally
+showed that our algorithm is orders of magnitude faster than the original AFW variant.

demo_data/nips-2021/25964/transcript_whisper_large-v2.txt ADDED Viewed

	@@ -0,0 +1,366 @@

+e la possibilità di eseguire un'operazione di modello di un'algebra.
+Questo è un'operazione che è stata creata per il nostro studio,
+e che è stato creato per il nostro studio.
+Ciao a tutti, sono Matteo Papini,
+e questo è un lavoro insieme con Andrea Tirinzoni,
+Aldo Pacchiano, Marcello Restelli,
+Alessandro Lazzarici e Matteo Pirotta.
+Il nostro lavoro è motivato dall'efficacia
+di algoritmi di imparazione di rinforzamento profondo
+per risolvere tasche complesse, come i videoghi.
+Una caratteristica fondamentale di questi metodi
+è la possibilità di eseguire neural networks
+per eseguire rappresentazioni complesse delle tasche
+che permette di rappresentare e imparare
+le polizie ottime efficacemente.
+Capire cosa fa una rappresentazione buona
+e come trovarne una
+è fondamentale per disegnare
+migliori algoritmi di imparazione di rinforzamento.
+In questo lavoro, per prima volta,
+ci sono state presentate caratterizzazioni formali
+di rappresentazioni buone per l'imparazione di rinforzamento.
+Abbiamo mostrato che usare una rappresentazione buona
+può davvero beneficiare l'efficienza di imparazione
+e fornire garantie di regretto costante.
+Finalmente, abbiamo mostrato come una rappresentazione buona
+può essere selezionata dall'interazione online,
+un primo passaggio verso l'apprendimento di rappresentazione per RL.
+Ma prima di tutto, qualche background.
+Il problema di imparazione è modellato
+come un processo di decisione di marco finito di orizzonte, o MDP.
+In ogni passaggio di tempo, l'agente osserva un stato dell'ambiente,
+prende un'azione e riceve una rinforza
+e un stato successivo come risultato.
+Questi sono determinati rispettivamente
+da una funzione di rinforza e una funzione di transizione
+che sono un'unità di tempo e un'unità di non-conoscenza.
+L'interazione è dividita in due episodi
+di lunghezza finita, che si chiama l'orizzonte.
+All'ultimo episodio, il stato è risalto
+a seconda della distribuzione fissata.
+Il comportamento dell'agente è modellato da una polizia,
+che è una mappatura da stati all'azione
+che può anche essere dipendente del tempo.
+La funzione di valore, o funzione Q della polizia Pi,
+dà la rinforza aspettata totale
+ottenuta prendendo l'azione A in stato S a tempo H
+e poi seguendo la polizia fino all'ultimo episodio.
+Un'ottima polizia è garantita
+che la funzione Q si massima su tutti i stati.
+Facciamo un'assumzione extra
+che ogni stato admette un'azione ottima unica.
+Quando il numero di stati è molto grande o anche infinito,
+imparare l'ottima polizia può essere molto difficile.
+Quindi guardiamo i linear MDPs
+dove l'agente ha accesso a una rappresentazione compatta.
+Questa è una mappatura di caratteristiche
+da stati e azioni a vectori d-dimensional
+dove D è più piccolo.
+Potete vedere queste caratteristiche
+come l'ultimo strato scoperto di un'intera rete neurale.
+Nell'apprendimento di rinforzamento profondo
+impariamo tutti i pesi della rete simultaneamente.
+Qui mantendremo la rappresentazione fissa
+e impareremo solo i finali parametri
+che sono i pesi di una combinazione lineare.
+Questa funzione lineare, almeno,
+deve essere in grado di rappresentare la funzione Q ottima
+in modo da poterla usare per prendere azioni ottime.
+Ma, infine,
+essere in grado di rappresentare la funzione Q ottima
+non è abbastanza per l'apprendimento efficace
+perché un numero esponenziale di esempi
+può ancora essere richiesto.
+Per evitare questo,
+ci sono necessità di assumizioni strutturali extra
+sull'MDP,
+e alcune sono state proposte nella literatura.
+Nel MDP di basso rango,
+sia la funzione di rinforzamento che la funzione di transizione
+sono lineari nelle stesse funzioni.
+Queste funzioni possono essere tempo-indipendenti.
+Assumiamo solo per semplicità
+che le due funzioni condividono la stessa dimensione D.
+Una prima conseguenza della struttura di basso rango
+è che la funzione Q di ogni polizia
+può essere rappresentata come una funzione lineare delle funzioni.
+Una assumzione strutturale più forte è la rinforzamento di Bellman.
+In questi MDP,
+tutte le funzioni lineare delle funzioni
+devono essere chiuse sotto l'operatore di optimità di Bellman.
+La struttura di basso rango implica la chiusura di Bellman,
+ma l'opposto non è vero.
+Indeed, nelle MDP di chiusura di Bellman,
+solo l'ottima funzione Q
+è garantita di essere realizzabile lineariamente.
+Le algoritmi di imparazione di rinforzamento efficace
+sono state proposte per questi settimenti.
+Possiamo evaluare le funzioni
+usando il concetto di risalto,
+che è l'amounto totale di sub-optimità
+che viene sofferto dall'agente
+durante il processo di imparazione
+rispetto alla polizia ottima.
+Nelle MDP di basso rango,
+l'algoritmo LSVI-UCB
+soffre solo un regalo sublineare
+nel caso più grave.
+Eleanor è una versione raffinata
+che funziona nel caso più generale
+della chiusura di Bellman
+e ha una migliore dipendenza
+sulla dimensione di caratteristiche.
+Doveva essere notato, però,
+che Eleanor è computazionale intrattabile.
+Per il LSVI-UCB
+abbiamo anche un regalo
+di base di istanze
+che è logaritmico
+nel numero totale di interazioni.
+Qui Delta denuncia
+il capo di sub-optimità
+di una pariera di attesa statale
+che è assumato di avere
+un minimo ben definito.
+Tutti questi regali di base
+ignorano la qualità della rappresentazione,
+a parte le assumazioni strutturali
+che sono necessarie
+per la sua gestione.
+La domanda che cercheremo di rispondere è questa.
+Possiamo raggiungere
+anche piccoli dolori
+con una buona rappresentazione?
+Per rendere questo concetto
+di buona rappresentazione formale
+introduciamo la proprietà Unisoft.
+Una rappresentazione è Unisoft
+se le caratteristiche ottime
+spostano l'intero spazio di caratteristiche.
+Le caratteristiche ottime sono
+le caratteristiche delle azioni ottime
+in stati che sono raggiuntibili
+alla propria politica ottimale.
+Intuitivamente, la proprietà Unisoft
+garantisce che le caratteristiche ottime
+sono diverse abbastanza
+per che l'agente
+cominci rapidamente alla politica ottimale
+senza ridurre
+l'amounto di informazioni che riceve
+sulla tasca in generale.
+Possiamo anche misurare
+il grado di diversità della rappresentazione
+guardando i più piccoli valori
+degli eigenvali
+della matrica di covarianza delle caratteristiche ottime.
+Questo parametro di Lambda
+porterà un ruolo importante
+nelle nostre regrette.
+Notate che un valore più alto di Lambda
+è migliore perché denota
+più diversità di caratteristiche
+e che Lambda può essere al massimo
+una sotto assumizioni comuni
+sulla magnitude di caratteristiche.
+Ma in quale senso sono queste rappresentazioni
+ottime?
+Ciò che abbiamo mostrato in MDP lineari
+è che Unisoft è sinonimo
+con regrette costanti.
+Per prima cosa, abbiamo mostrato
+che la proprietà di Unisoft
+è necessaria per raggiungere
+regrette costanti in MDP
+con regretti lineari.
+Questo appartiene a MDPs di basso rango,
+Bellman closure,
+e anche a MDPs di mixtura lineare
+che sono un'altra
+assumazione strutturale comune.
+Ma Unisoft è anche sufficiente
+per regrette costanti
+in casi interessanti.
+In MDPs di basso rango,
+SVI-UCB raggiunge
+regrette costanti se e solo se
+la rappresentazione è Unisoft.
+Con una alta probabilità,
+un numero finito
+di interaczioni è sufficiente
+per l'agente imparare
+perfettamente la polizia ottimale.
+Quindi, la regrette può essere
+rilassata in termini di questo tempo costante
+regardless of the
+total number of episodes k.
+In altri parole, la regrette
+è costante.
+Notate come il tempo τ
+dipende inversamente
+sul parametro λ.
+Indeed, con una mappa di
+più diversità di caratteristiche, possiamo imparare
+la polizia ottimale più velocemente.
+Abbiamo un risultato simile
+per Eleanor nel caso più generale
+di MDPs di Bellman closure,
+con anche una migliore
+dipendenza sulla dimensione d
+della caratteristica.
+Infine, la mancanza di
+lombari per Eleanor
+dà questa polinomiale
+dipendenza sul parametro λ
+rispetto a una dipendenza logaritmica
+nel caso di LSVI-UCB.
+Ma questo potrebbe ben essere
+un artefatto del nostro provo.
+Per ricapitulare, abbiamo mostrato
+che l'Unisoft è
+sia necessario che sufficiente
+per raggiungere regrette costanti
+in MDPs di Bellman closure
+e di low rank, e ha
+provvinto regrette costanti
+per i bounds superiori per algoritmi comuni.
+Nella ultima parte del
+talco, mostriamo come
+le representazioni buone possono essere
+scelte online.
+Ci concentriamo su MDPs di low rank
+per semplicità.
+L'agente è dato un set
+di N rappresentazioni candidate
+che rappresentano
+la stessa MDP di low rank
+senza misspecificazione.
+Le rappresentazioni possono avere
+diverse dimensioni.
+Questo differe dall'approccio tipico
+di rappresentazione di lezione in RL
+dove si cercano di trovare
+una rappresentazione accurata
+da una classe di funzioni realizzabili.
+Questo permette di
+risolvere le misspecificazioni, ma
+è tipicamente fatto offline.
+Il nostro obiettivo è
+imparare così efficientemente
+come se usassimo la migliore
+rappresentazione candidata nel set
+senza sapere in avanzo.
+Ovviamente, se una delle candidate
+è Unisoft, vorremmo
+ottenere un regalo costante.
+L'algoritmo che proponiamo
+è LSVI Leader.
+Si guida
+N istanze parallele di LSVI UCB,
+una per ogni rappresentazione
+candidata.
+Per ogni rappresentazione, usiamo
+tutte le date collezionate
+dall'agente per esimerare
+il parametro dell'ottima
+funzione Q accordo
+a questa rappresentazione.
+Questo è fatto con una combinazione
+di square e induzione sbattuta.
+Un bonus di esplorazione
+viene aggiunto all'estimato
+del parametro per rendere
+l'estimato ottimista, come nel caso di LSVI UCB.
+Ma ora
+abbiamo un parametro ottimista
+per ogni rappresentazione
+e l'azione viene scelta
+per maximizzare il più piccolo
+parametro ottimista,
+che è anche l'estimato più tico.
+Notate come questo
+è in realtà più potente
+dell'algoritmo di selezione del modello
+perché possiamo usare
+una rappresentazione diversa
+per ogni stato.
+Vediamo che il regalo del leader di LSVI
+è superiore
+a quello di LSVI UCB
+se è condannato con la rappresentazione
+migliore dei candidati,
+a meno di un fattore,
+che è il numero di candidati
+in square.
+Questo significa che se abbiamo
+una rappresentazione di Unisoft nel set,
+il leader di LSVI
+raggiunge il regalo di selezione.
+Ma il leader di LSVI
+può combinare rappresentazioni
+attraverso stagi, stati e azioni,
+e quindi
+a volte può raggiungere
+il regalo di selezione
+anche se non c'è una rappresentazione di candidati
+di Unisoft.
+I nostri risultati teoretici sono anche supportati
+dai risultati empirici
+in MDPs di piccolo regalo di selezione.
+Questi plotti mostrano il regalo di selezione
+come funzione del numero di episodi.
+A sinistra abbiamo
+il regalo di LSVI-UCB
+che è gestito con
+diverse rappresentazioni.
+Di queste, l'unica rappresentazione
+in grigio nel plotto
+è Unisoft, e solo in questo caso
+LSVI-UCB è in grado
+di raggiungere regali costanti.
+A sinistra abbiamo il regalo
+del leader di LSVI
+che è gestito con vari set di candidati.
+In tutti questi casi,
+il leader di LSVI raggiunge
+regali costanti.
+Ovviamente, senza sapere
+la migliore rappresentazione in avanzo,
+ci serve più tempo per imparare la polizia ottima,
+ma questo è stato anche aspettato
+dalla nostra regola di selezione.
+Il plotto arancione è particolarmente
+interessante, perché in questo caso
+l'unica rappresentazione di Unisoft,
+numero 1,
+non è nel set di candidati,
+ma ancora LSVI-leader è in grado
+di raggiungere regali costanti
+combinando le representazioni rimaste.
+Nel lavoro futuro,
+vorremmo migliorare questo fattore
+di sqvrtn nel regalo del leader di LSVI,
+perché nel caso dei banditi lineari
+la dipendenza sull'umare
+delle rappresentazioni è solo logaritmica.
+Vorremmo anche
+estendere il leader di LSVI
+per gestire le rappresentazioni
+di candidati che sono miscele.
+Tuttavia, questa
+selezione delle rappresentazioni è
+solo un passaggio verso
+il learning of representation,
+che significa imparare
+la rappresentazione online da scratch.
+Questo è già fatto
+in pratica con il learning di
+rinforzamento profondo, ma la teoria
+di questo è scomoda.
+Finalmente, possiamo considerare
+il learning di rinforzamento multitasca,
+dove una singola rappresentazione
+potrebbe essere buona per un
+composto di MDPs che condividono
+una struttura. Grazie.

demo_data/nips-2021/25965/transcript_whisper_large-v2.txt ADDED Viewed

	@@ -0,0 +1,136 @@

+How many friends do you have?
+At least you have more friends than I do.
+Well, on average.
+Don't get me wrong, I am not a pity person.
+This is a mathematical fact known as the friendship paradox.
+Suppose we have two persons, A who has one friend and B who has three friends.
+Now let me ask in which friend list am I likely to appear?
+Because B has three times more friends, I am three times more likely to appear in the
+B's friend list.
+The friendship paradox dictates that on average, your friends have more friends than you do.
+The more friends someone has, the more likely someone appears in your friend list.
+Beyond an interesting piece of trivia, the friendship paradox has substantial importance
+because it may introduce biases in graph embeddings.
+Hello everyone, my name is Sadamori Kojak, and we will walk you through a new insight
+into biases in graph embedding arising from the friendship paradox.
+The graph embedding is a technique to map a graph into a vector space that reflects
+the structure of the graph.
+A widespread paradigm is the approach based on Word2Vec.
+In this approach, one somehow generates a sequence of nodes from the graph.
+The nodes in the sentences are then mapped to a vector space by Word2Vec.
+Now the key is that Word2Vec does not directly learn the graph, but through the sentences
+generated from the graph.
+Unlike the word embedding, where the input sentences are the actual data, for graph embedding,
+the input sentence is artificially generated, and how to generate it is a critical modeling
+decision.
+This leads us to the question of how to generate the sentences from the graph.
+A common way is to use random walks.
+The worker starts from a node in the graph, and this node is the first node in the sentence.
+Then the worker moves to one of the neighbors selected randomly.
+This new node is added to the sentence.
+By repeating this process, we can generate a sentence of nodes from this graph.
+The friendship paradox comes into play when the worker follows an edge.
+It is more likely to visit a node with many neighbors.
+In other words, following edges is a bias sampling that preferentially leads random
+workers to nodes with many neighbors.
+To see this effect, let us consider a graph with co-peripheral structure, where kernels
+have more neighbors than periphery.
+A sentence can be generated from this graph by running a random walk.
+Now, the kernels are about 20% of nodes in the graph.
+But when looking at the generated sentence, the kernels are overrepresented, which is
+because of the bias due to the friendship paradox.
+The fact that the sentence is biased by the friendship paradox leads us to our main question.
+Does the sampling bias have negative impact?
+If so, how can we fix it?
+Surprisingly, it has no effect because Word2Vec itself has an overlooked built-in devising
+feature that happens to negate the bias due to the friendship paradox.
+This built-in devising feature can be easily utilized to negate other types of biases,
+and we demonstrate how to do this.
+Our starting point is a sentence of words.
+Word2Vec picks a word called center and surrounding words called context, and then models the
+conditional probability using a softmax function, where the conditional probability is reflected
+as a dot similarity of the two vectors of the words.
+We want to fit this model to the data, but it is computationally challenging due to the
+normalization constant, which extends over all unique words in the corpus.
+A common way to reduce this burden is negative sampling.
+Now, it is often underappreciated that negative sampling is actually a simplified version
+of noise contrastive estimation.
+And it is this simplification that gives rise to an interesting feature of Word2Vec.
+How does the noise contrastive estimation, or NCE, works?
+NCE samples k random contexts from so-called noise distribution.
+This noise distribution is roughly proportional to the frequency of a word in the corpus.
+The random contexts are labeled as 0, and the actual context is labeled as 1.
+Then NCE calculates the probability that a word comes from actual data using a Bayesian
+framework.
+By putting the prior likelihood together, we have a posterior like this.
+This function is a sigmoid function and takes the dot similarity and the noise distribution
+as the arguments.
+Now the key feature of the NCE is that it is asymptomatically unbiased for the model
+of the Word2Vec.
+Meaning if the data is actually generated from this model, and we increase the number
+of trainings, then the embedding vectors converge to the true vectors.
+Beyond Word2Vec, the noise contrastive estimation is also an unbiased estimator for a more general
+model that takes a real value function f instead of the dot similarity.
+Now the negative sampling simplifies the noise contrastive estimation.
+It estimates the same probability, but variably drops the term of the noise distribution.
+You might be wondering what happens without this term.
+To see this, we rewrite it in form of the noise contrastive estimation, where we define
+a new function f' which consists of the original function f as well as the noise distribution.
+This is asymptomatically unbiased for a probability model which now includes the noise distribution.
+So all in all, Word2Vec trained with skip-gram-negative sampling is asymptomatically unbiased for
+this probability model, or more specifically for Word2Vec, this function.
+In this model, the noise distribution offsets the modeled probability, serving as a baseline.
+The embedding vectors captures the residual from the baseline.
+Now, remind that the baseline probability is roughly proportional to the frequency.
+Therefore, the embedding vectors capture the information other than the frequency.
+In other words, SGNS Word2Vec has a built-in debiasing feature for frequency bias.
+Now let us revisit the friendship paradox.
+The sampling bias due to the friendship paradox is that the frequency of a word is determined
+thoroughly by the degree of noise.
+Notice that this frequency is actually accounted for by the baseline probability.
+Therefore, the friendship paradox has no effect thanks to the built-in debiasing feature of
+SGNS Word2Vec.
+This realization leads us to Residual2Vec.
+The key idea is to model the baseline probability explicitly to control what bias to remove
+in embedding.
+So how can we model the baseline more specifically?
+We start from the given graph and randomize the structure, then generate a sequence using
+random walks, then calculate the conditional probability as the baseline, which is based
+on the idea that we should remove biases arising from the trivial structure.
+This debiasing feature is useful to predict links in the graph.
+Residual2Vec performs the best or nearly the best for all six graphs of different domains.
+Furthermore, Residual2Vec is the best or the second best performer for a community detection
+benchmark.
+To showcase the debiasing feature, we constructed a citation graph of general issues using the
+web of science, where the nodes are general issues connected by undirected and weighted
+citations.
+When applying grove embedding, all genres are concentrated on the center, reflecting
+temporal aspects of the issues.
+This is because the old issues have time to accumulate many citations, and therefore well
+connected to many different issues.
+For subject-wise, grove separates different fields to some extent.
+With Residual2Vec, we can remove the biases due to time.
+In effect, the old genres now spread out, and the disciplinary separations are more
+clearly visible.
+Beyond eyeballing the embeddings, we test the embeddings quantitatively by predicting
+the genre impact factor as well as the subject categories.
+We find that the impact factor and the subject of genres can be well predicted by removing
+the temporal biases as well as the friendship paradox effect.
+In summary, we show that World2Vec has a built-in debiasing feature attributed to negative sampling.
+Inspired by this finding, we propose Residual2Vec that can negate other types of structural
+biases.
+We demonstrate that removing biases not only improves the performance, but also enabling
+us to control on the biases in the final representation.
+Our results highlighted a new potential of negative sampling as a way to mitigate biases
+in representations, which may be useful to address the problem of the biases in AI.
+Although we have not studied the biases in AI, given the wide usage of negative sampling
+to train AI, our approach may lead to methods and studies that expose and mitigate biases
+in AI.
+We believe that our approach contributes to the effort to create transparent and accountable
+machine learning methods, especially because our method enables us to explicitly control
+the biases in the graph representation.
+That's all for the presentation, and finally I'd like to acknowledge Jason Yoon, Isabel
+Constantino, and Yongyuan An for creating and adding momentum to this project for years,
+and for all of you who watched this video.
+If you want to know more in detail, please check out our paper.
+Thanks!

demo_data/nips-2021/25969/transcript_whisper_large-v2.txt ADDED Viewed

	@@ -0,0 +1,160 @@

+Hello everyone, my name is Alan. I'm a PhD student from Stanford University. I'm presenting
+our work Play to Grade, testing coding games as classifying Markov decision process. This
+is joint work with Emma Bronskill and Chris Peach.
+In this talk, we will highlight the central problem that we're trying to solve, which
+is scaling up quality feedback for students learning to code is crucial. Grading interactive
+coding game is very difficult, and we frame this as an instance of identifying if a program
+has the same behavior as a desired MDP. Even with 11 label programs, we can achieve 94%
+accuracy on real student assignment from code.org.
+Each year, hundreds of thousands of people, children and adults alike, want to learn coding.
+Modern massive online education platforms like code.org serves over 40% of US K-12 students.
+Scaling up quality feedback for these students is crucial, especially in areas where there
+are shortages of computer science teachers.
+Interactive coding assignments are becoming more popular. It's a lot more fun for students
+to program them. They're also a common type of programs for students to code. For example,
+web pages are interactive. However, in order to grade them, teachers often need to play
+each student homework for 20 seconds to a couple minutes. This quickly becomes a scaling
+issue. A 20-student classroom might still be manageable, but in a large university where
+there are hundreds of students taking the same class or on an online education platform
+like code.org, grading these assignments is a real challenge. This places a real burden
+on teachers.
+Why is it difficult to develop automatic grading tools? First of all, each assignment is different
+from each other. Traditional machine learning solutions that rely on collecting a large
+set of data set simply won't work here. Oftentimes, assignments for the same class can even change
+from year to year. Spending effort to collect a large label data set is a hard sell to teachers.
+Second, the same assignment can be written in different coding languages. The solutions
+could end up looking quite different. At last, code solutions can be very long, especially
+when interaction is involved. Unfortunately, current state-of-the-art code analysis solutions
+don't scale beyond 10 lines of code. In this work, we hope to offer a new solution
+inspired by human teachers' grade these assignments.
+Let's take a look at how a teacher plays to grade a student homework. This is what
+a correct solution for code.org's coding assignment, Bounce, looks like. The teacher
+controls a paddle to bounce a ball into a goal post and gets one score.
+Here's what an incorrect student submission looks like. The student didn't put the boundary
+condition for the wall and the ball goes right through it.
+Here's another incorrect submission. Instead of getting a point after successfully bouncing
+the ball into the goal post, the player gets a point whenever the ball bounces on wall
+and paddle. This is clearly not the correct behavior.
+However, a teacher isn't just playing the game normally. In order to grade it, the teacher
+has to play it in a specific way to expose bugs in the game. Take a look at both programs
+on the left and right. Both have wall boundary problems, but we would never know if the teacher
+didn't try to bounce the ball on the wall. The right panel shows a game, though broken,
+can look like a perfectly correct game.
+Using the Markov Decision Process framework from reinforcement learning, we can characterize
+the intuition we have built up. The MDP framework can be used to describe any interactive environment,
+not just games. It includes a state space, action space, a transition dynamics that defines
+how the game moves from one frame to the next, and a reward function. We can train an agent
+using a reinforcement learning algorithm that learns to maximize the reward. So how does
+the MDP framework help us understand programs with bugs?
+We can treat each program as its own MDP. The teacher's correct program is the correct
+or desired MDP, while the student's program is another MDP or a test MDP. We can frame
+grading as an instance of identifying if a test MDP has the same behavior as a desired
+MDP. Using components from the MDP framework, we can express bugs as distance between two
+MDPs' transition and reward functions. The ball going through the wall is clearly not
+a correct transition. Receive reward when you shouldn't can also be captured by the
+difference in the reward function output. More precisely, we can treat grading as calculating
+a distance between two MDPs. Equation 1 might suggest that we should check over all states.
+However, since distance is non-negative and we're interested in the overall sum, we
+only need to find one state-action pair in the test MDP to know if the overall distance
+is non-zero. If we set this distance as a reward for an RL agent, we can make the task
+of reaching bug states a lot more intelligent and efficient. This RL agent's objective
+is to reach states that have the highest potential to be different between the two MDPs with
+respect to this distance function. We do have one more challenge that remains.
+The distance function DSA requires access to both MDPs' transition and reward functions.
+We cannot assume we have access to the student program's inner mechanism. We can't control
+the randomness in the student's code either, meaning two MDPs can have different random
+initial starting positions. Therefore, when we interact with the student's MDP, we need
+to learn a parametrized distance function that can tell us how far the observed state-action
+pairs from the student MDP is from the correct MDP.
+Now we have two parametrized models. The agent requires training to find the bug. The classifier
+requires training to identify the bug. We call this the code star problem. So, if I
+have a classifier that can classify which state triggers a bug, then we can simply replace
+reward function in the MDP with this classifier and directly teach our agent. If I have an
+agent that can always reach the bug state, I can probably just collect a dataset of trajectories
+and train a good classifier. But at the beginning, neither the agent nor the classifier can do
+a very good job. Therefore, we introduce a procedure called
+collaborative training. The agent will start out as a random agent, where we can train
+the agent to maximize the original reward in the MDP. It collects trajectories and trains
+the classifier. Then we use the classifier as a reward function to guide the agent on
+how to reach bug states. They both start out bad, but the agent can help the classifier
+learn and the classifier can in return teach the agent.
+We present two baselines to train the bug classifier. Since we have some training data,
+though not a lot, we can simply apply coarse labeling, creating a dataset where all state-action
+pairs from the correct labeled MDP as non-bug states and all state-action pairs from the
+broken MDP as bug states. This is incredibly noisy because not all state-action pairs from
+the broken MDP are bug states, only a few of them are. But this is a good baseline to
+have. We can also train an unsupervised learning
+model to memorize all state-action pairs from the correct MDP and use log probability or
+reconstruction loss to detect abnormal state-action pairs in the broken MDP.
+Inspired by Hohr-Triples and MDP state equivalence literature, we designed two models to fully
+capture this notion of MDP-based state difference. We assume that the students can specify and
+set random seed for their game. Therefore, the game objects, such as a ball, will not
+always appear in the same initial state. Therefore, it is crucial for us to approximate one MDP's
+transition dynamics and reward function. When our agent interacts with a new MDP, this is
+where Hohr-LSTM comes in. We train it to model the correct MDP's transition dynamics and
+reward function and treat bug states in the new MDP when sufficient deviation occurs from
+the prediction. We further introduce contrastive Hohr-LSTM.
+Sometimes the agent will explore a new region that it might not have visited in the correct
+MDP. The predictive difference between the observed state and predictive state is in
+fact a function approximation error. In order to reduce this error, we approximate both
+the correct MDP and the broken MDP.
+Let's take a look at how these models work. We introduce a car environment. In here, the
+student miscalculated the boundary of this environment, so whenever the car goes outside
+of the red dotted line, it will get stuck and can only wriggle back and forth. This
+is a task where you will always reach a bug state at the end of each trajectory. Therefore,
+every single agent is already an optimal agent. We create a specific one that only knows how
+to drive north in a straight line.
+As we can see, almost all models, except Gaussian mixture model, can be close to 100% accuracy
+at classifying bug states and non-bug states. However, the agent that only knows how to
+drive north is not a very interesting agent, and we probably will never use that in real
+life. So what if we make it a little bit harder?
+We can create an agent that drives the car randomly. Now the trajectory will become different
+each time. We see a significant drop in performance for baseline solutions like noisy supervised
+learning and variational autoencoder. However, our LSTM-based models can still do very well
+at close to 100% accuracy. This is a pretty challenging task because we're measuring the
+accuracy of each classifier on every state in a trajectory, even though we're in a toy
+environment.
+Let's make this setting even harder. The car environment can stay the same, but for now,
+bugs can only be triggered if the agent successfully drives the car into some small red rectangular
+areas. Not all agents are optimal now, and it would be unlikely for a single-direction
+agent to ever see a bug state. We can now showcase the power of collaborative training
+through this example.
+We can see at the beginning, the agent is pretty random, and the classifier is pretty
+bad except for the LSTM models. However, after only one round of collaborative training,
+we see a substantial improvement for the two baseline models, both noisy supervised learning
+model and variational autoencoder are able to improve their accuracy by 30% and precision
+by 60%. This shows that the collaborative training is helping both the agent and the
+classifier to be more optimal, even for the weaker classifiers.
+We also notice that this improvement is not monotonic. Just like every other AI training
+scheme, overfitting sometimes happens. Only the most expressive classifiers, our proposed
+Horl LSTM and contrastive Horl LSTM can remain stable and even mildly improve their recall
+in the last round of collaborative training.
+We can directly examine the agent's learning by looking at its trajectory. At first, the
+agent drives the car randomly, but after only one round of collaborative training, the agent
+becomes sharply focused and only visits the possible buggy areas.
+We verify our method on a real student dataset that we obtained from code.org. We use this
+assignment as our motivating examples earlier. Bounce is a simple coding exercise where 450,000
+students have submitted their solutions. We built a simulator that can run and execute
+students' programs that conforms to the OpenAI GEM API. For each student program, we have
+created goal labels for bug behaviors. We further binarize them into a single label
+indicating correct or incorrect.
+Bounce is a lot more complicated than car. Learning to bounce a ball into the goalpost
+and understanding the physics is a lot more difficult for the agent. Therefore, we pre-train
+the agent using the score as a reward. We call this play-to-win agent. Then we use this
+agent to train our bug classifier. We're able to reach 94% accuracy with only 11 label
+programs as training data. A similar algorithm that uses code as text input cannot match
+our method's performance due to the smallness of the training dataset.
+In addition to just grading, since we're able to determine bugs at the state level,
+we can simply record a few frames before and after the bug occurs and compile a short video
+for the students to demonstrate what the bug is in their assignment.
+To summarize our work, we provide a fully functional simulator and a massive amount
+of real student programs with goal labels. We demonstrate that our solution achieves
+a high performance. However, there are still many problems remain. For example, can we
+know which bug is triggered in the student program? This is helpful for providing fine-grained
+feedback to the students. Training an RL agent with a classifier has also been explored in
+other areas like SafeRL, where unsafe states are predicted by a classifier.
+At last, we pose this question of creativity. Can our formulation accommodate creativity?
+Creative programs are different but not broken. A ball can move faster or slower than the
+teacher's solution, but it doesn't mean it's wrong. Exploring how we can recognize
+and encourage student creativity is crucial for automated grading. Thanks for listening.
+Come and chat with me during the poster session.

demo_data/nips-2021/25970/transcript_whisper_large-v2.txt ADDED Viewed

	@@ -0,0 +1,93 @@

+Hi, my name is Maxwell Nye, and today I'll be talking about improving coherence and consistency
+in neural sequence models with dual system neurosymbolic reasoning.
+So I first want to give a little bit of a demo, which is to ask this question.
+A bat and a ball cost $1.10 in total.
+The bat costs $1 more than the ball.
+How much does the ball cost?
+So I'll let you think a little bit for this.
+So one answer that sort of might jump out at you is $0.10, but this is actually incorrect
+because the sum of the two objects should be $1.10.
+So the correct answer is actually $0.05.
+And this is an example from a cognitive reflection test, and these are questions designed to
+have a particular answer which comes to mind quite quickly, which is in fact wrong.
+And something that's interesting is that large-scale language models such as GPT-3 predict the
+wrong answers as well.
+And this is true not just for the sort of the classic cognitive reflection test, but
+also for variants with different numbers.
+So this is sort of an interesting thing.
+It talks about how neural language models often have issues with consistency and coherence.
+So another place that we can see this a little more concretely is the clutter data set.
+In the clutter data set, models are trained to...
+There are sentences about people and their family relationships and stories about those
+people.
+And this was originally devised as a question-answering data set where you ask what the relations
+are.
+One thing you can do is ask models to be trained on this data set and then generate new stories.
+And when you do that, you'll see that often the generated stories have inconsistency.
+So if we look at the bottom of the screen here, we can see an example of this.
+Robert and his brother Antonio played harmonicas together.
+Robert's daughter, Elsie, asked him to play with her.
+Elsie doesn't like having to babysit her younger brother, Antonio.
+And so we can see that this is a common sense error because Elsie is not the younger brother
+of Antonio.
+Or Elsie's younger brother is not Antonio.
+So what we've done is we've built a dual system model using large-scale neural networks and
+symbolic deliberative logic in order to try to help with these consistency issues.
+So the model is as follows.
+You use neural generation to generate sentences in a particular story.
+You might generate the next sentence using a model such as GPT-3 or BART.
+What you can then do is parse that sentence into the semantic meaning with respect to
+the family relationships and check whether or not it matches the current state of the
+family relationships that's been described so far, and only accept the candidate sentence
+generations that are actually consistent.
+So this has a few components.
+One of the components here is a symbolic world model.
+In the case of this clutter domain, the symbolic world model that we built encodes people and
+their family relationships.
+So in other words, you could take a sentence and encode what the underlying family relationship
+is.
+And what you can do is you can use SMT solvers such as the Z3 solver to check consistency.
+So given a new sentence, you can check that it doesn't disobey the rules of ancestry that
+we've defined here.
+And so some of those are, for example, what is the relationship between children and grandchildren?
+And then another is what are the rules about whether ancestry, can you be your own ancestor,
+et cetera.
+So one question is how is this semantic parsing done?
+And it turns out we can actually do this quite cheaply using GPT-3.
+So what we can see here in the dotted box is an actual example of a few-shot prompt
+we can use to parse each new sentence, each new candidate sentence from the system one
+generation model and parse it into the semantic form that we can then give to the world model
+solver.
+So the results here show that models that use this dual system neurosymbolic stories
+show improved coherence over just sentences that were constructed by a neural model.
+So the example here is that what we've done is we've used human judgments on which of
+the following sentences make more sense given the prior context of the story.
+And we see that if we use a symbolic world model and the parsing scheme described above,
+humans prefer the judgments given by this model.
+We can also apply the same sort of reasoning to a completely different task.
+Here we can discuss the grounded instruction following task, the grounded instruction following
+domain called gscan.
+In this domain, the goal is to have an agent, which is shown by this pink triangle, follow
+a command to perform some simple action in this grid world.
+So you can see here, walk to a small yellow cylinder might be an example of a command.
+Prior work has shown that one thing you can do is encode the initial state, encode the
+instruction and then train a neural model to predict the action sequences.
+Other work has also shown that one thing you can do is train a model to predict a distribution
+over the correct target location as part of the neural model.
+That will also increase the performance of the model.
+What we do here is show that if you do both of these things, you predict both an action
+sequence and a target location, like what is the location you should end up in, and
+then check whether or not when you execute the set of instructions, you will end up in
+the predicted target location.
+You can sort of check consistency between these two different predictions and only accept
+those instruction sequences which match the target location prediction.
+And this leads to also higher accuracy, especially in a low data regime.
+We have more details about the results of the paper.
+So that's a little bit of an overview of our paper.
+Our takeaways are that you can build systems with combined neural methods and explicit
+world knowledge.
+And if you add just a little bit of world knowledge, you can really help increase coherence
+and consistency for these large sequence models.
+There are some challenges here about parsing in larger scale domains and also what it would
+mean to automatically build a more complete world model.
+Thank you very much.

demo_data/nips-2021/25973/transcript_whisper_large-v2.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+Hi everyone, I'm Jingwen, a PhD student in National University of Singapore.
+In this paper, we introduce dual-aspect collaborative transformer for solving routine problems.
+Until now, the neural solvers for VRPs could be classified in two types.
+The first one is the neural construction solver.
+It starts from an empty solution and iteratively selects a customer node to the solution,
+until all customers have been visited.
+And in this paper, we focus more on the neural improvement solvers.
+It starts from an incomplete solution and iteratively improves the solution
+based on the node features and solution features, until reaching a step limit T.
+Although the transformer has shown the efficiency for processing the sequence data,
+its positional encoding method may not be optimal for encoding the VRP solutions,
+because it only learns a unified set of embeddings and combines the node embeddings
+and the positional embeddings together.
+Also, it can only encode the linear sequences,
+which cannot capture the circularity and symmetry of VRP solutions.
+So in this paper, we introduce the dual-aspect augmentation,
+which could better describe the VRP solutions.
+We separate the learnings to node feature embeddings and positional feature embeddings
+based on the cross-aspect referential attention.
+And in this table, we compare the performance of dual-aspect and single-aspect.
+We can see the dual-aspect outperforms the single-aspect.
+And here we introduce the cyclic positional encoding.
+In this figure, we describe the embedding vectors and correlations between every two embeddings
+of the original PE and our CPE method in subfeature A and B.
+In subfeature C, we describe the top two principal components after PCA projection.
+And we can see our PCA method can better capture the circularity of VRP solutions.
+And here we did some ablation studies on the CPE method,
+which can achieve better generalization performance.
+And now we introduce our curriculum learning strategy in the training process.
+And in this method, we're training with an unstepped PPO method and a curriculum learning strategy.
+It gradually prescribes higher quality solutions as the initial stage for training.
+And in this graph, we describe two curves.
+The blue one is the PPO method only, and the green one is the PPO method only.
+And the green one is the PPO method with our curriculum learning strategy.
+And we can see the green one is more stable and achieves lower objective values.
+And here is the comparison performance of our method and some baselines on both TST and CVRP.
+We can see our DACT outperforms the existing transformer-based improvement models.
+So, based on these experiments, we can see our DACT performs very well for the routing problems.
+And in the future, we hope to use this method to solve more combinatorial optimization problems.
+Thank you.

demo_data/nips-2021/25974/transcript_whisper_large-v2.txt ADDED Viewed

	@@ -0,0 +1,130 @@

+Hi, I am Mohamed Pezeshki from Mila and today I am going to talk about creating starvation.
+This is a joint work with Omar Kaba, Joshua Bengio, Aaron Korvel, Doina Prikop, and Guillaume
+Lajra.
+Let me start with a story.
+Back in 1904, there was a horse called Hans and people believed that he could do arithmetic.
+Here is an article from New York Times published in 1904.
+The article says that Hans is an expert in numbers.
+For example, when two numbers of 5 and 9 are written on a blackboard, Hans replies by tapping
+on the ground 14 times.
+Seven years later, in an article, Oscar Feinst unveiled that the so-called clever Hans was
+not actually capable of doing any arithmetic and instead reading subtle hints in his trainer's
+behavior indicating when to stop tapping.
+As the article says, even the trainer was not aware of providing these shortcut signals.
+So Hans was clever but probably not in doing arithmetic.
+Its cleverness was in reading his trainer's clues.
+A similar phenomenon has been observed in many applications of machine learning.
+Essentially, the situations where the model seemingly has a very good performance but
+in fact it hasn't learned true underlying relationships between the input and the target.
+In this paper by Robert Gares and co-authors, they list several instances of what they call
+shortcut learning.
+For example, in a task of image captioning, the model predicts grazing sheep only by seeing
+the green hillside.
+In another instance, the network hallucinates a teapot with high confidence in an image
+of pure noise.
+This is another and indeed dangerous example of the task of pneumonia detection from x-ray
+images.
+The model appears to have a very good performance even on the test set.
+However, the heat maps reveal that the network is not looking at the long section at all
+and just latching on some features in the corner of the image.
+The intuition behind this phenomenon is a folk knowledge in one form or another.
+Given a strongly correlated and fast to learn features in training data, grading the sense
+is biased towards learning them first.
+However, this intuition is a bit abstract and hand-wavy, so let's look at a more concrete
+example.
+Consider a 2D classification task with red and blue data points as shown.
+If you train in raw network and this data, here is the decision boundary that we learn.
+Now consider slightly different arrangements of the data points such that the blue data
+points are slightly shifted to the left and the red data points are shifted to the right,
+making the data linearly separable.
+Now if we train in neural network on this, we get an almost linear decision boundary.
+Note that the network is only making its predictions based on the feature along the x-axis.
+Indicated in the red circle here, you can see that the decision boundary is very close
+to the data points.
+However, the network is super confident on its predictions and the training loss is indeed
+zero.
+So you can see that the slightly perturbing data point can get the network to predict
+an incorrect label with high confidence.
+This problem will be even more visible when testing the model on OOD, meaning out of distribution
+test data.
+An online interactive demo of this work is available on a blog post we wrote.
+If you wish to play with it a bit, please visit the link provided here.
+So we hypothesize that what is happening here is gradient starvation.
+Gradient starvation is a phenomenon in which a neural network captures statistically dominant
+features while remaining invariant to the rest.
+Here gradient descent leads to parameter updates, predominantly in directions that only capture
+these dominant features, thus starving the gradient from other potentially informative
+features.
+Here, the notions of feature and dominancy of a feature is rather vague.
+To define them more formally, we need to look into the learning dynamics.
+In the interest of time, I will be covering only the general intuition of our results
+and encourage interested audiences to take a look at the full paper for detailed treatment.
+So the two main theorems of the paper can be summarized into these two plots that I
+now explain.
+Let's first start with gradient starvation itself on the left.
+We train a model with common binary cross entropy loss.
+On the x-axis we have training iterations or epochs, and on the y-axis we monitor two
+features z1 and z2.
+Their dynamics depend on several factors, including their strength, meaning how easy
+or how hard it is for the network to learn those features, and their correlation with
+the target.
+Here, z1 has a larger correlation and hence converges to a value around 6, and z2 with
+a smaller correlation converges to a value around 2.
+However, the strength is equal, i.e. kappa is set to be 1.
+Again, it means that both of these features are as easy for the network to learn.
+Now let's keep their correlation fixed but increase the strength of z1.
+A kappa equal to 2 means that z1 is learned easier than z2.
+We can immediately see that although their correlation is still the same as before, z1
+is overestimated while z2 is underestimated.
+If we make kappa to be 4 or 8, it becomes more evident that simply because z1 is easier
+to learn, it is being overestimated, while z2 is being starved.
+Our theory shows that an increase in the strength of feature z1 has a detrimental effect on
+the learning of feature z2.
+Now our second theory shows that adding this term, indicated in the red rectangle, to the
+loss decouples the features.
+As you can see, a spectral decoupling decouples the features at the converged solution.
+Regardless of the value of kappa, all of the experiments on z1 and z2 converge to the same
+place.
+Again, we refer interested audience to the paper for more theory as well as more intuition.
+Now let's look at some experiments.
+Recall the task that we studied earlier.
+When the data is not linearly separable, we learn the curve decision boundary.
+On the right, we see how z1 and z2 evolve.
+When the data is linearly separable with a small margin, a linear decision boundary is
+learned.
+We observe that z1 is overestimated, while z2 is heavily underestimated.
+Now let's see what happens if we add spectral decoupling.
+Spectral decoupling suppresses z1 and as a result allows z2 to grow.
+It also appears that other regularization methods do not succeed at learning a curve
+decision boundary.
+So we observed that spectral decoupling leads to a decision boundary with a larger margin.
+What happens in real-world tasks?
+The distance to the decision boundary is not trivial to compute when working with nonlinear
+models.
+However, we can use a proxy.
+The amount of perturbation required to fool the network is a proxy to the margin.
+Look at the plot on the right.
+On the x-axis, we have the amount of perturbation and on the y-axis, we have how many of the
+examples are misclassified.
+You can see that with a fixed amount of perturbation, a model with vanilla binary cross entropy
+is much more vulnerable compared to a model trained with spectral decoupling.
+In another experiment, we studied colored MNIST, a well-known task of OOD generalization
+where the color is spuriously correlated with the labels.
+Also another task of OOD generalization is a classification task on the CILIB8 dataset
+where the training data is again biased with respect to the color of the hair and the gender
+such that most of male images have black hair while the majority of females have blonde
+hair.
+Here, we skip the details in the interest of time.
+However, let me just draw your attention to the superiority of spectral decoupling in
+these both tasks.
+Finally to conclude, we talked about the clever hands effect.
+We showed that a similar phenomenon can happen in neural networks and we called that gradient
+starvation.
+To understand gradient starvation, we looked into the learning dynamics.
+We showed that the presence of a strongly correlated feature could result in a starvation
+of other features.
+We also showed that spectral decoupling provides some degree of control over what features
+to learn and decouples essentially the features.
+Thanks for your attention.
+If you're interested to chat more, please visit our poster this afternoon.
+Thank you very much.

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
-webvtt-py
-transformers
-requests
-pandas
-nltk
-sentencepiece
 torch

+webvtt-py
+transformers
+requests
+pandas
+nltk
+sentencepiece
 torch